Nick: fixed re-ranker and enabled url cache of 2hrs

This commit is contained in:
Nicolas 2024-12-31 18:06:07 -03:00
parent 33632d2fe3
commit c3fd13a82b
6 changed files with 28 additions and 18 deletions

3
.gitignore vendored
View File

@ -33,3 +33,6 @@ apps/js-sdk/firecrawl/dist
/examples/haiku_web_crawler/firecrawl_env
/examples/sonnet_web_crawler/firecrawl_env
/examples/internal_link_assitant/firecrawl_env
/apps/api/logs/*
/apps/api/debug/*

View File

@ -42,7 +42,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return;
try {
await cacheRedis.set(key, JSON.stringify(entry));
await cacheRedis.set(key, JSON.stringify(entry), "EX", 3600); // 1 hour in seconds
} catch (error) {
logger.warn("Failed to save to cache", { key, error });
}

View File

@ -0,0 +1,7 @@
export const extractConfig = {
MAX_INITIAL_RANKING_LIMIT: 1000,
MAX_RANKING_LIMIT: 20,
INITIAL_SCORE_THRESHOLD: 0.75,
FALLBACK_SCORE_THRESHOLD: 0.5,
MIN_REQUIRED_LINKS: 1,
};

View File

@ -3,15 +3,13 @@ import { performRanking } from "../ranker";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { logger } from "../logger";
import { CohereClient } from "cohere-ai";
import { extractConfig } from "./config";
const cohere = new CohereClient({
token: process.env.COHERE_API_KEY,
});
const MAX_RANKING_LIMIT = 10;
const INITIAL_SCORE_THRESHOLD = 0.75;
const FALLBACK_SCORE_THRESHOLD = 0.5;
const MIN_REQUIRED_LINKS = 1;
interface RankingResult {
mappedLinks: MapDocument[];
@ -61,32 +59,35 @@ export async function rerankLinks(
searchQuery,
);
// First try with high threshold
let filteredLinks = filterAndProcessLinks(
mappedLinks,
linksAndScores,
INITIAL_SCORE_THRESHOLD,
extractConfig.INITIAL_SCORE_THRESHOLD,
);
// If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
logger.info(
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
`Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
);
filteredLinks = filterAndProcessLinks(
mappedLinks,
linksAndScores,
FALLBACK_SCORE_THRESHOLD,
extractConfig.FALLBACK_SCORE_THRESHOLD,
);
if (filteredLinks.length === 0) {
// If still no results, take top N results regardless of score
logger.warn(
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`,
`No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`,
);
filteredLinks = linksAndScores
.sort((a, b) => b.score - a.score)
.slice(0, MIN_REQUIRED_LINKS)
.slice(0, extractConfig.MIN_REQUIRED_LINKS)
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
@ -108,7 +109,7 @@ export async function rerankLinks(
}
});
const rankedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
// Mark URLs that will be used in completion
rankedLinks.forEach(link => {
@ -119,7 +120,7 @@ export async function rerankLinks(
});
// Mark URLs that were dropped due to ranking limit
filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => {
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => {
const trace = urlTraces.find(t => t.url === link.url);
if (trace) {
trace.warning = 'Excluded due to ranking limit';

View File

@ -7,8 +7,7 @@ import { generateBasicCompletion } from "../LLM-extraction";
import { buildRefrasedPrompt } from "./build-prompts";
import { logger } from "../logger";
import { rerankLinks } from "./reranker";
const MAX_EXTRACT_LIMIT = 100;
import { extractConfig } from "./config";
interface ProcessUrlOptions {
url: string;
@ -96,8 +95,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
}
// Limit initial set of links
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
// Limit initial set of links (1000)
mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT);
// Perform reranking if prompt is provided
if (options.prompt) {

View File

@ -38,7 +38,7 @@ const useCache =
process.env.CACHE_REDIS_URL !== undefined;
export const engines: Engine[] = [
// ...(useCache ? [ "cache" as const ] : []),
...(useCache ? [ "cache" as const ] : []),
...(useFireEngine
? [
"fire-engine;chrome-cdp" as const,