From c3fd13a82ba16e6c24f8bb69a8d232556dde761e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 31 Dec 2024 18:06:07 -0300 Subject: [PATCH] Nick: fixed re-ranker and enabled url cache of 2hrs --- .gitignore | 3 +++ apps/api/src/lib/cache.ts | 2 +- apps/api/src/lib/extract/config.ts | 7 ++++++ apps/api/src/lib/extract/reranker.ts | 25 ++++++++++--------- apps/api/src/lib/extract/url-processor.ts | 7 +++--- .../src/scraper/scrapeURL/engines/index.ts | 2 +- 6 files changed, 28 insertions(+), 18 deletions(-) create mode 100644 apps/api/src/lib/extract/config.ts diff --git a/.gitignore b/.gitignore index fc527490..311ee4df 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,6 @@ apps/js-sdk/firecrawl/dist /examples/haiku_web_crawler/firecrawl_env /examples/sonnet_web_crawler/firecrawl_env /examples/internal_link_assitant/firecrawl_env + +/apps/api/logs/* +/apps/api/debug/* \ No newline at end of file diff --git a/apps/api/src/lib/cache.ts b/apps/api/src/lib/cache.ts index cbab4e05..ff91fa88 100644 --- a/apps/api/src/lib/cache.ts +++ b/apps/api/src/lib/cache.ts @@ -42,7 +42,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) { if (!cacheRedis) return; try { - await cacheRedis.set(key, JSON.stringify(entry)); + await cacheRedis.set(key, JSON.stringify(entry), "EX", 3600); // 1 hour in seconds } catch (error) { logger.warn("Failed to save to cache", { key, error }); } diff --git a/apps/api/src/lib/extract/config.ts b/apps/api/src/lib/extract/config.ts new file mode 100644 index 00000000..f8333b3c --- /dev/null +++ b/apps/api/src/lib/extract/config.ts @@ -0,0 +1,7 @@ +export const extractConfig = { + MAX_INITIAL_RANKING_LIMIT: 1000, + MAX_RANKING_LIMIT: 20, + INITIAL_SCORE_THRESHOLD: 0.75, + FALLBACK_SCORE_THRESHOLD: 0.5, + MIN_REQUIRED_LINKS: 1, +}; \ No newline at end of file diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index 2a4e2f62..e5b61741 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -3,15 +3,13 @@ import { performRanking } from "../ranker"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { logger } from "../logger"; import { CohereClient } from "cohere-ai"; +import { extractConfig } from "./config"; const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, }); -const MAX_RANKING_LIMIT = 10; -const INITIAL_SCORE_THRESHOLD = 0.75; -const FALLBACK_SCORE_THRESHOLD = 0.5; -const MIN_REQUIRED_LINKS = 1; + interface RankingResult { mappedLinks: MapDocument[]; @@ -61,32 +59,35 @@ export async function rerankLinks( searchQuery, ); + // First try with high threshold let filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - INITIAL_SCORE_THRESHOLD, + extractConfig.INITIAL_SCORE_THRESHOLD, ); + + // If we don't have enough high-quality links, try with lower threshold - if (filteredLinks.length < MIN_REQUIRED_LINKS) { + if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) { logger.info( - `Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, + `Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, ); filteredLinks = filterAndProcessLinks( mappedLinks, linksAndScores, - FALLBACK_SCORE_THRESHOLD, + extractConfig.FALLBACK_SCORE_THRESHOLD, ); if (filteredLinks.length === 0) { // If still no results, take top N results regardless of score logger.warn( - `No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`, + `No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`, ); filteredLinks = linksAndScores .sort((a, b) => b.score - a.score) - .slice(0, MIN_REQUIRED_LINKS) + .slice(0, extractConfig.MIN_REQUIRED_LINKS) .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => @@ -108,7 +109,7 @@ export async function rerankLinks( } }); - const rankedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT); + const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT); // Mark URLs that will be used in completion rankedLinks.forEach(link => { @@ -119,7 +120,7 @@ export async function rerankLinks( }); // Mark URLs that were dropped due to ranking limit - filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => { + filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => { const trace = urlTraces.find(t => t.url === link.url); if (trace) { trace.warning = 'Excluded due to ranking limit'; diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 4d61a8d3..9f255ad7 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -7,8 +7,7 @@ import { generateBasicCompletion } from "../LLM-extraction"; import { buildRefrasedPrompt } from "./build-prompts"; import { logger } from "../logger"; import { rerankLinks } from "./reranker"; - -const MAX_EXTRACT_LIMIT = 100; +import { extractConfig } from "./config"; interface ProcessUrlOptions { url: string; @@ -96,8 +95,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace mappedLinks = [{ url: baseUrl, title: "", description: "" }]; } - // Limit initial set of links - mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); + // Limit initial set of links (1000) + mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT); // Perform reranking if prompt is provided if (options.prompt) { diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index bb0c485c..bf51ac94 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -38,7 +38,7 @@ const useCache = process.env.CACHE_REDIS_URL !== undefined; export const engines: Engine[] = [ - // ...(useCache ? [ "cache" as const ] : []), + ...(useCache ? [ "cache" as const ] : []), ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const,