Nick: fixed re-ranker and enabled url cache of 2hrs

This commit is contained in:
Nicolas 2024-12-31 18:06:07 -03:00
parent 33632d2fe3
commit c3fd13a82b
6 changed files with 28 additions and 18 deletions

3
.gitignore vendored
View File

@ -33,3 +33,6 @@ apps/js-sdk/firecrawl/dist
/examples/haiku_web_crawler/firecrawl_env /examples/haiku_web_crawler/firecrawl_env
/examples/sonnet_web_crawler/firecrawl_env /examples/sonnet_web_crawler/firecrawl_env
/examples/internal_link_assitant/firecrawl_env /examples/internal_link_assitant/firecrawl_env
/apps/api/logs/*
/apps/api/debug/*

View File

@ -42,7 +42,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return; if (!cacheRedis) return;
try { try {
await cacheRedis.set(key, JSON.stringify(entry)); await cacheRedis.set(key, JSON.stringify(entry), "EX", 3600); // 1 hour in seconds
} catch (error) { } catch (error) {
logger.warn("Failed to save to cache", { key, error }); logger.warn("Failed to save to cache", { key, error });
} }

View File

@ -0,0 +1,7 @@
export const extractConfig = {
MAX_INITIAL_RANKING_LIMIT: 1000,
MAX_RANKING_LIMIT: 20,
INITIAL_SCORE_THRESHOLD: 0.75,
FALLBACK_SCORE_THRESHOLD: 0.5,
MIN_REQUIRED_LINKS: 1,
};

View File

@ -3,15 +3,13 @@ import { performRanking } from "../ranker";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { logger } from "../logger"; import { logger } from "../logger";
import { CohereClient } from "cohere-ai"; import { CohereClient } from "cohere-ai";
import { extractConfig } from "./config";
const cohere = new CohereClient({ const cohere = new CohereClient({
token: process.env.COHERE_API_KEY, token: process.env.COHERE_API_KEY,
}); });
const MAX_RANKING_LIMIT = 10;
const INITIAL_SCORE_THRESHOLD = 0.75;
const FALLBACK_SCORE_THRESHOLD = 0.5;
const MIN_REQUIRED_LINKS = 1;
interface RankingResult { interface RankingResult {
mappedLinks: MapDocument[]; mappedLinks: MapDocument[];
@ -61,32 +59,35 @@ export async function rerankLinks(
searchQuery, searchQuery,
); );
// First try with high threshold // First try with high threshold
let filteredLinks = filterAndProcessLinks( let filteredLinks = filterAndProcessLinks(
mappedLinks, mappedLinks,
linksAndScores, linksAndScores,
INITIAL_SCORE_THRESHOLD, extractConfig.INITIAL_SCORE_THRESHOLD,
); );
// If we don't have enough high-quality links, try with lower threshold // If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < MIN_REQUIRED_LINKS) { if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
logger.info( logger.info(
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`, `Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
); );
filteredLinks = filterAndProcessLinks( filteredLinks = filterAndProcessLinks(
mappedLinks, mappedLinks,
linksAndScores, linksAndScores,
FALLBACK_SCORE_THRESHOLD, extractConfig.FALLBACK_SCORE_THRESHOLD,
); );
if (filteredLinks.length === 0) { if (filteredLinks.length === 0) {
// If still no results, take top N results regardless of score // If still no results, take top N results regardless of score
logger.warn( logger.warn(
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`, `No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`,
); );
filteredLinks = linksAndScores filteredLinks = linksAndScores
.sort((a, b) => b.score - a.score) .sort((a, b) => b.score - a.score)
.slice(0, MIN_REQUIRED_LINKS) .slice(0, extractConfig.MIN_REQUIRED_LINKS)
.map((x) => mappedLinks.find((link) => link.url === x.link)) .map((x) => mappedLinks.find((link) => link.url === x.link))
.filter( .filter(
(x): x is MapDocument => (x): x is MapDocument =>
@ -108,7 +109,7 @@ export async function rerankLinks(
} }
}); });
const rankedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT); const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
// Mark URLs that will be used in completion // Mark URLs that will be used in completion
rankedLinks.forEach(link => { rankedLinks.forEach(link => {
@ -119,7 +120,7 @@ export async function rerankLinks(
}); });
// Mark URLs that were dropped due to ranking limit // Mark URLs that were dropped due to ranking limit
filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => { filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => {
const trace = urlTraces.find(t => t.url === link.url); const trace = urlTraces.find(t => t.url === link.url);
if (trace) { if (trace) {
trace.warning = 'Excluded due to ranking limit'; trace.warning = 'Excluded due to ranking limit';

View File

@ -7,8 +7,7 @@ import { generateBasicCompletion } from "../LLM-extraction";
import { buildRefrasedPrompt } from "./build-prompts"; import { buildRefrasedPrompt } from "./build-prompts";
import { logger } from "../logger"; import { logger } from "../logger";
import { rerankLinks } from "./reranker"; import { rerankLinks } from "./reranker";
import { extractConfig } from "./config";
const MAX_EXTRACT_LIMIT = 100;
interface ProcessUrlOptions { interface ProcessUrlOptions {
url: string; url: string;
@ -96,8 +95,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
mappedLinks = [{ url: baseUrl, title: "", description: "" }]; mappedLinks = [{ url: baseUrl, title: "", description: "" }];
} }
// Limit initial set of links // Limit initial set of links (1000)
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT); mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT);
// Perform reranking if prompt is provided // Perform reranking if prompt is provided
if (options.prompt) { if (options.prompt) {

View File

@ -38,7 +38,7 @@ const useCache =
process.env.CACHE_REDIS_URL !== undefined; process.env.CACHE_REDIS_URL !== undefined;
export const engines: Engine[] = [ export const engines: Engine[] = [
// ...(useCache ? [ "cache" as const ] : []), ...(useCache ? [ "cache" as const ] : []),
...(useFireEngine ...(useFireEngine
? [ ? [
"fire-engine;chrome-cdp" as const, "fire-engine;chrome-cdp" as const,