mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-08 14:29:04 +08:00
Nick: fixed re-ranker and enabled url cache of 2hrs
This commit is contained in:
parent
33632d2fe3
commit
c3fd13a82b
3
.gitignore
vendored
3
.gitignore
vendored
@ -33,3 +33,6 @@ apps/js-sdk/firecrawl/dist
|
||||
/examples/haiku_web_crawler/firecrawl_env
|
||||
/examples/sonnet_web_crawler/firecrawl_env
|
||||
/examples/internal_link_assitant/firecrawl_env
|
||||
|
||||
/apps/api/logs/*
|
||||
/apps/api/debug/*
|
@ -42,7 +42,7 @@ export async function saveEntryToCache(key: string, entry: CacheEntry) {
|
||||
if (!cacheRedis) return;
|
||||
|
||||
try {
|
||||
await cacheRedis.set(key, JSON.stringify(entry));
|
||||
await cacheRedis.set(key, JSON.stringify(entry), "EX", 3600); // 1 hour in seconds
|
||||
} catch (error) {
|
||||
logger.warn("Failed to save to cache", { key, error });
|
||||
}
|
||||
|
7
apps/api/src/lib/extract/config.ts
Normal file
7
apps/api/src/lib/extract/config.ts
Normal file
@ -0,0 +1,7 @@
|
||||
export const extractConfig = {
|
||||
MAX_INITIAL_RANKING_LIMIT: 1000,
|
||||
MAX_RANKING_LIMIT: 20,
|
||||
INITIAL_SCORE_THRESHOLD: 0.75,
|
||||
FALLBACK_SCORE_THRESHOLD: 0.5,
|
||||
MIN_REQUIRED_LINKS: 1,
|
||||
};
|
@ -3,15 +3,13 @@ import { performRanking } from "../ranker";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { logger } from "../logger";
|
||||
import { CohereClient } from "cohere-ai";
|
||||
import { extractConfig } from "./config";
|
||||
|
||||
const cohere = new CohereClient({
|
||||
token: process.env.COHERE_API_KEY,
|
||||
});
|
||||
|
||||
const MAX_RANKING_LIMIT = 10;
|
||||
const INITIAL_SCORE_THRESHOLD = 0.75;
|
||||
const FALLBACK_SCORE_THRESHOLD = 0.5;
|
||||
const MIN_REQUIRED_LINKS = 1;
|
||||
|
||||
|
||||
interface RankingResult {
|
||||
mappedLinks: MapDocument[];
|
||||
@ -61,32 +59,35 @@ export async function rerankLinks(
|
||||
searchQuery,
|
||||
);
|
||||
|
||||
|
||||
// First try with high threshold
|
||||
let filteredLinks = filterAndProcessLinks(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
INITIAL_SCORE_THRESHOLD,
|
||||
extractConfig.INITIAL_SCORE_THRESHOLD,
|
||||
);
|
||||
|
||||
|
||||
|
||||
// If we don't have enough high-quality links, try with lower threshold
|
||||
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
|
||||
if (filteredLinks.length < extractConfig.MIN_REQUIRED_LINKS) {
|
||||
logger.info(
|
||||
`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
|
||||
`Only found ${filteredLinks.length} links with score > ${extractConfig.INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`,
|
||||
);
|
||||
filteredLinks = filterAndProcessLinks(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
FALLBACK_SCORE_THRESHOLD,
|
||||
extractConfig.FALLBACK_SCORE_THRESHOLD,
|
||||
);
|
||||
|
||||
if (filteredLinks.length === 0) {
|
||||
// If still no results, take top N results regardless of score
|
||||
logger.warn(
|
||||
`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`,
|
||||
`No links found with score > ${extractConfig.FALLBACK_SCORE_THRESHOLD}. Taking top ${extractConfig.MIN_REQUIRED_LINKS} results.`,
|
||||
);
|
||||
filteredLinks = linksAndScores
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, MIN_REQUIRED_LINKS)
|
||||
.slice(0, extractConfig.MIN_REQUIRED_LINKS)
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
@ -108,7 +109,7 @@ export async function rerankLinks(
|
||||
}
|
||||
});
|
||||
|
||||
const rankedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
|
||||
const rankedLinks = filteredLinks.slice(0, extractConfig.MAX_RANKING_LIMIT);
|
||||
|
||||
// Mark URLs that will be used in completion
|
||||
rankedLinks.forEach(link => {
|
||||
@ -119,7 +120,7 @@ export async function rerankLinks(
|
||||
});
|
||||
|
||||
// Mark URLs that were dropped due to ranking limit
|
||||
filteredLinks.slice(MAX_RANKING_LIMIT).forEach(link => {
|
||||
filteredLinks.slice(extractConfig.MAX_RANKING_LIMIT).forEach(link => {
|
||||
const trace = urlTraces.find(t => t.url === link.url);
|
||||
if (trace) {
|
||||
trace.warning = 'Excluded due to ranking limit';
|
||||
|
@ -7,8 +7,7 @@ import { generateBasicCompletion } from "../LLM-extraction";
|
||||
import { buildRefrasedPrompt } from "./build-prompts";
|
||||
import { logger } from "../logger";
|
||||
import { rerankLinks } from "./reranker";
|
||||
|
||||
const MAX_EXTRACT_LIMIT = 100;
|
||||
import { extractConfig } from "./config";
|
||||
|
||||
interface ProcessUrlOptions {
|
||||
url: string;
|
||||
@ -96,8 +95,8 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace
|
||||
mappedLinks = [{ url: baseUrl, title: "", description: "" }];
|
||||
}
|
||||
|
||||
// Limit initial set of links
|
||||
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
|
||||
// Limit initial set of links (1000)
|
||||
mappedLinks = mappedLinks.slice(0, extractConfig.MAX_INITIAL_RANKING_LIMIT);
|
||||
|
||||
// Perform reranking if prompt is provided
|
||||
if (options.prompt) {
|
||||
|
@ -38,7 +38,7 @@ const useCache =
|
||||
process.env.CACHE_REDIS_URL !== undefined;
|
||||
|
||||
export const engines: Engine[] = [
|
||||
// ...(useCache ? [ "cache" as const ] : []),
|
||||
...(useCache ? [ "cache" as const ] : []),
|
||||
...(useFireEngine
|
||||
? [
|
||||
"fire-engine;chrome-cdp" as const,
|
||||
|
Loading…
x
Reference in New Issue
Block a user