mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:29:04 +08:00
rerank with lower threshold + back to map if lenght = 0
This commit is contained in:
parent
05d79a875a
commit
64d116540f
@ -35,5 +35,5 @@ export function buildRerankerSystemPrompt(): string {
|
||||
}
|
||||
|
||||
export function buildRerankerUserPrompt(searchQuery: string): string {
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction.`;
|
||||
}
|
||||
|
@ -158,24 +158,27 @@ function filterAndProcessLinks(
|
||||
}
|
||||
|
||||
export type RerankerResult = {
|
||||
mapDocument: MapDocument[];
|
||||
mapDocument: (MapDocument & { relevanceScore?: number })[];
|
||||
tokensUsed: number;
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(
|
||||
mappedLinks: MapDocument[],
|
||||
searchQuery: string,
|
||||
urlTraces: URLTrace[],
|
||||
): Promise<RerankerResult> {
|
||||
export type RerankerOptions = {
|
||||
links: MapDocument[];
|
||||
searchQuery: string;
|
||||
urlTraces: URLTrace[];
|
||||
};
|
||||
|
||||
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
|
||||
const { links, searchQuery, urlTraces } = options;
|
||||
const chunkSize = 100;
|
||||
const chunks: MapDocument[][] = [];
|
||||
const TIMEOUT_MS = 20000;
|
||||
const MAX_RETRIES = 2;
|
||||
let totalTokensUsed = 0;
|
||||
|
||||
// Split mappedLinks into chunks of 200
|
||||
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
|
||||
chunks.push(mappedLinks.slice(i, i + chunkSize));
|
||||
// Split links into chunks of 200
|
||||
for (let i = 0; i < links.length; i += chunkSize) {
|
||||
chunks.push(links.slice(i, i + chunkSize));
|
||||
}
|
||||
|
||||
// console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`);
|
||||
@ -190,8 +193,9 @@ export async function rerankLinksWithLLM(
|
||||
properties: {
|
||||
url: { type: "string" },
|
||||
relevanceScore: { type: "number" },
|
||||
reason: { type: "string" },
|
||||
},
|
||||
required: ["url", "relevanceScore"],
|
||||
required: ["url", "relevanceScore", "reason"],
|
||||
},
|
||||
},
|
||||
},
|
||||
@ -275,10 +279,15 @@ export async function rerankLinksWithLLM(
|
||||
|
||||
// Map back to MapDocument format, keeping only relevant links
|
||||
const relevantLinks = flattenedResults
|
||||
.map((result) => mappedLinks.find((link) => link.url === result.url))
|
||||
.filter((link): link is MapDocument => link !== undefined);
|
||||
.map((result) => {
|
||||
const link = links.find((link) => link.url === result.url);
|
||||
if (link) {
|
||||
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0 };
|
||||
}
|
||||
return undefined;
|
||||
})
|
||||
.filter((link): link is NonNullable<typeof link> => link !== undefined);
|
||||
|
||||
// console.log(`Returning ${relevantLinks.length} relevant links`);
|
||||
return {
|
||||
mapDocument: relevantLinks,
|
||||
tokensUsed: totalTokensUsed,
|
||||
|
@ -203,38 +203,52 @@ export async function processUrl(
|
||||
rephrasedPrompt
|
||||
});
|
||||
|
||||
logger.info("Reranking (pass 1)...");
|
||||
const rerankerResult = await rerankLinksWithLLM(
|
||||
mappedLinks,
|
||||
rephrasedPrompt,
|
||||
urlTraces,
|
||||
);
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
let rerankedLinks = mappedLinks;
|
||||
logger.info("Reranking pass 1 (threshold 0.6)...");
|
||||
const rerankerResult = await rerankLinksWithLLM({
|
||||
links: rerankedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces
|
||||
});
|
||||
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
|
||||
let tokensUsed = rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 1)", {
|
||||
linkCount: mappedLinks.length,
|
||||
|
||||
logger.info("Reranked! (threshold 0.6)", {
|
||||
linkCount: rerankedLinks.length,
|
||||
});
|
||||
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (mappedLinks.length > 100) {
|
||||
logger.info("Reranking (pass 2)...");
|
||||
const rerankerResult = await rerankLinksWithLLM(
|
||||
mappedLinks,
|
||||
rephrasedPrompt,
|
||||
urlTraces,
|
||||
);
|
||||
mappedLinks = rerankerResult.mapDocument;
|
||||
tokensUsed += rerankerResult.tokensUsed;
|
||||
logger.info("Reranked! (pass 2)", {
|
||||
linkCount: mappedLinks.length,
|
||||
// lower threshold to 0.3 if no links are found
|
||||
if (rerankedLinks.length === 0) {
|
||||
logger.info("No links found. Reranking with threshold 0.3");
|
||||
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3);
|
||||
logger.info("Reranked! (threshold 0.3)", {
|
||||
linkCount: rerankedLinks.length,
|
||||
});
|
||||
}
|
||||
|
||||
// dumpToFile(
|
||||
// "llm-links.txt",
|
||||
// mappedLinks,
|
||||
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
|
||||
// );
|
||||
// 2nd Pass, useful for when the first pass returns too many links
|
||||
if (rerankedLinks.length > 100) {
|
||||
logger.info("Reranking pass 2 (> 100 links - threshold 0.6)...");
|
||||
const secondPassRerankerResult = await rerankLinksWithLLM({
|
||||
links: rerankedLinks,
|
||||
searchQuery: rephrasedPrompt,
|
||||
urlTraces,
|
||||
});
|
||||
|
||||
if (secondPassRerankerResult.mapDocument.length > 0) {
|
||||
rerankedLinks = secondPassRerankerResult.mapDocument;
|
||||
logger.info("Reranked! (threshold 0.6)", {
|
||||
linkCount: rerankedLinks.length,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// If no relevant links are found, return the original mapped links
|
||||
if (rerankedLinks.length === 0) {
|
||||
logger.info("No links found. Not reranking.");
|
||||
rerankedLinks = mappedLinks;
|
||||
}
|
||||
|
||||
// Remove title and description from mappedLinks
|
||||
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
|
||||
return mappedLinks.map((x) => x.url);
|
||||
|
Loading…
x
Reference in New Issue
Block a user