rerank with lower threshold + back to map if lenght = 0

This commit is contained in:
rafaelmmiller 2025-01-24 09:08:16 -03:00
parent 05d79a875a
commit 64d116540f
3 changed files with 63 additions and 40 deletions

View File

@ -35,5 +35,5 @@ export function buildRerankerSystemPrompt(): string {
}
export function buildRerankerUserPrompt(searchQuery: string): string {
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction.`;
}

View File

@ -158,24 +158,27 @@ function filterAndProcessLinks(
}
export type RerankerResult = {
mapDocument: MapDocument[];
mapDocument: (MapDocument & { relevanceScore?: number })[];
tokensUsed: number;
};
export async function rerankLinksWithLLM(
mappedLinks: MapDocument[],
searchQuery: string,
urlTraces: URLTrace[],
): Promise<RerankerResult> {
export type RerankerOptions = {
links: MapDocument[];
searchQuery: string;
urlTraces: URLTrace[];
};
export async function rerankLinksWithLLM(options: RerankerOptions): Promise<RerankerResult> {
const { links, searchQuery, urlTraces } = options;
const chunkSize = 100;
const chunks: MapDocument[][] = [];
const TIMEOUT_MS = 20000;
const MAX_RETRIES = 2;
let totalTokensUsed = 0;
// Split mappedLinks into chunks of 200
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
chunks.push(mappedLinks.slice(i, i + chunkSize));
// Split links into chunks of 200
for (let i = 0; i < links.length; i += chunkSize) {
chunks.push(links.slice(i, i + chunkSize));
}
// console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`);
@ -190,8 +193,9 @@ export async function rerankLinksWithLLM(
properties: {
url: { type: "string" },
relevanceScore: { type: "number" },
reason: { type: "string" },
},
required: ["url", "relevanceScore"],
required: ["url", "relevanceScore", "reason"],
},
},
},
@ -275,10 +279,15 @@ export async function rerankLinksWithLLM(
// Map back to MapDocument format, keeping only relevant links
const relevantLinks = flattenedResults
.map((result) => mappedLinks.find((link) => link.url === result.url))
.filter((link): link is MapDocument => link !== undefined);
.map((result) => {
const link = links.find((link) => link.url === result.url);
if (link) {
return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0 };
}
return undefined;
})
.filter((link): link is NonNullable<typeof link> => link !== undefined);
// console.log(`Returning ${relevantLinks.length} relevant links`);
return {
mapDocument: relevantLinks,
tokensUsed: totalTokensUsed,

View File

@ -203,38 +203,52 @@ export async function processUrl(
rephrasedPrompt
});
logger.info("Reranking (pass 1)...");
const rerankerResult = await rerankLinksWithLLM(
mappedLinks,
rephrasedPrompt,
urlTraces,
);
mappedLinks = rerankerResult.mapDocument;
let rerankedLinks = mappedLinks;
logger.info("Reranking pass 1 (threshold 0.6)...");
const rerankerResult = await rerankLinksWithLLM({
links: rerankedLinks,
searchQuery: rephrasedPrompt,
urlTraces
});
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6);
let tokensUsed = rerankerResult.tokensUsed;
logger.info("Reranked! (pass 1)", {
linkCount: mappedLinks.length,
logger.info("Reranked! (threshold 0.6)", {
linkCount: rerankedLinks.length,
});
// 2nd Pass, useful for when the first pass returns too many links
if (mappedLinks.length > 100) {
logger.info("Reranking (pass 2)...");
const rerankerResult = await rerankLinksWithLLM(
mappedLinks,
rephrasedPrompt,
urlTraces,
);
mappedLinks = rerankerResult.mapDocument;
tokensUsed += rerankerResult.tokensUsed;
logger.info("Reranked! (pass 2)", {
linkCount: mappedLinks.length,
// lower threshold to 0.3 if no links are found
if (rerankedLinks.length === 0) {
logger.info("No links found. Reranking with threshold 0.3");
rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3);
logger.info("Reranked! (threshold 0.3)", {
linkCount: rerankedLinks.length,
});
}
// dumpToFile(
// "llm-links.txt",
// mappedLinks,
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
// 2nd Pass, useful for when the first pass returns too many links
if (rerankedLinks.length > 100) {
logger.info("Reranking pass 2 (> 100 links - threshold 0.6)...");
const secondPassRerankerResult = await rerankLinksWithLLM({
links: rerankedLinks,
searchQuery: rephrasedPrompt,
urlTraces,
});
if (secondPassRerankerResult.mapDocument.length > 0) {
rerankedLinks = secondPassRerankerResult.mapDocument;
logger.info("Reranked! (threshold 0.6)", {
linkCount: rerankedLinks.length,
});
}
}
// If no relevant links are found, return the original mapped links
if (rerankedLinks.length === 0) {
logger.info("No links found. Not reranking.");
rerankedLinks = mappedLinks;
}
// Remove title and description from mappedLinks
mappedLinks = mappedLinks.map((link) => ({ url: link.url }));
return mappedLinks.map((x) => x.url);