From 64d116540fab20676dda18de5dedffa4251b6ed3 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 24 Jan 2025 09:08:16 -0300 Subject: [PATCH] rerank with lower threshold + back to map if lenght = 0 --- apps/api/src/lib/extract/build-prompts.ts | 2 +- apps/api/src/lib/extract/reranker.ts | 35 +++++++----- apps/api/src/lib/extract/url-processor.ts | 66 ++++++++++++++--------- 3 files changed, 63 insertions(+), 40 deletions(-) diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 8996c13d..51ff447d 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -35,5 +35,5 @@ export function buildRerankerSystemPrompt(): string { } export function buildRerankerUserPrompt(searchQuery: string): string { - return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`; + return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction.`; } diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index dc23d4cb..eec5dc92 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -158,24 +158,27 @@ function filterAndProcessLinks( } export type RerankerResult = { - mapDocument: MapDocument[]; + mapDocument: (MapDocument & { relevanceScore?: number })[]; tokensUsed: number; }; -export async function rerankLinksWithLLM( - mappedLinks: MapDocument[], - searchQuery: string, - urlTraces: URLTrace[], -): Promise { +export type RerankerOptions = { + links: MapDocument[]; + searchQuery: string; + urlTraces: URLTrace[]; +}; + +export async function rerankLinksWithLLM(options: RerankerOptions): Promise { + const { links, searchQuery, urlTraces } = options; const chunkSize = 100; const chunks: MapDocument[][] = []; const TIMEOUT_MS = 20000; const MAX_RETRIES = 2; let totalTokensUsed = 0; - // Split mappedLinks into chunks of 200 - for (let i = 0; i < mappedLinks.length; i += chunkSize) { - chunks.push(mappedLinks.slice(i, i + chunkSize)); + // Split links into chunks of 200 + for (let i = 0; i < links.length; i += chunkSize) { + chunks.push(links.slice(i, i + chunkSize)); } // console.log(`Total links: ${mappedLinks.length}, Number of chunks: ${chunks.length}`); @@ -190,8 +193,9 @@ export async function rerankLinksWithLLM( properties: { url: { type: "string" }, relevanceScore: { type: "number" }, + reason: { type: "string" }, }, - required: ["url", "relevanceScore"], + required: ["url", "relevanceScore", "reason"], }, }, }, @@ -275,10 +279,15 @@ export async function rerankLinksWithLLM( // Map back to MapDocument format, keeping only relevant links const relevantLinks = flattenedResults - .map((result) => mappedLinks.find((link) => link.url === result.url)) - .filter((link): link is MapDocument => link !== undefined); + .map((result) => { + const link = links.find((link) => link.url === result.url); + if (link) { + return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0 }; + } + return undefined; + }) + .filter((link): link is NonNullable => link !== undefined); - // console.log(`Returning ${relevantLinks.length} relevant links`); return { mapDocument: relevantLinks, tokensUsed: totalTokensUsed, diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index ab9f6f60..7a265f36 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -203,38 +203,52 @@ export async function processUrl( rephrasedPrompt }); - logger.info("Reranking (pass 1)..."); - const rerankerResult = await rerankLinksWithLLM( - mappedLinks, - rephrasedPrompt, - urlTraces, - ); - mappedLinks = rerankerResult.mapDocument; + let rerankedLinks = mappedLinks; + logger.info("Reranking pass 1 (threshold 0.6)..."); + const rerankerResult = await rerankLinksWithLLM({ + links: rerankedLinks, + searchQuery: rephrasedPrompt, + urlTraces + }); + rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.6); let tokensUsed = rerankerResult.tokensUsed; - logger.info("Reranked! (pass 1)", { - linkCount: mappedLinks.length, + + logger.info("Reranked! (threshold 0.6)", { + linkCount: rerankedLinks.length, }); - // 2nd Pass, useful for when the first pass returns too many links - if (mappedLinks.length > 100) { - logger.info("Reranking (pass 2)..."); - const rerankerResult = await rerankLinksWithLLM( - mappedLinks, - rephrasedPrompt, - urlTraces, - ); - mappedLinks = rerankerResult.mapDocument; - tokensUsed += rerankerResult.tokensUsed; - logger.info("Reranked! (pass 2)", { - linkCount: mappedLinks.length, + // lower threshold to 0.3 if no links are found + if (rerankedLinks.length === 0) { + logger.info("No links found. Reranking with threshold 0.3"); + rerankedLinks = rerankerResult.mapDocument.filter((x) => x.relevanceScore && x.relevanceScore > 0.3); + logger.info("Reranked! (threshold 0.3)", { + linkCount: rerankedLinks.length, }); } - // dumpToFile( - // "llm-links.txt", - // mappedLinks, - // (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}` - // ); + // 2nd Pass, useful for when the first pass returns too many links + if (rerankedLinks.length > 100) { + logger.info("Reranking pass 2 (> 100 links - threshold 0.6)..."); + const secondPassRerankerResult = await rerankLinksWithLLM({ + links: rerankedLinks, + searchQuery: rephrasedPrompt, + urlTraces, + }); + + if (secondPassRerankerResult.mapDocument.length > 0) { + rerankedLinks = secondPassRerankerResult.mapDocument; + logger.info("Reranked! (threshold 0.6)", { + linkCount: rerankedLinks.length, + }); + } + } + + // If no relevant links are found, return the original mapped links + if (rerankedLinks.length === 0) { + logger.info("No links found. Not reranking."); + rerankedLinks = mappedLinks; + } + // Remove title and description from mappedLinks mappedLinks = mappedLinks.map((link) => ({ url: link.url })); return mappedLinks.map((x) => x.url);