diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 51ff447d..450e29f5 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -31,9 +31,15 @@ Return only a concise sentece or 2 focused on the essential data points that the } export function buildRerankerSystemPrompt(): string { - return "You are a relevance expert. Analyze the provided URLs and their content to determine their relevance to the user's query and intent. For each URL, assign a relevance score between 0 and 1, where 1 means highly relevant and 0 means not relevant at all. Only include URLs that are actually relevant to the query."; + return `You are a relevance expert scoring links from a website the user is trying to + extract information from. Analyze the provided URLs and their content + to determine their relevance to the user's query and intent. + For each URL, assign a relevance score between 0 and 1, where 1 + means highly relevant and we should extract the content from it and 0 means not relevant at all, we should not extract the content from it. + Always return all the links scored that you are giving. Do not omit links. + Always return the links in the same order they were provided. If the user wants the content from all the links, all links should be scored 1.`; } export function buildRerankerUserPrompt(searchQuery: string): string { - return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction.`; + return `Given these URLs, rank which ones are relevant to the user's extraction intent: "${searchQuery}".`; } diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index eec5dc92..90d4ca21 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -8,6 +8,7 @@ import { searchSimilarPages } from "./index/pinecone"; import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { buildRerankerUserPrompt } from "./build-prompts"; import { buildRerankerSystemPrompt } from "./build-prompts"; +import { dumpToFile } from "./helpers/dump-to-file"; const cohere = new CohereClient({ token: process.env.COHERE_API_KEY, @@ -158,7 +159,7 @@ function filterAndProcessLinks( } export type RerankerResult = { - mapDocument: (MapDocument & { relevanceScore?: number })[]; + mapDocument: (MapDocument & { relevanceScore?: number; reason?: string })[]; tokensUsed: number; }; @@ -170,7 +171,7 @@ export type RerankerOptions = { export async function rerankLinksWithLLM(options: RerankerOptions): Promise { const { links, searchQuery, urlTraces } = options; - const chunkSize = 100; + const chunkSize = 20; const chunks: MapDocument[][] = []; const TIMEOUT_MS = 20000; const MAX_RETRIES = 2; @@ -193,7 +194,7 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise resolve(null), TIMEOUT_MS); }); + // dumpToFile(new Date().toISOString(),[buildRerankerSystemPrompt(), buildRerankerUserPrompt(searchQuery), schema, linksContent]) const completionPromise = generateOpenAICompletions( logger.child({ method: "rerankLinksWithLLM", @@ -233,7 +235,7 @@ export async function rerankLinksWithLLM(options: RerankerOptions): Promise { const link = links.find((link) => link.url === result.url); if (link) { - return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0 }; + return { ...link, relevanceScore: result.relevanceScore ? parseFloat(result.relevanceScore) : 0, reason: result.reason }; } return undefined; })