From a44d9a2d2a010c9f8adc160585577ef68ec70d6b Mon Sep 17 00:00:00 2001 From: Zhaofeng Miao <522856232@qq.com> Date: Tue, 8 Oct 2024 15:19:03 +0800 Subject: [PATCH] feat(adaptive-crawler): optimize relevance detection --- .../src/cloud-functions/adaptive-crawler.ts | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/backend/functions/src/cloud-functions/adaptive-crawler.ts b/backend/functions/src/cloud-functions/adaptive-crawler.ts index 1c69973..e178bfb 100644 --- a/backend/functions/src/cloud-functions/adaptive-crawler.ts +++ b/backend/functions/src/cloud-functions/adaptive-crawler.ts @@ -362,10 +362,9 @@ export class AdaptiveCrawlerHost extends RPCHost { const title = json.data.title; const description = json.data.description; - const rerankQuery = `TITLE: ${title}; DESCRIPTION: ${description}`; const links = json.data.links as Record; - const relevantUrls = await this.getRelevantUrls(token, { query: rerankQuery, links }); + const relevantUrls = await this.getRelevantUrls(token, { title, description, links }); this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`); for (const url of relevantUrls) { @@ -418,9 +417,10 @@ export class AdaptiveCrawlerHost extends RPCHost { } async getRelevantUrls(token: string, { - query, links + title, description, links }: { - query: string; + title: string; + description: string; links: Record; }) { const invalidSuffix = [ @@ -434,6 +434,13 @@ export class AdaptiveCrawlerHost extends RPCHost { .map(([title, link]) => link) .filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix))); + let query = ''; + if (!description) { + query += title; + } else { + query += `TITLE: ${title}; DESCRIPTION: ${description}`; + } + const data = { model: 'jina-reranker-v2-base-multilingual', query, @@ -460,7 +467,8 @@ export class AdaptiveCrawlerHost extends RPCHost { }[]; }; - return json.results.filter(r => r.relevance_score > 0.3).map(r => removeURLHash(r.document.text)); + const highestRelevanceScore = json.results[0]?.relevance_score ?? 0; + return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text)); } getIndex(user?: JinaEmbeddingsTokenAccount) {