feat(adaptive-crawler): optimize relevance detection

This commit is contained in:
Zhaofeng Miao 2024-10-08 15:19:03 +08:00
parent af282eec43
commit a44d9a2d2a

View File

@ -362,10 +362,9 @@ export class AdaptiveCrawlerHost extends RPCHost {
const title = json.data.title;
const description = json.data.description;
const rerankQuery = `TITLE: ${title}; DESCRIPTION: ${description}`;
const links = json.data.links as Record<string, string>;
const relevantUrls = await this.getRelevantUrls(token, { query: rerankQuery, links });
const relevantUrls = await this.getRelevantUrls(token, { title, description, links });
this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
for (const url of relevantUrls) {
@ -418,9 +417,10 @@ export class AdaptiveCrawlerHost extends RPCHost {
}
async getRelevantUrls(token: string, {
query, links
title, description, links
}: {
query: string;
title: string;
description: string;
links: Record<string, string>;
}) {
const invalidSuffix = [
@ -434,6 +434,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
.map(([title, link]) => link)
.filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
let query = '';
if (!description) {
query += title;
} else {
query += `TITLE: ${title}; DESCRIPTION: ${description}`;
}
const data = {
model: 'jina-reranker-v2-base-multilingual',
query,
@ -460,7 +467,8 @@ export class AdaptiveCrawlerHost extends RPCHost {
}[];
};
return json.results.filter(r => r.relevance_score > 0.3).map(r => removeURLHash(r.document.text));
const highestRelevanceScore = json.results[0]?.relevance_score ?? 0;
return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text));
}
getIndex(user?: JinaEmbeddingsTokenAccount) {