mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:55:56 +08:00
feat(adaptive-crawler): optimize relevance detection
This commit is contained in:
parent
af282eec43
commit
a44d9a2d2a
@ -362,10 +362,9 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
||||
|
||||
const title = json.data.title;
|
||||
const description = json.data.description;
|
||||
const rerankQuery = `TITLE: ${title}; DESCRIPTION: ${description}`;
|
||||
const links = json.data.links as Record<string, string>;
|
||||
|
||||
const relevantUrls = await this.getRelevantUrls(token, { query: rerankQuery, links });
|
||||
const relevantUrls = await this.getRelevantUrls(token, { title, description, links });
|
||||
this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
|
||||
|
||||
for (const url of relevantUrls) {
|
||||
@ -418,9 +417,10 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
async getRelevantUrls(token: string, {
|
||||
query, links
|
||||
title, description, links
|
||||
}: {
|
||||
query: string;
|
||||
title: string;
|
||||
description: string;
|
||||
links: Record<string, string>;
|
||||
}) {
|
||||
const invalidSuffix = [
|
||||
@ -434,6 +434,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
||||
.map(([title, link]) => link)
|
||||
.filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
|
||||
|
||||
let query = '';
|
||||
if (!description) {
|
||||
query += title;
|
||||
} else {
|
||||
query += `TITLE: ${title}; DESCRIPTION: ${description}`;
|
||||
}
|
||||
|
||||
const data = {
|
||||
model: 'jina-reranker-v2-base-multilingual',
|
||||
query,
|
||||
@ -460,7 +467,8 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
||||
}[];
|
||||
};
|
||||
|
||||
return json.results.filter(r => r.relevance_score > 0.3).map(r => removeURLHash(r.document.text));
|
||||
const highestRelevanceScore = json.results[0]?.relevance_score ?? 0;
|
||||
return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text));
|
||||
}
|
||||
|
||||
getIndex(user?: JinaEmbeddingsTokenAccount) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user