mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 18:09:06 +08:00
feat(adaptive-crawler): optimize relevance detection
This commit is contained in:
parent
af282eec43
commit
a44d9a2d2a
@ -362,10 +362,9 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const title = json.data.title;
|
const title = json.data.title;
|
||||||
const description = json.data.description;
|
const description = json.data.description;
|
||||||
const rerankQuery = `TITLE: ${title}; DESCRIPTION: ${description}`;
|
|
||||||
const links = json.data.links as Record<string, string>;
|
const links = json.data.links as Record<string, string>;
|
||||||
|
|
||||||
const relevantUrls = await this.getRelevantUrls(token, { query: rerankQuery, links });
|
const relevantUrls = await this.getRelevantUrls(token, { title, description, links });
|
||||||
this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
|
this.logger.debug(`Total urls: ${Object.keys(links).length}, relevant urls: ${relevantUrls.length}`);
|
||||||
|
|
||||||
for (const url of relevantUrls) {
|
for (const url of relevantUrls) {
|
||||||
@ -418,9 +417,10 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async getRelevantUrls(token: string, {
|
async getRelevantUrls(token: string, {
|
||||||
query, links
|
title, description, links
|
||||||
}: {
|
}: {
|
||||||
query: string;
|
title: string;
|
||||||
|
description: string;
|
||||||
links: Record<string, string>;
|
links: Record<string, string>;
|
||||||
}) {
|
}) {
|
||||||
const invalidSuffix = [
|
const invalidSuffix = [
|
||||||
@ -434,6 +434,13 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|||||||
.map(([title, link]) => link)
|
.map(([title, link]) => link)
|
||||||
.filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
|
.filter(link => link.startsWith('http') && !invalidSuffix.some(suffix => link.endsWith(suffix)));
|
||||||
|
|
||||||
|
let query = '';
|
||||||
|
if (!description) {
|
||||||
|
query += title;
|
||||||
|
} else {
|
||||||
|
query += `TITLE: ${title}; DESCRIPTION: ${description}`;
|
||||||
|
}
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
model: 'jina-reranker-v2-base-multilingual',
|
model: 'jina-reranker-v2-base-multilingual',
|
||||||
query,
|
query,
|
||||||
@ -460,7 +467,8 @@ export class AdaptiveCrawlerHost extends RPCHost {
|
|||||||
}[];
|
}[];
|
||||||
};
|
};
|
||||||
|
|
||||||
return json.results.filter(r => r.relevance_score > 0.3).map(r => removeURLHash(r.document.text));
|
const highestRelevanceScore = json.results[0]?.relevance_score ?? 0;
|
||||||
|
return json.results.filter(r => r.relevance_score > Math.max(highestRelevanceScore * 0.6, 0.1)).map(r => removeURLHash(r.document.text));
|
||||||
}
|
}
|
||||||
|
|
||||||
getIndex(user?: JinaEmbeddingsTokenAccount) {
|
getIndex(user?: JinaEmbeddingsTokenAccount) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user