mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:45:53 +08:00
feat: support getting update time of websites for s.jina.ai
This commit is contained in:
parent
5bbd75a6d6
commit
a15681cba5
@ -21,6 +21,7 @@ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
|||||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||||
import { InsufficientBalanceError } from '../services/errors';
|
import { InsufficientBalanceError } from '../services/errors';
|
||||||
import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||||
|
import { UpdateTimeService } from '../shared/3rd-party/cloud-flare';
|
||||||
|
|
||||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
|
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
|
||||||
|
|
||||||
@ -43,6 +44,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
protected serperSearchService: SerperSearchService,
|
protected serperSearchService: SerperSearchService,
|
||||||
protected crawler: CrawlerHost,
|
protected crawler: CrawlerHost,
|
||||||
protected snapshotFormatter: SnapshotFormatter,
|
protected snapshotFormatter: SnapshotFormatter,
|
||||||
|
protected updateTimeService: UpdateTimeService
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
}
|
}
|
||||||
@ -99,6 +101,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
// Return content by default
|
// Return content by default
|
||||||
const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
|
const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
|
||||||
const withFavicon = Boolean(ctx.get('X-With-Favicons'));
|
const withFavicon = Boolean(ctx.get('X-With-Favicons'));
|
||||||
|
const withUpdatedTime = Boolean(ctx.get('X-With-Updated-Time'));
|
||||||
|
|
||||||
let chargeAmount = 0;
|
let chargeAmount = 0;
|
||||||
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
|
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
|
||||||
@ -174,7 +177,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
||||||
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
||||||
if (crawlWithoutContent || count === 0) {
|
if (crawlWithoutContent || count === 0) {
|
||||||
const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
|
const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon, withUpdatedTime);
|
||||||
lastScrapped = fakeResults;
|
lastScrapped = fakeResults;
|
||||||
chargeAmount = this.assignChargeAmount(!crawlWithoutContent ? lastScrapped : [], count);
|
chargeAmount = this.assignChargeAmount(!crawlWithoutContent ? lastScrapped : [], count);
|
||||||
|
|
||||||
@ -188,7 +191,8 @@ export class SearcherHost extends RPCHost {
|
|||||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
||||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||||
count,
|
count,
|
||||||
withFavicon
|
withFavicon,
|
||||||
|
withUpdatedTime
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
||||||
@ -347,6 +351,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
searchResults?: SerperSearchResponse['organic'],
|
searchResults?: SerperSearchResponse['organic'],
|
||||||
withContent: boolean = false,
|
withContent: boolean = false,
|
||||||
withFavicon: boolean = false,
|
withFavicon: boolean = false,
|
||||||
|
withUpdatedTime: boolean = false,
|
||||||
) {
|
) {
|
||||||
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
|
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
|
||||||
|
|
||||||
@ -380,6 +385,15 @@ export class SearcherHost extends RPCHost {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (withUpdatedTime) {
|
||||||
|
const updatedTime = await this.getUpdatedTime(upstreamSearchResult.link);
|
||||||
|
result.lastUpdatedTime = updatedTime;
|
||||||
|
dataItems.push({
|
||||||
|
key: 'lastUpdatedTime',
|
||||||
|
label: 'Last Update Time',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
result.toString = function () {
|
result.toString = function () {
|
||||||
const self = this as any;
|
const self = this as any;
|
||||||
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
|
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
|
||||||
@ -401,6 +415,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
crawlerOptions?: CrawlerOptions,
|
crawlerOptions?: CrawlerOptions,
|
||||||
count?: number,
|
count?: number,
|
||||||
withFavicon?: boolean,
|
withFavicon?: boolean,
|
||||||
|
withUpdatedTime?: boolean,
|
||||||
) {
|
) {
|
||||||
if (!searchResults) {
|
if (!searchResults) {
|
||||||
return;
|
return;
|
||||||
@ -441,10 +456,21 @@ export class SearcherHost extends RPCHost {
|
|||||||
});
|
});
|
||||||
}).map(async (x) => {
|
}).map(async (x) => {
|
||||||
const page = await x;
|
const page = await x;
|
||||||
if (withFavicon && page.url) {
|
|
||||||
const url = new URL(page.url);
|
await Promise.allSettled([
|
||||||
page.favicon = await this.getFavicon(url.origin);
|
async () => {
|
||||||
}
|
if (withFavicon && page.url) {
|
||||||
|
const url = new URL(page.url);
|
||||||
|
page.favicon = await this.getFavicon(url.origin);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
async () => {
|
||||||
|
if (withUpdatedTime && page.url) {
|
||||||
|
const updatedTime = await this.getUpdatedTime(page.url);
|
||||||
|
page.lastUpdatedTime = updatedTime;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
].map(f => f()));
|
||||||
return page;
|
return page;
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -476,7 +502,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
||||||
return `[${i + 1}] Title: ${this.title}
|
return `[${i + 1}] Title: ${this.title}
|
||||||
[${i + 1}] URL Source: ${this.url}
|
[${i + 1}] URL Source: ${this.url}
|
||||||
[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}${this.lastUpdatedTime !== undefined ? `\n[${i + 1}] Last Update Time: ${this.lastUpdatedTime}]` : ''}
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -514,7 +540,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return `[${i + 1}] Title: ${this.title}
|
return `[${i + 1}] Title: ${this.title}
|
||||||
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}${this.lastUpdatedTime !== undefined ? `\n[${i + 1}] Last Update Time: ${this.lastUpdatedTime}]` : ''}
|
||||||
[${i + 1}] Markdown Content:
|
[${i + 1}] Markdown Content:
|
||||||
${this.content}
|
${this.content}
|
||||||
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
||||||
@ -553,6 +579,16 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
|||||||
return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
|
return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async getUpdatedTime(url: string) {
|
||||||
|
try {
|
||||||
|
const response = await this.updateTimeService.guessUpdateTime(url);
|
||||||
|
return response.data.bestGuess;
|
||||||
|
} catch (error: any) {
|
||||||
|
this.logger.warn(`Failed to get updated time`, { err: marshalErrorLike(error) });
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async getFavicon(domain: string) {
|
async getFavicon(domain: string) {
|
||||||
const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;
|
const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;
|
||||||
|
|
||||||
|
@ -35,6 +35,7 @@ export interface FormattedPage {
|
|||||||
images?: { [k: string]: string; } | [string, string][];
|
images?: { [k: string]: string; } | [string, string][];
|
||||||
warning?: string;
|
warning?: string;
|
||||||
favicon?: string;
|
favicon?: string;
|
||||||
|
lastUpdatedTime?: string;
|
||||||
usage?: {
|
usage?: {
|
||||||
total_tokens?: number;
|
total_tokens?: number;
|
||||||
totalTokens?: number;
|
totalTokens?: number;
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit f7d65a8b12fa32d3d6fa46585d73693cba7b14e3
|
Subproject commit e9469f86ad90f01af7ceaffe90bd0e91531b331d
|
Loading…
x
Reference in New Issue
Block a user