feat: support getting update time of websites for s.jina.ai

This commit is contained in:
Aaron Ji 2025-03-10 18:03:06 +08:00
parent 5bbd75a6d6
commit a15681cba5
3 changed files with 46 additions and 9 deletions

View File

@ -21,6 +21,7 @@ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { InsufficientBalanceError } from '../services/errors'; import { InsufficientBalanceError } from '../services/errors';
import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
import { UpdateTimeService } from '../shared/3rd-party/cloud-flare';
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES); const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
@ -43,6 +44,7 @@ export class SearcherHost extends RPCHost {
protected serperSearchService: SerperSearchService, protected serperSearchService: SerperSearchService,
protected crawler: CrawlerHost, protected crawler: CrawlerHost,
protected snapshotFormatter: SnapshotFormatter, protected snapshotFormatter: SnapshotFormatter,
protected updateTimeService: UpdateTimeService
) { ) {
super(...arguments); super(...arguments);
} }
@ -99,6 +101,7 @@ export class SearcherHost extends RPCHost {
// Return content by default // Return content by default
const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content'); const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
const withFavicon = Boolean(ctx.get('X-With-Favicons')); const withFavicon = Boolean(ctx.get('X-With-Favicons'));
const withUpdatedTime = Boolean(ctx.get('X-With-Updated-Time'));
let chargeAmount = 0; let chargeAmount = 0;
const noSlashPath = decodeURIComponent(ctx.path).slice(1); const noSlashPath = decodeURIComponent(ctx.path).slice(1);
@ -174,7 +177,7 @@ export class SearcherHost extends RPCHost {
const targetResultCount = crawlWithoutContent ? count : count + 2; const targetResultCount = crawlWithoutContent ? count : count + 2;
const organicSearchResults = r.organic.slice(0, targetResultCount); const organicSearchResults = r.organic.slice(0, targetResultCount);
if (crawlWithoutContent || count === 0) { if (crawlWithoutContent || count === 0) {
const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon); const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon, withUpdatedTime);
lastScrapped = fakeResults; lastScrapped = fakeResults;
chargeAmount = this.assignChargeAmount(!crawlWithoutContent ? lastScrapped : [], count); chargeAmount = this.assignChargeAmount(!crawlWithoutContent ? lastScrapped : [], count);
@ -188,7 +191,8 @@ export class SearcherHost extends RPCHost {
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts, const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }), CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
count, count,
withFavicon withFavicon,
withUpdatedTime
); );
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) { if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
@ -347,6 +351,7 @@ export class SearcherHost extends RPCHost {
searchResults?: SerperSearchResponse['organic'], searchResults?: SerperSearchResponse['organic'],
withContent: boolean = false, withContent: boolean = false,
withFavicon: boolean = false, withFavicon: boolean = false,
withUpdatedTime: boolean = false,
) { ) {
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith; const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
@ -380,6 +385,15 @@ export class SearcherHost extends RPCHost {
}); });
} }
if (withUpdatedTime) {
const updatedTime = await this.getUpdatedTime(upstreamSearchResult.link);
result.lastUpdatedTime = updatedTime;
dataItems.push({
key: 'lastUpdatedTime',
label: 'Last Update Time',
});
}
result.toString = function () { result.toString = function () {
const self = this as any; const self = this as any;
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n'; return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
@ -401,6 +415,7 @@ export class SearcherHost extends RPCHost {
crawlerOptions?: CrawlerOptions, crawlerOptions?: CrawlerOptions,
count?: number, count?: number,
withFavicon?: boolean, withFavicon?: boolean,
withUpdatedTime?: boolean,
) { ) {
if (!searchResults) { if (!searchResults) {
return; return;
@ -441,10 +456,21 @@ export class SearcherHost extends RPCHost {
}); });
}).map(async (x) => { }).map(async (x) => {
const page = await x; const page = await x;
if (withFavicon && page.url) {
const url = new URL(page.url); await Promise.allSettled([
page.favicon = await this.getFavicon(url.origin); async () => {
} if (withFavicon && page.url) {
const url = new URL(page.url);
page.favicon = await this.getFavicon(url.origin);
}
},
async () => {
if (withUpdatedTime && page.url) {
const updatedTime = await this.getUpdatedTime(page.url);
page.lastUpdatedTime = updatedTime;
}
}
].map(f => f()));
return page; return page;
}); });
@ -476,7 +502,7 @@ export class SearcherHost extends RPCHost {
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : ''; const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url} [${i + 1}] URL Source: ${this.url}
[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''} [${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}${this.lastUpdatedTime !== undefined ? `\n[${i + 1}] Last Update Time: ${this.lastUpdatedTime}]` : ''}
`; `;
} }
@ -514,7 +540,7 @@ export class SearcherHost extends RPCHost {
} }
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''} [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}${this.lastUpdatedTime !== undefined ? `\n[${i + 1}] Last Update Time: ${this.lastUpdatedTime}]` : ''}
[${i + 1}] Markdown Content: [${i + 1}] Markdown Content:
${this.content} ${this.content}
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`; ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
@ -553,6 +579,16 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount; return _.every(results, (x) => this.pageQualified(x)) && results.length >= targetResultCount;
} }
async getUpdatedTime(url: string) {
try {
const response = await this.updateTimeService.guessUpdateTime(url);
return response.data.bestGuess;
} catch (error: any) {
this.logger.warn(`Failed to get updated time`, { err: marshalErrorLike(error) });
return '';
}
}
async getFavicon(domain: string) { async getFavicon(domain: string) {
const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`; const url = `https://www.google.com/s2/favicons?sz=32&domain_url=${domain}`;

View File

@ -35,6 +35,7 @@ export interface FormattedPage {
images?: { [k: string]: string; } | [string, string][]; images?: { [k: string]: string; } | [string, string][];
warning?: string; warning?: string;
favicon?: string; favicon?: string;
lastUpdatedTime?: string;
usage?: { usage?: {
total_tokens?: number; total_tokens?: number;
totalTokens?: number; totalTokens?: number;

@ -1 +1 @@
Subproject commit f7d65a8b12fa32d3d6fa46585d73693cba7b14e3 Subproject commit e9469f86ad90f01af7ceaffe90bd0e91531b331d