feat: support fetching favicon (#1155)

* feat: support fetching favicon

* Update backend/functions/src/dto/scrapping-options.ts

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

* Update backend/functions/src/dto/scrapping-options.ts

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

* chore: remove 'withFavicon' from CrawlerOptions

---------

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
This commit is contained in:
Aaron Ji 2025-03-04 11:32:42 +08:00 committed by GitHub
parent cd6a4eedf9
commit 21ae52a55a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -92,6 +92,7 @@ export class SearcherHost extends RPCHost {
// Return content by default // Return content by default
const respondWith = ctx.req.get('X-Respond-With') ?? 'content'; const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
const crawlWithoutContent = respondWith.includes('no-content'); const crawlWithoutContent = respondWith.includes('no-content');
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
let chargeAmount = 0; let chargeAmount = 0;
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1); const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
@ -164,7 +165,7 @@ export class SearcherHost extends RPCHost {
const targetResultCount = crawlWithoutContent ? count : count + 2; const targetResultCount = crawlWithoutContent ? count : count + 2;
const organicSearchResults = r.organic.slice(0, targetResultCount); const organicSearchResults = r.organic.slice(0, targetResultCount);
if (crawlWithoutContent || count === 0) { if (crawlWithoutContent || count === 0) {
const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent); const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
lastScrapped = fakeResults; lastScrapped = fakeResults;
if (!crawlWithoutContent) { if (!crawlWithoutContent) {
chargeAmount = this.assignChargeAmount(lastScrapped); chargeAmount = this.assignChargeAmount(lastScrapped);
@ -181,6 +182,7 @@ export class SearcherHost extends RPCHost {
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts, const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }), CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
count, count,
withFavicon
); );
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@ -328,10 +330,13 @@ export class SearcherHost extends RPCHost {
} }
async fakeResult( async fakeResult(
mode: string | 'markdown' | 'html' | 'text' | 'screenshot', crawlerOptions: CrawlerOptions,
searchResults?: SerperSearchResponse['organic'], searchResults?: SerperSearchResponse['organic'],
withContent: boolean = false withContent: boolean = false,
withFavicon: boolean = false,
) { ) {
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
if (!searchResults) { if (!searchResults) {
return []; return [];
} }
@ -352,7 +357,8 @@ export class SearcherHost extends RPCHost {
if (withContent) { if (withContent) {
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''; result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
} }
if (mode.includes('no-content')) {
if (withFavicon) {
const url = new URL(upstreamSearchResult.link); const url = new URL(upstreamSearchResult.link);
result.favicon = await this.getFavicon(url.origin); result.favicon = await this.getFavicon(url.origin);
dataItems.push({ dataItems.push({
@ -381,6 +387,7 @@ export class SearcherHost extends RPCHost {
options?: ExtraScrappingOptions, options?: ExtraScrappingOptions,
crawlerOptions?: CrawlerOptions, crawlerOptions?: CrawlerOptions,
count?: number, count?: number,
withFavicon?: boolean,
) { ) {
if (!searchResults) { if (!searchResults) {
return; return;
@ -391,9 +398,11 @@ export class SearcherHost extends RPCHost {
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
const mapped = scrapped.map((x, i) => { const mapped = scrapped.map((x, i) => {
const upstreamSearchResult = searchResults[i]; const upstreamSearchResult = searchResults[i];
const url = upstreamSearchResult.link;
if (!x) { if (!x) {
return { return {
url: upstreamSearchResult.link, url,
title: upstreamSearchResult.title, title: upstreamSearchResult.title,
description: upstreamSearchResult.snippet, description: upstreamSearchResult.snippet,
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : '' content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
@ -412,12 +421,19 @@ export class SearcherHost extends RPCHost {
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) }); this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
return { return {
url: upstreamSearchResult.link, url,
title: upstreamSearchResult.title, title: upstreamSearchResult.title,
description: upstreamSearchResult.snippet, description: upstreamSearchResult.snippet,
content: x.text, content: x.text,
}; };
}); });
}).map(async (x) => {
const page = await x;
if (withFavicon && page.url) {
const url = new URL(page.url);
page.favicon = await this.getFavicon(url.origin);
}
return page;
}); });
const resultArray = await Promise.all(mapped) as FormattedPage[]; const resultArray = await Promise.all(mapped) as FormattedPage[];
@ -448,11 +464,11 @@ export class SearcherHost extends RPCHost {
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : ''; const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url} [${i + 1}] URL Source: ${this.url}
[${i + 1}] Description: ${this.description}${textRep} [${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
`; `;
} }
return `[${i + 1}] No content available for ${this.url}`; return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`;
} }
const mixins = []; const mixins = [];
@ -486,7 +502,7 @@ export class SearcherHost extends RPCHost {
} }
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''} [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
[${i + 1}] Markdown Content: [${i + 1}] Markdown Content:
${this.content} ${this.content}
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`; ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;