feat: support fetching favicon (#1155)

* feat: support fetching favicon

* Update backend/functions/src/dto/scrapping-options.ts

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

* Update backend/functions/src/dto/scrapping-options.ts

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>

* chore: remove 'withFavicon' from CrawlerOptions

---------

Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
This commit is contained in:
Aaron Ji 2025-03-04 11:32:42 +08:00 committed by GitHub
parent cd6a4eedf9
commit 21ae52a55a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -92,6 +92,7 @@ export class SearcherHost extends RPCHost {
// Return content by default
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
const crawlWithoutContent = respondWith.includes('no-content');
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
let chargeAmount = 0;
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
@ -164,7 +165,7 @@ export class SearcherHost extends RPCHost {
const targetResultCount = crawlWithoutContent ? count : count + 2;
const organicSearchResults = r.organic.slice(0, targetResultCount);
if (crawlWithoutContent || count === 0) {
const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent);
const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
lastScrapped = fakeResults;
if (!crawlWithoutContent) {
chargeAmount = this.assignChargeAmount(lastScrapped);
@ -181,6 +182,7 @@ export class SearcherHost extends RPCHost {
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
count,
withFavicon
);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
@ -328,10 +330,13 @@ export class SearcherHost extends RPCHost {
}
async fakeResult(
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
crawlerOptions: CrawlerOptions,
searchResults?: SerperSearchResponse['organic'],
withContent: boolean = false
withContent: boolean = false,
withFavicon: boolean = false,
) {
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
if (!searchResults) {
return [];
}
@ -352,7 +357,8 @@ export class SearcherHost extends RPCHost {
if (withContent) {
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
}
if (mode.includes('no-content')) {
if (withFavicon) {
const url = new URL(upstreamSearchResult.link);
result.favicon = await this.getFavicon(url.origin);
dataItems.push({
@ -381,6 +387,7 @@ export class SearcherHost extends RPCHost {
options?: ExtraScrappingOptions,
crawlerOptions?: CrawlerOptions,
count?: number,
withFavicon?: boolean,
) {
if (!searchResults) {
return;
@ -391,9 +398,11 @@ export class SearcherHost extends RPCHost {
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
const mapped = scrapped.map((x, i) => {
const upstreamSearchResult = searchResults[i];
const url = upstreamSearchResult.link;
if (!x) {
return {
url: upstreamSearchResult.link,
url,
title: upstreamSearchResult.title,
description: upstreamSearchResult.snippet,
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
@ -412,12 +421,19 @@ export class SearcherHost extends RPCHost {
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
return {
url: upstreamSearchResult.link,
url,
title: upstreamSearchResult.title,
description: upstreamSearchResult.snippet,
content: x.text,
};
});
}).map(async (x) => {
const page = await x;
if (withFavicon && page.url) {
const url = new URL(page.url);
page.favicon = await this.getFavicon(url.origin);
}
return page;
});
const resultArray = await Promise.all(mapped) as FormattedPage[];
@ -448,11 +464,11 @@ export class SearcherHost extends RPCHost {
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}
[${i + 1}] Description: ${this.description}${textRep}
[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
`;
}
return `[${i + 1}] No content available for ${this.url}`;
return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`;
}
const mixins = [];
@ -486,7 +502,7 @@ export class SearcherHost extends RPCHost {
}
return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
[${i + 1}] Markdown Content:
${this.content}
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;