From 21ae52a55a9c6849965d473ca912d01439045e67 Mon Sep 17 00:00:00 2001 From: Aaron Ji <127167174+DresAaron@users.noreply.github.com> Date: Tue, 4 Mar 2025 11:32:42 +0800 Subject: [PATCH] feat: support fetching favicon (#1155) * feat: support fetching favicon * Update backend/functions/src/dto/scrapping-options.ts Co-authored-by: Yanlong Wang * Update backend/functions/src/dto/scrapping-options.ts Co-authored-by: Yanlong Wang * chore: remove 'withFavicon' from CrawlerOptions --------- Co-authored-by: Yanlong Wang --- .../src/cloud-functions/searcher-serper.ts | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/backend/functions/src/cloud-functions/searcher-serper.ts b/backend/functions/src/cloud-functions/searcher-serper.ts index dd35b3a..e6df24c 100644 --- a/backend/functions/src/cloud-functions/searcher-serper.ts +++ b/backend/functions/src/cloud-functions/searcher-serper.ts @@ -92,6 +92,7 @@ export class SearcherHost extends RPCHost { // Return content by default const respondWith = ctx.req.get('X-Respond-With') ?? 'content'; const crawlWithoutContent = respondWith.includes('no-content'); + const withFavicon = ctx.req.get('X-With-Favicons') === 'true'; let chargeAmount = 0; const noSlashPath = decodeURIComponent(ctx.req.path).slice(1); @@ -164,7 +165,7 @@ export class SearcherHost extends RPCHost { const targetResultCount = crawlWithoutContent ? count : count + 2; const organicSearchResults = r.organic.slice(0, targetResultCount); if (crawlWithoutContent || count === 0) { - const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent); + const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon); lastScrapped = fakeResults; if (!crawlWithoutContent) { chargeAmount = this.assignChargeAmount(lastScrapped); @@ -181,6 +182,7 @@ export class SearcherHost extends RPCHost { const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts, CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }), count, + withFavicon ); if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { @@ -328,10 +330,13 @@ export class SearcherHost extends RPCHost { } async fakeResult( - mode: string | 'markdown' | 'html' | 'text' | 'screenshot', + crawlerOptions: CrawlerOptions, searchResults?: SerperSearchResponse['organic'], - withContent: boolean = false + withContent: boolean = false, + withFavicon: boolean = false, ) { + const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith; + if (!searchResults) { return []; } @@ -352,7 +357,8 @@ export class SearcherHost extends RPCHost { if (withContent) { result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''; } - if (mode.includes('no-content')) { + + if (withFavicon) { const url = new URL(upstreamSearchResult.link); result.favicon = await this.getFavicon(url.origin); dataItems.push({ @@ -381,6 +387,7 @@ export class SearcherHost extends RPCHost { options?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions, count?: number, + withFavicon?: boolean, ) { if (!searchResults) { return; @@ -391,9 +398,11 @@ export class SearcherHost extends RPCHost { for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { const mapped = scrapped.map((x, i) => { const upstreamSearchResult = searchResults[i]; + const url = upstreamSearchResult.link; + if (!x) { return { - url: upstreamSearchResult.link, + url, title: upstreamSearchResult.title, description: upstreamSearchResult.snippet, content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : '' @@ -412,12 +421,19 @@ export class SearcherHost extends RPCHost { this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) }); return { - url: upstreamSearchResult.link, + url, title: upstreamSearchResult.title, description: upstreamSearchResult.snippet, content: x.text, }; }); + }).map(async (x) => { + const page = await x; + if (withFavicon && page.url) { + const url = new URL(page.url); + page.favicon = await this.getFavicon(url.origin); + } + return page; }); const resultArray = await Promise.all(mapped) as FormattedPage[]; @@ -448,11 +464,11 @@ export class SearcherHost extends RPCHost { const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : ''; return `[${i + 1}] Title: ${this.title} [${i + 1}] URL Source: ${this.url} -[${i + 1}] Description: ${this.description}${textRep} +[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''} `; } - return `[${i + 1}] No content available for ${this.url}`; + return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`; } const mixins = []; @@ -486,7 +502,7 @@ export class SearcherHost extends RPCHost { } return `[${i + 1}] Title: ${this.title} -[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''} +[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''} [${i + 1}] Markdown Content: ${this.content} ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;