mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 05:05:59 +08:00
feat: support fetching favicon (#1155)
* feat: support fetching favicon * Update backend/functions/src/dto/scrapping-options.ts Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org> * Update backend/functions/src/dto/scrapping-options.ts Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org> * chore: remove 'withFavicon' from CrawlerOptions --------- Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
This commit is contained in:
parent
cd6a4eedf9
commit
21ae52a55a
@ -92,6 +92,7 @@ export class SearcherHost extends RPCHost {
|
||||
// Return content by default
|
||||
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
|
||||
const crawlWithoutContent = respondWith.includes('no-content');
|
||||
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
|
||||
|
||||
let chargeAmount = 0;
|
||||
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
||||
@ -164,7 +165,7 @@ export class SearcherHost extends RPCHost {
|
||||
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
||||
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
||||
if (crawlWithoutContent || count === 0) {
|
||||
const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent);
|
||||
const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
|
||||
lastScrapped = fakeResults;
|
||||
if (!crawlWithoutContent) {
|
||||
chargeAmount = this.assignChargeAmount(lastScrapped);
|
||||
@ -181,6 +182,7 @@ export class SearcherHost extends RPCHost {
|
||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||
count,
|
||||
withFavicon
|
||||
);
|
||||
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
@ -328,10 +330,13 @@ export class SearcherHost extends RPCHost {
|
||||
}
|
||||
|
||||
async fakeResult(
|
||||
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
||||
crawlerOptions: CrawlerOptions,
|
||||
searchResults?: SerperSearchResponse['organic'],
|
||||
withContent: boolean = false
|
||||
withContent: boolean = false,
|
||||
withFavicon: boolean = false,
|
||||
) {
|
||||
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
|
||||
|
||||
if (!searchResults) {
|
||||
return [];
|
||||
}
|
||||
@ -352,7 +357,8 @@ export class SearcherHost extends RPCHost {
|
||||
if (withContent) {
|
||||
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
||||
}
|
||||
if (mode.includes('no-content')) {
|
||||
|
||||
if (withFavicon) {
|
||||
const url = new URL(upstreamSearchResult.link);
|
||||
result.favicon = await this.getFavicon(url.origin);
|
||||
dataItems.push({
|
||||
@ -381,6 +387,7 @@ export class SearcherHost extends RPCHost {
|
||||
options?: ExtraScrappingOptions,
|
||||
crawlerOptions?: CrawlerOptions,
|
||||
count?: number,
|
||||
withFavicon?: boolean,
|
||||
) {
|
||||
if (!searchResults) {
|
||||
return;
|
||||
@ -391,9 +398,11 @@ export class SearcherHost extends RPCHost {
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||
const mapped = scrapped.map((x, i) => {
|
||||
const upstreamSearchResult = searchResults[i];
|
||||
const url = upstreamSearchResult.link;
|
||||
|
||||
if (!x) {
|
||||
return {
|
||||
url: upstreamSearchResult.link,
|
||||
url,
|
||||
title: upstreamSearchResult.title,
|
||||
description: upstreamSearchResult.snippet,
|
||||
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
|
||||
@ -412,12 +421,19 @@ export class SearcherHost extends RPCHost {
|
||||
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
||||
|
||||
return {
|
||||
url: upstreamSearchResult.link,
|
||||
url,
|
||||
title: upstreamSearchResult.title,
|
||||
description: upstreamSearchResult.snippet,
|
||||
content: x.text,
|
||||
};
|
||||
});
|
||||
}).map(async (x) => {
|
||||
const page = await x;
|
||||
if (withFavicon && page.url) {
|
||||
const url = new URL(page.url);
|
||||
page.favicon = await this.getFavicon(url.origin);
|
||||
}
|
||||
return page;
|
||||
});
|
||||
|
||||
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
||||
@ -448,11 +464,11 @@ export class SearcherHost extends RPCHost {
|
||||
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
||||
return `[${i + 1}] Title: ${this.title}
|
||||
[${i + 1}] URL Source: ${this.url}
|
||||
[${i + 1}] Description: ${this.description}${textRep}
|
||||
[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
||||
`;
|
||||
}
|
||||
|
||||
return `[${i + 1}] No content available for ${this.url}`;
|
||||
return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`;
|
||||
}
|
||||
|
||||
const mixins = [];
|
||||
@ -486,7 +502,7 @@ export class SearcherHost extends RPCHost {
|
||||
}
|
||||
|
||||
return `[${i + 1}] Title: ${this.title}
|
||||
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
||||
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
||||
[${i + 1}] Markdown Content:
|
||||
${this.content}
|
||||
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
||||
|
Loading…
x
Reference in New Issue
Block a user