mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 02:25:53 +08:00
feat: support fetching favicon (#1155)
* feat: support fetching favicon * Update backend/functions/src/dto/scrapping-options.ts Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org> * Update backend/functions/src/dto/scrapping-options.ts Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org> * chore: remove 'withFavicon' from CrawlerOptions --------- Co-authored-by: Yanlong Wang <yanlong.wang@naiver.org>
This commit is contained in:
parent
cd6a4eedf9
commit
21ae52a55a
@ -92,6 +92,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
// Return content by default
|
// Return content by default
|
||||||
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
|
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
|
||||||
const crawlWithoutContent = respondWith.includes('no-content');
|
const crawlWithoutContent = respondWith.includes('no-content');
|
||||||
|
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
|
||||||
|
|
||||||
let chargeAmount = 0;
|
let chargeAmount = 0;
|
||||||
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
||||||
@ -164,7 +165,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
const targetResultCount = crawlWithoutContent ? count : count + 2;
|
||||||
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
const organicSearchResults = r.organic.slice(0, targetResultCount);
|
||||||
if (crawlWithoutContent || count === 0) {
|
if (crawlWithoutContent || count === 0) {
|
||||||
const fakeResults = await this.fakeResult(crawlerOptions.respondWith, organicSearchResults, !crawlWithoutContent);
|
const fakeResults = await this.fakeResult(crawlerOptions, organicSearchResults, !crawlWithoutContent, withFavicon);
|
||||||
lastScrapped = fakeResults;
|
lastScrapped = fakeResults;
|
||||||
if (!crawlWithoutContent) {
|
if (!crawlWithoutContent) {
|
||||||
chargeAmount = this.assignChargeAmount(lastScrapped);
|
chargeAmount = this.assignChargeAmount(lastScrapped);
|
||||||
@ -181,6 +182,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.organic.slice(0, targetResultCount), crawlOpts,
|
||||||
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
CrawlerOptions.from({ ...crawlerOptions, cacheTolerance: crawlerOptions.cacheTolerance ?? this.pageCacheToleranceMs }),
|
||||||
count,
|
count,
|
||||||
|
withFavicon
|
||||||
);
|
);
|
||||||
|
|
||||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||||
@ -328,10 +330,13 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fakeResult(
|
async fakeResult(
|
||||||
mode: string | 'markdown' | 'html' | 'text' | 'screenshot',
|
crawlerOptions: CrawlerOptions,
|
||||||
searchResults?: SerperSearchResponse['organic'],
|
searchResults?: SerperSearchResponse['organic'],
|
||||||
withContent: boolean = false
|
withContent: boolean = false,
|
||||||
|
withFavicon: boolean = false,
|
||||||
) {
|
) {
|
||||||
|
const mode: string | 'markdown' | 'html' | 'text' | 'screenshot' = crawlerOptions.respondWith;
|
||||||
|
|
||||||
if (!searchResults) {
|
if (!searchResults) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
@ -352,7 +357,8 @@ export class SearcherHost extends RPCHost {
|
|||||||
if (withContent) {
|
if (withContent) {
|
||||||
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
||||||
}
|
}
|
||||||
if (mode.includes('no-content')) {
|
|
||||||
|
if (withFavicon) {
|
||||||
const url = new URL(upstreamSearchResult.link);
|
const url = new URL(upstreamSearchResult.link);
|
||||||
result.favicon = await this.getFavicon(url.origin);
|
result.favicon = await this.getFavicon(url.origin);
|
||||||
dataItems.push({
|
dataItems.push({
|
||||||
@ -381,6 +387,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
options?: ExtraScrappingOptions,
|
options?: ExtraScrappingOptions,
|
||||||
crawlerOptions?: CrawlerOptions,
|
crawlerOptions?: CrawlerOptions,
|
||||||
count?: number,
|
count?: number,
|
||||||
|
withFavicon?: boolean,
|
||||||
) {
|
) {
|
||||||
if (!searchResults) {
|
if (!searchResults) {
|
||||||
return;
|
return;
|
||||||
@ -391,9 +398,11 @@ export class SearcherHost extends RPCHost {
|
|||||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||||
const mapped = scrapped.map((x, i) => {
|
const mapped = scrapped.map((x, i) => {
|
||||||
const upstreamSearchResult = searchResults[i];
|
const upstreamSearchResult = searchResults[i];
|
||||||
|
const url = upstreamSearchResult.link;
|
||||||
|
|
||||||
if (!x) {
|
if (!x) {
|
||||||
return {
|
return {
|
||||||
url: upstreamSearchResult.link,
|
url,
|
||||||
title: upstreamSearchResult.title,
|
title: upstreamSearchResult.title,
|
||||||
description: upstreamSearchResult.snippet,
|
description: upstreamSearchResult.snippet,
|
||||||
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
|
content: ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''
|
||||||
@ -412,12 +421,19 @@ export class SearcherHost extends RPCHost {
|
|||||||
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: upstreamSearchResult.link,
|
url,
|
||||||
title: upstreamSearchResult.title,
|
title: upstreamSearchResult.title,
|
||||||
description: upstreamSearchResult.snippet,
|
description: upstreamSearchResult.snippet,
|
||||||
content: x.text,
|
content: x.text,
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
}).map(async (x) => {
|
||||||
|
const page = await x;
|
||||||
|
if (withFavicon && page.url) {
|
||||||
|
const url = new URL(page.url);
|
||||||
|
page.favicon = await this.getFavicon(url.origin);
|
||||||
|
}
|
||||||
|
return page;
|
||||||
});
|
});
|
||||||
|
|
||||||
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
const resultArray = await Promise.all(mapped) as FormattedPage[];
|
||||||
@ -448,11 +464,11 @@ export class SearcherHost extends RPCHost {
|
|||||||
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
const textRep = x.textRepresentation ? `\n[${i + 1}] Content: \n${x.textRepresentation}` : '';
|
||||||
return `[${i + 1}] Title: ${this.title}
|
return `[${i + 1}] Title: ${this.title}
|
||||||
[${i + 1}] URL Source: ${this.url}
|
[${i + 1}] URL Source: ${this.url}
|
||||||
[${i + 1}] Description: ${this.description}${textRep}
|
[${i + 1}] Description: ${this.description}${textRep}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
return `[${i + 1}] No content available for ${this.url}`;
|
return `[${i + 1}] No content available for ${this.url}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
const mixins = [];
|
const mixins = [];
|
||||||
@ -486,7 +502,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return `[${i + 1}] Title: ${this.title}
|
return `[${i + 1}] Title: ${this.title}
|
||||||
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
|
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}${this.favicon !== undefined ? `\n[${i + 1}] Favicon: ${this.favicon}` : ''}
|
||||||
[${i + 1}] Markdown Content:
|
[${i + 1}] Markdown Content:
|
||||||
${this.content}
|
${this.content}
|
||||||
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user