fix: search descriptions

This commit is contained in:
yanlong.wang 2024-06-05 19:47:04 +08:00
parent 165cce6c91
commit a9936d322e
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 41 additions and 6 deletions

View File

@ -870,6 +870,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
this.threadLocal.set('withImagesSummary', opts.withImagesSummary); this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance); this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('userAgent', opts.userAgent); this.threadLocal.set('userAgent', opts.userAgent);
if (opts.timeout) {
this.threadLocal.set('timeout', opts.timeout * 1000);
}
const crawlOpts: ExtraScrappingOptions = { const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl, proxyUrl: opts.proxyUrl,
@ -878,6 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
waitForSelector: opts.waitForSelector, waitForSelector: opts.waitForSelector,
targetSelector: opts.targetSelector, targetSelector: opts.targetSelector,
overrideUserAgent: opts.userAgent, overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
}; };
return crawlOpts; return crawlOpts;

View File

@ -30,7 +30,7 @@ export class SearcherHost extends RPCHost {
cacheValidMs = 1000 * 3600; cacheValidMs = 1000 * 3600;
pageCacheToleranceMs = 1000 * 3600 * 24; pageCacheToleranceMs = 1000 * 3600 * 24;
reasonableDelayMs = 10_000; reasonableDelayMs = 15_000;
targetResultCount = 5; targetResultCount = 5;
@ -163,6 +163,10 @@ export class SearcherHost extends RPCHost {
throw new AssertionFailureError(`No search results available for query ${searchQuery}`); throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
} }
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
delete crawlOpts.timeoutMs;
}
const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts, const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts,
crawlerOptions.cacheTolerance || this.pageCacheToleranceMs crawlerOptions.cacheTolerance || this.pageCacheToleranceMs
); );
@ -213,7 +217,7 @@ export class SearcherHost extends RPCHost {
chargeAmount = this.getChargeAmount(lastScrapped); chargeAmount = this.getChargeAmount(lastScrapped);
rpcReflect.return(lastScrapped); rpcReflect.return(lastScrapped);
earlyReturn = true; earlyReturn = true;
}, this.reasonableDelayMs); }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
}; };
for await (const scrapped of it) { for await (const scrapped of it) {
@ -259,7 +263,7 @@ export class SearcherHost extends RPCHost {
chargeAmount = this.getChargeAmount(lastScrapped); chargeAmount = this.getChargeAmount(lastScrapped);
rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null })); rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }));
earlyReturn = true; earlyReturn = true;
}, this.reasonableDelayMs); }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs);
}; };
for await (const scrapped of it) { for await (const scrapped of it) {
@ -317,7 +321,12 @@ export class SearcherHost extends RPCHost {
description: upstreamSearchResult.description, description: upstreamSearchResult.description,
}; };
} }
return this.crawler.formatSnapshot(mode, x, urls[i]); return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
r.title ??= upstreamSearchResult.title;
r.description = upstreamSearchResult.description;
return r;
});
}); });
const resultArray = await Promise.all(mapped) as FormattedPage[]; const resultArray = await Promise.all(mapped) as FormattedPage[];
@ -343,7 +352,7 @@ export class SearcherHost extends RPCHost {
return { return {
...x, ...x,
toString(this: any) { toString(this: any) {
if (this.description) { if (!this.content && this.description) {
if (this.title) { if (this.title) {
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url} [${i + 1}] URL Source: ${this.url}
@ -355,6 +364,9 @@ export class SearcherHost extends RPCHost {
} }
const mixins = []; const mixins = [];
if (this.description) {
mixins.push(`[${i + 1}] Description: ${this.description}`);
}
if (this.publishedTime) { if (this.publishedTime) {
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`); mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
} }

View File

@ -91,6 +91,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-Timeout': {
description: `Specify timeout in seconds. Max 180.`,
in: 'header',
schema: { type: 'string' }
},
} }
} }
} }
@ -142,6 +147,11 @@ export class CrawlerOptions extends AutoCastable {
@Prop() @Prop()
userAgent?: string; userAgent?: string;
@Prop({
validate: (v: number) => v > 0 && v <= 180,
})
timeout?: number;
static override from(input: any) { static override from(input: any) {
const instance = super.from(input) as CrawlerOptions; const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -178,6 +188,11 @@ export class CrawlerOptions extends AutoCastable {
instance.cacheTolerance = cacheTolerance; instance.cacheTolerance = cacheTolerance;
} }
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
if (!isNaN(timeoutSeconds)) {
instance.timeout = timeoutSeconds;
}
const targetSelector = ctx?.req.get('x-target-selector'); const targetSelector = ctx?.req.get('x-target-selector');
instance.targetSelector ??= targetSelector; instance.targetSelector ??= targetSelector;
const waitForSelector = ctx?.req.get('x-wait-for-selector'); const waitForSelector = ctx?.req.get('x-wait-for-selector');

View File

@ -66,6 +66,7 @@ export interface ScrappingOptions {
waitForSelector?: string; waitForSelector?: string;
minIntervalMs?: number; minIntervalMs?: number;
overrideUserAgent?: string; overrideUserAgent?: string;
timeoutMs?: number;
} }
@ -449,7 +450,10 @@ document.addEventListener('load', handlePageLoad);
); );
}); });
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 }) const gotoPromise = page.goto(url, {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout: options?.timeoutMs || 30_000
})
.catch((err) => { .catch((err) => {
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) }); this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
return Promise.reject(new AssertionFailureError({ return Promise.reject(new AssertionFailureError({