From a9936d322e83b563bcd0ae1b124ed228999dcaa3 Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Wed, 5 Jun 2024 19:47:04 +0800 Subject: [PATCH] fix: search descriptions --- .../functions/src/cloud-functions/crawler.ts | 4 ++++ .../functions/src/cloud-functions/searcher.ts | 22 ++++++++++++++----- .../functions/src/dto/scrapping-options.ts | 15 +++++++++++++ backend/functions/src/services/puppeteer.ts | 6 ++++- 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index a611ed7..71696fa 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -870,6 +870,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; this.threadLocal.set('withImagesSummary', opts.withImagesSummary); this.threadLocal.set('cacheTolerance', opts.cacheTolerance); this.threadLocal.set('userAgent', opts.userAgent); + if (opts.timeout) { + this.threadLocal.set('timeout', opts.timeout * 1000); + } const crawlOpts: ExtraScrappingOptions = { proxyUrl: opts.proxyUrl, @@ -878,6 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; waitForSelector: opts.waitForSelector, targetSelector: opts.targetSelector, overrideUserAgent: opts.userAgent, + timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, }; return crawlOpts; diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index b219160..4f59340 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -30,7 +30,7 @@ export class SearcherHost extends RPCHost { cacheValidMs = 1000 * 3600; pageCacheToleranceMs = 1000 * 3600 * 24; - reasonableDelayMs = 10_000; + reasonableDelayMs = 15_000; targetResultCount = 5; @@ -163,6 +163,10 @@ export class SearcherHost extends RPCHost { throw new AssertionFailureError(`No search results available for query ${searchQuery}`); } + if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) { + delete crawlOpts.timeoutMs; + } + const it = this.fetchSearchResults(crawlerOptions.respondWith, r.web?.results, crawlOpts, crawlerOptions.cacheTolerance || this.pageCacheToleranceMs ); @@ -213,7 +217,7 @@ export class SearcherHost extends RPCHost { chargeAmount = this.getChargeAmount(lastScrapped); rpcReflect.return(lastScrapped); earlyReturn = true; - }, this.reasonableDelayMs); + }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs); }; for await (const scrapped of it) { @@ -259,7 +263,7 @@ export class SearcherHost extends RPCHost { chargeAmount = this.getChargeAmount(lastScrapped); rpcReflect.return(assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null })); earlyReturn = true; - }, this.reasonableDelayMs); + }, ((crawlerOptions.timeout || 0) * 1000) || this.reasonableDelayMs); }; for await (const scrapped of it) { @@ -317,7 +321,12 @@ export class SearcherHost extends RPCHost { description: upstreamSearchResult.description, }; } - return this.crawler.formatSnapshot(mode, x, urls[i]); + return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => { + r.title ??= upstreamSearchResult.title; + r.description = upstreamSearchResult.description; + + return r; + }); }); const resultArray = await Promise.all(mapped) as FormattedPage[]; @@ -343,7 +352,7 @@ export class SearcherHost extends RPCHost { return { ...x, toString(this: any) { - if (this.description) { + if (!this.content && this.description) { if (this.title) { return `[${i + 1}] Title: ${this.title} [${i + 1}] URL Source: ${this.url} @@ -355,6 +364,9 @@ export class SearcherHost extends RPCHost { } const mixins = []; + if (this.description) { + mixins.push(`[${i + 1}] Description: ${this.description}`); + } if (this.publishedTime) { mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`); } diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 2db7cd1..e56dd61 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -91,6 +91,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser'; in: 'header', schema: { type: 'string' } }, + 'X-Timeout': { + description: `Specify timeout in seconds. Max 180.`, + in: 'header', + schema: { type: 'string' } + }, } } } @@ -142,6 +147,11 @@ export class CrawlerOptions extends AutoCastable { @Prop() userAgent?: string; + @Prop({ + validate: (v: number) => v > 0 && v <= 180, + }) + timeout?: number; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { @@ -178,6 +188,11 @@ export class CrawlerOptions extends AutoCastable { instance.cacheTolerance = cacheTolerance; } + let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || ''); + if (!isNaN(timeoutSeconds)) { + instance.timeout = timeoutSeconds; + } + const targetSelector = ctx?.req.get('x-target-selector'); instance.targetSelector ??= targetSelector; const waitForSelector = ctx?.req.get('x-wait-for-selector'); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 5e23427..dd29235 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -66,6 +66,7 @@ export interface ScrappingOptions { waitForSelector?: string; minIntervalMs?: number; overrideUserAgent?: string; + timeoutMs?: number; } @@ -449,7 +450,10 @@ document.addEventListener('load', handlePageLoad); ); }); - const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 }) + const gotoPromise = page.goto(url, { + waitUntil: ['load', 'domcontentloaded', 'networkidle0'], + timeout: options?.timeoutMs || 30_000 + }) .catch((err) => { this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) }); return Promise.reject(new AssertionFailureError({