From d2afa9ddc256861dd60ee8a3bcef9be99f0c4194 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 17 Apr 2025 22:08:17 +0800 Subject: [PATCH] fix: timeout respect --- src/api/crawler.ts | 12 +- src/api/searcher-serper.ts | 5 - src/dto/crawler-options.ts | 5 +- src/services/puppeteer.ts | 469 ++++--------------------------------- 4 files changed, 63 insertions(+), 428 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index d0826f4..2788755 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -116,6 +116,10 @@ export class CrawlerHost extends RPCHost { if (snapshot.isIntermediate) { return; } + if (!snapshot.lastMutationIdle) { + // Never reached mutationIdle, presumably too short timeout + return; + } if (options.locale) { Reflect.set(snapshot, 'locale', options.locale); } @@ -313,7 +317,6 @@ export class CrawlerHost extends RPCHost { throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); } } - const crawlOpts = await this.configure(crawlerOptions); if (crawlerOptions.robotsTxt) { await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt); @@ -461,7 +464,6 @@ export class CrawlerHost extends RPCHost { } throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlerOptions); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { @@ -798,6 +800,8 @@ export class CrawlerHost extends RPCHost { } if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) { + const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() && + [RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming); try { const altOpts = { ...crawlOpts }; let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ? @@ -832,7 +836,7 @@ export class CrawlerHost extends RPCHost { let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html); draftSnapshot.title ??= analyzed.title; draftSnapshot.isIntermediate = true; - if (crawlerOpts?.browserIsNotRequired()) { + if (sideLoadSnapshotPermitted) { yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts); } let fallbackProxyIsUsed = false; @@ -858,7 +862,7 @@ export class CrawlerHost extends RPCHost { analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html); if (proxyLoaded.status === 200 || analyzed.tokens >= 200) { proxySnapshot.isIntermediate = true; - if (crawlerOpts?.browserIsNotRequired()) { + if (sideLoadSnapshotPermitted) { yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts); } sideLoaded = proxyLoaded; diff --git a/src/api/searcher-serper.ts b/src/api/searcher-serper.ts index 8314c8f..8cb8c67 100644 --- a/src/api/searcher-serper.ts +++ b/src/api/searcher-serper.ts @@ -318,11 +318,6 @@ export class SearcherHost extends RPCHost { throw new AssertionFailureError(`No search results available for query ${searchQuery}`); } - if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) { - delete crawlOpts.timeoutMs; - } - - let lastScrapped: any[] | undefined; const targetResultCount = crawlWithoutContent ? count : count + 2; const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x)); diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index 2004706..01dea59 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -655,8 +655,11 @@ export class CrawlerOptions extends AutoCastable { if (this.respondWith.includes('lm')) { return false; } + if (this.withIframe) { + return false; + } - return false; + return !snapshot.isIntermediate; } isCacheQueryApplicable() { diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 67840af..5621a1a 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -846,7 +846,6 @@ export class PuppeteerControl extends AsyncService { async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator { // parsedUrl.search = ''; const url = parsedUrl.toString(); - let snapshot: PageSnapshot | undefined; let screenshot: Buffer | undefined; let pageshot: Buffer | undefined; @@ -1097,7 +1096,7 @@ export class PuppeteerControl extends AsyncService { nextSnapshotDeferred.promise.finally(() => { this.off('crippled', crippleListener); }); - let finalized = false; + let successfullyDone = false; const hdl = (s: any) => { if (snapshot === s) { return; @@ -1143,6 +1142,39 @@ export class PuppeteerControl extends AsyncService { goToOptions.referer = options.referer; } + let waitForPromise: Promise | undefined; + let finalizationPromise: Promise | undefined; + const doFinalization = async () => { + if (!waitForPromise) { + successfullyDone = true; + } + try { + const pSubFrameSnapshots = this.snapshotChildFrames(page); + snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; + screenshot = (await this.takeScreenShot(page)) || screenshot; + pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; + if (snapshot) { + snapshot.childFrames = await pSubFrameSnapshots; + } + } catch (err: any) { + this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); + } + if (!snapshot?.html) { + return; + } + + this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); + this.emit( + 'crawled', + { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + pdfs: _.uniq(pdfUrls), screenshot, pageshot, + }, + { ...options, url: parsedUrl } + ); + }; const delayPromise = delay(timeout); const gotoPromise = page.goto(url, goToOptions) .catch((err) => { @@ -1170,50 +1202,14 @@ export class PuppeteerControl extends AsyncService { // Calling evaluate directly may stall the process. if (!snapshot) { if (stuff instanceof Error) { - finalized = true; throw stuff; } } await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise]) .catch(() => void 0); - try { - const pSubFrameSnapshots = this.snapshotChildFrames(page); - snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = (await this.takeScreenShot(page)) || screenshot; - pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; - if (snapshot) { - snapshot.childFrames = await pSubFrameSnapshots; - } - } catch (err: any) { - this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - if (!snapshot?.html) { - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - - finalized = true; - if (snapshot?.html) { - this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); - this.emit( - 'crawled', - { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - pdfs: _.uniq(pdfUrls), screenshot, pageshot, - }, - { ...options, url: parsedUrl } - ); - } + finalizationPromise = doFinalization(); + return stuff; }); - let waitForPromise: Promise | undefined; if (options.waitForSelector) { const t0 = Date.now(); waitForPromise = nextSnapshotDeferred.promise.then(() => { @@ -1224,19 +1220,12 @@ export class PuppeteerControl extends AsyncService { const p = (Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) : page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout })) - .then(async () => { - const pSubFrameSnapshots = this.snapshotChildFrames(page); - snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = (await this.takeScreenShot(page)) || screenshot; - pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot; - if (snapshot) { - snapshot.childFrames = await pSubFrameSnapshots; - } - finalized = true; + .then(() => { + successfullyDone = true; + finalizationPromise = doFinalization(); }) .catch((err) => { this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err }); - waitForPromise = undefined; }); return p as any; }); @@ -1254,11 +1243,8 @@ export class PuppeteerControl extends AsyncService { } let error; await Promise.race(ckpt).catch((err) => error = err); - if (finalized && !error) { + if (successfullyDone && !error) { if (!snapshot && !screenshot) { - if (error) { - throw error; - } throw new AssertionFailureError(`Could not extract any meaningful content from the page`); } yield { @@ -1286,10 +1272,20 @@ export class PuppeteerControl extends AsyncService { if (error) { throw error; } + if (successfullyDone) { + break; + } } + await finalizationPromise; + yield { + ...snapshot, + status: navigationResponse?.status(), + statusText: navigationResponse?.statusText(), + pdfs: _.uniq(pdfUrls), screenshot, pageshot + } as PageSnapshot; } finally { this.pagePhase.set(page, 'background'); - (waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => { + Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => { page.off('snapshot', hdl); this.ditchPage(page); }); @@ -1329,369 +1325,6 @@ export class PuppeteerControl extends AsyncService { return r.filter(Boolean); } - async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise { - // parsedUrl.search = ''; - const url = parsedUrl.toString(); - let snapshot: PageSnapshot | undefined; - let navigationResponse: HTTPResponse | undefined; - const page = await this.getNextPage(); - this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx); - this.pagePhase.set(page, 'active'); - page.on('response', (resp) => { - this.blackHoleDetector.itWorked(); - const req = resp.request(); - if (req.frame() === page.mainFrame() && req.isNavigationRequest()) { - navigationResponse = resp; - } - if (!resp.ok()) { - return; - } - }); - page.on('request', async (req) => { - if (req.isInterceptResolutionHandled()) { - return; - }; - const reqUrlParsed = new URL(req.url()); - if (!reqUrlParsed.protocol.startsWith('http')) { - const overrides = req.continueRequestOverrides(); - - return req.continue(overrides, 0); - } - const typ = req.resourceType(); - if (typ === 'media') { - // Non-cooperative answer to block all media requests. - return req.abort('blockedbyclient'); - } - if (!options.proxyResources) { - const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ); - if (!isDocRequest) { - if (options.extraHeaders) { - const overrides = req.continueRequestOverrides(); - const continueArgs = [{ - ...overrides, - headers: { - ...req.headers(), - ...overrides?.headers, - ...options.extraHeaders, - } - }, 1] as const; - - return req.continue(continueArgs[0], continueArgs[1]); - } - const overrides = req.continueRequestOverrides(); - - return req.continue(overrides, 0); - } - } - const sideload = options.sideLoad; - - const impersonate = sideload?.impersonate[reqUrlParsed.href]; - if (impersonate) { - let body; - if (impersonate.body) { - body = await readFile(await impersonate.body.filePath); - if (req.isInterceptResolutionHandled()) { - return; - } - } - return req.respond({ - status: impersonate.status, - headers: impersonate.headers, - contentType: impersonate.contentType, - body: body ? Uint8Array.from(body) : undefined, - }, 999); - } - - const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin]; - const ctx = this.lifeCycleTrack.get(page); - if (proxy && ctx) { - return await this.asyncLocalContext.bridge(ctx, async () => { - try { - const curled = await this.curlControl.sideLoad(reqUrlParsed, { - ...options, - method: req.method(), - body: req.postData(), - extraHeaders: { - ...req.headers(), - ...options.extraHeaders, - }, - proxyUrl: proxy - }); - if (req.isInterceptResolutionHandled()) { - return; - }; - - if (curled.chain.length === 1) { - if (!curled.file) { - return req.respond({ - status: curled.status, - headers: _.omit(curled.headers, 'result'), - contentType: curled.contentType, - }, 3); - } - const body = await readFile(await curled.file.filePath); - if (req.isInterceptResolutionHandled()) { - return; - }; - return req.respond({ - status: curled.status, - headers: _.omit(curled.headers, 'result'), - contentType: curled.contentType, - body: Uint8Array.from(body), - }, 3); - } - options.sideLoad ??= curled.sideLoadOpts; - _.merge(options.sideLoad, curled.sideLoadOpts); - const firstReq = curled.chain[0]; - - return req.respond({ - status: firstReq.result!.code, - headers: _.omit(firstReq, 'result'), - }, 3); - } catch (err: any) { - this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy }); - } - if (req.isInterceptResolutionHandled()) { - return; - }; - const overrides = req.continueRequestOverrides(); - const continueArgs = [{ - ...overrides, - headers: { - ...req.headers(), - ...overrides?.headers, - ...options.extraHeaders, - } - }, 1] as const; - - return req.continue(continueArgs[0], continueArgs[1]); - }); - } - - if (req.isInterceptResolutionHandled()) { - return; - }; - const overrides = req.continueRequestOverrides(); - const continueArgs = [{ - ...overrides, - headers: { - ...req.headers(), - ...overrides?.headers, - ...options.extraHeaders, - } - }, 1] as const; - - return req.continue(continueArgs[0], continueArgs[1]); - }); - - const sn = this.snMap.get(page); - this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); - if (options.locale) { - // Add headers via request interception to walk around this bug - // https://github.com/puppeteer/puppeteer/issues/10235 - // await page.setExtraHTTPHeaders({ - // 'Accept-Language': options.locale - // }); - - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, "language", { - get: function () { - return options.locale; - } - }); - Object.defineProperty(navigator, "languages", { - get: function () { - return [options.locale]; - } - }); - }); - } - - if (options.cookies) { - const mapped = options.cookies.map((x) => { - const draft: CookieParam = { - name: x.name, - value: encodeURIComponent(x.value), - secure: x.secure, - domain: x.domain, - path: x.path, - expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined, - sameSite: x.sameSite as any, - }; - if (!draft.expires && x.maxAge) { - draft.expires = Math.floor(Date.now() / 1000) + x.maxAge; - } - if (!draft.domain) { - draft.url = parsedUrl.toString(); - } - - return draft; - }); - try { - await page.setCookie(...mapped); - } catch (err: any) { - this.logger.warn(`Page ${sn}: Failed to set cookies`, { err }); - throw new ParamValidationError({ - path: 'cookies', - message: `Failed to set cookies: ${err?.message}` - }); - } - } - if (options.overrideUserAgent) { - await page.setUserAgent(options.overrideUserAgent); - } - if (options.viewport) { - await page.setViewport(options.viewport); - } - - let nextSnapshotDeferred = Defer(); - const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` })); - this.once('crippled', crippleListener); - nextSnapshotDeferred.promise.finally(() => { - this.off('crippled', crippleListener); - }); - let finalized = false; - const hdl = (s: any) => { - if (snapshot === s) { - return; - } - snapshot = s; - if (snapshot) { - const kit = this.pageReqCtrl.get(page); - snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt; - snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt; - } - if (s?.maxElemDepth && s.maxElemDepth > 256) { - return; - } - if (s?.elemCount && s.elemCount > 10_000) { - return; - } - nextSnapshotDeferred.resolve(s); - nextSnapshotDeferred = Defer(); - this.once('crippled', crippleListener); - nextSnapshotDeferred.promise.finally(() => { - this.off('crippled', crippleListener); - }); - }; - page.on('snapshot', hdl); - page.once('abuse', (event: any) => { - this.emit('abuse', { ...event, url: parsedUrl }); - if (snapshot?.href && parsedUrl.href !== snapshot.href) { - this.emit('abuse', { ...event, url: snapshot.href }); - } - - nextSnapshotDeferred.reject( - new SecurityCompromiseError(`Abuse detected: ${event.reason}`) - ); - }); - - const timeout = options.timeoutMs || 30_000; - const goToOptions: GoToOptions = { - waitUntil: ['load', 'domcontentloaded', 'networkidle0'], - timeout, - }; - - if (options.referer) { - goToOptions.referer = options.referer; - } - - const gotoPromise = page.goto(url, goToOptions) - .catch((err) => { - if (err instanceof TimeoutError) { - this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err }); - return new AssertionFailureError({ - message: `Failed to goto ${url}: ${err}`, - cause: err, - }); - } - - this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err }); - return new AssertionFailureError({ - message: `Failed to goto ${url}: ${err}`, - cause: err, - }); - }).then(async (stuff) => { - // This check is necessary because without snapshot, the condition of the page is unclear - // Calling evaluate directly may stall the process. - if (!snapshot) { - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - try { - snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - } catch (err: any) { - this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err }); - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - if (!snapshot?.html) { - if (stuff instanceof Error) { - finalized = true; - throw stuff; - } - } - - finalized = true; - if (snapshot?.html) { - this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); - this.emit( - 'crawled', - { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - }, - { ...options, url: parsedUrl } - ); - } - }); - - try { - while (true) { - const ckpt = [nextSnapshotDeferred.promise, gotoPromise]; - if (options.minIntervalMs) { - ckpt.push(delay(options.minIntervalMs)); - } - let error; - await Promise.race(ckpt).catch((err) => error = err); - if (finalized && !error) { - if (!snapshot) { - if (error) { - throw error; - } - throw new AssertionFailureError(`Could not extract any meaningful content from the page`); - } - return { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - } as PageSnapshot; - } - - if (snapshot?.lastMutationIdle) { - return { - ...snapshot, - status: navigationResponse?.status(), - statusText: navigationResponse?.statusText(), - } as PageSnapshot; - } - if (error) { - throw error; - } - } - } finally { - this.pagePhase.set(page, 'background'); - page.off('snapshot', hdl); - this.ditchPage(page); - nextSnapshotDeferred.resolve(); - } - } - } const puppeteerControl = container.resolve(PuppeteerControl);