fix: timeout respect

This commit is contained in:
Yanlong Wang 2025-04-17 22:08:17 +08:00
parent c795cdb7b3
commit d2afa9ddc2
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 63 additions and 428 deletions

View File

@ -116,6 +116,10 @@ export class CrawlerHost extends RPCHost {
if (snapshot.isIntermediate) {
return;
}
if (!snapshot.lastMutationIdle) {
// Never reached mutationIdle, presumably too short timeout
return;
}
if (options.locale) {
Reflect.set(snapshot, 'locale', options.locale);
}
@ -313,7 +317,6 @@ export class CrawlerHost extends RPCHost {
throw new SecurityCompromiseError(`Domain ${targetUrl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
}
const crawlOpts = await this.configure(crawlerOptions);
if (crawlerOptions.robotsTxt) {
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
@ -461,7 +464,6 @@ export class CrawlerHost extends RPCHost {
}
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
@ -798,6 +800,8 @@ export class CrawlerHost extends RPCHost {
}
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && !this.knownUrlThatSideLoadingWouldCrashTheBrowser(urlToCrawl)) {
const sideLoadSnapshotPermitted = crawlerOpts?.browserIsNotRequired() &&
[RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(crawlerOpts.presumedRespondTiming);
try {
const altOpts = { ...crawlOpts };
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
@ -832,7 +836,7 @@ export class CrawlerHost extends RPCHost {
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
draftSnapshot.title ??= analyzed.title;
draftSnapshot.isIntermediate = true;
if (crawlerOpts?.browserIsNotRequired()) {
if (sideLoadSnapshotPermitted) {
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
}
let fallbackProxyIsUsed = false;
@ -858,7 +862,7 @@ export class CrawlerHost extends RPCHost {
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
proxySnapshot.isIntermediate = true;
if (crawlerOpts?.browserIsNotRequired()) {
if (sideLoadSnapshotPermitted) {
yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
}
sideLoaded = proxyLoaded;

View File

@ -318,11 +318,6 @@ export class SearcherHost extends RPCHost {
throw new AssertionFailureError(`No search results available for query ${searchQuery}`);
}
if (crawlOpts.timeoutMs && crawlOpts.timeoutMs < 30_000) {
delete crawlOpts.timeoutMs;
}
let lastScrapped: any[] | undefined;
const targetResultCount = crawlWithoutContent ? count : count + 2;
const trimmedResults = results.filter((x) => Boolean(x.link)).slice(0, targetResultCount).map((x) => this.mapToFinalResults(x));

View File

@ -655,8 +655,11 @@ export class CrawlerOptions extends AutoCastable {
if (this.respondWith.includes('lm')) {
return false;
}
if (this.withIframe) {
return false;
}
return false;
return !snapshot.isIntermediate;
}
isCacheQueryApplicable() {

View File

@ -846,7 +846,6 @@ export class PuppeteerControl extends AsyncService {
async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
// parsedUrl.search = '';
const url = parsedUrl.toString();
let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined;
let pageshot: Buffer | undefined;
@ -1097,7 +1096,7 @@ export class PuppeteerControl extends AsyncService {
nextSnapshotDeferred.promise.finally(() => {
this.off('crippled', crippleListener);
});
let finalized = false;
let successfullyDone = false;
const hdl = (s: any) => {
if (snapshot === s) {
return;
@ -1143,6 +1142,39 @@ export class PuppeteerControl extends AsyncService {
goToOptions.referer = options.referer;
}
let waitForPromise: Promise<any> | undefined;
let finalizationPromise: Promise<any> | undefined;
const doFinalization = async () => {
if (!waitForPromise) {
successfullyDone = true;
}
try {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = (await this.takeScreenShot(page)) || screenshot;
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
} catch (err: any) {
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
}
if (!snapshot?.html) {
return;
}
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit(
'crawled',
{
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
},
{ ...options, url: parsedUrl }
);
};
const delayPromise = delay(timeout);
const gotoPromise = page.goto(url, goToOptions)
.catch((err) => {
@ -1170,50 +1202,14 @@ export class PuppeteerControl extends AsyncService {
// Calling evaluate directly may stall the process.
if (!snapshot) {
if (stuff instanceof Error) {
finalized = true;
throw stuff;
}
}
await Promise.race([Promise.allSettled([...pageScriptEvaluations, ...frameScriptEvaluations]), delayPromise])
.catch(() => void 0);
try {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = (await this.takeScreenShot(page)) || screenshot;
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
} catch (err: any) {
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
if (stuff instanceof Error) {
finalized = true;
throw stuff;
}
}
if (!snapshot?.html) {
if (stuff instanceof Error) {
finalized = true;
throw stuff;
}
}
finalized = true;
if (snapshot?.html) {
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit(
'crawled',
{
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
},
{ ...options, url: parsedUrl }
);
}
finalizationPromise = doFinalization();
return stuff;
});
let waitForPromise: Promise<any> | undefined;
if (options.waitForSelector) {
const t0 = Date.now();
waitForPromise = nextSnapshotDeferred.promise.then(() => {
@ -1224,19 +1220,12 @@ export class PuppeteerControl extends AsyncService {
const p = (Array.isArray(options.waitForSelector) ?
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
.then(async () => {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = (await this.takeScreenShot(page)) || screenshot;
pageshot = (await this.takeScreenShot(page, { fullPage: true })) || pageshot;
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
finalized = true;
.then(() => {
successfullyDone = true;
finalizationPromise = doFinalization();
})
.catch((err) => {
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
waitForPromise = undefined;
});
return p as any;
});
@ -1254,11 +1243,8 @@ export class PuppeteerControl extends AsyncService {
}
let error;
await Promise.race(ckpt).catch((err) => error = err);
if (finalized && !error) {
if (successfullyDone && !error) {
if (!snapshot && !screenshot) {
if (error) {
throw error;
}
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
}
yield {
@ -1286,10 +1272,20 @@ export class PuppeteerControl extends AsyncService {
if (error) {
throw error;
}
if (successfullyDone) {
break;
}
}
await finalizationPromise;
yield {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: _.uniq(pdfUrls), screenshot, pageshot
} as PageSnapshot;
} finally {
this.pagePhase.set(page, 'background');
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
Promise.allSettled([gotoPromise, waitForPromise, finalizationPromise]).finally(() => {
page.off('snapshot', hdl);
this.ditchPage(page);
});
@ -1329,369 +1325,6 @@ export class PuppeteerControl extends AsyncService {
return r.filter(Boolean);
}
async simpleScrap(parsedUrl: URL, options: ScrappingOptions = {}): Promise<PageSnapshot> {
// parsedUrl.search = '';
const url = parsedUrl.toString();
let snapshot: PageSnapshot | undefined;
let navigationResponse: HTTPResponse | undefined;
const page = await this.getNextPage();
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
this.pagePhase.set(page, 'active');
page.on('response', (resp) => {
this.blackHoleDetector.itWorked();
const req = resp.request();
if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
navigationResponse = resp;
}
if (!resp.ok()) {
return;
}
});
page.on('request', async (req) => {
if (req.isInterceptResolutionHandled()) {
return;
};
const reqUrlParsed = new URL(req.url());
if (!reqUrlParsed.protocol.startsWith('http')) {
const overrides = req.continueRequestOverrides();
return req.continue(overrides, 0);
}
const typ = req.resourceType();
if (typ === 'media') {
// Non-cooperative answer to block all media requests.
return req.abort('blockedbyclient');
}
if (!options.proxyResources) {
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
if (!isDocRequest) {
if (options.extraHeaders) {
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...req.headers(),
...overrides?.headers,
...options.extraHeaders,
}
}, 1] as const;
return req.continue(continueArgs[0], continueArgs[1]);
}
const overrides = req.continueRequestOverrides();
return req.continue(overrides, 0);
}
}
const sideload = options.sideLoad;
const impersonate = sideload?.impersonate[reqUrlParsed.href];
if (impersonate) {
let body;
if (impersonate.body) {
body = await readFile(await impersonate.body.filePath);
if (req.isInterceptResolutionHandled()) {
return;
}
}
return req.respond({
status: impersonate.status,
headers: impersonate.headers,
contentType: impersonate.contentType,
body: body ? Uint8Array.from(body) : undefined,
}, 999);
}
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
const ctx = this.lifeCycleTrack.get(page);
if (proxy && ctx) {
return await this.asyncLocalContext.bridge(ctx, async () => {
try {
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
...options,
method: req.method(),
body: req.postData(),
extraHeaders: {
...req.headers(),
...options.extraHeaders,
},
proxyUrl: proxy
});
if (req.isInterceptResolutionHandled()) {
return;
};
if (curled.chain.length === 1) {
if (!curled.file) {
return req.respond({
status: curled.status,
headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType,
}, 3);
}
const body = await readFile(await curled.file.filePath);
if (req.isInterceptResolutionHandled()) {
return;
};
return req.respond({
status: curled.status,
headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType,
body: Uint8Array.from(body),
}, 3);
}
options.sideLoad ??= curled.sideLoadOpts;
_.merge(options.sideLoad, curled.sideLoadOpts);
const firstReq = curled.chain[0];
return req.respond({
status: firstReq.result!.code,
headers: _.omit(firstReq, 'result'),
}, 3);
} catch (err: any) {
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
}
if (req.isInterceptResolutionHandled()) {
return;
};
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...req.headers(),
...overrides?.headers,
...options.extraHeaders,
}
}, 1] as const;
return req.continue(continueArgs[0], continueArgs[1]);
});
}
if (req.isInterceptResolutionHandled()) {
return;
};
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...req.headers(),
...overrides?.headers,
...options.extraHeaders,
}
}, 1] as const;
return req.continue(continueArgs[0], continueArgs[1]);
});
const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
if (options.locale) {
// Add headers via request interception to walk around this bug
// https://github.com/puppeteer/puppeteer/issues/10235
// await page.setExtraHTTPHeaders({
// 'Accept-Language': options.locale
// });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "language", {
get: function () {
return options.locale;
}
});
Object.defineProperty(navigator, "languages", {
get: function () {
return [options.locale];
}
});
});
}
if (options.cookies) {
const mapped = options.cookies.map((x) => {
const draft: CookieParam = {
name: x.name,
value: encodeURIComponent(x.value),
secure: x.secure,
domain: x.domain,
path: x.path,
expires: x.expires ? Math.floor(x.expires.valueOf() / 1000) : undefined,
sameSite: x.sameSite as any,
};
if (!draft.expires && x.maxAge) {
draft.expires = Math.floor(Date.now() / 1000) + x.maxAge;
}
if (!draft.domain) {
draft.url = parsedUrl.toString();
}
return draft;
});
try {
await page.setCookie(...mapped);
} catch (err: any) {
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
throw new ParamValidationError({
path: 'cookies',
message: `Failed to set cookies: ${err?.message}`
});
}
}
if (options.overrideUserAgent) {
await page.setUserAgent(options.overrideUserAgent);
}
if (options.viewport) {
await page.setViewport(options.viewport);
}
let nextSnapshotDeferred = Defer();
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
this.once('crippled', crippleListener);
nextSnapshotDeferred.promise.finally(() => {
this.off('crippled', crippleListener);
});
let finalized = false;
const hdl = (s: any) => {
if (snapshot === s) {
return;
}
snapshot = s;
if (snapshot) {
const kit = this.pageReqCtrl.get(page);
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
}
if (s?.maxElemDepth && s.maxElemDepth > 256) {
return;
}
if (s?.elemCount && s.elemCount > 10_000) {
return;
}
nextSnapshotDeferred.resolve(s);
nextSnapshotDeferred = Defer();
this.once('crippled', crippleListener);
nextSnapshotDeferred.promise.finally(() => {
this.off('crippled', crippleListener);
});
};
page.on('snapshot', hdl);
page.once('abuse', (event: any) => {
this.emit('abuse', { ...event, url: parsedUrl });
if (snapshot?.href && parsedUrl.href !== snapshot.href) {
this.emit('abuse', { ...event, url: snapshot.href });
}
nextSnapshotDeferred.reject(
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
);
});
const timeout = options.timeoutMs || 30_000;
const goToOptions: GoToOptions = {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout,
};
if (options.referer) {
goToOptions.referer = options.referer;
}
const gotoPromise = page.goto(url, goToOptions)
.catch((err) => {
if (err instanceof TimeoutError) {
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
return new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
});
}
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
return new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`,
cause: err,
});
}).then(async (stuff) => {
// This check is necessary because without snapshot, the condition of the page is unclear
// Calling evaluate directly may stall the process.
if (!snapshot) {
if (stuff instanceof Error) {
finalized = true;
throw stuff;
}
}
try {
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
} catch (err: any) {
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err });
if (stuff instanceof Error) {
finalized = true;
throw stuff;
}
}
if (!snapshot?.html) {
if (stuff instanceof Error) {
finalized = true;
throw stuff;
}
}
finalized = true;
if (snapshot?.html) {
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit(
'crawled',
{
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
},
{ ...options, url: parsedUrl }
);
}
});
try {
while (true) {
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
if (options.minIntervalMs) {
ckpt.push(delay(options.minIntervalMs));
}
let error;
await Promise.race(ckpt).catch((err) => error = err);
if (finalized && !error) {
if (!snapshot) {
if (error) {
throw error;
}
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
}
return {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
} as PageSnapshot;
}
if (snapshot?.lastMutationIdle) {
return {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
} as PageSnapshot;
}
if (error) {
throw error;
}
}
} finally {
this.pagePhase.set(page, 'background');
page.off('snapshot', hdl);
this.ditchPage(page);
nextSnapshotDeferred.resolve();
}
}
}
const puppeteerControl = container.resolve(PuppeteerControl);