fix: bring back content based respond timing

This commit is contained in:
Yanlong Wang 2025-03-19 19:03:51 +08:00
parent 3a40db2590
commit 2a30fce1cc
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 21 additions and 13 deletions

View File

@ -793,7 +793,7 @@ export class CrawlerHost extends RPCHost {
if (!sideLoaded.file) { if (!sideLoaded.file) {
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`); throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
} }
let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile( const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(
urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
).catch((err) => { ).catch((err) => {
if (err instanceof ApplicationError) { if (err instanceof ApplicationError) {
@ -809,6 +809,9 @@ export class CrawlerHost extends RPCHost {
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html); let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
draftSnapshot.title ??= analyzed.title; draftSnapshot.title ??= analyzed.title;
draftSnapshot.isIntermediate = true; draftSnapshot.isIntermediate = true;
if (crawlerOpts?.browserIsNotRequired()) {
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
}
let fallbackProxyIsUsed = false; let fallbackProxyIsUsed = false;
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) && if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
(analyzed.tokens < 42 || sideLoaded.status !== 200) (analyzed.tokens < 42 || sideLoaded.status !== 200)
@ -825,19 +828,19 @@ export class CrawlerHost extends RPCHost {
} }
return Promise.reject(err); return Promise.reject(err);
}); });
if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
}
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html); analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) { if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
draftSnapshot = proxySnapshot; proxySnapshot.isIntermediate = true;
draftSnapshot.isIntermediate = true; if (crawlerOpts?.browserIsNotRequired()) {
yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
}
sideLoaded = proxyLoaded; sideLoaded = proxyLoaded;
fallbackProxyIsUsed = true; fallbackProxyIsUsed = true;
} }
} }
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
yield draftSnapshot;
}
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) { if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href }); this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts; crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;

View File

@ -25,6 +25,7 @@ export enum ENGINE_TYPE {
export enum RESPOND_TIMING { export enum RESPOND_TIMING {
HTML = 'html', HTML = 'html',
VISIBLE_CONTENT = 'visible-content',
MUTATION_IDLE = 'mutation-idle', MUTATION_IDLE = 'mutation-idle',
RESOURCE_IDLE = 'resource-idle', RESOURCE_IDLE = 'resource-idle',
MEDIA_IDLE = 'media-idle', MEDIA_IDLE = 'media-idle',
@ -222,11 +223,12 @@ class Viewport extends AutoCastable {
}, },
'X-Respond-Timing': { 'X-Respond-Timing': {
description: `Explicitly specify the respond timing. One of the following:\n\n` + description: `Explicitly specify the respond timing. One of the following:\n\n` +
`- html: unrendered HTML is enough to return\n` + `- html: directly return unrendered HTML\n` +
`- visible-content: return immediately when any content becomes available\n` +
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` + `- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
`- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` + `- resource-idle: wait for no additional resources that would affect page logic and content has SUCCEEDED loading in 0.5s\n` +
`- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n` + `- media-idle: wait for no additional resources, including media resources, has SUCCEEDED loading in 0.5s\n` +
`- network-idle: wait for full load of webpage, as usual.\n\n`, `- network-idle: wait for full load of webpage, also known as networkidle0.\n\n`,
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
@ -600,7 +602,7 @@ export class CrawlerOptions extends AutoCastable {
if (this.respondTiming) { if (this.respondTiming) {
return this.respondTiming; return this.respondTiming;
} }
if (this.timeout) { if (this.timeout && this.timeout >= 20) {
return RESPOND_TIMING.NETWORK_IDLE; return RESPOND_TIMING.NETWORK_IDLE;
} }
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) { if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
@ -636,6 +638,9 @@ export class CrawlerOptions extends AutoCastable {
if (this.injectFrameScript?.length || this.injectPageScript?.length) { if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false; return false;
} }
if (presumedTiming === RESPOND_TIMING.VISIBLE_CONTENT && snapshot.parsed?.content) {
return true;
}
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) { if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
return true; return true;
} }
@ -677,7 +682,7 @@ export class CrawlerOptions extends AutoCastable {
} }
browserIsNotRequired() { browserIsNotRequired() {
if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) { if (this.respondTiming && ![RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(this.respondTiming)) {
return false; return false;
} }
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) { if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {