mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-02 04:40:37 +08:00
fix: bring back content based respond timing
This commit is contained in:
parent
3a40db2590
commit
2a30fce1cc
@ -793,7 +793,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (!sideLoaded.file) {
|
if (!sideLoaded.file) {
|
||||||
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(
|
const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(
|
||||||
urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
|
urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName
|
||||||
).catch((err) => {
|
).catch((err) => {
|
||||||
if (err instanceof ApplicationError) {
|
if (err instanceof ApplicationError) {
|
||||||
@ -809,6 +809,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
||||||
draftSnapshot.title ??= analyzed.title;
|
draftSnapshot.title ??= analyzed.title;
|
||||||
draftSnapshot.isIntermediate = true;
|
draftSnapshot.isIntermediate = true;
|
||||||
|
if (crawlerOpts?.browserIsNotRequired()) {
|
||||||
|
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
||||||
|
}
|
||||||
let fallbackProxyIsUsed = false;
|
let fallbackProxyIsUsed = false;
|
||||||
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
||||||
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
||||||
@ -825,19 +828,19 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
return Promise.reject(err);
|
return Promise.reject(err);
|
||||||
});
|
});
|
||||||
|
if (proxyLoaded.status === 200 && crawlerOpts?.browserIsNotRequired()) {
|
||||||
|
}
|
||||||
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
||||||
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
||||||
draftSnapshot = proxySnapshot;
|
proxySnapshot.isIntermediate = true;
|
||||||
draftSnapshot.isIntermediate = true;
|
if (crawlerOpts?.browserIsNotRequired()) {
|
||||||
|
yield this.jsdomControl.narrowSnapshot(proxySnapshot, crawlOpts);
|
||||||
|
}
|
||||||
sideLoaded = proxyLoaded;
|
sideLoaded = proxyLoaded;
|
||||||
fallbackProxyIsUsed = true;
|
fallbackProxyIsUsed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
|
||||||
yield draftSnapshot;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
|
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
|
||||||
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
|
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
|
||||||
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
|
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
|
||||||
|
@ -25,6 +25,7 @@ export enum ENGINE_TYPE {
|
|||||||
|
|
||||||
export enum RESPOND_TIMING {
|
export enum RESPOND_TIMING {
|
||||||
HTML = 'html',
|
HTML = 'html',
|
||||||
|
VISIBLE_CONTENT = 'visible-content',
|
||||||
MUTATION_IDLE = 'mutation-idle',
|
MUTATION_IDLE = 'mutation-idle',
|
||||||
RESOURCE_IDLE = 'resource-idle',
|
RESOURCE_IDLE = 'resource-idle',
|
||||||
MEDIA_IDLE = 'media-idle',
|
MEDIA_IDLE = 'media-idle',
|
||||||
@ -222,11 +223,12 @@ class Viewport extends AutoCastable {
|
|||||||
},
|
},
|
||||||
'X-Respond-Timing': {
|
'X-Respond-Timing': {
|
||||||
description: `Explicitly specify the respond timing. One of the following:\n\n` +
|
description: `Explicitly specify the respond timing. One of the following:\n\n` +
|
||||||
`- html: unrendered HTML is enough to return\n` +
|
`- html: directly return unrendered HTML\n` +
|
||||||
|
`- visible-content: return immediately when any content becomes available\n` +
|
||||||
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
|
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
|
||||||
`- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` +
|
`- resource-idle: wait for no additional resources that would affect page logic and content has SUCCEEDED loading in 0.5s\n` +
|
||||||
`- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n` +
|
`- media-idle: wait for no additional resources, including media resources, has SUCCEEDED loading in 0.5s\n` +
|
||||||
`- network-idle: wait for full load of webpage, as usual.\n\n`,
|
`- network-idle: wait for full load of webpage, also known as networkidle0.\n\n`,
|
||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
@ -600,7 +602,7 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (this.respondTiming) {
|
if (this.respondTiming) {
|
||||||
return this.respondTiming;
|
return this.respondTiming;
|
||||||
}
|
}
|
||||||
if (this.timeout) {
|
if (this.timeout && this.timeout >= 20) {
|
||||||
return RESPOND_TIMING.NETWORK_IDLE;
|
return RESPOND_TIMING.NETWORK_IDLE;
|
||||||
}
|
}
|
||||||
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
|
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
|
||||||
@ -636,6 +638,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (presumedTiming === RESPOND_TIMING.VISIBLE_CONTENT && snapshot.parsed?.content) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -677,7 +682,7 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
browserIsNotRequired() {
|
browserIsNotRequired() {
|
||||||
if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) {
|
if (this.respondTiming && ![RESPOND_TIMING.HTML, RESPOND_TIMING.VISIBLE_CONTENT].includes(this.respondTiming)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user