mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 06:15:58 +08:00
fix: scrap timing
This commit is contained in:
parent
323590647e
commit
ec7c2ab52c
@ -12,6 +12,7 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|||||||
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
||||||
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
|
import { TimeoutError } from 'puppeteer';
|
||||||
const tldExtract = require('tld-extract');
|
const tldExtract = require('tld-extract');
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
@ -370,9 +371,6 @@ const handlePageLoad = () => {
|
|||||||
if (window.haltSnapshot) {
|
if (window.haltSnapshot) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (document.readyState === 'loading') {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const thisTextLength = (document.body.innerText || '').length;
|
const thisTextLength = (document.body.innerText || '').length;
|
||||||
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
||||||
if (10 * deltaLength < lastTextLength) {
|
if (10 * deltaLength < lastTextLength) {
|
||||||
@ -383,7 +381,7 @@ const handlePageLoad = () => {
|
|||||||
window.reportSnapshot(r);
|
window.reportSnapshot(r);
|
||||||
lastTextLength = thisTextLength;
|
lastTextLength = thisTextLength;
|
||||||
};
|
};
|
||||||
setInterval(handlePageLoad, 500);
|
setInterval(handlePageLoad, 800);
|
||||||
document.addEventListener('readystatechange', handlePageLoad);
|
document.addEventListener('readystatechange', handlePageLoad);
|
||||||
document.addEventListener('load', handlePageLoad);
|
document.addEventListener('load', handlePageLoad);
|
||||||
`);
|
`);
|
||||||
@ -495,49 +493,94 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const timeout = options?.timeoutMs || 30_000;
|
||||||
|
|
||||||
const gotoPromise = page.goto(url, {
|
const gotoPromise = page.goto(url, {
|
||||||
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
||||||
timeout: options?.timeoutMs || 30_000
|
timeout,
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
if (err instanceof TimeoutError) {
|
||||||
|
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });
|
||||||
|
return new AssertionFailureError({
|
||||||
|
message: `Failed to goto ${url}: ${err}`,
|
||||||
|
cause: err,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
|
||||||
return Promise.reject(new AssertionFailureError({
|
return Promise.reject(new AssertionFailureError({
|
||||||
message: `Failed to goto ${url}: ${err}`,
|
message: `Failed to goto ${url}: ${err}`,
|
||||||
cause: err,
|
cause: err,
|
||||||
}));
|
}));
|
||||||
}).finally(async () => {
|
}).then(async (stuff) => {
|
||||||
if (!snapshot?.html) {
|
// This check is necessary because without snapshot, the condition of the page is unclear
|
||||||
|
// Calling evaluate directly may stall the process.
|
||||||
|
if (!snapshot) {
|
||||||
|
if (stuff instanceof Error) {
|
||||||
finalized = true;
|
finalized = true;
|
||||||
return;
|
throw stuff;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
try {
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
if ((!snapshot.title || !snapshot.parsed?.content) && !(snapshot.pdfs?.length)) {
|
} catch (err: any) {
|
||||||
|
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
|
||||||
|
if (stuff instanceof Error) {
|
||||||
|
finalized = true;
|
||||||
|
throw stuff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!snapshot?.html) {
|
||||||
|
if (stuff instanceof Error) {
|
||||||
|
finalized = true;
|
||||||
|
throw stuff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
|
||||||
const salvaged = await this.salvage(url, page);
|
const salvaged = await this.salvage(url, page);
|
||||||
if (salvaged) {
|
if (salvaged) {
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) });
|
||||||
|
}
|
||||||
|
|
||||||
finalized = true;
|
finalized = true;
|
||||||
|
if (snapshot?.html) {
|
||||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||||
this.emit(
|
this.emit(
|
||||||
'crawled',
|
'crawled',
|
||||||
{ ...snapshot, screenshot },
|
{ ...snapshot, screenshot },
|
||||||
{ ...options, url: parsedUrl }
|
{ ...options, url: parsedUrl }
|
||||||
);
|
);
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
let waitForPromise: Promise<any> | undefined;
|
||||||
if (options?.waitForSelector) {
|
if (options?.waitForSelector) {
|
||||||
const waitPromise = Array.isArray(options.waitForSelector) ? Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x))) : page.waitForSelector(options.waitForSelector);
|
const t0 = Date.now();
|
||||||
waitPromise
|
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
||||||
|
const t1 = Date.now();
|
||||||
|
const elapsed = t1 - t0;
|
||||||
|
const remaining = timeout - elapsed;
|
||||||
|
const thisTimeout = remaining > 100 ? remaining : 100;
|
||||||
|
const p = (Array.isArray(options.waitForSelector) ?
|
||||||
|
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
||||||
|
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
||||||
.then(async () => {
|
.then(async () => {
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot();
|
||||||
finalized = true;
|
finalized = true;
|
||||||
nextSnapshotDeferred.resolve(snapshot);
|
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
||||||
|
waitForPromise = undefined;
|
||||||
|
});
|
||||||
|
return p as any;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -545,12 +588,21 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
let lastHTML = snapshot?.html;
|
let lastHTML = snapshot?.html;
|
||||||
while (true) {
|
while (true) {
|
||||||
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
const ckpt = [nextSnapshotDeferred.promise, gotoPromise];
|
||||||
|
if (waitForPromise) {
|
||||||
|
ckpt.push(waitForPromise);
|
||||||
|
}
|
||||||
if (options?.minIntervalMs) {
|
if (options?.minIntervalMs) {
|
||||||
ckpt.push(delay(options.minIntervalMs));
|
ckpt.push(delay(options.minIntervalMs));
|
||||||
}
|
}
|
||||||
let error;
|
let error;
|
||||||
await Promise.race(ckpt).catch((err) => error = err);
|
await Promise.race(ckpt).catch((err) => error = err);
|
||||||
if (finalized && !error) {
|
if (finalized && !error) {
|
||||||
|
if (!snapshot && !screenshot) {
|
||||||
|
if (error) {
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||||
|
}
|
||||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
yield { ...snapshot, screenshot } as PageSnapshot;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -566,7 +618,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
gotoPromise.finally(() => {
|
(waitForPromise ? Promise.allSettled([gotoPromise, waitForPromise]) : gotoPromise).finally(() => {
|
||||||
page.off('snapshot', hdl);
|
page.off('snapshot', hdl);
|
||||||
this.ditchPage(page);
|
this.ditchPage(page);
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user