mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 07:06:02 +08:00
improve: cache usage and detection of js-required pages
This commit is contained in:
parent
a5e3c2d281
commit
ff595c2b4c
@ -509,7 +509,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return digest;
|
return digest;
|
||||||
}
|
}
|
||||||
|
|
||||||
async queryCache(urlToCrawl: URL, cacheTolerance: number) {
|
async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
|
||||||
const digest = this.getUrlDigest(urlToCrawl);
|
const digest = this.getUrlDigest(urlToCrawl);
|
||||||
|
|
||||||
const cache = (
|
const cache = (
|
||||||
@ -526,8 +526,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}))
|
}))
|
||||||
)?.[0];
|
)?.[0];
|
||||||
|
|
||||||
|
yield cache;
|
||||||
|
|
||||||
if (!cache) {
|
if (!cache) {
|
||||||
return undefined;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const age = Date.now() - cache.createdAt.valueOf();
|
const age = Date.now() - cache.createdAt.valueOf();
|
||||||
@ -561,7 +563,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
yield {
|
||||||
isFresh: !stale,
|
isFresh: !stale,
|
||||||
...cache,
|
...cache,
|
||||||
snapshot: {
|
snapshot: {
|
||||||
@ -585,7 +587,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
url: urlToCrawl.toString(),
|
url: urlToCrawl.toString(),
|
||||||
createdAt: nowDate,
|
createdAt: nowDate,
|
||||||
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
||||||
htmlModifiedByJs: snapshot.htmlModifiedByJs,
|
htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs,
|
||||||
urlPathDigest: digest,
|
urlPathDigest: digest,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -726,19 +728,21 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let cache;
|
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
|
||||||
|
const cacheIt = this.queryCache(urlToCrawl, cacheTolerance);
|
||||||
|
|
||||||
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
let cache = (await cacheIt.next()).value;
|
||||||
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
|
if (cache?.htmlSignificantlyModifiedByJs === false) {
|
||||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cache?.htmlModifiedByJs === false) {
|
|
||||||
if (crawlerOpts) {
|
if (crawlerOpts) {
|
||||||
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
|
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
||||||
|
cache = (await cacheIt.next()).value;
|
||||||
|
}
|
||||||
|
cacheIt.return(undefined);
|
||||||
|
|
||||||
if (cache?.isFresh &&
|
if (cache?.isFresh &&
|
||||||
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
||||||
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
||||||
|
@ -22,7 +22,7 @@ export class Crawled extends FirestoreRecord {
|
|||||||
urlPathDigest!: string;
|
urlPathDigest!: string;
|
||||||
|
|
||||||
@Prop()
|
@Prop()
|
||||||
htmlModifiedByJs?: boolean;
|
htmlSignificantlyModifiedByJs?: boolean;
|
||||||
|
|
||||||
@Prop()
|
@Prop()
|
||||||
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
||||||
|
@ -615,9 +615,6 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const presumedTiming = this.presumedRespondTiming;
|
const presumedTiming = this.presumedRespondTiming;
|
||||||
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
|
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
||||||
@ -636,10 +633,12 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -6,8 +6,7 @@ import { container, singleton } from 'tsyringe';
|
|||||||
|
|
||||||
import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
|
import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
|
||||||
import type { Cookie } from 'set-cookie-parser';
|
import type { Cookie } from 'set-cookie-parser';
|
||||||
import puppeteer from 'puppeteer-extra';
|
import puppeteer, { TimeoutError } from 'puppeteer';
|
||||||
import { TimeoutError } from 'puppeteer';
|
|
||||||
|
|
||||||
import { Defer, Deferred } from 'civkit/defer';
|
import { Defer, Deferred } from 'civkit/defer';
|
||||||
import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
|
import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
|
||||||
@ -15,7 +14,6 @@ import { AsyncService } from 'civkit/async-service';
|
|||||||
import { FancyFile } from 'civkit/fancy-file';
|
import { FancyFile } from 'civkit/fancy-file';
|
||||||
import { delay } from 'civkit/timeout';
|
import { delay } from 'civkit/timeout';
|
||||||
|
|
||||||
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|
||||||
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
|
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
|
||||||
import { CurlControl } from './curl';
|
import { CurlControl } from './curl';
|
||||||
import { BlackHoleDetector } from './blackhole-detector';
|
import { BlackHoleDetector } from './blackhole-detector';
|
||||||
@ -55,7 +53,7 @@ export interface PageSnapshot {
|
|||||||
href: string;
|
href: string;
|
||||||
rebase?: string;
|
rebase?: string;
|
||||||
html: string;
|
html: string;
|
||||||
htmlModifiedByJs?: boolean;
|
htmlSignificantlyModifiedByJs?: boolean;
|
||||||
shadowExpanded?: string;
|
shadowExpanded?: string;
|
||||||
text: string;
|
text: string;
|
||||||
status?: number;
|
status?: number;
|
||||||
@ -110,11 +108,6 @@ export interface ScrappingOptions {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
puppeteer.use(puppeteerBlockResources({
|
|
||||||
blockedTypes: new Set(['media']),
|
|
||||||
interceptResolutionPriority: 1,
|
|
||||||
}));
|
|
||||||
|
|
||||||
const SIMULATE_SCROLL = `
|
const SIMULATE_SCROLL = `
|
||||||
(function () {
|
(function () {
|
||||||
function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
|
function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
|
||||||
@ -265,7 +258,7 @@ function briefImgs(elem) {
|
|||||||
};
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function getMaxDepthAndCountUsingTreeWalker(root) {
|
function getMaxDepthAndElemCountUsingTreeWalker(root=document.documentElement) {
|
||||||
let maxDepth = 0;
|
let maxDepth = 0;
|
||||||
let currentDepth = 0;
|
let currentDepth = 0;
|
||||||
let elementCount = 0;
|
let elementCount = 0;
|
||||||
@ -378,11 +371,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let lastMutationIdle = 0;
|
let lastMutationIdle = 0;
|
||||||
let initialHTML;
|
let initialAnalytics;
|
||||||
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
||||||
|
|
||||||
function giveSnapshot(stopActiveSnapshot) {
|
function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
|
||||||
initialHTML ??= document.documentElement?.outerHTML;
|
|
||||||
if (stopActiveSnapshot) {
|
if (stopActiveSnapshot) {
|
||||||
window.haltSnapshot = true;
|
window.haltSnapshot = true;
|
||||||
}
|
}
|
||||||
@ -392,13 +384,18 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
} catch (err) {
|
} catch (err) {
|
||||||
void 0;
|
void 0;
|
||||||
}
|
}
|
||||||
const domAnalysis = getMaxDepthAndCountUsingTreeWalker(document.documentElement);
|
const domAnalysis = overrideDomAnalysis || getMaxDepthAndElemCountUsingTreeWalker(document.documentElement);
|
||||||
|
initialAnalytics ??= domAnalysis;
|
||||||
|
|
||||||
|
const thisElemCount = domAnalysis.elementCount;
|
||||||
|
const initialElemCount = initialAnalytics.elementCount;
|
||||||
|
Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON)
|
||||||
const r = {
|
const r = {
|
||||||
title: document.title,
|
title: document.title,
|
||||||
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
||||||
href: document.location.href,
|
href: document.location.href,
|
||||||
html: document.documentElement?.outerHTML,
|
html: document.documentElement?.outerHTML,
|
||||||
htmlModifiedByJs: false,
|
htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.1),
|
||||||
text: document.body?.innerText,
|
text: document.body?.innerText,
|
||||||
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
||||||
parsed: parsed,
|
parsed: parsed,
|
||||||
@ -407,9 +404,6 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
elemCount: domAnalysis.elementCount,
|
elemCount: domAnalysis.elementCount,
|
||||||
lastMutationIdle,
|
lastMutationIdle,
|
||||||
};
|
};
|
||||||
if (initialHTML) {
|
|
||||||
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
|
||||||
}
|
|
||||||
if (document.baseURI !== r.href) {
|
if (document.baseURI !== r.href) {
|
||||||
r.rebase = document.baseURI;
|
r.rebase = document.baseURI;
|
||||||
}
|
}
|
||||||
@ -446,6 +440,7 @@ function waitForSelector(selectorText) {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
window.getMaxDepthAndElemCountUsingTreeWalker = getMaxDepthAndElemCountUsingTreeWalker;
|
||||||
window.waitForSelector = waitForSelector;
|
window.waitForSelector = waitForSelector;
|
||||||
window.giveSnapshot = giveSnapshot;
|
window.giveSnapshot = giveSnapshot;
|
||||||
window.briefImgs = briefImgs;
|
window.briefImgs = briefImgs;
|
||||||
@ -566,7 +561,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
this.browser = await puppeteer.launch({
|
this.browser = await puppeteer.launch({
|
||||||
timeout: 10_000,
|
timeout: 10_000,
|
||||||
headless: true,
|
headless: false,
|
||||||
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
|
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
|
||||||
args: ['--disable-dev-shm-usage']
|
args: ['--disable-dev-shm-usage']
|
||||||
}).catch((err: any) => {
|
}).catch((err: any) => {
|
||||||
@ -735,23 +730,45 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
await page.evaluateOnNewDocument(`
|
await page.evaluateOnNewDocument(`
|
||||||
(function () {
|
(function () {
|
||||||
if (window.self === window.top) {
|
if (window.self === window.top) {
|
||||||
let lastTextLength = 0;
|
let lastAnalytics;
|
||||||
|
let lastReportedAt = 0;
|
||||||
const handlePageLoad = () => {
|
const handlePageLoad = () => {
|
||||||
const thisTextLength = (document.body.innerText || '').length;
|
const now = Date.now();
|
||||||
const deltaLength = Math.abs(thisTextLength - lastTextLength);
|
const dt = now - lastReportedAt;
|
||||||
if (10 * deltaLength < lastTextLength) {
|
const previousAnalytics = lastAnalytics;
|
||||||
// Change is not significant
|
const thisAnalytics = getMaxDepthAndElemCountUsingTreeWalker();
|
||||||
return;
|
let dElem = 0;
|
||||||
}
|
|
||||||
lastTextLength = thisTextLength;
|
|
||||||
if (window.haltSnapshot) {
|
if (window.haltSnapshot) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const r = giveSnapshot();
|
|
||||||
|
const thisElemCount = thisAnalytics.elementCount;
|
||||||
|
if (previousAnalytics) {
|
||||||
|
const previousElemCount = previousAnalytics.elementCount;
|
||||||
|
|
||||||
|
const delta = Math.abs(thisElemCount - previousElemCount);
|
||||||
|
dElem = delta /(previousElemCount + Number.EPSILON);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dt < 1500 && dElem < 0.1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastAnalytics = thisAnalytics;
|
||||||
|
lastReportedAt = now;
|
||||||
|
|
||||||
|
const r = giveSnapshot(false, lastAnalytics);
|
||||||
window.reportSnapshot(r);
|
window.reportSnapshot(r);
|
||||||
};
|
};
|
||||||
document.addEventListener('readystatechange', handlePageLoad);
|
document.addEventListener('readystatechange', ()=> {
|
||||||
|
if (document.readyState === 'interactive') {
|
||||||
|
handlePageLoad();
|
||||||
|
}
|
||||||
|
});
|
||||||
document.addEventListener('load', handlePageLoad);
|
document.addEventListener('load', handlePageLoad);
|
||||||
|
window.addEventListener('load', handlePageLoad);
|
||||||
|
document.addEventListener('DOMContentLoaded', handlePageLoad);
|
||||||
document.addEventListener('mutationIdle', handlePageLoad);
|
document.addEventListener('mutationIdle', handlePageLoad);
|
||||||
}
|
}
|
||||||
document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
|
document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
|
||||||
@ -772,11 +789,13 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
if (this.__loadedPage.length) {
|
if (this.__loadedPage.length) {
|
||||||
thePage = this.__loadedPage.shift();
|
thePage = this.__loadedPage.shift();
|
||||||
if (this.__loadedPage.length <= 1) {
|
if (this.__loadedPage.length <= 1) {
|
||||||
this.newPage()
|
process.nextTick(() => {
|
||||||
.then((r) => this.__loadedPage.push(r))
|
this.newPage()
|
||||||
.catch((err) => {
|
.then((r) => this.__loadedPage.push(r))
|
||||||
this.logger.warn(`Failed to load new page ahead of time`, { err });
|
.catch((err) => {
|
||||||
});
|
this.logger.warn(`Failed to load new page ahead of time`, { err });
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -860,6 +879,10 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
return req.continue(overrides, 0);
|
return req.continue(overrides, 0);
|
||||||
}
|
}
|
||||||
const typ = req.resourceType();
|
const typ = req.resourceType();
|
||||||
|
if (typ === 'media') {
|
||||||
|
// Non-cooperative answer to block all media requests.
|
||||||
|
return req.abort('blockedbyclient');
|
||||||
|
}
|
||||||
if (!options.proxyResources) {
|
if (!options.proxyResources) {
|
||||||
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
||||||
if (!isDocRequest) {
|
if (!isDocRequest) {
|
||||||
@ -925,7 +948,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
status: curled.status,
|
status: curled.status,
|
||||||
headers: _.omit(curled.headers, 'result'),
|
headers: _.omit(curled.headers, 'result'),
|
||||||
contentType: curled.contentType,
|
contentType: curled.contentType,
|
||||||
}, 999);
|
}, 3);
|
||||||
}
|
}
|
||||||
const body = await readFile(await curled.file.filePath);
|
const body = await readFile(await curled.file.filePath);
|
||||||
if (req.isInterceptResolutionHandled()) {
|
if (req.isInterceptResolutionHandled()) {
|
||||||
@ -936,7 +959,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
headers: _.omit(curled.headers, 'result'),
|
headers: _.omit(curled.headers, 'result'),
|
||||||
contentType: curled.contentType,
|
contentType: curled.contentType,
|
||||||
body: Uint8Array.from(body),
|
body: Uint8Array.from(body),
|
||||||
}, 999);
|
}, 3);
|
||||||
}
|
}
|
||||||
options.sideLoad ??= curled.sideLoadOpts;
|
options.sideLoad ??= curled.sideLoadOpts;
|
||||||
_.merge(options.sideLoad, curled.sideLoadOpts);
|
_.merge(options.sideLoad, curled.sideLoadOpts);
|
||||||
@ -945,7 +968,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
return req.respond({
|
return req.respond({
|
||||||
status: firstReq.result!.code,
|
status: firstReq.result!.code,
|
||||||
headers: _.omit(firstReq, 'result'),
|
headers: _.omit(firstReq, 'result'),
|
||||||
}, 999);
|
}, 3);
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user