improve: cache usage and detection of js-required pages

This commit is contained in:
yanlong.wang 2025-03-13 15:55:12 +08:00
parent a5e3c2d281
commit ff595c2b4c
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 79 additions and 53 deletions

View File

@ -509,7 +509,7 @@ export class CrawlerHost extends RPCHost {
return digest; return digest;
} }
async queryCache(urlToCrawl: URL, cacheTolerance: number) { async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
const digest = this.getUrlDigest(urlToCrawl); const digest = this.getUrlDigest(urlToCrawl);
const cache = ( const cache = (
@ -526,8 +526,10 @@ export class CrawlerHost extends RPCHost {
})) }))
)?.[0]; )?.[0];
yield cache;
if (!cache) { if (!cache) {
return undefined; return;
} }
const age = Date.now() - cache.createdAt.valueOf(); const age = Date.now() - cache.createdAt.valueOf();
@ -561,7 +563,7 @@ export class CrawlerHost extends RPCHost {
return undefined; return undefined;
} }
return { yield {
isFresh: !stale, isFresh: !stale,
...cache, ...cache,
snapshot: { snapshot: {
@ -585,7 +587,7 @@ export class CrawlerHost extends RPCHost {
url: urlToCrawl.toString(), url: urlToCrawl.toString(),
createdAt: nowDate, createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
htmlModifiedByJs: snapshot.htmlModifiedByJs, htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs,
urlPathDigest: digest, urlPathDigest: digest,
}); });
@ -726,19 +728,21 @@ export class CrawlerHost extends RPCHost {
return; return;
} }
let cache; const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
const cacheIt = this.queryCache(urlToCrawl, cacheTolerance);
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) { let cache = (await cacheIt.next()).value;
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs; if (cache?.htmlSignificantlyModifiedByJs === false) {
cache = await this.queryCache(urlToCrawl, cacheTolerance);
}
if (cache?.htmlModifiedByJs === false) {
if (crawlerOpts) { if (crawlerOpts) {
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML; crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
} }
} }
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
cache = (await cacheIt.next()).value;
}
cacheIt.return(undefined);
if (cache?.isFresh && if (cache?.isFresh &&
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale) (_.get(cache.snapshot, 'locale') === crawlOpts?.locale)

View File

@ -22,7 +22,7 @@ export class Crawled extends FirestoreRecord {
urlPathDigest!: string; urlPathDigest!: string;
@Prop() @Prop()
htmlModifiedByJs?: boolean; htmlSignificantlyModifiedByJs?: boolean;
@Prop() @Prop()
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; }; snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };

View File

@ -615,9 +615,6 @@ export class CrawlerOptions extends AutoCastable {
return false; return false;
} }
const presumedTiming = this.presumedRespondTiming; const presumedTiming = this.presumedRespondTiming;
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
return true;
}
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) { if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
const now = Date.now(); const now = Date.now();
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) { if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
@ -636,10 +633,12 @@ export class CrawlerOptions extends AutoCastable {
return true; return true;
} }
} }
if (this.injectFrameScript?.length || this.injectPageScript?.length) { if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false; return false;
} }
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
return true;
}
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) { if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
return false; return false;
} }

View File

@ -6,8 +6,7 @@ import { container, singleton } from 'tsyringe';
import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer'; import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
import type { Cookie } from 'set-cookie-parser'; import type { Cookie } from 'set-cookie-parser';
import puppeteer from 'puppeteer-extra'; import puppeteer, { TimeoutError } from 'puppeteer';
import { TimeoutError } from 'puppeteer';
import { Defer, Deferred } from 'civkit/defer'; import { Defer, Deferred } from 'civkit/defer';
import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc'; import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
@ -15,7 +14,6 @@ import { AsyncService } from 'civkit/async-service';
import { FancyFile } from 'civkit/fancy-file'; import { FancyFile } from 'civkit/fancy-file';
import { delay } from 'civkit/timeout'; import { delay } from 'civkit/timeout';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors'; import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
import { CurlControl } from './curl'; import { CurlControl } from './curl';
import { BlackHoleDetector } from './blackhole-detector'; import { BlackHoleDetector } from './blackhole-detector';
@ -55,7 +53,7 @@ export interface PageSnapshot {
href: string; href: string;
rebase?: string; rebase?: string;
html: string; html: string;
htmlModifiedByJs?: boolean; htmlSignificantlyModifiedByJs?: boolean;
shadowExpanded?: string; shadowExpanded?: string;
text: string; text: string;
status?: number; status?: number;
@ -110,11 +108,6 @@ export interface ScrappingOptions {
} }
puppeteer.use(puppeteerBlockResources({
blockedTypes: new Set(['media']),
interceptResolutionPriority: 1,
}));
const SIMULATE_SCROLL = ` const SIMULATE_SCROLL = `
(function () { (function () {
function createIntersectionObserverEntry(target, isIntersecting, timestamp) { function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
@ -265,7 +258,7 @@ function briefImgs(elem) {
}; };
}); });
} }
function getMaxDepthAndCountUsingTreeWalker(root) { function getMaxDepthAndElemCountUsingTreeWalker(root=document.documentElement) {
let maxDepth = 0; let maxDepth = 0;
let currentDepth = 0; let currentDepth = 0;
let elementCount = 0; let elementCount = 0;
@ -378,11 +371,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
} }
let lastMutationIdle = 0; let lastMutationIdle = 0;
let initialHTML; let initialAnalytics;
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now()); document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
function giveSnapshot(stopActiveSnapshot) { function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
initialHTML ??= document.documentElement?.outerHTML;
if (stopActiveSnapshot) { if (stopActiveSnapshot) {
window.haltSnapshot = true; window.haltSnapshot = true;
} }
@ -392,13 +384,18 @@ function giveSnapshot(stopActiveSnapshot) {
} catch (err) { } catch (err) {
void 0; void 0;
} }
const domAnalysis = getMaxDepthAndCountUsingTreeWalker(document.documentElement); const domAnalysis = overrideDomAnalysis || getMaxDepthAndElemCountUsingTreeWalker(document.documentElement);
initialAnalytics ??= domAnalysis;
const thisElemCount = domAnalysis.elementCount;
const initialElemCount = initialAnalytics.elementCount;
Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON)
const r = { const r = {
title: document.title, title: document.title,
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '', description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
href: document.location.href, href: document.location.href,
html: document.documentElement?.outerHTML, html: document.documentElement?.outerHTML,
htmlModifiedByJs: false, htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.1),
text: document.body?.innerText, text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed, parsed: parsed,
@ -407,9 +404,6 @@ function giveSnapshot(stopActiveSnapshot) {
elemCount: domAnalysis.elementCount, elemCount: domAnalysis.elementCount,
lastMutationIdle, lastMutationIdle,
}; };
if (initialHTML) {
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
}
if (document.baseURI !== r.href) { if (document.baseURI !== r.href) {
r.rebase = document.baseURI; r.rebase = document.baseURI;
} }
@ -446,6 +440,7 @@ function waitForSelector(selectorText) {
}); });
}); });
} }
window.getMaxDepthAndElemCountUsingTreeWalker = getMaxDepthAndElemCountUsingTreeWalker;
window.waitForSelector = waitForSelector; window.waitForSelector = waitForSelector;
window.giveSnapshot = giveSnapshot; window.giveSnapshot = giveSnapshot;
window.briefImgs = briefImgs; window.briefImgs = briefImgs;
@ -566,7 +561,7 @@ export class PuppeteerControl extends AsyncService {
} }
this.browser = await puppeteer.launch({ this.browser = await puppeteer.launch({
timeout: 10_000, timeout: 10_000,
headless: true, headless: false,
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH, executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
args: ['--disable-dev-shm-usage'] args: ['--disable-dev-shm-usage']
}).catch((err: any) => { }).catch((err: any) => {
@ -735,23 +730,45 @@ export class PuppeteerControl extends AsyncService {
await page.evaluateOnNewDocument(` await page.evaluateOnNewDocument(`
(function () { (function () {
if (window.self === window.top) { if (window.self === window.top) {
let lastTextLength = 0; let lastAnalytics;
let lastReportedAt = 0;
const handlePageLoad = () => { const handlePageLoad = () => {
const thisTextLength = (document.body.innerText || '').length; const now = Date.now();
const deltaLength = Math.abs(thisTextLength - lastTextLength); const dt = now - lastReportedAt;
if (10 * deltaLength < lastTextLength) { const previousAnalytics = lastAnalytics;
// Change is not significant const thisAnalytics = getMaxDepthAndElemCountUsingTreeWalker();
return; let dElem = 0;
}
lastTextLength = thisTextLength;
if (window.haltSnapshot) { if (window.haltSnapshot) {
return; return;
} }
const r = giveSnapshot();
const thisElemCount = thisAnalytics.elementCount;
if (previousAnalytics) {
const previousElemCount = previousAnalytics.elementCount;
const delta = Math.abs(thisElemCount - previousElemCount);
dElem = delta /(previousElemCount + Number.EPSILON);
}
if (dt < 1500 && dElem < 0.1) {
return;
}
lastAnalytics = thisAnalytics;
lastReportedAt = now;
const r = giveSnapshot(false, lastAnalytics);
window.reportSnapshot(r); window.reportSnapshot(r);
}; };
document.addEventListener('readystatechange', handlePageLoad); document.addEventListener('readystatechange', ()=> {
if (document.readyState === 'interactive') {
handlePageLoad();
}
});
document.addEventListener('load', handlePageLoad); document.addEventListener('load', handlePageLoad);
window.addEventListener('load', handlePageLoad);
document.addEventListener('DOMContentLoaded', handlePageLoad);
document.addEventListener('mutationIdle', handlePageLoad); document.addEventListener('mutationIdle', handlePageLoad);
} }
document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true }); document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
@ -772,11 +789,13 @@ export class PuppeteerControl extends AsyncService {
if (this.__loadedPage.length) { if (this.__loadedPage.length) {
thePage = this.__loadedPage.shift(); thePage = this.__loadedPage.shift();
if (this.__loadedPage.length <= 1) { if (this.__loadedPage.length <= 1) {
this.newPage() process.nextTick(() => {
.then((r) => this.__loadedPage.push(r)) this.newPage()
.catch((err) => { .then((r) => this.__loadedPage.push(r))
this.logger.warn(`Failed to load new page ahead of time`, { err }); .catch((err) => {
}); this.logger.warn(`Failed to load new page ahead of time`, { err });
});
});
} }
} }
@ -860,6 +879,10 @@ export class PuppeteerControl extends AsyncService {
return req.continue(overrides, 0); return req.continue(overrides, 0);
} }
const typ = req.resourceType(); const typ = req.resourceType();
if (typ === 'media') {
// Non-cooperative answer to block all media requests.
return req.abort('blockedbyclient');
}
if (!options.proxyResources) { if (!options.proxyResources) {
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ); const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
if (!isDocRequest) { if (!isDocRequest) {
@ -925,7 +948,7 @@ export class PuppeteerControl extends AsyncService {
status: curled.status, status: curled.status,
headers: _.omit(curled.headers, 'result'), headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType, contentType: curled.contentType,
}, 999); }, 3);
} }
const body = await readFile(await curled.file.filePath); const body = await readFile(await curled.file.filePath);
if (req.isInterceptResolutionHandled()) { if (req.isInterceptResolutionHandled()) {
@ -936,7 +959,7 @@ export class PuppeteerControl extends AsyncService {
headers: _.omit(curled.headers, 'result'), headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType, contentType: curled.contentType,
body: Uint8Array.from(body), body: Uint8Array.from(body),
}, 999); }, 3);
} }
options.sideLoad ??= curled.sideLoadOpts; options.sideLoad ??= curled.sideLoadOpts;
_.merge(options.sideLoad, curled.sideLoadOpts); _.merge(options.sideLoad, curled.sideLoadOpts);
@ -945,7 +968,7 @@ export class PuppeteerControl extends AsyncService {
return req.respond({ return req.respond({
status: firstReq.result!.code, status: firstReq.result!.code,
headers: _.omit(firstReq, 'result'), headers: _.omit(firstReq, 'result'),
}, 999); }, 3);
} catch (err: any) { } catch (err: any) {
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy }); this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
} }