improve: cache usage and detection of js-required pages

This commit is contained in:
yanlong.wang 2025-03-13 15:55:12 +08:00
parent a5e3c2d281
commit ff595c2b4c
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 79 additions and 53 deletions

View File

@ -509,7 +509,7 @@ export class CrawlerHost extends RPCHost {
return digest;
}
async queryCache(urlToCrawl: URL, cacheTolerance: number) {
async *queryCache(urlToCrawl: URL, cacheTolerance: number) {
const digest = this.getUrlDigest(urlToCrawl);
const cache = (
@ -526,8 +526,10 @@ export class CrawlerHost extends RPCHost {
}))
)?.[0];
yield cache;
if (!cache) {
return undefined;
return;
}
const age = Date.now() - cache.createdAt.valueOf();
@ -561,7 +563,7 @@ export class CrawlerHost extends RPCHost {
return undefined;
}
return {
yield {
isFresh: !stale,
...cache,
snapshot: {
@ -585,7 +587,7 @@ export class CrawlerHost extends RPCHost {
url: urlToCrawl.toString(),
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
htmlModifiedByJs: snapshot.htmlModifiedByJs,
htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs,
urlPathDigest: digest,
});
@ -726,19 +728,21 @@ export class CrawlerHost extends RPCHost {
return;
}
let cache;
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
const cacheIt = this.queryCache(urlToCrawl, cacheTolerance);
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs;
cache = await this.queryCache(urlToCrawl, cacheTolerance);
}
if (cache?.htmlModifiedByJs === false) {
let cache = (await cacheIt.next()).value;
if (cache?.htmlSignificantlyModifiedByJs === false) {
if (crawlerOpts) {
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
}
}
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
cache = (await cacheIt.next()).value;
}
cacheIt.return(undefined);
if (cache?.isFresh &&
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)

View File

@ -22,7 +22,7 @@ export class Crawled extends FirestoreRecord {
urlPathDigest!: string;
@Prop()
htmlModifiedByJs?: boolean;
htmlSignificantlyModifiedByJs?: boolean;
@Prop()
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };

View File

@ -615,9 +615,6 @@ export class CrawlerOptions extends AutoCastable {
return false;
}
const presumedTiming = this.presumedRespondTiming;
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
return true;
}
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
const now = Date.now();
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
@ -636,10 +633,12 @@ export class CrawlerOptions extends AutoCastable {
return true;
}
}
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false;
}
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
return true;
}
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
return false;
}

View File

@ -6,8 +6,7 @@ import { container, singleton } from 'tsyringe';
import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
import type { Cookie } from 'set-cookie-parser';
import puppeteer from 'puppeteer-extra';
import { TimeoutError } from 'puppeteer';
import puppeteer, { TimeoutError } from 'puppeteer';
import { Defer, Deferred } from 'civkit/defer';
import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
@ -15,7 +14,6 @@ import { AsyncService } from 'civkit/async-service';
import { FancyFile } from 'civkit/fancy-file';
import { delay } from 'civkit/timeout';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
import { CurlControl } from './curl';
import { BlackHoleDetector } from './blackhole-detector';
@ -55,7 +53,7 @@ export interface PageSnapshot {
href: string;
rebase?: string;
html: string;
htmlModifiedByJs?: boolean;
htmlSignificantlyModifiedByJs?: boolean;
shadowExpanded?: string;
text: string;
status?: number;
@ -110,11 +108,6 @@ export interface ScrappingOptions {
}
puppeteer.use(puppeteerBlockResources({
blockedTypes: new Set(['media']),
interceptResolutionPriority: 1,
}));
const SIMULATE_SCROLL = `
(function () {
function createIntersectionObserverEntry(target, isIntersecting, timestamp) {
@ -265,7 +258,7 @@ function briefImgs(elem) {
};
});
}
function getMaxDepthAndCountUsingTreeWalker(root) {
function getMaxDepthAndElemCountUsingTreeWalker(root=document.documentElement) {
let maxDepth = 0;
let currentDepth = 0;
let elementCount = 0;
@ -378,11 +371,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
}
let lastMutationIdle = 0;
let initialHTML;
let initialAnalytics;
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
function giveSnapshot(stopActiveSnapshot) {
initialHTML ??= document.documentElement?.outerHTML;
function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) {
if (stopActiveSnapshot) {
window.haltSnapshot = true;
}
@ -392,13 +384,18 @@ function giveSnapshot(stopActiveSnapshot) {
} catch (err) {
void 0;
}
const domAnalysis = getMaxDepthAndCountUsingTreeWalker(document.documentElement);
const domAnalysis = overrideDomAnalysis || getMaxDepthAndElemCountUsingTreeWalker(document.documentElement);
initialAnalytics ??= domAnalysis;
const thisElemCount = domAnalysis.elementCount;
const initialElemCount = initialAnalytics.elementCount;
Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON)
const r = {
title: document.title,
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
href: document.location.href,
html: document.documentElement?.outerHTML,
htmlModifiedByJs: false,
htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.1),
text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed,
@ -407,9 +404,6 @@ function giveSnapshot(stopActiveSnapshot) {
elemCount: domAnalysis.elementCount,
lastMutationIdle,
};
if (initialHTML) {
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
}
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}
@ -446,6 +440,7 @@ function waitForSelector(selectorText) {
});
});
}
window.getMaxDepthAndElemCountUsingTreeWalker = getMaxDepthAndElemCountUsingTreeWalker;
window.waitForSelector = waitForSelector;
window.giveSnapshot = giveSnapshot;
window.briefImgs = briefImgs;
@ -566,7 +561,7 @@ export class PuppeteerControl extends AsyncService {
}
this.browser = await puppeteer.launch({
timeout: 10_000,
headless: true,
headless: false,
executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH,
args: ['--disable-dev-shm-usage']
}).catch((err: any) => {
@ -735,23 +730,45 @@ export class PuppeteerControl extends AsyncService {
await page.evaluateOnNewDocument(`
(function () {
if (window.self === window.top) {
let lastTextLength = 0;
let lastAnalytics;
let lastReportedAt = 0;
const handlePageLoad = () => {
const thisTextLength = (document.body.innerText || '').length;
const deltaLength = Math.abs(thisTextLength - lastTextLength);
if (10 * deltaLength < lastTextLength) {
// Change is not significant
return;
}
lastTextLength = thisTextLength;
const now = Date.now();
const dt = now - lastReportedAt;
const previousAnalytics = lastAnalytics;
const thisAnalytics = getMaxDepthAndElemCountUsingTreeWalker();
let dElem = 0;
if (window.haltSnapshot) {
return;
}
const r = giveSnapshot();
const thisElemCount = thisAnalytics.elementCount;
if (previousAnalytics) {
const previousElemCount = previousAnalytics.elementCount;
const delta = Math.abs(thisElemCount - previousElemCount);
dElem = delta /(previousElemCount + Number.EPSILON);
}
if (dt < 1500 && dElem < 0.1) {
return;
}
lastAnalytics = thisAnalytics;
lastReportedAt = now;
const r = giveSnapshot(false, lastAnalytics);
window.reportSnapshot(r);
};
document.addEventListener('readystatechange', handlePageLoad);
document.addEventListener('readystatechange', ()=> {
if (document.readyState === 'interactive') {
handlePageLoad();
}
});
document.addEventListener('load', handlePageLoad);
window.addEventListener('load', handlePageLoad);
document.addEventListener('DOMContentLoaded', handlePageLoad);
document.addEventListener('mutationIdle', handlePageLoad);
}
document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true });
@ -772,11 +789,13 @@ export class PuppeteerControl extends AsyncService {
if (this.__loadedPage.length) {
thePage = this.__loadedPage.shift();
if (this.__loadedPage.length <= 1) {
this.newPage()
.then((r) => this.__loadedPage.push(r))
.catch((err) => {
this.logger.warn(`Failed to load new page ahead of time`, { err });
});
process.nextTick(() => {
this.newPage()
.then((r) => this.__loadedPage.push(r))
.catch((err) => {
this.logger.warn(`Failed to load new page ahead of time`, { err });
});
});
}
}
@ -860,6 +879,10 @@ export class PuppeteerControl extends AsyncService {
return req.continue(overrides, 0);
}
const typ = req.resourceType();
if (typ === 'media') {
// Non-cooperative answer to block all media requests.
return req.abort('blockedbyclient');
}
if (!options.proxyResources) {
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
if (!isDocRequest) {
@ -925,7 +948,7 @@ export class PuppeteerControl extends AsyncService {
status: curled.status,
headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType,
}, 999);
}, 3);
}
const body = await readFile(await curled.file.filePath);
if (req.isInterceptResolutionHandled()) {
@ -936,7 +959,7 @@ export class PuppeteerControl extends AsyncService {
headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType,
body: Uint8Array.from(body),
}, 999);
}, 3);
}
options.sideLoad ??= curled.sideLoadOpts;
_.merge(options.sideLoad, curled.sideLoadOpts);
@ -945,7 +968,7 @@ export class PuppeteerControl extends AsyncService {
return req.respond({
status: firstReq.result!.code,
headers: _.omit(firstReq, 'result'),
}, 999);
}, 3);
} catch (err: any) {
this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy });
}