From ff595c2b4ca2b0000fe3213ea3af408c882c45ab Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Thu, 13 Mar 2025 15:55:12 +0800 Subject: [PATCH] improve: cache usage and detection of js-required pages --- src/api/crawler.ts | 26 +++++----- src/db/crawled.ts | 2 +- src/dto/crawler-options.ts | 7 ++- src/services/puppeteer.ts | 97 +++++++++++++++++++++++--------------- 4 files changed, 79 insertions(+), 53 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 7dc5e56..8d44297 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -509,7 +509,7 @@ export class CrawlerHost extends RPCHost { return digest; } - async queryCache(urlToCrawl: URL, cacheTolerance: number) { + async *queryCache(urlToCrawl: URL, cacheTolerance: number) { const digest = this.getUrlDigest(urlToCrawl); const cache = ( @@ -526,8 +526,10 @@ export class CrawlerHost extends RPCHost { })) )?.[0]; + yield cache; + if (!cache) { - return undefined; + return; } const age = Date.now() - cache.createdAt.valueOf(); @@ -561,7 +563,7 @@ export class CrawlerHost extends RPCHost { return undefined; } - return { + yield { isFresh: !stale, ...cache, snapshot: { @@ -585,7 +587,7 @@ export class CrawlerHost extends RPCHost { url: urlToCrawl.toString(), createdAt: nowDate, expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), - htmlModifiedByJs: snapshot.htmlModifiedByJs, + htmlSignificantlyModifiedByJs: snapshot.htmlSignificantlyModifiedByJs, urlPathDigest: digest, }); @@ -726,19 +728,21 @@ export class CrawlerHost extends RPCHost { return; } - let cache; + const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs; + const cacheIt = this.queryCache(urlToCrawl, cacheTolerance); - if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) { - const cacheTolerance = crawlerOpts?.cacheTolerance ?? this.cacheValidMs; - cache = await this.queryCache(urlToCrawl, cacheTolerance); - } - - if (cache?.htmlModifiedByJs === false) { + let cache = (await cacheIt.next()).value; + if (cache?.htmlSignificantlyModifiedByJs === false) { if (crawlerOpts) { crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML; } } + if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) { + cache = (await cacheIt.next()).value; + } + cacheIt.return(undefined); + if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) && (_.get(cache.snapshot, 'locale') === crawlOpts?.locale) diff --git a/src/db/crawled.ts b/src/db/crawled.ts index ee5379f..100da62 100644 --- a/src/db/crawled.ts +++ b/src/db/crawled.ts @@ -22,7 +22,7 @@ export class Crawled extends FirestoreRecord { urlPathDigest!: string; @Prop() - htmlModifiedByJs?: boolean; + htmlSignificantlyModifiedByJs?: boolean; @Prop() snapshot?: PageSnapshot & { screenshot: never; pageshot: never; }; diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index a33471b..1199c9d 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -615,9 +615,6 @@ export class CrawlerOptions extends AutoCastable { return false; } const presumedTiming = this.presumedRespondTiming; - if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) { - return true; - } if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) { const now = Date.now(); if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) { @@ -636,10 +633,12 @@ export class CrawlerOptions extends AutoCastable { return true; } } - if (this.injectFrameScript?.length || this.injectPageScript?.length) { return false; } + if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) { + return true; + } if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) { return false; } diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 2f29f29..e35b587 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -6,8 +6,7 @@ import { container, singleton } from 'tsyringe'; import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer'; import type { Cookie } from 'set-cookie-parser'; -import puppeteer from 'puppeteer-extra'; -import { TimeoutError } from 'puppeteer'; +import puppeteer, { TimeoutError } from 'puppeteer'; import { Defer, Deferred } from 'civkit/defer'; import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc'; @@ -15,7 +14,6 @@ import { AsyncService } from 'civkit/async-service'; import { FancyFile } from 'civkit/fancy-file'; import { delay } from 'civkit/timeout'; -import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors'; import { CurlControl } from './curl'; import { BlackHoleDetector } from './blackhole-detector'; @@ -55,7 +53,7 @@ export interface PageSnapshot { href: string; rebase?: string; html: string; - htmlModifiedByJs?: boolean; + htmlSignificantlyModifiedByJs?: boolean; shadowExpanded?: string; text: string; status?: number; @@ -110,11 +108,6 @@ export interface ScrappingOptions { } -puppeteer.use(puppeteerBlockResources({ - blockedTypes: new Set(['media']), - interceptResolutionPriority: 1, -})); - const SIMULATE_SCROLL = ` (function () { function createIntersectionObserverEntry(target, isIntersecting, timestamp) { @@ -265,7 +258,7 @@ function briefImgs(elem) { }; }); } -function getMaxDepthAndCountUsingTreeWalker(root) { +function getMaxDepthAndElemCountUsingTreeWalker(root=document.documentElement) { let maxDepth = 0; let currentDepth = 0; let elementCount = 0; @@ -378,11 +371,10 @@ function shadowDomPresent(rootElement = document.documentElement) { } let lastMutationIdle = 0; -let initialHTML; +let initialAnalytics; document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now()); -function giveSnapshot(stopActiveSnapshot) { - initialHTML ??= document.documentElement?.outerHTML; +function giveSnapshot(stopActiveSnapshot, overrideDomAnalysis) { if (stopActiveSnapshot) { window.haltSnapshot = true; } @@ -392,13 +384,18 @@ function giveSnapshot(stopActiveSnapshot) { } catch (err) { void 0; } - const domAnalysis = getMaxDepthAndCountUsingTreeWalker(document.documentElement); + const domAnalysis = overrideDomAnalysis || getMaxDepthAndElemCountUsingTreeWalker(document.documentElement); + initialAnalytics ??= domAnalysis; + + const thisElemCount = domAnalysis.elementCount; + const initialElemCount = initialAnalytics.elementCount; + Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) const r = { title: document.title, description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '', href: document.location.href, html: document.documentElement?.outerHTML, - htmlModifiedByJs: false, + htmlSignificantlyModifiedByJs: Boolean(Math.abs(thisElemCount - initialElemCount) / (initialElemCount + Number.EPSILON) > 0.1), text: document.body?.innerText, shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, parsed: parsed, @@ -407,9 +404,6 @@ function giveSnapshot(stopActiveSnapshot) { elemCount: domAnalysis.elementCount, lastMutationIdle, }; - if (initialHTML) { - r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded; - } if (document.baseURI !== r.href) { r.rebase = document.baseURI; } @@ -446,6 +440,7 @@ function waitForSelector(selectorText) { }); }); } +window.getMaxDepthAndElemCountUsingTreeWalker = getMaxDepthAndElemCountUsingTreeWalker; window.waitForSelector = waitForSelector; window.giveSnapshot = giveSnapshot; window.briefImgs = briefImgs; @@ -566,7 +561,7 @@ export class PuppeteerControl extends AsyncService { } this.browser = await puppeteer.launch({ timeout: 10_000, - headless: true, + headless: false, executablePath: process.env.OVERRIDE_CHROME_EXECUTABLE_PATH, args: ['--disable-dev-shm-usage'] }).catch((err: any) => { @@ -735,23 +730,45 @@ export class PuppeteerControl extends AsyncService { await page.evaluateOnNewDocument(` (function () { if (window.self === window.top) { - let lastTextLength = 0; + let lastAnalytics; + let lastReportedAt = 0; const handlePageLoad = () => { - const thisTextLength = (document.body.innerText || '').length; - const deltaLength = Math.abs(thisTextLength - lastTextLength); - if (10 * deltaLength < lastTextLength) { - // Change is not significant - return; - } - lastTextLength = thisTextLength; + const now = Date.now(); + const dt = now - lastReportedAt; + const previousAnalytics = lastAnalytics; + const thisAnalytics = getMaxDepthAndElemCountUsingTreeWalker(); + let dElem = 0; + if (window.haltSnapshot) { return; } - const r = giveSnapshot(); + + const thisElemCount = thisAnalytics.elementCount; + if (previousAnalytics) { + const previousElemCount = previousAnalytics.elementCount; + + const delta = Math.abs(thisElemCount - previousElemCount); + dElem = delta /(previousElemCount + Number.EPSILON); + } + + if (dt < 1500 && dElem < 0.1) { + return; + } + + lastAnalytics = thisAnalytics; + lastReportedAt = now; + + const r = giveSnapshot(false, lastAnalytics); window.reportSnapshot(r); }; - document.addEventListener('readystatechange', handlePageLoad); + document.addEventListener('readystatechange', ()=> { + if (document.readyState === 'interactive') { + handlePageLoad(); + } + }); document.addEventListener('load', handlePageLoad); + window.addEventListener('load', handlePageLoad); + document.addEventListener('DOMContentLoaded', handlePageLoad); document.addEventListener('mutationIdle', handlePageLoad); } document.addEventListener('DOMContentLoaded', ()=> window.simulateScroll(), { once: true }); @@ -772,11 +789,13 @@ export class PuppeteerControl extends AsyncService { if (this.__loadedPage.length) { thePage = this.__loadedPage.shift(); if (this.__loadedPage.length <= 1) { - this.newPage() - .then((r) => this.__loadedPage.push(r)) - .catch((err) => { - this.logger.warn(`Failed to load new page ahead of time`, { err }); - }); + process.nextTick(() => { + this.newPage() + .then((r) => this.__loadedPage.push(r)) + .catch((err) => { + this.logger.warn(`Failed to load new page ahead of time`, { err }); + }); + }); } } @@ -860,6 +879,10 @@ export class PuppeteerControl extends AsyncService { return req.continue(overrides, 0); } const typ = req.resourceType(); + if (typ === 'media') { + // Non-cooperative answer to block all media requests. + return req.abort('blockedbyclient'); + } if (!options.proxyResources) { const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ); if (!isDocRequest) { @@ -925,7 +948,7 @@ export class PuppeteerControl extends AsyncService { status: curled.status, headers: _.omit(curled.headers, 'result'), contentType: curled.contentType, - }, 999); + }, 3); } const body = await readFile(await curled.file.filePath); if (req.isInterceptResolutionHandled()) { @@ -936,7 +959,7 @@ export class PuppeteerControl extends AsyncService { headers: _.omit(curled.headers, 'result'), contentType: curled.contentType, body: Uint8Array.from(body), - }, 999); + }, 3); } options.sideLoad ??= curled.sideLoadOpts; _.merge(options.sideLoad, curled.sideLoadOpts); @@ -945,7 +968,7 @@ export class PuppeteerControl extends AsyncService { return req.respond({ status: firstReq.result!.code, headers: _.omit(firstReq, 'result'), - }, 999); + }, 3); } catch (err: any) { this.logger.warn(`Failed to sideload browser request ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err, proxy }); }