From f7dbadffb714268a626c06e829f4ca8947ba81f0 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 12 Mar 2025 18:04:22 +0800 Subject: [PATCH] behavior change: ditch content based return timing, adopt mutationIdle as default timing --- src/api/crawler.ts | 83 ++++++-------------------------------- src/dto/crawler-options.ts | 77 ++++++++++++++++++++++++++++++----- src/services/puppeteer.ts | 45 ++++++++++++++++----- 3 files changed, 116 insertions(+), 89 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 2911b9e..6dfdae4 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -19,7 +19,6 @@ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } import { Crawled } from '../db/crawled'; import { DomainBlockade } from '../db/domain-blockade'; -import { DomainProfile } from '../db/domain-profile'; import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; @@ -317,6 +316,9 @@ export class CrawlerHost extends RPCHost { if (crawlerOptions.robotsTxt) { await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt); } + if (rpcReflect.signal.aborted) { + return; + } if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); rpcReflect.return(sseStream); @@ -363,10 +365,7 @@ export class CrawlerHost extends RPCHost { if (rpcReflect.signal.aborted) { break; } - if (!crawlerOptions.isEarlyReturnApplicable()) { - continue; - } - if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) { + if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { continue; } @@ -412,11 +411,7 @@ export class CrawlerHost extends RPCHost { if (rpcReflect.signal.aborted) { break; } - if (!crawlerOptions.isEarlyReturnApplicable()) { - continue; - } - - if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) { + if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) { continue; } @@ -427,13 +422,11 @@ export class CrawlerHost extends RPCHost { } if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { - return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } ); } if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { - return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } ); @@ -705,7 +698,11 @@ export class CrawlerHost extends RPCHost { return; } - if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) { + if ( + crawlOpts?.engine === ENGINE_TYPE.CURL || + // deprecated name + crawlOpts?.engine === 'direct' + ) { const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ? await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) : await this.curlControl.sideLoad(urlToCrawl, crawlOpts); @@ -779,6 +776,7 @@ export class CrawlerHost extends RPCHost { let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html); draftSnapshot.title ??= analyzed.title; + draftSnapshot.isIntermediate = true; let fallbackProxyIsUsed = false; if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200) @@ -798,6 +796,7 @@ export class CrawlerHost extends RPCHost { analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html); if (proxyLoaded.status === 200 || analyzed.tokens >= 200) { draftSnapshot = proxySnapshot; + draftSnapshot.isIntermediate = true; sideLoaded = proxyLoaded; fallbackProxyIsUsed = true; } @@ -986,7 +985,7 @@ export class CrawlerHost extends RPCHost { crawlOpts.extraHeaders['Accept-Language'] = opts.locale; } - if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) { + if (opts.respondWith.includes(CONTENT_FORMAT.VLM)) { crawlOpts.favorScreenshot = true; } @@ -1142,62 +1141,6 @@ export class CrawlerHost extends RPCHost { return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs); } - async exploreDirectEngine(knownSnapshot: PageSnapshot) { - const realUrl = new URL(knownSnapshot.href); - const { digest, path } = this.getDomainProfileUrlDigest(realUrl); - const profile = await DomainProfile.fromFirestore(digest); - - if (!profile) { - const record = DomainProfile.from({ - _id: digest, - origin: realUrl.origin.toLowerCase(), - path, - triggerUrl: realUrl.href, - engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT, - createdAt: new Date(), - expireAt: new Date(Date.now() + this.domainProfileRetentionMs), - }); - await DomainProfile.save(record); - - return; - } - - if (profile.engine === ENGINE_TYPE.BROWSER) { - // Mixed engine, always use browser - return; - } - - profile.origin = realUrl.origin.toLowerCase(); - profile.triggerUrl = realUrl.href; - profile.path = path; - profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT; - profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs); - - await DomainProfile.save(profile); - - return; - } - - async snapshotNotGoodEnough(snapshot: PageSnapshot) { - if (snapshot.pdfs?.length) { - return false; - } - if (!snapshot.title) { - return true; - } - if (snapshot.parsed?.content) { - return false; - } - if (snapshot.html) { - const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html); - const tokens = r.tokens; - if (tokens < 200) { - return true; - } - } - return false; - } - getDomainProfileUrlDigest(url: URL) { const pathname = url.pathname; const pathVec = pathname.split('/'); diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index 2de8cc1..9d774fb 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -3,6 +3,7 @@ import { FancyFile } from 'civkit/fancy-file'; import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser'; import { Context } from '../services/registry'; import { TurnDownTweakableOptions } from './turndown-tweakable-options'; +import type { PageSnapshot } from '../services/puppeteer'; export enum CONTENT_FORMAT { CONTENT = 'content', @@ -18,12 +19,18 @@ export enum CONTENT_FORMAT { export enum ENGINE_TYPE { AUTO = 'auto', BROWSER = 'browser', - DIRECT = 'direct', - VLM = 'vlm', - READER_LM = 'readerlm-v2', + CURL = 'curl', CF_BROWSER_RENDERING = 'cf-browser-rendering', } +export enum RESPOND_TIMING { + HTML = 'html', + MUTATION_IDLE = 'mutation-idle', + RESOURCE_IDLE = 'resource-idle', + MEDIA_IDLE = 'media-idle', + NETWORK_IDLE = 'network-idle', +} + const CONTENT_FORMAT_VALUES = new Set(Object.values(CONTENT_FORMAT)); export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const; @@ -213,6 +220,15 @@ class Viewport extends AutoCastable { in: 'header', schema: { type: 'string' } }, + 'X-Respond-Timing': { + description: `Explicitly specify the respond timing. One of the following:\n\n` + + `- html: unrendered HTML is enough to return\n` + + `- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` + + `- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` + + `- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n\n`, + in: 'header', + schema: { type: 'string' } + }, 'X-Engine': { description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering', in: 'header', @@ -405,6 +421,11 @@ export class CrawlerOptions extends AutoCastable { @Prop() markdown?: TurnDownTweakableOptions; + @Prop({ + type: RESPOND_TIMING, + }) + respondTiming?: RESPOND_TIMING; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined; @@ -498,10 +519,10 @@ export class CrawlerOptions extends AutoCastable { if (instance.engine) { instance.engine = instance.engine.toLowerCase(); } - if (instance.engine === ENGINE_TYPE.VLM) { + if (instance.engine === 'vlm') { instance.engine = ENGINE_TYPE.BROWSER; instance.respondWith = CONTENT_FORMAT.VLM; - } else if (instance.engine === ENGINE_TYPE.READER_LM) { + } else if (instance.engine === 'readerlm-v2') { instance.engine = ENGINE_TYPE.AUTO; instance.respondWith = CONTENT_FORMAT.READER_LM; } @@ -558,6 +579,16 @@ export class CrawlerOptions extends AutoCastable { const dnt = ctx?.get('dnt'); instance.doNotTrack ??= (parseInt(dnt || '') || null); + const respondTiming = ctx?.get('x-respond-timing'); + if (respondTiming) { + instance.respondTiming ??= respondTiming as RESPOND_TIMING; + } + instance.respondTiming ??= ( + instance.timeout || + instance.respondWith.includes('shot') || + instance.respondWith.includes('vlm') + ) ? RESPOND_TIMING.MEDIA_IDLE : RESPOND_TIMING.MUTATION_IDLE; + if (instance.cacheTolerance) { instance.cacheTolerance = instance.cacheTolerance * 1000; } @@ -569,11 +600,36 @@ export class CrawlerOptions extends AutoCastable { return instance; } - isEarlyReturnApplicable() { - if (this.timeout !== undefined) { + isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) { + if (this.waitForSelector?.length) { return false; } - if (this.waitForSelector?.length) { + if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) { + return true; + } + if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded) { + const now = Date.now(); + if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) { + return true; + } + } + if ((this.respondWith.includes('vlm') || this.respondWith.includes('pageshot')) && !snapshot.pageshot) { + return false; + } + if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) { + return false; + } + if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) { + return true; + } + if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded) { + const now = Date.now(); + if ((snapshot.lastContentResourceLoaded + 500) < now) { + return true; + } + } + + if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) { return false; } if (this.injectFrameScript?.length || this.injectPageScript?.length) { @@ -583,7 +639,7 @@ export class CrawlerOptions extends AutoCastable { return false; } - return true; + return false; } isCacheQueryApplicable() { @@ -611,6 +667,9 @@ export class CrawlerOptions extends AutoCastable { } browserIsNotRequired() { + if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) { + return false; + } if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) { return false; } diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index df9b37a..6f17d8e 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -55,13 +55,10 @@ export interface PageSnapshot { href: string; rebase?: string; html: string; - htmlModifiedByJs?: boolean; shadowExpanded?: string; text: string; status?: number; statusText?: string; - isIntermediate?: boolean; - isFromCache?: boolean; parsed?: Partial | null; screenshot?: Buffer; pageshot?: Buffer; @@ -70,6 +67,11 @@ export interface PageSnapshot { maxElemDepth?: number; elemCount?: number; childFrames?: PageSnapshot[]; + isIntermediate?: boolean; + isFromCache?: boolean; + lastMutationIdle?: number; + lastContentResourceLoaded?: number; + lastMediaResourceLoaded?: number; } export interface ExtendedSnapshot extends PageSnapshot { @@ -374,9 +376,10 @@ function shadowDomPresent(rootElement = document.documentElement) { return false; } -let initialHTML; +let lastMutationIdle = 0; +document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now()); + function giveSnapshot(stopActiveSnapshot) { - initialHTML ??= document.documentElement?.outerHTML; if (stopActiveSnapshot) { window.haltSnapshot = true; } @@ -392,17 +395,14 @@ function giveSnapshot(stopActiveSnapshot) { description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '', href: document.location.href, html: document.documentElement?.outerHTML, - htmlModifiedByJs: false, text: document.body?.innerText, shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, parsed: parsed, imgs: [], maxElemDepth: domAnalysis.maxDepth, elemCount: domAnalysis.elementCount, + lastMutationIdle, }; - if (initialHTML) { - r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded; - } if (document.baseURI !== r.href) { r.rebase = document.baseURI; } @@ -445,9 +445,20 @@ window.briefImgs = briefImgs; })(); `; +const documentResourceTypes = new Set([ + 'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight' +]); +const mediaResourceTypes = new Set([ + 'stylesheet', 'image', 'font', 'media' +]); + + class PageReqCtrlKit { reqSet: Set = new Set(); blockers: Deferred[] = []; + lastResourceLoadedAt: number = 0; + lastContentResourceLoadedAt: number = 0; + lastMediaResourceLoadedAt: number = 0; constructor( public concurrency: number, @@ -472,6 +483,15 @@ class PageReqCtrlKit { this.reqSet.delete(req); const deferred = this.blockers.shift(); deferred?.resolve(); + const now = Date.now(); + this.lastResourceLoadedAt = now; + const typ = req.resourceType(); + if (documentResourceTypes.has(typ)) { + this.lastContentResourceLoadedAt = now; + } + if (mediaResourceTypes.has(typ)) { + this.lastMediaResourceLoadedAt = now; + } } } @@ -491,7 +511,7 @@ export class PuppeteerControl extends AsyncService { lastPageCratedAt: number = 0; ua: string = ''; - concurrentRequestsPerPage: number = 16; + concurrentRequestsPerPage: number = 32; pageReqCtrl = new WeakMap(); lastReqSentAt: number = 0; @@ -1050,6 +1070,11 @@ export class PuppeteerControl extends AsyncService { return; } snapshot = s; + if (snapshot) { + const kit = this.pageReqCtrl.get(page); + snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt; + snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt; + } if (s?.maxElemDepth && s.maxElemDepth > 256) { return; }