behavior change: ditch content based return timing, adopt mutationIdle as default timing

This commit is contained in:
Yanlong Wang 2025-03-12 18:04:22 +08:00
parent 5141814bc9
commit f7dbadffb7
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 116 additions and 89 deletions

View File

@ -19,7 +19,6 @@ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE }
import { Crawled } from '../db/crawled';
import { DomainBlockade } from '../db/domain-blockade';
import { DomainProfile } from '../db/domain-profile';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@ -317,6 +316,9 @@ export class CrawlerHost extends RPCHost {
if (crawlerOptions.robotsTxt) {
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
}
if (rpcReflect.signal.aborted) {
return;
}
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
@ -363,10 +365,7 @@ export class CrawlerHost extends RPCHost {
if (rpcReflect.signal.aborted) {
break;
}
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
continue;
}
@ -412,11 +411,7 @@ export class CrawlerHost extends RPCHost {
if (rpcReflect.signal.aborted) {
break;
}
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
continue;
}
@ -427,13 +422,11 @@ export class CrawlerHost extends RPCHost {
}
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
@ -705,7 +698,11 @@ export class CrawlerHost extends RPCHost {
return;
}
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
if (
crawlOpts?.engine === ENGINE_TYPE.CURL ||
// deprecated name
crawlOpts?.engine === 'direct'
) {
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
@ -779,6 +776,7 @@ export class CrawlerHost extends RPCHost {
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
draftSnapshot.title ??= analyzed.title;
draftSnapshot.isIntermediate = true;
let fallbackProxyIsUsed = false;
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
(analyzed.tokens < 42 || sideLoaded.status !== 200)
@ -798,6 +796,7 @@ export class CrawlerHost extends RPCHost {
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
draftSnapshot = proxySnapshot;
draftSnapshot.isIntermediate = true;
sideLoaded = proxyLoaded;
fallbackProxyIsUsed = true;
}
@ -986,7 +985,7 @@ export class CrawlerHost extends RPCHost {
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
}
if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
if (opts.respondWith.includes(CONTENT_FORMAT.VLM)) {
crawlOpts.favorScreenshot = true;
}
@ -1142,62 +1141,6 @@ export class CrawlerHost extends RPCHost {
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
}
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
const realUrl = new URL(knownSnapshot.href);
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
const profile = await DomainProfile.fromFirestore(digest);
if (!profile) {
const record = DomainProfile.from({
_id: digest,
origin: realUrl.origin.toLowerCase(),
path,
triggerUrl: realUrl.href,
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
createdAt: new Date(),
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
});
await DomainProfile.save(record);
return;
}
if (profile.engine === ENGINE_TYPE.BROWSER) {
// Mixed engine, always use browser
return;
}
profile.origin = realUrl.origin.toLowerCase();
profile.triggerUrl = realUrl.href;
profile.path = path;
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
await DomainProfile.save(profile);
return;
}
async snapshotNotGoodEnough(snapshot: PageSnapshot) {
if (snapshot.pdfs?.length) {
return false;
}
if (!snapshot.title) {
return true;
}
if (snapshot.parsed?.content) {
return false;
}
if (snapshot.html) {
const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
const tokens = r.tokens;
if (tokens < 200) {
return true;
}
}
return false;
}
getDomainProfileUrlDigest(url: URL) {
const pathname = url.pathname;
const pathVec = pathname.split('/');

View File

@ -3,6 +3,7 @@ import { FancyFile } from 'civkit/fancy-file';
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
import { Context } from '../services/registry';
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
import type { PageSnapshot } from '../services/puppeteer';
export enum CONTENT_FORMAT {
CONTENT = 'content',
@ -18,12 +19,18 @@ export enum CONTENT_FORMAT {
export enum ENGINE_TYPE {
AUTO = 'auto',
BROWSER = 'browser',
DIRECT = 'direct',
VLM = 'vlm',
READER_LM = 'readerlm-v2',
CURL = 'curl',
CF_BROWSER_RENDERING = 'cf-browser-rendering',
}
export enum RESPOND_TIMING {
HTML = 'html',
MUTATION_IDLE = 'mutation-idle',
RESOURCE_IDLE = 'resource-idle',
MEDIA_IDLE = 'media-idle',
NETWORK_IDLE = 'network-idle',
}
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
@ -213,6 +220,15 @@ class Viewport extends AutoCastable {
in: 'header',
schema: { type: 'string' }
},
'X-Respond-Timing': {
description: `Explicitly specify the respond timing. One of the following:\n\n` +
`- html: unrendered HTML is enough to return\n` +
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
`- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` +
`- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n\n`,
in: 'header',
schema: { type: 'string' }
},
'X-Engine': {
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
in: 'header',
@ -405,6 +421,11 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
markdown?: TurnDownTweakableOptions;
@Prop({
type: RESPOND_TIMING,
})
respondTiming?: RESPOND_TIMING;
static override from(input: any) {
const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
@ -498,10 +519,10 @@ export class CrawlerOptions extends AutoCastable {
if (instance.engine) {
instance.engine = instance.engine.toLowerCase();
}
if (instance.engine === ENGINE_TYPE.VLM) {
if (instance.engine === 'vlm') {
instance.engine = ENGINE_TYPE.BROWSER;
instance.respondWith = CONTENT_FORMAT.VLM;
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
} else if (instance.engine === 'readerlm-v2') {
instance.engine = ENGINE_TYPE.AUTO;
instance.respondWith = CONTENT_FORMAT.READER_LM;
}
@ -558,6 +579,16 @@ export class CrawlerOptions extends AutoCastable {
const dnt = ctx?.get('dnt');
instance.doNotTrack ??= (parseInt(dnt || '') || null);
const respondTiming = ctx?.get('x-respond-timing');
if (respondTiming) {
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
}
instance.respondTiming ??= (
instance.timeout ||
instance.respondWith.includes('shot') ||
instance.respondWith.includes('vlm')
) ? RESPOND_TIMING.MEDIA_IDLE : RESPOND_TIMING.MUTATION_IDLE;
if (instance.cacheTolerance) {
instance.cacheTolerance = instance.cacheTolerance * 1000;
}
@ -569,11 +600,36 @@ export class CrawlerOptions extends AutoCastable {
return instance;
}
isEarlyReturnApplicable() {
if (this.timeout !== undefined) {
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
if (this.waitForSelector?.length) {
return false;
}
if (this.waitForSelector?.length) {
if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
return true;
}
if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded) {
const now = Date.now();
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
return true;
}
}
if ((this.respondWith.includes('vlm') || this.respondWith.includes('pageshot')) && !snapshot.pageshot) {
return false;
}
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
return false;
}
if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
return true;
}
if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded) {
const now = Date.now();
if ((snapshot.lastContentResourceLoaded + 500) < now) {
return true;
}
}
if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
return false;
}
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
@ -583,7 +639,7 @@ export class CrawlerOptions extends AutoCastable {
return false;
}
return true;
return false;
}
isCacheQueryApplicable() {
@ -611,6 +667,9 @@ export class CrawlerOptions extends AutoCastable {
}
browserIsNotRequired() {
if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) {
return false;
}
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
return false;
}

View File

@ -55,13 +55,10 @@ export interface PageSnapshot {
href: string;
rebase?: string;
html: string;
htmlModifiedByJs?: boolean;
shadowExpanded?: string;
text: string;
status?: number;
statusText?: string;
isIntermediate?: boolean;
isFromCache?: boolean;
parsed?: Partial<ReadabilityParsed> | null;
screenshot?: Buffer;
pageshot?: Buffer;
@ -70,6 +67,11 @@ export interface PageSnapshot {
maxElemDepth?: number;
elemCount?: number;
childFrames?: PageSnapshot[];
isIntermediate?: boolean;
isFromCache?: boolean;
lastMutationIdle?: number;
lastContentResourceLoaded?: number;
lastMediaResourceLoaded?: number;
}
export interface ExtendedSnapshot extends PageSnapshot {
@ -374,9 +376,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
return false;
}
let initialHTML;
let lastMutationIdle = 0;
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
function giveSnapshot(stopActiveSnapshot) {
initialHTML ??= document.documentElement?.outerHTML;
if (stopActiveSnapshot) {
window.haltSnapshot = true;
}
@ -392,17 +395,14 @@ function giveSnapshot(stopActiveSnapshot) {
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
href: document.location.href,
html: document.documentElement?.outerHTML,
htmlModifiedByJs: false,
text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed,
imgs: [],
maxElemDepth: domAnalysis.maxDepth,
elemCount: domAnalysis.elementCount,
lastMutationIdle,
};
if (initialHTML) {
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
}
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}
@ -445,9 +445,20 @@ window.briefImgs = briefImgs;
})();
`;
const documentResourceTypes = new Set([
'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight'
]);
const mediaResourceTypes = new Set([
'stylesheet', 'image', 'font', 'media'
]);
class PageReqCtrlKit {
reqSet: Set<HTTPRequest> = new Set();
blockers: Deferred<void>[] = [];
lastResourceLoadedAt: number = 0;
lastContentResourceLoadedAt: number = 0;
lastMediaResourceLoadedAt: number = 0;
constructor(
public concurrency: number,
@ -472,6 +483,15 @@ class PageReqCtrlKit {
this.reqSet.delete(req);
const deferred = this.blockers.shift();
deferred?.resolve();
const now = Date.now();
this.lastResourceLoadedAt = now;
const typ = req.resourceType();
if (documentResourceTypes.has(typ)) {
this.lastContentResourceLoadedAt = now;
}
if (mediaResourceTypes.has(typ)) {
this.lastMediaResourceLoadedAt = now;
}
}
}
@ -491,7 +511,7 @@ export class PuppeteerControl extends AsyncService {
lastPageCratedAt: number = 0;
ua: string = '';
concurrentRequestsPerPage: number = 16;
concurrentRequestsPerPage: number = 32;
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
lastReqSentAt: number = 0;
@ -1050,6 +1070,11 @@ export class PuppeteerControl extends AsyncService {
return;
}
snapshot = s;
if (snapshot) {
const kit = this.pageReqCtrl.get(page);
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
}
if (s?.maxElemDepth && s.maxElemDepth > 256) {
return;
}