mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 00:36:18 +08:00
behavior change: ditch content based return timing, adopt mutationIdle as default timing
This commit is contained in:
parent
5141814bc9
commit
f7dbadffb7
@ -19,7 +19,6 @@ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE }
|
||||
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { DomainBlockade } from '../db/domain-blockade';
|
||||
import { DomainProfile } from '../db/domain-profile';
|
||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
@ -317,6 +316,9 @@ export class CrawlerHost extends RPCHost {
|
||||
if (crawlerOptions.robotsTxt) {
|
||||
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
||||
}
|
||||
if (rpcReflect.signal.aborted) {
|
||||
return;
|
||||
}
|
||||
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
||||
const sseStream = new OutputServerEventStream();
|
||||
rpcReflect.return(sseStream);
|
||||
@ -363,10 +365,7 @@ export class CrawlerHost extends RPCHost {
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||
continue;
|
||||
}
|
||||
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
||||
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -412,11 +411,7 @@ export class CrawlerHost extends RPCHost {
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
||||
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -427,13 +422,11 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||
);
|
||||
}
|
||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||
);
|
||||
@ -705,7 +698,11 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
||||
if (
|
||||
crawlOpts?.engine === ENGINE_TYPE.CURL ||
|
||||
// deprecated name
|
||||
crawlOpts?.engine === 'direct'
|
||||
) {
|
||||
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
||||
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
|
||||
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
|
||||
@ -779,6 +776,7 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
||||
draftSnapshot.title ??= analyzed.title;
|
||||
draftSnapshot.isIntermediate = true;
|
||||
let fallbackProxyIsUsed = false;
|
||||
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
||||
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
||||
@ -798,6 +796,7 @@ export class CrawlerHost extends RPCHost {
|
||||
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
||||
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
||||
draftSnapshot = proxySnapshot;
|
||||
draftSnapshot.isIntermediate = true;
|
||||
sideLoaded = proxyLoaded;
|
||||
fallbackProxyIsUsed = true;
|
||||
}
|
||||
@ -986,7 +985,7 @@ export class CrawlerHost extends RPCHost {
|
||||
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
||||
}
|
||||
|
||||
if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||
if (opts.respondWith.includes(CONTENT_FORMAT.VLM)) {
|
||||
crawlOpts.favorScreenshot = true;
|
||||
}
|
||||
|
||||
@ -1142,62 +1141,6 @@ export class CrawlerHost extends RPCHost {
|
||||
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||
}
|
||||
|
||||
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
|
||||
const realUrl = new URL(knownSnapshot.href);
|
||||
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
|
||||
const profile = await DomainProfile.fromFirestore(digest);
|
||||
|
||||
if (!profile) {
|
||||
const record = DomainProfile.from({
|
||||
_id: digest,
|
||||
origin: realUrl.origin.toLowerCase(),
|
||||
path,
|
||||
triggerUrl: realUrl.href,
|
||||
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
|
||||
createdAt: new Date(),
|
||||
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
|
||||
});
|
||||
await DomainProfile.save(record);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (profile.engine === ENGINE_TYPE.BROWSER) {
|
||||
// Mixed engine, always use browser
|
||||
return;
|
||||
}
|
||||
|
||||
profile.origin = realUrl.origin.toLowerCase();
|
||||
profile.triggerUrl = realUrl.href;
|
||||
profile.path = path;
|
||||
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
|
||||
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
||||
|
||||
await DomainProfile.save(profile);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
async snapshotNotGoodEnough(snapshot: PageSnapshot) {
|
||||
if (snapshot.pdfs?.length) {
|
||||
return false;
|
||||
}
|
||||
if (!snapshot.title) {
|
||||
return true;
|
||||
}
|
||||
if (snapshot.parsed?.content) {
|
||||
return false;
|
||||
}
|
||||
if (snapshot.html) {
|
||||
const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
|
||||
const tokens = r.tokens;
|
||||
if (tokens < 200) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
getDomainProfileUrlDigest(url: URL) {
|
||||
const pathname = url.pathname;
|
||||
const pathVec = pathname.split('/');
|
||||
|
@ -3,6 +3,7 @@ import { FancyFile } from 'civkit/fancy-file';
|
||||
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
import { Context } from '../services/registry';
|
||||
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
||||
import type { PageSnapshot } from '../services/puppeteer';
|
||||
|
||||
export enum CONTENT_FORMAT {
|
||||
CONTENT = 'content',
|
||||
@ -18,12 +19,18 @@ export enum CONTENT_FORMAT {
|
||||
export enum ENGINE_TYPE {
|
||||
AUTO = 'auto',
|
||||
BROWSER = 'browser',
|
||||
DIRECT = 'direct',
|
||||
VLM = 'vlm',
|
||||
READER_LM = 'readerlm-v2',
|
||||
CURL = 'curl',
|
||||
CF_BROWSER_RENDERING = 'cf-browser-rendering',
|
||||
}
|
||||
|
||||
export enum RESPOND_TIMING {
|
||||
HTML = 'html',
|
||||
MUTATION_IDLE = 'mutation-idle',
|
||||
RESOURCE_IDLE = 'resource-idle',
|
||||
MEDIA_IDLE = 'media-idle',
|
||||
NETWORK_IDLE = 'network-idle',
|
||||
}
|
||||
|
||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||
|
||||
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
||||
@ -213,6 +220,15 @@ class Viewport extends AutoCastable {
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Respond-Timing': {
|
||||
description: `Explicitly specify the respond timing. One of the following:\n\n` +
|
||||
`- html: unrendered HTML is enough to return\n` +
|
||||
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
|
||||
`- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` +
|
||||
`- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n\n`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Engine': {
|
||||
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
|
||||
in: 'header',
|
||||
@ -405,6 +421,11 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
markdown?: TurnDownTweakableOptions;
|
||||
|
||||
@Prop({
|
||||
type: RESPOND_TIMING,
|
||||
})
|
||||
respondTiming?: RESPOND_TIMING;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as CrawlerOptions;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
||||
@ -498,10 +519,10 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if (instance.engine) {
|
||||
instance.engine = instance.engine.toLowerCase();
|
||||
}
|
||||
if (instance.engine === ENGINE_TYPE.VLM) {
|
||||
if (instance.engine === 'vlm') {
|
||||
instance.engine = ENGINE_TYPE.BROWSER;
|
||||
instance.respondWith = CONTENT_FORMAT.VLM;
|
||||
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
|
||||
} else if (instance.engine === 'readerlm-v2') {
|
||||
instance.engine = ENGINE_TYPE.AUTO;
|
||||
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
||||
}
|
||||
@ -558,6 +579,16 @@ export class CrawlerOptions extends AutoCastable {
|
||||
const dnt = ctx?.get('dnt');
|
||||
instance.doNotTrack ??= (parseInt(dnt || '') || null);
|
||||
|
||||
const respondTiming = ctx?.get('x-respond-timing');
|
||||
if (respondTiming) {
|
||||
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
||||
}
|
||||
instance.respondTiming ??= (
|
||||
instance.timeout ||
|
||||
instance.respondWith.includes('shot') ||
|
||||
instance.respondWith.includes('vlm')
|
||||
) ? RESPOND_TIMING.MEDIA_IDLE : RESPOND_TIMING.MUTATION_IDLE;
|
||||
|
||||
if (instance.cacheTolerance) {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
}
|
||||
@ -569,11 +600,36 @@ export class CrawlerOptions extends AutoCastable {
|
||||
return instance;
|
||||
}
|
||||
|
||||
isEarlyReturnApplicable() {
|
||||
if (this.timeout !== undefined) {
|
||||
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
||||
if (this.waitForSelector?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.waitForSelector?.length) {
|
||||
if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
||||
return true;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded) {
|
||||
const now = Date.now();
|
||||
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if ((this.respondWith.includes('vlm') || this.respondWith.includes('pageshot')) && !snapshot.pageshot) {
|
||||
return false;
|
||||
}
|
||||
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
||||
return false;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
||||
return true;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded) {
|
||||
const now = Date.now();
|
||||
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
||||
return false;
|
||||
}
|
||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||
@ -583,7 +639,7 @@ export class CrawlerOptions extends AutoCastable {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
isCacheQueryApplicable() {
|
||||
@ -611,6 +667,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
}
|
||||
|
||||
browserIsNotRequired() {
|
||||
if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) {
|
||||
return false;
|
||||
}
|
||||
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -55,13 +55,10 @@ export interface PageSnapshot {
|
||||
href: string;
|
||||
rebase?: string;
|
||||
html: string;
|
||||
htmlModifiedByJs?: boolean;
|
||||
shadowExpanded?: string;
|
||||
text: string;
|
||||
status?: number;
|
||||
statusText?: string;
|
||||
isIntermediate?: boolean;
|
||||
isFromCache?: boolean;
|
||||
parsed?: Partial<ReadabilityParsed> | null;
|
||||
screenshot?: Buffer;
|
||||
pageshot?: Buffer;
|
||||
@ -70,6 +67,11 @@ export interface PageSnapshot {
|
||||
maxElemDepth?: number;
|
||||
elemCount?: number;
|
||||
childFrames?: PageSnapshot[];
|
||||
isIntermediate?: boolean;
|
||||
isFromCache?: boolean;
|
||||
lastMutationIdle?: number;
|
||||
lastContentResourceLoaded?: number;
|
||||
lastMediaResourceLoaded?: number;
|
||||
}
|
||||
|
||||
export interface ExtendedSnapshot extends PageSnapshot {
|
||||
@ -374,9 +376,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let initialHTML;
|
||||
let lastMutationIdle = 0;
|
||||
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
||||
|
||||
function giveSnapshot(stopActiveSnapshot) {
|
||||
initialHTML ??= document.documentElement?.outerHTML;
|
||||
if (stopActiveSnapshot) {
|
||||
window.haltSnapshot = true;
|
||||
}
|
||||
@ -392,17 +395,14 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
||||
href: document.location.href,
|
||||
html: document.documentElement?.outerHTML,
|
||||
htmlModifiedByJs: false,
|
||||
text: document.body?.innerText,
|
||||
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
||||
parsed: parsed,
|
||||
imgs: [],
|
||||
maxElemDepth: domAnalysis.maxDepth,
|
||||
elemCount: domAnalysis.elementCount,
|
||||
lastMutationIdle,
|
||||
};
|
||||
if (initialHTML) {
|
||||
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
||||
}
|
||||
if (document.baseURI !== r.href) {
|
||||
r.rebase = document.baseURI;
|
||||
}
|
||||
@ -445,9 +445,20 @@ window.briefImgs = briefImgs;
|
||||
})();
|
||||
`;
|
||||
|
||||
const documentResourceTypes = new Set([
|
||||
'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight'
|
||||
]);
|
||||
const mediaResourceTypes = new Set([
|
||||
'stylesheet', 'image', 'font', 'media'
|
||||
]);
|
||||
|
||||
|
||||
class PageReqCtrlKit {
|
||||
reqSet: Set<HTTPRequest> = new Set();
|
||||
blockers: Deferred<void>[] = [];
|
||||
lastResourceLoadedAt: number = 0;
|
||||
lastContentResourceLoadedAt: number = 0;
|
||||
lastMediaResourceLoadedAt: number = 0;
|
||||
|
||||
constructor(
|
||||
public concurrency: number,
|
||||
@ -472,6 +483,15 @@ class PageReqCtrlKit {
|
||||
this.reqSet.delete(req);
|
||||
const deferred = this.blockers.shift();
|
||||
deferred?.resolve();
|
||||
const now = Date.now();
|
||||
this.lastResourceLoadedAt = now;
|
||||
const typ = req.resourceType();
|
||||
if (documentResourceTypes.has(typ)) {
|
||||
this.lastContentResourceLoadedAt = now;
|
||||
}
|
||||
if (mediaResourceTypes.has(typ)) {
|
||||
this.lastMediaResourceLoadedAt = now;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -491,7 +511,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
lastPageCratedAt: number = 0;
|
||||
ua: string = '';
|
||||
|
||||
concurrentRequestsPerPage: number = 16;
|
||||
concurrentRequestsPerPage: number = 32;
|
||||
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
||||
|
||||
lastReqSentAt: number = 0;
|
||||
@ -1050,6 +1070,11 @@ export class PuppeteerControl extends AsyncService {
|
||||
return;
|
||||
}
|
||||
snapshot = s;
|
||||
if (snapshot) {
|
||||
const kit = this.pageReqCtrl.get(page);
|
||||
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
|
||||
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
|
||||
}
|
||||
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
||||
return;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user