mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 14:15:52 +08:00
behavior change: ditch content based return timing, adopt mutationIdle as default timing
This commit is contained in:
parent
5141814bc9
commit
f7dbadffb7
@ -19,7 +19,6 @@ import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE }
|
|||||||
|
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import { DomainBlockade } from '../db/domain-blockade';
|
import { DomainBlockade } from '../db/domain-blockade';
|
||||||
import { DomainProfile } from '../db/domain-profile';
|
|
||||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||||
|
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
@ -317,6 +316,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (crawlerOptions.robotsTxt) {
|
if (crawlerOptions.robotsTxt) {
|
||||||
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
||||||
}
|
}
|
||||||
|
if (rpcReflect.signal.aborted) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
||||||
const sseStream = new OutputServerEventStream();
|
const sseStream = new OutputServerEventStream();
|
||||||
rpcReflect.return(sseStream);
|
rpcReflect.return(sseStream);
|
||||||
@ -363,10 +365,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (rpcReflect.signal.aborted) {
|
if (rpcReflect.signal.aborted) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -412,11 +411,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (rpcReflect.signal.aborted) {
|
if (rpcReflect.signal.aborted) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
if (!scrapped || !crawlerOptions.isSnapshotAcceptableForEarlyResponse(scrapped)) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -427,13 +422,11 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
);
|
);
|
||||||
@ -705,7 +698,11 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
if (
|
||||||
|
crawlOpts?.engine === ENGINE_TYPE.CURL ||
|
||||||
|
// deprecated name
|
||||||
|
crawlOpts?.engine === 'direct'
|
||||||
|
) {
|
||||||
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
||||||
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
|
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
|
||||||
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
|
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
|
||||||
@ -779,6 +776,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
||||||
draftSnapshot.title ??= analyzed.title;
|
draftSnapshot.title ??= analyzed.title;
|
||||||
|
draftSnapshot.isIntermediate = true;
|
||||||
let fallbackProxyIsUsed = false;
|
let fallbackProxyIsUsed = false;
|
||||||
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
if (((!crawlOpts?.allocProxy || crawlOpts.allocProxy === 'none') && !crawlOpts?.proxyUrl) &&
|
||||||
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
(analyzed.tokens < 42 || sideLoaded.status !== 200)
|
||||||
@ -798,6 +796,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
||||||
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
||||||
draftSnapshot = proxySnapshot;
|
draftSnapshot = proxySnapshot;
|
||||||
|
draftSnapshot.isIntermediate = true;
|
||||||
sideLoaded = proxyLoaded;
|
sideLoaded = proxyLoaded;
|
||||||
fallbackProxyIsUsed = true;
|
fallbackProxyIsUsed = true;
|
||||||
}
|
}
|
||||||
@ -986,7 +985,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
if (opts.respondWith.includes(CONTENT_FORMAT.VLM)) {
|
||||||
crawlOpts.favorScreenshot = true;
|
crawlOpts.favorScreenshot = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1142,62 +1141,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
async exploreDirectEngine(knownSnapshot: PageSnapshot) {
|
|
||||||
const realUrl = new URL(knownSnapshot.href);
|
|
||||||
const { digest, path } = this.getDomainProfileUrlDigest(realUrl);
|
|
||||||
const profile = await DomainProfile.fromFirestore(digest);
|
|
||||||
|
|
||||||
if (!profile) {
|
|
||||||
const record = DomainProfile.from({
|
|
||||||
_id: digest,
|
|
||||||
origin: realUrl.origin.toLowerCase(),
|
|
||||||
path,
|
|
||||||
triggerUrl: realUrl.href,
|
|
||||||
engine: knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT,
|
|
||||||
createdAt: new Date(),
|
|
||||||
expireAt: new Date(Date.now() + this.domainProfileRetentionMs),
|
|
||||||
});
|
|
||||||
await DomainProfile.save(record);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (profile.engine === ENGINE_TYPE.BROWSER) {
|
|
||||||
// Mixed engine, always use browser
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
profile.origin = realUrl.origin.toLowerCase();
|
|
||||||
profile.triggerUrl = realUrl.href;
|
|
||||||
profile.path = path;
|
|
||||||
profile.engine = knownSnapshot.htmlModifiedByJs ? ENGINE_TYPE.BROWSER : ENGINE_TYPE.DIRECT;
|
|
||||||
profile.expireAt = new Date(Date.now() + this.domainProfileRetentionMs);
|
|
||||||
|
|
||||||
await DomainProfile.save(profile);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
async snapshotNotGoodEnough(snapshot: PageSnapshot) {
|
|
||||||
if (snapshot.pdfs?.length) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!snapshot.title) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (snapshot.parsed?.content) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (snapshot.html) {
|
|
||||||
const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
|
|
||||||
const tokens = r.tokens;
|
|
||||||
if (tokens < 200) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
getDomainProfileUrlDigest(url: URL) {
|
getDomainProfileUrlDigest(url: URL) {
|
||||||
const pathname = url.pathname;
|
const pathname = url.pathname;
|
||||||
const pathVec = pathname.split('/');
|
const pathVec = pathname.split('/');
|
||||||
|
@ -3,6 +3,7 @@ import { FancyFile } from 'civkit/fancy-file';
|
|||||||
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
import { Context } from '../services/registry';
|
import { Context } from '../services/registry';
|
||||||
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
||||||
|
import type { PageSnapshot } from '../services/puppeteer';
|
||||||
|
|
||||||
export enum CONTENT_FORMAT {
|
export enum CONTENT_FORMAT {
|
||||||
CONTENT = 'content',
|
CONTENT = 'content',
|
||||||
@ -18,12 +19,18 @@ export enum CONTENT_FORMAT {
|
|||||||
export enum ENGINE_TYPE {
|
export enum ENGINE_TYPE {
|
||||||
AUTO = 'auto',
|
AUTO = 'auto',
|
||||||
BROWSER = 'browser',
|
BROWSER = 'browser',
|
||||||
DIRECT = 'direct',
|
CURL = 'curl',
|
||||||
VLM = 'vlm',
|
|
||||||
READER_LM = 'readerlm-v2',
|
|
||||||
CF_BROWSER_RENDERING = 'cf-browser-rendering',
|
CF_BROWSER_RENDERING = 'cf-browser-rendering',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export enum RESPOND_TIMING {
|
||||||
|
HTML = 'html',
|
||||||
|
MUTATION_IDLE = 'mutation-idle',
|
||||||
|
RESOURCE_IDLE = 'resource-idle',
|
||||||
|
MEDIA_IDLE = 'media-idle',
|
||||||
|
NETWORK_IDLE = 'network-idle',
|
||||||
|
}
|
||||||
|
|
||||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||||
|
|
||||||
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
export const IMAGE_RETENTION_MODES = ['none', 'all', 'alt', 'all_p', 'alt_p'] as const;
|
||||||
@ -213,6 +220,15 @@ class Viewport extends AutoCastable {
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-Respond-Timing': {
|
||||||
|
description: `Explicitly specify the respond timing. One of the following:\n\n` +
|
||||||
|
`- html: unrendered HTML is enough to return\n` +
|
||||||
|
`- mutation-idle: wait for DOM mutations to settle and remain unchanged for at least 0.2s\n` +
|
||||||
|
`- resource-idle: wait for no additional resources that would affect page logic and content SUCCEEDED loading for at least 0.5s\n` +
|
||||||
|
`- media-idle: wait for no additional resources, including media resources, SUCCEEDED loading for at least 0.5s\n\n`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
'X-Engine': {
|
'X-Engine': {
|
||||||
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
|
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, cf-browser-rendering',
|
||||||
in: 'header',
|
in: 'header',
|
||||||
@ -405,6 +421,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
markdown?: TurnDownTweakableOptions;
|
markdown?: TurnDownTweakableOptions;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
type: RESPOND_TIMING,
|
||||||
|
})
|
||||||
|
respondTiming?: RESPOND_TIMING;
|
||||||
|
|
||||||
static override from(input: any) {
|
static override from(input: any) {
|
||||||
const instance = super.from(input) as CrawlerOptions;
|
const instance = super.from(input) as CrawlerOptions;
|
||||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
||||||
@ -498,10 +519,10 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (instance.engine) {
|
if (instance.engine) {
|
||||||
instance.engine = instance.engine.toLowerCase();
|
instance.engine = instance.engine.toLowerCase();
|
||||||
}
|
}
|
||||||
if (instance.engine === ENGINE_TYPE.VLM) {
|
if (instance.engine === 'vlm') {
|
||||||
instance.engine = ENGINE_TYPE.BROWSER;
|
instance.engine = ENGINE_TYPE.BROWSER;
|
||||||
instance.respondWith = CONTENT_FORMAT.VLM;
|
instance.respondWith = CONTENT_FORMAT.VLM;
|
||||||
} else if (instance.engine === ENGINE_TYPE.READER_LM) {
|
} else if (instance.engine === 'readerlm-v2') {
|
||||||
instance.engine = ENGINE_TYPE.AUTO;
|
instance.engine = ENGINE_TYPE.AUTO;
|
||||||
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
||||||
}
|
}
|
||||||
@ -558,6 +579,16 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
const dnt = ctx?.get('dnt');
|
const dnt = ctx?.get('dnt');
|
||||||
instance.doNotTrack ??= (parseInt(dnt || '') || null);
|
instance.doNotTrack ??= (parseInt(dnt || '') || null);
|
||||||
|
|
||||||
|
const respondTiming = ctx?.get('x-respond-timing');
|
||||||
|
if (respondTiming) {
|
||||||
|
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
||||||
|
}
|
||||||
|
instance.respondTiming ??= (
|
||||||
|
instance.timeout ||
|
||||||
|
instance.respondWith.includes('shot') ||
|
||||||
|
instance.respondWith.includes('vlm')
|
||||||
|
) ? RESPOND_TIMING.MEDIA_IDLE : RESPOND_TIMING.MUTATION_IDLE;
|
||||||
|
|
||||||
if (instance.cacheTolerance) {
|
if (instance.cacheTolerance) {
|
||||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||||
}
|
}
|
||||||
@ -569,11 +600,36 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
isEarlyReturnApplicable() {
|
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
||||||
if (this.timeout !== undefined) {
|
if (this.waitForSelector?.length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.waitForSelector?.length) {
|
if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded) {
|
||||||
|
const now = Date.now();
|
||||||
|
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((this.respondWith.includes('vlm') || this.respondWith.includes('pageshot')) && !snapshot.pageshot) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded) {
|
||||||
|
const now = Date.now();
|
||||||
|
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||||
@ -583,7 +639,7 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
isCacheQueryApplicable() {
|
isCacheQueryApplicable() {
|
||||||
@ -611,6 +667,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
browserIsNotRequired() {
|
browserIsNotRequired() {
|
||||||
|
if (this.respondTiming && this.respondTiming !== RESPOND_TIMING.HTML) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
if (this.respondWith.includes(CONTENT_FORMAT.PAGESHOT) || this.respondWith.includes(CONTENT_FORMAT.SCREENSHOT)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -55,13 +55,10 @@ export interface PageSnapshot {
|
|||||||
href: string;
|
href: string;
|
||||||
rebase?: string;
|
rebase?: string;
|
||||||
html: string;
|
html: string;
|
||||||
htmlModifiedByJs?: boolean;
|
|
||||||
shadowExpanded?: string;
|
shadowExpanded?: string;
|
||||||
text: string;
|
text: string;
|
||||||
status?: number;
|
status?: number;
|
||||||
statusText?: string;
|
statusText?: string;
|
||||||
isIntermediate?: boolean;
|
|
||||||
isFromCache?: boolean;
|
|
||||||
parsed?: Partial<ReadabilityParsed> | null;
|
parsed?: Partial<ReadabilityParsed> | null;
|
||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
pageshot?: Buffer;
|
pageshot?: Buffer;
|
||||||
@ -70,6 +67,11 @@ export interface PageSnapshot {
|
|||||||
maxElemDepth?: number;
|
maxElemDepth?: number;
|
||||||
elemCount?: number;
|
elemCount?: number;
|
||||||
childFrames?: PageSnapshot[];
|
childFrames?: PageSnapshot[];
|
||||||
|
isIntermediate?: boolean;
|
||||||
|
isFromCache?: boolean;
|
||||||
|
lastMutationIdle?: number;
|
||||||
|
lastContentResourceLoaded?: number;
|
||||||
|
lastMediaResourceLoaded?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtendedSnapshot extends PageSnapshot {
|
export interface ExtendedSnapshot extends PageSnapshot {
|
||||||
@ -374,9 +376,10 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
let initialHTML;
|
let lastMutationIdle = 0;
|
||||||
|
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
||||||
|
|
||||||
function giveSnapshot(stopActiveSnapshot) {
|
function giveSnapshot(stopActiveSnapshot) {
|
||||||
initialHTML ??= document.documentElement?.outerHTML;
|
|
||||||
if (stopActiveSnapshot) {
|
if (stopActiveSnapshot) {
|
||||||
window.haltSnapshot = true;
|
window.haltSnapshot = true;
|
||||||
}
|
}
|
||||||
@ -392,17 +395,14 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
||||||
href: document.location.href,
|
href: document.location.href,
|
||||||
html: document.documentElement?.outerHTML,
|
html: document.documentElement?.outerHTML,
|
||||||
htmlModifiedByJs: false,
|
|
||||||
text: document.body?.innerText,
|
text: document.body?.innerText,
|
||||||
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
||||||
parsed: parsed,
|
parsed: parsed,
|
||||||
imgs: [],
|
imgs: [],
|
||||||
maxElemDepth: domAnalysis.maxDepth,
|
maxElemDepth: domAnalysis.maxDepth,
|
||||||
elemCount: domAnalysis.elementCount,
|
elemCount: domAnalysis.elementCount,
|
||||||
|
lastMutationIdle,
|
||||||
};
|
};
|
||||||
if (initialHTML) {
|
|
||||||
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
|
||||||
}
|
|
||||||
if (document.baseURI !== r.href) {
|
if (document.baseURI !== r.href) {
|
||||||
r.rebase = document.baseURI;
|
r.rebase = document.baseURI;
|
||||||
}
|
}
|
||||||
@ -445,9 +445,20 @@ window.briefImgs = briefImgs;
|
|||||||
})();
|
})();
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
const documentResourceTypes = new Set([
|
||||||
|
'document', 'script', 'xhr', 'fetch', 'prefetch', 'eventsource', 'websocket', 'preflight'
|
||||||
|
]);
|
||||||
|
const mediaResourceTypes = new Set([
|
||||||
|
'stylesheet', 'image', 'font', 'media'
|
||||||
|
]);
|
||||||
|
|
||||||
|
|
||||||
class PageReqCtrlKit {
|
class PageReqCtrlKit {
|
||||||
reqSet: Set<HTTPRequest> = new Set();
|
reqSet: Set<HTTPRequest> = new Set();
|
||||||
blockers: Deferred<void>[] = [];
|
blockers: Deferred<void>[] = [];
|
||||||
|
lastResourceLoadedAt: number = 0;
|
||||||
|
lastContentResourceLoadedAt: number = 0;
|
||||||
|
lastMediaResourceLoadedAt: number = 0;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
public concurrency: number,
|
public concurrency: number,
|
||||||
@ -472,6 +483,15 @@ class PageReqCtrlKit {
|
|||||||
this.reqSet.delete(req);
|
this.reqSet.delete(req);
|
||||||
const deferred = this.blockers.shift();
|
const deferred = this.blockers.shift();
|
||||||
deferred?.resolve();
|
deferred?.resolve();
|
||||||
|
const now = Date.now();
|
||||||
|
this.lastResourceLoadedAt = now;
|
||||||
|
const typ = req.resourceType();
|
||||||
|
if (documentResourceTypes.has(typ)) {
|
||||||
|
this.lastContentResourceLoadedAt = now;
|
||||||
|
}
|
||||||
|
if (mediaResourceTypes.has(typ)) {
|
||||||
|
this.lastMediaResourceLoadedAt = now;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -491,7 +511,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
ua: string = '';
|
ua: string = '';
|
||||||
|
|
||||||
concurrentRequestsPerPage: number = 16;
|
concurrentRequestsPerPage: number = 32;
|
||||||
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
||||||
|
|
||||||
lastReqSentAt: number = 0;
|
lastReqSentAt: number = 0;
|
||||||
@ -1050,6 +1070,11 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
snapshot = s;
|
snapshot = s;
|
||||||
|
if (snapshot) {
|
||||||
|
const kit = this.pageReqCtrl.get(page);
|
||||||
|
snapshot.lastContentResourceLoaded = kit?.lastContentResourceLoadedAt;
|
||||||
|
snapshot.lastMediaResourceLoaded = kit?.lastMediaResourceLoadedAt;
|
||||||
|
}
|
||||||
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user