fix: track if snapshot html modified by js

This commit is contained in:
Yanlong Wang 2025-03-12 22:26:10 +08:00
parent 6027963670
commit 9415c6a2be
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 38 additions and 13 deletions

View File

@ -15,7 +15,7 @@ import { Defer } from 'civkit/defer';
import { retryWith } from 'civkit/decorators';
import { FancyFile } from 'civkit/fancy-file';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
import { Crawled } from '../db/crawled';
import { DomainBlockade } from '../db/domain-blockade';
@ -585,6 +585,7 @@ export class CrawlerHost extends RPCHost {
url: urlToCrawl.toString(),
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
htmlModifiedByJs: snapshot.htmlModifiedByJs,
urlPathDigest: digest,
});
@ -732,6 +733,12 @@ export class CrawlerHost extends RPCHost {
cache = await this.queryCache(urlToCrawl, cacheTolerance);
}
if (cache?.htmlModifiedByJs === false) {
if (crawlerOpts) {
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
}
}
if (cache?.isFresh &&
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)

View File

@ -21,6 +21,9 @@ export class Crawled extends FirestoreRecord {
})
urlPathDigest!: string;
@Prop()
htmlModifiedByJs?: boolean;
@Prop()
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };

View File

@ -584,13 +584,6 @@ export class CrawlerOptions extends AutoCastable {
if (respondTiming) {
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
}
if (instance.timeout) {
instance.respondTiming ??= RESPOND_TIMING.NETWORK_IDLE;
}
if (instance.respondWith.includes('shot') || instance.respondWith.includes('vlm')) {
instance.respondTiming ??= RESPOND_TIMING.MEDIA_IDLE;
}
instance.respondTiming ??= RESPOND_TIMING.RESOURCE_IDLE;
if (instance.cacheTolerance) {
instance.cacheTolerance = instance.cacheTolerance * 1000;
@ -603,14 +596,29 @@ export class CrawlerOptions extends AutoCastable {
return instance;
}
get presumedRespondTiming() {
if (this.respondTiming) {
return this.respondTiming;
}
if (this.timeout) {
return RESPOND_TIMING.NETWORK_IDLE;
}
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
return RESPOND_TIMING.MEDIA_IDLE;
}
return RESPOND_TIMING.RESOURCE_IDLE;
}
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
if (this.waitForSelector?.length) {
return false;
}
if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
const presumedTiming = this.presumedRespondTiming;
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
return true;
}
if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
const now = Date.now();
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
return true;
@ -622,7 +630,7 @@ export class CrawlerOptions extends AutoCastable {
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
return false;
}
if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
if (presumedTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
const now = Date.now();
if ((snapshot.lastContentResourceLoaded + 500) < now) {
return true;
@ -632,10 +640,10 @@ export class CrawlerOptions extends AutoCastable {
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false;
}
if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
return false;
}
if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
if (presumedTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
return true;
}
if (this.respondWith.includes('lm')) {

View File

@ -55,6 +55,7 @@ export interface PageSnapshot {
href: string;
rebase?: string;
html: string;
htmlModifiedByJs?: boolean;
shadowExpanded?: string;
text: string;
status?: number;
@ -377,9 +378,11 @@ function shadowDomPresent(rootElement = document.documentElement) {
}
let lastMutationIdle = 0;
let initialHTML;
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
function giveSnapshot(stopActiveSnapshot) {
initialHTML ??= document.documentElement?.outerHTML;
if (stopActiveSnapshot) {
window.haltSnapshot = true;
}
@ -395,6 +398,7 @@ function giveSnapshot(stopActiveSnapshot) {
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
href: document.location.href,
html: document.documentElement?.outerHTML,
htmlModifiedByJs: false,
text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed,
@ -403,6 +407,9 @@ function giveSnapshot(stopActiveSnapshot) {
elemCount: domAnalysis.elementCount,
lastMutationIdle,
};
if (initialHTML) {
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
}
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}