mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-18 15:35:57 +08:00
fix: track if snapshot html modified by js
This commit is contained in:
parent
6027963670
commit
9415c6a2be
@ -15,7 +15,7 @@ import { Defer } from 'civkit/defer';
|
||||
import { retryWith } from 'civkit/decorators';
|
||||
import { FancyFile } from 'civkit/fancy-file';
|
||||
|
||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
|
||||
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { DomainBlockade } from '../db/domain-blockade';
|
||||
@ -585,6 +585,7 @@ export class CrawlerHost extends RPCHost {
|
||||
url: urlToCrawl.toString(),
|
||||
createdAt: nowDate,
|
||||
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
||||
htmlModifiedByJs: snapshot.htmlModifiedByJs,
|
||||
urlPathDigest: digest,
|
||||
});
|
||||
|
||||
@ -732,6 +733,12 @@ export class CrawlerHost extends RPCHost {
|
||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||
}
|
||||
|
||||
if (cache?.htmlModifiedByJs === false) {
|
||||
if (crawlerOpts) {
|
||||
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
|
||||
}
|
||||
}
|
||||
|
||||
if (cache?.isFresh &&
|
||||
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
||||
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
||||
|
@ -21,6 +21,9 @@ export class Crawled extends FirestoreRecord {
|
||||
})
|
||||
urlPathDigest!: string;
|
||||
|
||||
@Prop()
|
||||
htmlModifiedByJs?: boolean;
|
||||
|
||||
@Prop()
|
||||
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
||||
|
||||
|
@ -584,13 +584,6 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if (respondTiming) {
|
||||
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
||||
}
|
||||
if (instance.timeout) {
|
||||
instance.respondTiming ??= RESPOND_TIMING.NETWORK_IDLE;
|
||||
}
|
||||
if (instance.respondWith.includes('shot') || instance.respondWith.includes('vlm')) {
|
||||
instance.respondTiming ??= RESPOND_TIMING.MEDIA_IDLE;
|
||||
}
|
||||
instance.respondTiming ??= RESPOND_TIMING.RESOURCE_IDLE;
|
||||
|
||||
if (instance.cacheTolerance) {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
@ -603,14 +596,29 @@ export class CrawlerOptions extends AutoCastable {
|
||||
return instance;
|
||||
}
|
||||
|
||||
get presumedRespondTiming() {
|
||||
if (this.respondTiming) {
|
||||
return this.respondTiming;
|
||||
}
|
||||
if (this.timeout) {
|
||||
return RESPOND_TIMING.NETWORK_IDLE;
|
||||
}
|
||||
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
|
||||
return RESPOND_TIMING.MEDIA_IDLE;
|
||||
}
|
||||
|
||||
return RESPOND_TIMING.RESOURCE_IDLE;
|
||||
}
|
||||
|
||||
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
||||
if (this.waitForSelector?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
||||
const presumedTiming = this.presumedRespondTiming;
|
||||
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
||||
return true;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
|
||||
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
|
||||
const now = Date.now();
|
||||
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
||||
return true;
|
||||
@ -622,7 +630,7 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
||||
return false;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
|
||||
if (presumedTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
|
||||
const now = Date.now();
|
||||
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
||||
return true;
|
||||
@ -632,10 +640,10 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
||||
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
||||
return false;
|
||||
}
|
||||
if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
||||
if (presumedTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
||||
return true;
|
||||
}
|
||||
if (this.respondWith.includes('lm')) {
|
||||
|
@ -55,6 +55,7 @@ export interface PageSnapshot {
|
||||
href: string;
|
||||
rebase?: string;
|
||||
html: string;
|
||||
htmlModifiedByJs?: boolean;
|
||||
shadowExpanded?: string;
|
||||
text: string;
|
||||
status?: number;
|
||||
@ -377,9 +378,11 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
||||
}
|
||||
|
||||
let lastMutationIdle = 0;
|
||||
let initialHTML;
|
||||
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
||||
|
||||
function giveSnapshot(stopActiveSnapshot) {
|
||||
initialHTML ??= document.documentElement?.outerHTML;
|
||||
if (stopActiveSnapshot) {
|
||||
window.haltSnapshot = true;
|
||||
}
|
||||
@ -395,6 +398,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
||||
href: document.location.href,
|
||||
html: document.documentElement?.outerHTML,
|
||||
htmlModifiedByJs: false,
|
||||
text: document.body?.innerText,
|
||||
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
||||
parsed: parsed,
|
||||
@ -403,6 +407,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
elemCount: domAnalysis.elementCount,
|
||||
lastMutationIdle,
|
||||
};
|
||||
if (initialHTML) {
|
||||
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
||||
}
|
||||
if (document.baseURI !== r.href) {
|
||||
r.rebase = document.baseURI;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user