mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 19:59:10 +08:00
fix: track if snapshot html modified by js
This commit is contained in:
parent
6027963670
commit
9415c6a2be
@ -15,7 +15,7 @@ import { Defer } from 'civkit/defer';
|
|||||||
import { retryWith } from 'civkit/decorators';
|
import { retryWith } from 'civkit/decorators';
|
||||||
import { FancyFile } from 'civkit/fancy-file';
|
import { FancyFile } from 'civkit/fancy-file';
|
||||||
|
|
||||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options';
|
||||||
|
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import { DomainBlockade } from '../db/domain-blockade';
|
import { DomainBlockade } from '../db/domain-blockade';
|
||||||
@ -585,6 +585,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
url: urlToCrawl.toString(),
|
url: urlToCrawl.toString(),
|
||||||
createdAt: nowDate,
|
createdAt: nowDate,
|
||||||
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
|
||||||
|
htmlModifiedByJs: snapshot.htmlModifiedByJs,
|
||||||
urlPathDigest: digest,
|
urlPathDigest: digest,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -732,6 +733,12 @@ export class CrawlerHost extends RPCHost {
|
|||||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cache?.htmlModifiedByJs === false) {
|
||||||
|
if (crawlerOpts) {
|
||||||
|
crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (cache?.isFresh &&
|
if (cache?.isFresh &&
|
||||||
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
||||||
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
||||||
|
@ -21,6 +21,9 @@ export class Crawled extends FirestoreRecord {
|
|||||||
})
|
})
|
||||||
urlPathDigest!: string;
|
urlPathDigest!: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
htmlModifiedByJs?: boolean;
|
||||||
|
|
||||||
@Prop()
|
@Prop()
|
||||||
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
||||||
|
|
||||||
|
@ -584,13 +584,6 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (respondTiming) {
|
if (respondTiming) {
|
||||||
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
instance.respondTiming ??= respondTiming as RESPOND_TIMING;
|
||||||
}
|
}
|
||||||
if (instance.timeout) {
|
|
||||||
instance.respondTiming ??= RESPOND_TIMING.NETWORK_IDLE;
|
|
||||||
}
|
|
||||||
if (instance.respondWith.includes('shot') || instance.respondWith.includes('vlm')) {
|
|
||||||
instance.respondTiming ??= RESPOND_TIMING.MEDIA_IDLE;
|
|
||||||
}
|
|
||||||
instance.respondTiming ??= RESPOND_TIMING.RESOURCE_IDLE;
|
|
||||||
|
|
||||||
if (instance.cacheTolerance) {
|
if (instance.cacheTolerance) {
|
||||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||||
@ -603,14 +596,29 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get presumedRespondTiming() {
|
||||||
|
if (this.respondTiming) {
|
||||||
|
return this.respondTiming;
|
||||||
|
}
|
||||||
|
if (this.timeout) {
|
||||||
|
return RESPOND_TIMING.NETWORK_IDLE;
|
||||||
|
}
|
||||||
|
if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) {
|
||||||
|
return RESPOND_TIMING.MEDIA_IDLE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return RESPOND_TIMING.RESOURCE_IDLE;
|
||||||
|
}
|
||||||
|
|
||||||
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) {
|
||||||
if (this.waitForSelector?.length) {
|
if (this.waitForSelector?.length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
const presumedTiming = this.presumedRespondTiming;
|
||||||
|
if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
|
if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) {
|
||||||
return true;
|
return true;
|
||||||
@ -622,7 +630,7 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
|
if (presumedTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
if ((snapshot.lastContentResourceLoaded + 500) < now) {
|
||||||
return true;
|
return true;
|
||||||
@ -632,10 +640,10 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
if (presumedTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (this.respondWith.includes('lm')) {
|
if (this.respondWith.includes('lm')) {
|
||||||
|
@ -55,6 +55,7 @@ export interface PageSnapshot {
|
|||||||
href: string;
|
href: string;
|
||||||
rebase?: string;
|
rebase?: string;
|
||||||
html: string;
|
html: string;
|
||||||
|
htmlModifiedByJs?: boolean;
|
||||||
shadowExpanded?: string;
|
shadowExpanded?: string;
|
||||||
text: string;
|
text: string;
|
||||||
status?: number;
|
status?: number;
|
||||||
@ -377,9 +378,11 @@ function shadowDomPresent(rootElement = document.documentElement) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let lastMutationIdle = 0;
|
let lastMutationIdle = 0;
|
||||||
|
let initialHTML;
|
||||||
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now());
|
||||||
|
|
||||||
function giveSnapshot(stopActiveSnapshot) {
|
function giveSnapshot(stopActiveSnapshot) {
|
||||||
|
initialHTML ??= document.documentElement?.outerHTML;
|
||||||
if (stopActiveSnapshot) {
|
if (stopActiveSnapshot) {
|
||||||
window.haltSnapshot = true;
|
window.haltSnapshot = true;
|
||||||
}
|
}
|
||||||
@ -395,6 +398,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '',
|
||||||
href: document.location.href,
|
href: document.location.href,
|
||||||
html: document.documentElement?.outerHTML,
|
html: document.documentElement?.outerHTML,
|
||||||
|
htmlModifiedByJs: false,
|
||||||
text: document.body?.innerText,
|
text: document.body?.innerText,
|
||||||
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
||||||
parsed: parsed,
|
parsed: parsed,
|
||||||
@ -403,6 +407,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
elemCount: domAnalysis.elementCount,
|
elemCount: domAnalysis.elementCount,
|
||||||
lastMutationIdle,
|
lastMutationIdle,
|
||||||
};
|
};
|
||||||
|
if (initialHTML) {
|
||||||
|
r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded;
|
||||||
|
}
|
||||||
if (document.baseURI !== r.href) {
|
if (document.baseURI !== r.href) {
|
||||||
r.rebase = document.baseURI;
|
r.rebase = document.baseURI;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user