diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 6dfdae4..7dc5e56 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -15,7 +15,7 @@ import { Defer } from 'civkit/defer'; import { retryWith } from 'civkit/decorators'; import { FancyFile } from 'civkit/fancy-file'; -import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options'; +import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE, RESPOND_TIMING } from '../dto/crawler-options'; import { Crawled } from '../db/crawled'; import { DomainBlockade } from '../db/domain-blockade'; @@ -585,6 +585,7 @@ export class CrawlerHost extends RPCHost { url: urlToCrawl.toString(), createdAt: nowDate, expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs), + htmlModifiedByJs: snapshot.htmlModifiedByJs, urlPathDigest: digest, }); @@ -732,6 +733,12 @@ export class CrawlerHost extends RPCHost { cache = await this.queryCache(urlToCrawl, cacheTolerance); } + if (cache?.htmlModifiedByJs === false) { + if (crawlerOpts) { + crawlerOpts.respondTiming ??= RESPOND_TIMING.HTML; + } + } + if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) && (_.get(cache.snapshot, 'locale') === crawlOpts?.locale) diff --git a/src/db/crawled.ts b/src/db/crawled.ts index 4ec64f1..ee5379f 100644 --- a/src/db/crawled.ts +++ b/src/db/crawled.ts @@ -21,6 +21,9 @@ export class Crawled extends FirestoreRecord { }) urlPathDigest!: string; + @Prop() + htmlModifiedByJs?: boolean; + @Prop() snapshot?: PageSnapshot & { screenshot: never; pageshot: never; }; diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index a358a64..a33471b 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -584,13 +584,6 @@ export class CrawlerOptions extends AutoCastable { if (respondTiming) { instance.respondTiming ??= respondTiming as RESPOND_TIMING; } - if (instance.timeout) { - instance.respondTiming ??= RESPOND_TIMING.NETWORK_IDLE; - } - if (instance.respondWith.includes('shot') || instance.respondWith.includes('vlm')) { - instance.respondTiming ??= RESPOND_TIMING.MEDIA_IDLE; - } - instance.respondTiming ??= RESPOND_TIMING.RESOURCE_IDLE; if (instance.cacheTolerance) { instance.cacheTolerance = instance.cacheTolerance * 1000; @@ -603,14 +596,29 @@ export class CrawlerOptions extends AutoCastable { return instance; } + get presumedRespondTiming() { + if (this.respondTiming) { + return this.respondTiming; + } + if (this.timeout) { + return RESPOND_TIMING.NETWORK_IDLE; + } + if (this.respondWith.includes('shot') || this.respondWith.includes('vlm')) { + return RESPOND_TIMING.MEDIA_IDLE; + } + + return RESPOND_TIMING.RESOURCE_IDLE; + } + isSnapshotAcceptableForEarlyResponse(snapshot: PageSnapshot) { if (this.waitForSelector?.length) { return false; } - if (this.respondTiming === RESPOND_TIMING.HTML && snapshot.html) { + const presumedTiming = this.presumedRespondTiming; + if (presumedTiming === RESPOND_TIMING.HTML && snapshot.html) { return true; } - if (this.respondTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) { + if (presumedTiming === RESPOND_TIMING.MEDIA_IDLE && snapshot.lastMediaResourceLoaded && snapshot.lastMutationIdle) { const now = Date.now(); if ((Math.max(snapshot.lastMediaResourceLoaded, snapshot.lastContentResourceLoaded || 0) + 500) < now) { return true; @@ -622,7 +630,7 @@ export class CrawlerOptions extends AutoCastable { if ((this.respondWith.includes('vlm') || this.respondWith.includes('screenshot')) && !snapshot.screenshot) { return false; } - if (this.respondTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) { + if (presumedTiming === RESPOND_TIMING.RESOURCE_IDLE && snapshot.lastContentResourceLoaded && snapshot.lastMutationIdle) { const now = Date.now(); if ((snapshot.lastContentResourceLoaded + 500) < now) { return true; @@ -632,10 +640,10 @@ export class CrawlerOptions extends AutoCastable { if (this.injectFrameScript?.length || this.injectPageScript?.length) { return false; } - if (this.respondTiming === RESPOND_TIMING.NETWORK_IDLE) { + if (presumedTiming === RESPOND_TIMING.NETWORK_IDLE) { return false; } - if (this.respondTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) { + if (presumedTiming === RESPOND_TIMING.MUTATION_IDLE && snapshot.lastMutationIdle) { return true; } if (this.respondWith.includes('lm')) { diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 6f17d8e..2f29f29 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -55,6 +55,7 @@ export interface PageSnapshot { href: string; rebase?: string; html: string; + htmlModifiedByJs?: boolean; shadowExpanded?: string; text: string; status?: number; @@ -377,9 +378,11 @@ function shadowDomPresent(rootElement = document.documentElement) { } let lastMutationIdle = 0; +let initialHTML; document.addEventListener('mutationIdle', ()=> lastMutationIdle = Date.now()); function giveSnapshot(stopActiveSnapshot) { + initialHTML ??= document.documentElement?.outerHTML; if (stopActiveSnapshot) { window.haltSnapshot = true; } @@ -395,6 +398,7 @@ function giveSnapshot(stopActiveSnapshot) { description: document.head?.querySelector('meta[name="description"]')?.getAttribute('content') ?? '', href: document.location.href, html: document.documentElement?.outerHTML, + htmlModifiedByJs: false, text: document.body?.innerText, shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, parsed: parsed, @@ -403,6 +407,9 @@ function giveSnapshot(stopActiveSnapshot) { elemCount: domAnalysis.elementCount, lastMutationIdle, }; + if (initialHTML) { + r.htmlModifiedByJs = initialHTML !== r.html && !r.shadowExpanded; + } if (document.baseURI !== r.href) { r.rebase = document.baseURI; }