diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 936eafa..d3ec7aa 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -590,33 +590,26 @@ export class CrawlerHost extends RPCHost { } async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) { + let overrideFinalSnapshot; if (crawlerOpts?.html) { - const fakeSnapshot = { + overrideFinalSnapshot = { href: urlToCrawl.toString(), html: crawlerOpts.html, title: '', text: '', } as PageSnapshot; - - yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts); - - return; } if (crawlerOpts?.pdf) { const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64'); const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`; - const fakeSnapshot = { + overrideFinalSnapshot = { href: urlToCrawl.toString(), html: ``, title: '', text: '', pdfs: [pdfDataUrl], } as PageSnapshot; - - yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts); - - return; } if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) { @@ -668,6 +661,12 @@ export class CrawlerHost extends RPCHost { return; } + if (overrideFinalSnapshot) { + yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts); + + return; + } + let cache; if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {