This commit is contained in:
yanlong.wang 2025-01-15 17:50:03 +08:00
parent 06f359309e
commit 6be6051aa7
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -590,33 +590,26 @@ export class CrawlerHost extends RPCHost {
} }
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) { async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
let overrideFinalSnapshot;
if (crawlerOpts?.html) { if (crawlerOpts?.html) {
const fakeSnapshot = { overrideFinalSnapshot = {
href: urlToCrawl.toString(), href: urlToCrawl.toString(),
html: crawlerOpts.html, html: crawlerOpts.html,
title: '', title: '',
text: '', text: '',
} as PageSnapshot; } as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
return;
} }
if (crawlerOpts?.pdf) { if (crawlerOpts?.pdf) {
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64'); const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`; const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
const fakeSnapshot = { overrideFinalSnapshot = {
href: urlToCrawl.toString(), href: urlToCrawl.toString(),
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`, html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
title: '', title: '',
text: '', text: '',
pdfs: [pdfDataUrl], pdfs: [pdfDataUrl],
} as PageSnapshot; } as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
return;
} }
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) { if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
@ -668,6 +661,12 @@ export class CrawlerHost extends RPCHost {
return; return;
} }
if (overrideFinalSnapshot) {
yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts);
return;
}
let cache; let cache;
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) { if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {