mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 23:45:56 +08:00
fix
This commit is contained in:
parent
06f359309e
commit
6be6051aa7
@ -590,33 +590,26 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ExtraScrappingOptions, crawlerOpts?: CrawlerOptions) {
|
||||||
|
let overrideFinalSnapshot;
|
||||||
if (crawlerOpts?.html) {
|
if (crawlerOpts?.html) {
|
||||||
const fakeSnapshot = {
|
overrideFinalSnapshot = {
|
||||||
href: urlToCrawl.toString(),
|
href: urlToCrawl.toString(),
|
||||||
html: crawlerOpts.html,
|
html: crawlerOpts.html,
|
||||||
title: '',
|
title: '',
|
||||||
text: '',
|
text: '',
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
|
||||||
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlerOpts?.pdf) {
|
if (crawlerOpts?.pdf) {
|
||||||
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
||||||
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
||||||
const fakeSnapshot = {
|
overrideFinalSnapshot = {
|
||||||
href: urlToCrawl.toString(),
|
href: urlToCrawl.toString(),
|
||||||
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
||||||
title: '',
|
title: '',
|
||||||
text: '',
|
text: '',
|
||||||
pdfs: [pdfDataUrl],
|
pdfs: [pdfDataUrl],
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
|
||||||
yield this.jsdomControl.narrowSnapshot(fakeSnapshot, crawlOpts);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
||||||
@ -668,6 +661,12 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (overrideFinalSnapshot) {
|
||||||
|
yield this.jsdomControl.narrowSnapshot(overrideFinalSnapshot, crawlOpts);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let cache;
|
let cache;
|
||||||
|
|
||||||
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user