mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 21:05:54 +08:00
fix: pdf detection
This commit is contained in:
parent
94170db060
commit
607407f740
@ -11,6 +11,7 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|||||||
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
||||||
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
||||||
import { TimeoutError } from 'puppeteer';
|
import { TimeoutError } from 'puppeteer';
|
||||||
|
import _ from 'lodash';
|
||||||
const tldExtract = require('tld-extract');
|
const tldExtract = require('tld-extract');
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
@ -114,13 +115,6 @@ function briefImgs(elem) {
|
|||||||
};
|
};
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function briefPDFs() {
|
|
||||||
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
|
||||||
|
|
||||||
return pdfTags.map((x)=> {
|
|
||||||
return x.src === 'about:blank' ? document.location.href : x.src;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
function getMaxDepthAndCountUsingTreeWalker(root) {
|
function getMaxDepthAndCountUsingTreeWalker(root) {
|
||||||
let maxDepth = 0;
|
let maxDepth = 0;
|
||||||
let currentDepth = 0;
|
let currentDepth = 0;
|
||||||
@ -178,7 +172,6 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
text: document.body?.innerText,
|
text: document.body?.innerText,
|
||||||
parsed: parsed,
|
parsed: parsed,
|
||||||
imgs: [],
|
imgs: [],
|
||||||
pdfs: briefPDFs(),
|
|
||||||
maxElemDepth: domAnalysis.maxDepth,
|
maxElemDepth: domAnalysis.maxDepth,
|
||||||
elemCount: domAnalysis.elementCount,
|
elemCount: domAnalysis.elementCount,
|
||||||
};
|
};
|
||||||
@ -324,7 +317,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
t0 ??= Date.now();
|
t0 ??= Date.now();
|
||||||
const requestUrl = req.url();
|
const requestUrl = req.url();
|
||||||
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
|
||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
}
|
}
|
||||||
const tldParsed = tldExtract(requestUrl);
|
const tldParsed = tldExtract(requestUrl);
|
||||||
@ -469,7 +462,19 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
let snapshot: PageSnapshot | undefined;
|
let snapshot: PageSnapshot | undefined;
|
||||||
let screenshot: Buffer | undefined;
|
let screenshot: Buffer | undefined;
|
||||||
let pageshot: Buffer | undefined;
|
let pageshot: Buffer | undefined;
|
||||||
|
const pdfUrls: string[] = [];
|
||||||
const page = await this.getNextPage();
|
const page = await this.getNextPage();
|
||||||
|
page.on('response', (resp) => {
|
||||||
|
if (!resp.ok()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const headers = resp.headers();
|
||||||
|
const url = resp.url();
|
||||||
|
const contentType = headers['content-type'];
|
||||||
|
if (contentType?.toLowerCase().includes('application/pdf')) {
|
||||||
|
pdfUrls.push(url);
|
||||||
|
}
|
||||||
|
});
|
||||||
const sn = this.snMap.get(page);
|
const sn = this.snMap.get(page);
|
||||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||||
|
|
||||||
@ -619,7 +624,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||||
this.emit(
|
this.emit(
|
||||||
'crawled',
|
'crawled',
|
||||||
{ ...snapshot, screenshot, pageshot },
|
{ ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot, },
|
||||||
{ ...options, url: parsedUrl }
|
{ ...options, url: parsedUrl }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -672,7 +677,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||||
}
|
}
|
||||||
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||||
@ -681,7 +686,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
lastHTML = snapshot.html;
|
lastHTML = snapshot.html;
|
||||||
}
|
}
|
||||||
if (snapshot || screenshot) {
|
if (snapshot || screenshot) {
|
||||||
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
yield { ...snapshot, pdfs: _.uniq(pdfUrls), screenshot, pageshot } as PageSnapshot;
|
||||||
}
|
}
|
||||||
if (error) {
|
if (error) {
|
||||||
throw error;
|
throw error;
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 4af413f68207157d099ee99a2c056298b833dcd1
|
Subproject commit d2b0fbf184b4c77e80e8d1dd36b3f4d1807e0e09
|
Loading…
x
Reference in New Issue
Block a user