mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 00:45:57 +08:00
fix: dos abuse
This commit is contained in:
parent
57641c4608
commit
1bcb5a742e
@ -881,7 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
|
||||
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
||||
} catch (err: any) {
|
||||
if (cache) {
|
||||
if (cache && !(err instanceof SecurityCompromiseError)) {
|
||||
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
||||
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
||||
return;
|
||||
|
@ -52,6 +52,7 @@ export interface PageSnapshot {
|
||||
screenshot?: Buffer;
|
||||
imgs?: ImgBrief[];
|
||||
pdfs?: string[];
|
||||
maxElemDepth?: number;
|
||||
}
|
||||
|
||||
export interface ExtendedSnapshot extends PageSnapshot {
|
||||
@ -235,6 +236,32 @@ function briefPDFs() {
|
||||
return x.src === 'about:blank' ? document.location.href : x.src;
|
||||
});
|
||||
}
|
||||
function getMaxDepthUsingTreeWalker(root) {
|
||||
let maxDepth = 0;
|
||||
let currentDepth = 0;
|
||||
|
||||
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
|
||||
|
||||
while (true) {
|
||||
maxDepth = Math.max(maxDepth, currentDepth);
|
||||
|
||||
if (treeWalker.firstChild()) {
|
||||
currentDepth++;
|
||||
} else {
|
||||
while (!treeWalker.nextSibling() && currentDepth > 0) {
|
||||
treeWalker.parentNode();
|
||||
currentDepth--;
|
||||
}
|
||||
|
||||
if (currentDepth <= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return maxDepth + 1;
|
||||
}
|
||||
|
||||
function giveSnapshot(stopActiveSnapshot) {
|
||||
if (stopActiveSnapshot) {
|
||||
window.haltSnapshot = true;
|
||||
@ -254,6 +281,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
parsed: parsed,
|
||||
imgs: [],
|
||||
pdfs: briefPDFs(),
|
||||
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
|
||||
};
|
||||
if (parsed && parsed.content) {
|
||||
const elem = document.createElement('div');
|
||||
@ -277,7 +305,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
|
||||
const domainSet = new Set<string>();
|
||||
let reqCounter = 0;
|
||||
const t0 = Date.now();
|
||||
let t0: number | undefined;
|
||||
let halt = false;
|
||||
|
||||
page.on('request', (req) => {
|
||||
@ -285,6 +313,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
if (halt) {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
t0 ??= Date.now();
|
||||
const requestUrl = req.url();
|
||||
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
@ -446,6 +475,10 @@ document.addEventListener('load', handlePageLoad);
|
||||
if (snapshot === s) {
|
||||
return;
|
||||
}
|
||||
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
||||
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
|
||||
return;
|
||||
}
|
||||
snapshot = s;
|
||||
nextSnapshotDeferred.resolve(s);
|
||||
nextSnapshotDeferred = Defer();
|
||||
|
Loading…
x
Reference in New Issue
Block a user