mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 11:15:57 +08:00
fix: dos abuse
This commit is contained in:
parent
57641c4608
commit
1bcb5a742e
@ -881,7 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
|
|
||||||
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
if (cache) {
|
if (cache && !(err instanceof SecurityCompromiseError)) {
|
||||||
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
|
||||||
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
||||||
return;
|
return;
|
||||||
|
@ -52,6 +52,7 @@ export interface PageSnapshot {
|
|||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
imgs?: ImgBrief[];
|
imgs?: ImgBrief[];
|
||||||
pdfs?: string[];
|
pdfs?: string[];
|
||||||
|
maxElemDepth?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtendedSnapshot extends PageSnapshot {
|
export interface ExtendedSnapshot extends PageSnapshot {
|
||||||
@ -235,6 +236,32 @@ function briefPDFs() {
|
|||||||
return x.src === 'about:blank' ? document.location.href : x.src;
|
return x.src === 'about:blank' ? document.location.href : x.src;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
function getMaxDepthUsingTreeWalker(root) {
|
||||||
|
let maxDepth = 0;
|
||||||
|
let currentDepth = 0;
|
||||||
|
|
||||||
|
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
maxDepth = Math.max(maxDepth, currentDepth);
|
||||||
|
|
||||||
|
if (treeWalker.firstChild()) {
|
||||||
|
currentDepth++;
|
||||||
|
} else {
|
||||||
|
while (!treeWalker.nextSibling() && currentDepth > 0) {
|
||||||
|
treeWalker.parentNode();
|
||||||
|
currentDepth--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentDepth <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return maxDepth + 1;
|
||||||
|
}
|
||||||
|
|
||||||
function giveSnapshot(stopActiveSnapshot) {
|
function giveSnapshot(stopActiveSnapshot) {
|
||||||
if (stopActiveSnapshot) {
|
if (stopActiveSnapshot) {
|
||||||
window.haltSnapshot = true;
|
window.haltSnapshot = true;
|
||||||
@ -254,6 +281,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
parsed: parsed,
|
parsed: parsed,
|
||||||
imgs: [],
|
imgs: [],
|
||||||
pdfs: briefPDFs(),
|
pdfs: briefPDFs(),
|
||||||
|
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
|
||||||
};
|
};
|
||||||
if (parsed && parsed.content) {
|
if (parsed && parsed.content) {
|
||||||
const elem = document.createElement('div');
|
const elem = document.createElement('div');
|
||||||
@ -277,7 +305,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
|
|
||||||
const domainSet = new Set<string>();
|
const domainSet = new Set<string>();
|
||||||
let reqCounter = 0;
|
let reqCounter = 0;
|
||||||
const t0 = Date.now();
|
let t0: number | undefined;
|
||||||
let halt = false;
|
let halt = false;
|
||||||
|
|
||||||
page.on('request', (req) => {
|
page.on('request', (req) => {
|
||||||
@ -285,6 +313,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
if (halt) {
|
if (halt) {
|
||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
}
|
}
|
||||||
|
t0 ??= Date.now();
|
||||||
const requestUrl = req.url();
|
const requestUrl = req.url();
|
||||||
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
@ -446,6 +475,10 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
if (snapshot === s) {
|
if (snapshot === s) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
||||||
|
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
|
||||||
|
return;
|
||||||
|
}
|
||||||
snapshot = s;
|
snapshot = s;
|
||||||
nextSnapshotDeferred.resolve(s);
|
nextSnapshotDeferred.resolve(s);
|
||||||
nextSnapshotDeferred = Defer();
|
nextSnapshotDeferred = Defer();
|
||||||
@ -516,7 +549,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
ckpt.push(delay(options.minIntervalMs));
|
ckpt.push(delay(options.minIntervalMs));
|
||||||
}
|
}
|
||||||
let error;
|
let error;
|
||||||
await Promise.race(ckpt).catch((err)=> error = err);
|
await Promise.race(ckpt).catch((err) => error = err);
|
||||||
if (finalized) {
|
if (finalized) {
|
||||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
yield { ...snapshot, screenshot } as PageSnapshot;
|
||||||
break;
|
break;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user