fix: dos abuse

This commit is contained in:
yanlong.wang 2024-07-01 18:40:14 +08:00
parent 57641c4608
commit 1bcb5a742e
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 36 additions and 3 deletions

View File

@ -881,7 +881,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts); yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
} catch (err: any) { } catch (err: any) {
if (cache) { if (cache && !(err instanceof SecurityCompromiseError)) {
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) }); this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts); yield this.puppeteerControl.narrowSnapshot(cache.snapshot, crawlOpts);
return; return;

View File

@ -52,6 +52,7 @@ export interface PageSnapshot {
screenshot?: Buffer; screenshot?: Buffer;
imgs?: ImgBrief[]; imgs?: ImgBrief[];
pdfs?: string[]; pdfs?: string[];
maxElemDepth?: number;
} }
export interface ExtendedSnapshot extends PageSnapshot { export interface ExtendedSnapshot extends PageSnapshot {
@ -235,6 +236,32 @@ function briefPDFs() {
return x.src === 'about:blank' ? document.location.href : x.src; return x.src === 'about:blank' ? document.location.href : x.src;
}); });
} }
function getMaxDepthUsingTreeWalker(root) {
let maxDepth = 0;
let currentDepth = 0;
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
while (true) {
maxDepth = Math.max(maxDepth, currentDepth);
if (treeWalker.firstChild()) {
currentDepth++;
} else {
while (!treeWalker.nextSibling() && currentDepth > 0) {
treeWalker.parentNode();
currentDepth--;
}
if (currentDepth <= 0) {
break;
}
}
}
return maxDepth + 1;
}
function giveSnapshot(stopActiveSnapshot) { function giveSnapshot(stopActiveSnapshot) {
if (stopActiveSnapshot) { if (stopActiveSnapshot) {
window.haltSnapshot = true; window.haltSnapshot = true;
@ -254,6 +281,7 @@ function giveSnapshot(stopActiveSnapshot) {
parsed: parsed, parsed: parsed,
imgs: [], imgs: [],
pdfs: briefPDFs(), pdfs: briefPDFs(),
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
}; };
if (parsed && parsed.content) { if (parsed && parsed.content) {
const elem = document.createElement('div'); const elem = document.createElement('div');
@ -277,7 +305,7 @@ function giveSnapshot(stopActiveSnapshot) {
const domainSet = new Set<string>(); const domainSet = new Set<string>();
let reqCounter = 0; let reqCounter = 0;
const t0 = Date.now(); let t0: number | undefined;
let halt = false; let halt = false;
page.on('request', (req) => { page.on('request', (req) => {
@ -285,6 +313,7 @@ function giveSnapshot(stopActiveSnapshot) {
if (halt) { if (halt) {
return req.abort('blockedbyclient', 1000); return req.abort('blockedbyclient', 1000);
} }
t0 ??= Date.now();
const requestUrl = req.url(); const requestUrl = req.url();
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') { if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
return req.abort('blockedbyclient', 1000); return req.abort('blockedbyclient', 1000);
@ -446,6 +475,10 @@ document.addEventListener('load', handlePageLoad);
if (snapshot === s) { if (snapshot === s) {
return; return;
} }
if (s?.maxElemDepth && s.maxElemDepth > 256) {
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
return;
}
snapshot = s; snapshot = s;
nextSnapshotDeferred.resolve(s); nextSnapshotDeferred.resolve(s);
nextSnapshotDeferred = Defer(); nextSnapshotDeferred = Defer();
@ -516,7 +549,7 @@ document.addEventListener('load', handlePageLoad);
ckpt.push(delay(options.minIntervalMs)); ckpt.push(delay(options.minIntervalMs));
} }
let error; let error;
await Promise.race(ckpt).catch((err)=> error = err); await Promise.race(ckpt).catch((err) => error = err);
if (finalized) { if (finalized) {
yield { ...snapshot, screenshot } as PageSnapshot; yield { ...snapshot, screenshot } as PageSnapshot;
break; break;