mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 20:15:56 +08:00
fix: another approach to suspected DoS abuse
This commit is contained in:
parent
e658e8102c
commit
0a33207f8f
@ -375,12 +375,22 @@ export class CrawlerHost extends RPCHost {
|
|||||||
let contentText = '';
|
let contentText = '';
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
const imageIdxTrack = new Map<string, number[]>();
|
const imageIdxTrack = new Map<string, number[]>();
|
||||||
|
const uid = this.threadLocal.get('uid');
|
||||||
do {
|
do {
|
||||||
if (pdfMode) {
|
if (pdfMode) {
|
||||||
contentText = snapshot.parsed?.content || snapshot.text;
|
contentText = snapshot.parsed?.content || snapshot.text;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
snapshot.maxElemDepth! > 256 ||
|
||||||
|
(!uid && snapshot.elemCount! > 10_000) ||
|
||||||
|
snapshot.text.length > 70_000
|
||||||
|
) {
|
||||||
|
contentText = snapshot.text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
||||||
let toBeTurnedToMd = jsDomElementOfHTML;
|
let toBeTurnedToMd = jsDomElementOfHTML;
|
||||||
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
||||||
|
@ -334,6 +334,15 @@ export class SearcherHost extends RPCHost {
|
|||||||
r.description = upstreamSearchResult.description;
|
r.description = upstreamSearchResult.description;
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
|
}).catch((err)=> {
|
||||||
|
this.logger.error(`Failed to format snapshot for ${urls[i].href}`, { err: marshalErrorLike(err) });
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: upstreamSearchResult.url,
|
||||||
|
title: upstreamSearchResult.title,
|
||||||
|
description: upstreamSearchResult.description,
|
||||||
|
content: x.text,
|
||||||
|
};
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -11,7 +11,6 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
|||||||
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
||||||
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
||||||
import { TimeoutError } from 'puppeteer';
|
import { TimeoutError } from 'puppeteer';
|
||||||
import { AsyncContext } from '../shared';
|
|
||||||
const tldExtract = require('tld-extract');
|
const tldExtract = require('tld-extract');
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
@ -129,7 +128,7 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
|
|||||||
NodeFilter.SHOW_ELEMENT,
|
NodeFilter.SHOW_ELEMENT,
|
||||||
(node) => {
|
(node) => {
|
||||||
const nodeName = node.nodeName.toLowerCase();
|
const nodeName = node.nodeName.toLowerCase();
|
||||||
return (nodeName === 'svg' || nodeName === 'code') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
|
return (nodeName === 'svg') ? NodeFilter.FILTER_REJECT : NodeFilter.FILTER_ACCEPT;
|
||||||
},
|
},
|
||||||
false
|
false
|
||||||
);
|
);
|
||||||
@ -215,7 +214,6 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected threadLocal: AsyncContext,
|
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
||||||
@ -491,17 +489,13 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
if (snapshot === s) {
|
if (snapshot === s) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
snapshot = s;
|
||||||
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
if (s?.maxElemDepth && s.maxElemDepth > 256) {
|
||||||
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: DOM tree too deep` });
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (s?.elemCount && s.elemCount > 20_000) {
|
if (s?.elemCount && s.elemCount > 10_000) {
|
||||||
if (!this.threadLocal.get('uid')) {
|
return;
|
||||||
page.emit('abuse', { url, page, sn, reason: `DoS attack suspected: too many DOM elements` });
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
snapshot = s;
|
|
||||||
nextSnapshotDeferred.resolve(s);
|
nextSnapshotDeferred.resolve(s);
|
||||||
nextSnapshotDeferred = Defer();
|
nextSnapshotDeferred = Defer();
|
||||||
this.once('crippled', crippleListener);
|
this.once('crippled', crippleListener);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user