From d2bebec60fe6b13958caaf3aacf6a392b7db83c7 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Sat, 1 Jun 2024 02:01:12 +0800 Subject: [PATCH] fix: abuse blocker --- backend/functions/package-lock.json | 4 ++-- backend/functions/package.json | 1 + backend/functions/src/cloud-functions/crawler.ts | 2 +- backend/functions/src/services/puppeteer.ts | 9 ++++++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index f7038ab..2071418 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -38,6 +38,7 @@ "set-cookie-parser": "^2.6.0", "stripe": "^11.11.0", "tiktoken": "^1.0.10", + "tld-extract": "^2.1.0", "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", "undici": "^5.24.0" @@ -11306,8 +11307,7 @@ "node_modules/tld-extract": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/tld-extract/-/tld-extract-2.1.0.tgz", - "integrity": "sha512-Y9QHWIoDQPJJVm3/pOC7kOfOj7vsNSVZl4JGoEHb605FiwZgIfzSMyU0HC0wYw5Cx8435vaG1yGZtIm1yiQGOw==", - "optional": true + "integrity": "sha512-Y9QHWIoDQPJJVm3/pOC7kOfOj7vsNSVZl4JGoEHb605FiwZgIfzSMyU0HC0wYw5Cx8435vaG1yGZtIm1yiQGOw==" }, "node_modules/tmpl": { "version": "1.0.5", diff --git a/backend/functions/package.json b/backend/functions/package.json index ad55c3f..fa5aaa6 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -58,6 +58,7 @@ "set-cookie-parser": "^2.6.0", "stripe": "^11.11.0", "tiktoken": "^1.0.10", + "tld-extract": "^2.1.0", "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", "undici": "^5.24.0" diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 7e6b8b2..efd8d23 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -65,7 +65,7 @@ export class CrawlerHost extends RPCHost { cacheRetentionMs = 1000 * 3600 * 24 * 7; cacheValidMs = 1000 * 3600; urlValidMs = 1000 * 3600 * 4; - abuseBlockMs = 1000 * 3600 * 24; + abuseBlockMs = 1000 * 3600; constructor( protected globalLogger: Logger, diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 52ac4a6..a91b5c4 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -12,6 +12,7 @@ import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors'; import { Readability } from '@mozilla/readability'; +const tldExtract = require('tld-extract'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); @@ -279,8 +280,10 @@ function giveSnapshot(stopActiveSnapshot) { if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') { return req.abort('blockedbyclient', 1000); } + const tldParsed = tldExtract(requestUrl); + domainSet.add(tldParsed.domain); + const parsedUrl = new URL(requestUrl); - domainSet.add(parsedUrl.hostname); if ( parsedUrl.hostname === 'localhost' || @@ -291,13 +294,13 @@ function giveSnapshot(stopActiveSnapshot) { return req.abort('blockedbyclient', 1000); } - if (reqCounter > 200) { + if (reqCounter > 2000) { page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests: ${reqCounter}` }); return req.abort('blockedbyclient', 1000); } - if (domainSet.size > 51) { + if (domainSet.size > 200) { page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains (${domainSet.size})` }); return req.abort('blockedbyclient', 1000);