From 43dee08dcc6d2f7989d2e2963421c9f353e91d10 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Sat, 1 Jun 2024 00:57:51 +0800 Subject: [PATCH] security: detect abuse --- .../functions/src/cloud-functions/crawler.ts | 26 +++++++++++++++- backend/functions/src/db/domain-blockade.ts | 30 +++++++++++++++++++ backend/functions/src/services/puppeteer.ts | 27 ++++++++++++++++- thinapps-shared | 2 +- 4 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 backend/functions/src/db/domain-blockade.ts diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 9890a95..7e6b8b2 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -5,7 +5,7 @@ import { AssertionFailureError, ParamValidationError, Defer, } from 'civkit'; import { singleton } from 'tsyringe'; -import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared'; +import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared'; import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import _ from 'lodash'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; @@ -22,6 +22,7 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai'; import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options'; import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { PDFExtractor } from '../services/pdf-extract'; +import { DomainBlockade } from '../db/domain-blockade'; const md5Hasher = new HashManager('md5', 'hex'); @@ -64,6 +65,7 @@ export class CrawlerHost extends RPCHost { cacheRetentionMs = 1000 * 3600 * 24 * 7; cacheValidMs = 1000 * 3600; urlValidMs = 1000 * 3600 * 4; + abuseBlockMs = 1000 * 3600 * 24; constructor( protected globalLogger: Logger, @@ -87,6 +89,21 @@ export class CrawlerHost extends RPCHost { await this.setToCache(options.url, snapshot); }); + + puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { + this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn }); + + await DomainBlockade.save(DomainBlockade.from({ + domain: abuseEvent.url.hostname.toLowerCase(), + triggerReason: `${abuseEvent.reason}`, + triggerUrl: abuseEvent.url.toString(), + createdAt: new Date(), + expireAt: new Date(Date.now() + this.abuseBlockMs), + })).catch((err) => { + this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) }); + }); + + }); } override async init() { @@ -617,6 +634,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; path: 'url' }); } + const blockade = (await DomainBlockade.fromFirestoreQuery( + DomainBlockade.COLLECTION.where('domain', '==', urlToCrawl.hostname.toLowerCase()).limit(1) + ))[0]; + + if (blockade) { + throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); + } const crawlOpts = this.configure(crawlerOptions); diff --git a/backend/functions/src/db/domain-blockade.ts b/backend/functions/src/db/domain-blockade.ts new file mode 100644 index 0000000..72955a2 --- /dev/null +++ b/backend/functions/src/db/domain-blockade.ts @@ -0,0 +1,30 @@ +import { Also, Prop } from 'civkit'; +import { FirestoreRecord } from '../shared/lib/firestore'; + +@Also({ + dictOf: Object +}) +export class DomainBlockade extends FirestoreRecord { + static override collectionName = 'domainBlockades'; + + override _id!: string; + + @Prop({ + required: true + }) + domain!: string; + + @Prop({ required: true }) + triggerReason!: string; + + @Prop() + triggerUrl?: string; + + @Prop() + createdAt!: Date; + + @Prop() + expireAt?: Date; + + [k: string]: any; +} diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 5afdbe0..c0ca96e 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -10,7 +10,7 @@ import puppeteer from 'puppeteer-extra'; import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; -import { ServiceCrashedError } from '../shared/lib/errors'; +import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors'; import { Readability } from '@mozilla/readability'; const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); @@ -270,17 +270,36 @@ function giveSnapshot(stopActiveSnapshot) { await page.goto('about:blank', { waitUntil: 'domcontentloaded' }); + const domainSet = new Set(); + let reqCounter = 0; + page.on('request', (req) => { + reqCounter++; const requestUrl = req.url(); if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') { return req.abort('blockedbyclient', 1000); } const parsedUrl = new URL(requestUrl); + domainSet.add(parsedUrl.hostname); if ( parsedUrl.hostname === 'localhost' || parsedUrl.hostname.startsWith('127.') ) { + page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` }); + + return req.abort('blockedbyclient', 1000); + } + + if (reqCounter > 200) { + page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests: ${reqCounter}` }); + + return req.abort('blockedbyclient', 1000); + } + + if (domainSet.size > 21) { + page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains (${domainSet.size})` }); + return req.abort('blockedbyclient', 1000); } @@ -408,6 +427,12 @@ document.addEventListener('load', handlePageLoad); }); }; page.on('snapshot', hdl); + page.once('abuse', (event: any) => { + this.emit('abuse', { ...event, url: parsedUrl }); + nextSnapshotDeferred.reject( + new SecurityCompromiseError(`Abuse detected: ${event.reason}`) + ); + }); const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 }) .catch((err) => { diff --git a/thinapps-shared b/thinapps-shared index a3a13b1..d360d01 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit a3a13b13fbef8e9f5d388bde6fca6b459e6f92a6 +Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1