security: detect abuse

This commit is contained in:
Yanlong Wang 2024-06-01 00:57:51 +08:00
parent 908157b61e
commit 43dee08dcc
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 82 additions and 3 deletions

View File

@ -5,7 +5,7 @@ import {
AssertionFailureError, ParamValidationError, Defer,
} from 'civkit';
import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@ -22,6 +22,7 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { PDFExtractor } from '../services/pdf-extract';
import { DomainBlockade } from '../db/domain-blockade';
const md5Hasher = new HashManager('md5', 'hex');
@ -64,6 +65,7 @@ export class CrawlerHost extends RPCHost {
cacheRetentionMs = 1000 * 3600 * 24 * 7;
cacheValidMs = 1000 * 3600;
urlValidMs = 1000 * 3600 * 4;
abuseBlockMs = 1000 * 3600 * 24;
constructor(
protected globalLogger: Logger,
@ -87,6 +89,21 @@ export class CrawlerHost extends RPCHost {
await this.setToCache(options.url, snapshot);
});
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
await DomainBlockade.save(DomainBlockade.from({
domain: abuseEvent.url.hostname.toLowerCase(),
triggerReason: `${abuseEvent.reason}`,
triggerUrl: abuseEvent.url.toString(),
createdAt: new Date(),
expireAt: new Date(Date.now() + this.abuseBlockMs),
})).catch((err) => {
this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) });
});
});
}
override async init() {
@ -617,6 +634,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
path: 'url'
});
}
const blockade = (await DomainBlockade.fromFirestoreQuery(
DomainBlockade.COLLECTION.where('domain', '==', urlToCrawl.hostname.toLowerCase()).limit(1)
))[0];
if (blockade) {
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
const crawlOpts = this.configure(crawlerOptions);

View File

@ -0,0 +1,30 @@
import { Also, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
@Also({
dictOf: Object
})
export class DomainBlockade extends FirestoreRecord {
static override collectionName = 'domainBlockades';
override _id!: string;
@Prop({
required: true
})
domain!: string;
@Prop({ required: true })
triggerReason!: string;
@Prop()
triggerUrl?: string;
@Prop()
createdAt!: Date;
@Prop()
expireAt?: Date;
[k: string]: any;
}

View File

@ -10,7 +10,7 @@ import puppeteer from 'puppeteer-extra';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
import { ServiceCrashedError } from '../shared/lib/errors';
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
import { Readability } from '@mozilla/readability';
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@ -270,17 +270,36 @@ function giveSnapshot(stopActiveSnapshot) {
await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
const domainSet = new Set<string>();
let reqCounter = 0;
page.on('request', (req) => {
reqCounter++;
const requestUrl = req.url();
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
return req.abort('blockedbyclient', 1000);
}
const parsedUrl = new URL(requestUrl);
domainSet.add(parsedUrl.hostname);
if (
parsedUrl.hostname === 'localhost' ||
parsedUrl.hostname.startsWith('127.')
) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` });
return req.abort('blockedbyclient', 1000);
}
if (reqCounter > 200) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests: ${reqCounter}` });
return req.abort('blockedbyclient', 1000);
}
if (domainSet.size > 21) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains (${domainSet.size})` });
return req.abort('blockedbyclient', 1000);
}
@ -408,6 +427,12 @@ document.addEventListener('load', handlePageLoad);
});
};
page.on('snapshot', hdl);
page.once('abuse', (event: any) => {
this.emit('abuse', { ...event, url: parsedUrl });
nextSnapshotDeferred.reject(
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
);
});
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
.catch((err) => {

@ -1 +1 @@
Subproject commit a3a13b13fbef8e9f5d388bde6fca6b459e6f92a6
Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1