mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 05:06:04 +08:00
security: detect abuse
This commit is contained in:
parent
908157b61e
commit
43dee08dcc
@ -5,7 +5,7 @@ import {
|
||||
AssertionFailureError, ParamValidationError, Defer,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
@ -22,6 +22,7 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import { PDFExtractor } from '../services/pdf-extract';
|
||||
import { DomainBlockade } from '../db/domain-blockade';
|
||||
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
@ -64,6 +65,7 @@ export class CrawlerHost extends RPCHost {
|
||||
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||
cacheValidMs = 1000 * 3600;
|
||||
urlValidMs = 1000 * 3600 * 4;
|
||||
abuseBlockMs = 1000 * 3600 * 24;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
@ -87,6 +89,21 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
await this.setToCache(options.url, snapshot);
|
||||
});
|
||||
|
||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
|
||||
|
||||
await DomainBlockade.save(DomainBlockade.from({
|
||||
domain: abuseEvent.url.hostname.toLowerCase(),
|
||||
triggerReason: `${abuseEvent.reason}`,
|
||||
triggerUrl: abuseEvent.url.toString(),
|
||||
createdAt: new Date(),
|
||||
expireAt: new Date(Date.now() + this.abuseBlockMs),
|
||||
})).catch((err) => {
|
||||
this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) });
|
||||
});
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
override async init() {
|
||||
@ -617,6 +634,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
path: 'url'
|
||||
});
|
||||
}
|
||||
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
||||
DomainBlockade.COLLECTION.where('domain', '==', urlToCrawl.hostname.toLowerCase()).limit(1)
|
||||
))[0];
|
||||
|
||||
if (blockade) {
|
||||
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||
}
|
||||
|
||||
const crawlOpts = this.configure(crawlerOptions);
|
||||
|
||||
|
30
backend/functions/src/db/domain-blockade.ts
Normal file
30
backend/functions/src/db/domain-blockade.ts
Normal file
@ -0,0 +1,30 @@
|
||||
import { Also, Prop } from 'civkit';
|
||||
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||
|
||||
@Also({
|
||||
dictOf: Object
|
||||
})
|
||||
export class DomainBlockade extends FirestoreRecord {
|
||||
static override collectionName = 'domainBlockades';
|
||||
|
||||
override _id!: string;
|
||||
|
||||
@Prop({
|
||||
required: true
|
||||
})
|
||||
domain!: string;
|
||||
|
||||
@Prop({ required: true })
|
||||
triggerReason!: string;
|
||||
|
||||
@Prop()
|
||||
triggerUrl?: string;
|
||||
|
||||
@Prop()
|
||||
createdAt!: Date;
|
||||
|
||||
@Prop()
|
||||
expireAt?: Date;
|
||||
|
||||
[k: string]: any;
|
||||
}
|
@ -10,7 +10,7 @@ import puppeteer from 'puppeteer-extra';
|
||||
|
||||
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
||||
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
||||
import { ServiceCrashedError } from '../shared/lib/errors';
|
||||
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
|
||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||
@ -270,17 +270,36 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
|
||||
await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
|
||||
|
||||
const domainSet = new Set<string>();
|
||||
let reqCounter = 0;
|
||||
|
||||
page.on('request', (req) => {
|
||||
reqCounter++;
|
||||
const requestUrl = req.url();
|
||||
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
const parsedUrl = new URL(requestUrl);
|
||||
domainSet.add(parsedUrl.hostname);
|
||||
|
||||
if (
|
||||
parsedUrl.hostname === 'localhost' ||
|
||||
parsedUrl.hostname.startsWith('127.')
|
||||
) {
|
||||
page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` });
|
||||
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
if (reqCounter > 200) {
|
||||
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests: ${reqCounter}` });
|
||||
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
if (domainSet.size > 21) {
|
||||
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains (${domainSet.size})` });
|
||||
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
@ -408,6 +427,12 @@ document.addEventListener('load', handlePageLoad);
|
||||
});
|
||||
};
|
||||
page.on('snapshot', hdl);
|
||||
page.once('abuse', (event: any) => {
|
||||
this.emit('abuse', { ...event, url: parsedUrl });
|
||||
nextSnapshotDeferred.reject(
|
||||
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
||||
);
|
||||
});
|
||||
|
||||
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
||||
.catch((err) => {
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit a3a13b13fbef8e9f5d388bde6fca6b459e6f92a6
|
||||
Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1
|
Loading…
x
Reference in New Issue
Block a user