mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 19:15:57 +08:00
security: detect abuse
This commit is contained in:
parent
908157b61e
commit
43dee08dcc
@ -5,7 +5,7 @@ import {
|
|||||||
AssertionFailureError, ParamValidationError, Defer,
|
AssertionFailureError, ParamValidationError, Defer,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
||||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
@ -22,6 +22,7 @@ import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
|||||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
||||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
import { PDFExtractor } from '../services/pdf-extract';
|
import { PDFExtractor } from '../services/pdf-extract';
|
||||||
|
import { DomainBlockade } from '../db/domain-blockade';
|
||||||
|
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
@ -64,6 +65,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||||
cacheValidMs = 1000 * 3600;
|
cacheValidMs = 1000 * 3600;
|
||||||
urlValidMs = 1000 * 3600 * 4;
|
urlValidMs = 1000 * 3600 * 4;
|
||||||
|
abuseBlockMs = 1000 * 3600 * 24;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
@ -87,6 +89,21 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
await this.setToCache(options.url, snapshot);
|
await this.setToCache(options.url, snapshot);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||||
|
this.logger.warn(`Abuse detected on ${abuseEvent.url}, blocking ${abuseEvent.url.hostname}`, { reason: abuseEvent.reason, sn: abuseEvent.sn });
|
||||||
|
|
||||||
|
await DomainBlockade.save(DomainBlockade.from({
|
||||||
|
domain: abuseEvent.url.hostname.toLowerCase(),
|
||||||
|
triggerReason: `${abuseEvent.reason}`,
|
||||||
|
triggerUrl: abuseEvent.url.toString(),
|
||||||
|
createdAt: new Date(),
|
||||||
|
expireAt: new Date(Date.now() + this.abuseBlockMs),
|
||||||
|
})).catch((err) => {
|
||||||
|
this.logger.warn(`Failed to save domain blockade for ${abuseEvent.url.hostname}`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
override async init() {
|
override async init() {
|
||||||
@ -617,6 +634,13 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
path: 'url'
|
path: 'url'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
const blockade = (await DomainBlockade.fromFirestoreQuery(
|
||||||
|
DomainBlockade.COLLECTION.where('domain', '==', urlToCrawl.hostname.toLowerCase()).limit(1)
|
||||||
|
))[0];
|
||||||
|
|
||||||
|
if (blockade) {
|
||||||
|
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
|
||||||
|
}
|
||||||
|
|
||||||
const crawlOpts = this.configure(crawlerOptions);
|
const crawlOpts = this.configure(crawlerOptions);
|
||||||
|
|
||||||
|
30
backend/functions/src/db/domain-blockade.ts
Normal file
30
backend/functions/src/db/domain-blockade.ts
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import { Also, Prop } from 'civkit';
|
||||||
|
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||||
|
|
||||||
|
@Also({
|
||||||
|
dictOf: Object
|
||||||
|
})
|
||||||
|
export class DomainBlockade extends FirestoreRecord {
|
||||||
|
static override collectionName = 'domainBlockades';
|
||||||
|
|
||||||
|
override _id!: string;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
required: true
|
||||||
|
})
|
||||||
|
domain!: string;
|
||||||
|
|
||||||
|
@Prop({ required: true })
|
||||||
|
triggerReason!: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
triggerUrl?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
createdAt!: Date;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
expireAt?: Date;
|
||||||
|
|
||||||
|
[k: string]: any;
|
||||||
|
}
|
@ -10,7 +10,7 @@ import puppeteer from 'puppeteer-extra';
|
|||||||
|
|
||||||
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
||||||
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
||||||
import { ServiceCrashedError } from '../shared/lib/errors';
|
import { SecurityCompromiseError, ServiceCrashedError } from '../shared/lib/errors';
|
||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
@ -270,17 +270,36 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
|
|
||||||
await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
|
await page.goto('about:blank', { waitUntil: 'domcontentloaded' });
|
||||||
|
|
||||||
|
const domainSet = new Set<string>();
|
||||||
|
let reqCounter = 0;
|
||||||
|
|
||||||
page.on('request', (req) => {
|
page.on('request', (req) => {
|
||||||
|
reqCounter++;
|
||||||
const requestUrl = req.url();
|
const requestUrl = req.url();
|
||||||
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
if (!requestUrl.startsWith("http:") && !requestUrl.startsWith("https:") && requestUrl !== 'about:blank') {
|
||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
}
|
}
|
||||||
const parsedUrl = new URL(requestUrl);
|
const parsedUrl = new URL(requestUrl);
|
||||||
|
domainSet.add(parsedUrl.hostname);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
parsedUrl.hostname === 'localhost' ||
|
parsedUrl.hostname === 'localhost' ||
|
||||||
parsedUrl.hostname.startsWith('127.')
|
parsedUrl.hostname.startsWith('127.')
|
||||||
) {
|
) {
|
||||||
|
page.emit('abuse', { url: requestUrl, page, sn, reason: `Suspicious action: Request to localhost: ${requestUrl}` });
|
||||||
|
|
||||||
|
return req.abort('blockedbyclient', 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reqCounter > 200) {
|
||||||
|
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many requests: ${reqCounter}` });
|
||||||
|
|
||||||
|
return req.abort('blockedbyclient', 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (domainSet.size > 21) {
|
||||||
|
page.emit('abuse', { url: requestUrl, page, sn, reason: `DDoS attack suspected: Too many domains (${domainSet.size})` });
|
||||||
|
|
||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -408,6 +427,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
page.on('snapshot', hdl);
|
page.on('snapshot', hdl);
|
||||||
|
page.once('abuse', (event: any) => {
|
||||||
|
this.emit('abuse', { ...event, url: parsedUrl });
|
||||||
|
nextSnapshotDeferred.reject(
|
||||||
|
new SecurityCompromiseError(`Abuse detected: ${event.reason}`)
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit a3a13b13fbef8e9f5d388bde6fca6b459e6f92a6
|
Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1
|
Loading…
x
Reference in New Issue
Block a user