fix: potential circular crawling

This commit is contained in:
Yanlong Wang 2024-06-02 23:23:39 +08:00
parent be91371b93
commit f0668a96b4
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 24 additions and 8 deletions

View File

@ -581,6 +581,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
{ contentType: 'text/plain', envelope: null }
);
}
// Prevent circular crawling
this.puppeteerControl.circuitBreakerHosts.add(
ctx.req.hostname.toLowerCase()
);
if (uid) {
const user = await auth.assertUser();
if (!(user.wallet.total_balance > 0)) {
@ -638,15 +644,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
path: 'url'
});
}
const blockade = (await DomainBlockade.fromFirestoreQuery(
DomainBlockade.COLLECTION
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
.where('expireAt', '>=', new Date())
.limit(1)
))[0];
if (blockade && !uid) {
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
if (!uid) {
const blockade = (await DomainBlockade.fromFirestoreQuery(
DomainBlockade.COLLECTION
.where('domain', '==', urlToCrawl.hostname.toLowerCase())
.where('expireAt', '>=', new Date())
.limit(1)
))[0];
if (blockade) {
throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`);
}
}
const crawlOpts = this.configure(crawlerOptions);

View File

@ -100,6 +100,8 @@ export class PuppeteerControl extends AsyncService {
livePages = new Set<Page>();
lastPageCratedAt: number = 0;
circuitBreakerHosts: Set<string> = new Set();
constructor(
protected globalLogger: Logger,
) {
@ -285,6 +287,12 @@ function giveSnapshot(stopActiveSnapshot) {
const parsedUrl = new URL(requestUrl);
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
return req.abort('blockedbyclient', 1000);
}
if (
parsedUrl.hostname === 'localhost' ||
parsedUrl.hostname.startsWith('127.')