From f0668a96b4cbb040c51c3916176bd072a313c7ea Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Sun, 2 Jun 2024 23:23:39 +0800 Subject: [PATCH] fix: potential circular crawling --- .../functions/src/cloud-functions/crawler.ts | 24 ++++++++++++------- backend/functions/src/services/puppeteer.ts | 8 +++++++ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index f9a4e6b..c1fecdc 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -581,6 +581,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; { contentType: 'text/plain', envelope: null } ); } + + // Prevent circular crawling + this.puppeteerControl.circuitBreakerHosts.add( + ctx.req.hostname.toLowerCase() + ); + if (uid) { const user = await auth.assertUser(); if (!(user.wallet.total_balance > 0)) { @@ -638,15 +644,17 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; path: 'url' }); } - const blockade = (await DomainBlockade.fromFirestoreQuery( - DomainBlockade.COLLECTION - .where('domain', '==', urlToCrawl.hostname.toLowerCase()) - .where('expireAt', '>=', new Date()) - .limit(1) - ))[0]; - if (blockade && !uid) { - throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); + if (!uid) { + const blockade = (await DomainBlockade.fromFirestoreQuery( + DomainBlockade.COLLECTION + .where('domain', '==', urlToCrawl.hostname.toLowerCase()) + .where('expireAt', '>=', new Date()) + .limit(1) + ))[0]; + if (blockade) { + throw new SecurityCompromiseError(`Domain ${urlToCrawl.hostname} blocked until ${blockade.expireAt || 'Eternally'} due to previous abuse found on ${blockade.triggerUrl || 'site'}: ${blockade.triggerReason}`); + } } const crawlOpts = this.configure(crawlerOptions); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index a91b5c4..620ee82 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -100,6 +100,8 @@ export class PuppeteerControl extends AsyncService { livePages = new Set(); lastPageCratedAt: number = 0; + circuitBreakerHosts: Set = new Set(); + constructor( protected globalLogger: Logger, ) { @@ -285,6 +287,12 @@ function giveSnapshot(stopActiveSnapshot) { const parsedUrl = new URL(requestUrl); + if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) { + page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` }); + + return req.abort('blockedbyclient', 1000); + } + if ( parsedUrl.hostname === 'localhost' || parsedUrl.hostname.startsWith('127.')