From e2a187d126cf6b9e680d47320f1b7e7a7ab7a88e Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Mon, 11 Nov 2024 15:30:48 +0800 Subject: [PATCH] fix: crawling IP url --- backend/functions/src/services/puppeteer.ts | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 9b19887..67de96f 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -13,6 +13,7 @@ import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy'; import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors'; import { TimeoutError } from 'puppeteer'; import _ from 'lodash'; +import { isIP } from 'net'; const tldExtract = require('tld-extract'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); @@ -570,15 +571,19 @@ export class PuppeteerControl extends AsyncService { if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') { return req.abort('blockedbyclient', 1000); } + + const parsedUrl = new URL(requestUrl); try { - const tldParsed = tldExtract(requestUrl); - domainSet.add(tldParsed.domain); + if (isIP(parsedUrl.hostname)) { + domainSet.add(parsedUrl.hostname); + } else { + const tldParsed = tldExtract(requestUrl); + domainSet.add(tldParsed.domain); + } } catch (err) { return req.abort('blockedbyclient', 1000); } - const parsedUrl = new URL(requestUrl); - if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) { page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` }); return req.abort('blockedbyclient', 1000);