fix: crawling IP url

2025-08-18 22:35:56 +08:00 · 2024-11-11 15:30:48 +08:00 · 2024-11-11 15:30:48 +08:00 · e2a187d126
commit e2a187d126
parent 67d4a9f45a
1 changed files with 9 additions and 4 deletions
--- a/backend/functions/src/services/puppeteer.ts
+++ b/backend/functions/src/services/puppeteer.ts
@ -13,6 +13,7 @@ import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
 import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
 import { TimeoutError } from 'puppeteer';
 import _ from 'lodash';
+import { isIP } from 'net';
 const tldExtract = require('tld-extract');

 const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@ -570,15 +571,19 @@ export class PuppeteerControl extends AsyncService {
            if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
                return req.abort('blockedbyclient', 1000);
            }
+
+            const parsedUrl = new URL(requestUrl);
            try {
-                const tldParsed = tldExtract(requestUrl);
-                domainSet.add(tldParsed.domain);
+                if (isIP(parsedUrl.hostname)) {
+                    domainSet.add(parsedUrl.hostname);
+                } else {
+                    const tldParsed = tldExtract(requestUrl);
+                    domainSet.add(tldParsed.domain);
+                }
            } catch (err) {
                return req.abort('blockedbyclient', 1000);
            }

-            const parsedUrl = new URL(requestUrl);
-
            if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
                page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
                return req.abort('blockedbyclient', 1000);