mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 22:35:56 +08:00
fix: crawling IP url
This commit is contained in:
parent
67d4a9f45a
commit
e2a187d126
@ -13,6 +13,7 @@ import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
|
||||
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
|
||||
import { TimeoutError } from 'puppeteer';
|
||||
import _ from 'lodash';
|
||||
import { isIP } from 'net';
|
||||
const tldExtract = require('tld-extract');
|
||||
|
||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||
@ -570,15 +571,19 @@ export class PuppeteerControl extends AsyncService {
|
||||
if (!requestUrl.startsWith('http:') && !requestUrl.startsWith('https:') && !requestUrl.startsWith('chrome-extension:') && requestUrl !== 'about:blank') {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
const parsedUrl = new URL(requestUrl);
|
||||
try {
|
||||
const tldParsed = tldExtract(requestUrl);
|
||||
domainSet.add(tldParsed.domain);
|
||||
if (isIP(parsedUrl.hostname)) {
|
||||
domainSet.add(parsedUrl.hostname);
|
||||
} else {
|
||||
const tldParsed = tldExtract(requestUrl);
|
||||
domainSet.add(tldParsed.domain);
|
||||
}
|
||||
} catch (err) {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
const parsedUrl = new URL(requestUrl);
|
||||
|
||||
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
||||
page.emit('abuse', { url: requestUrl, page, sn, reason: `Abusive request: ${requestUrl}` });
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
|
Loading…
x
Reference in New Issue
Block a user