fix: guard invalid domain names

This commit is contained in:
Yanlong Wang 2025-03-08 22:21:25 +08:00
parent 4830ff5fda
commit 4ca627c0c5
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 28 additions and 13 deletions

8
package-lock.json generated
View File

@ -17,7 +17,7 @@
"axios": "^1.3.3", "axios": "^1.3.3",
"bcrypt": "^5.1.0", "bcrypt": "^5.1.0",
"busboy": "^1.6.0", "busboy": "^1.6.0",
"civkit": "^0.8.4-6ed9027", "civkit": "^0.8.4-bc8ef5e",
"core-js": "^3.37.1", "core-js": "^3.37.1",
"cors": "^2.8.5", "cors": "^2.8.5",
"dayjs": "^1.11.9", "dayjs": "^1.11.9",
@ -4095,9 +4095,9 @@
} }
}, },
"node_modules/civkit": { "node_modules/civkit": {
"version": "0.8.4-6ed9027", "version": "0.8.4-bc8ef5e",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz", "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-bc8ef5e.tgz",
"integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==", "integrity": "sha512-WpybmXgLxUmqrqTeCsWmVFRSEq/3up34kFfByEssXARom5XcvB9uAHzXHhPXmob3m9BGxBXAALD04UBOUq0J4g==",
"license": "AGPL", "license": "AGPL",
"dependencies": { "dependencies": {
"lodash": "^4.17.21", "lodash": "^4.17.21",

View File

@ -25,7 +25,7 @@
"axios": "^1.3.3", "axios": "^1.3.3",
"bcrypt": "^5.1.0", "bcrypt": "^5.1.0",
"busboy": "^1.6.0", "busboy": "^1.6.0",
"civkit": "^0.8.4-6ed9027", "civkit": "^0.8.4-bc8ef5e",
"core-js": "^3.37.1", "core-js": "^3.37.1",
"cors": "^2.8.5", "cors": "^2.8.5",
"dayjs": "^1.11.9", "dayjs": "^1.11.9",

View File

@ -42,6 +42,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { RobotsTxtService } from '../services/robots-text'; import { RobotsTxtService } from '../services/robots-text';
import { lookup } from 'dns/promises';
import { isIP } from 'net';
export interface ExtraScrappingOptions extends ScrappingOptions { export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted'; withIframe?: boolean | 'quoted';
@ -465,7 +467,7 @@ export class CrawlerHost extends RPCHost {
const targetUrlFromGet = originPath.slice(1); const targetUrlFromGet = originPath.slice(1);
if (crawlerOptions.pdf) { if (crawlerOptions.pdf) {
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64'); const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`; url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
} else if (targetUrlFromGet) { } else if (targetUrlFromGet) {
url = targetUrlFromGet.trim(); url = targetUrlFromGet.trim();
} else if (crawlerOptions.url) { } else if (crawlerOptions.url) {
@ -495,13 +497,26 @@ export class CrawlerHost extends RPCHost {
}); });
} }
if (!['http:', 'https:', 'file:'].includes(result.protocol)) { if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
throw new ParamValidationError({ throw new ParamValidationError({
message: `Invalid protocol ${result.protocol}`, message: `Invalid protocol ${result.protocol}`,
path: 'url' path: 'url'
}); });
} }
if (!isIP(result.hostname)) {
await lookup(result.hostname).catch((err) => {
if (err.code === 'ENOTFOUND') {
return Promise.reject(new ParamValidationError({
message: `Domain '${result.hostname}' could not be resolved`,
path: 'url'
}));
}
return;
});
}
return result; return result;
} }

View File

@ -605,15 +605,15 @@ export class PuppeteerControl extends AsyncService {
} }
const parsedUrl = new URL(requestUrl); const parsedUrl = new URL(requestUrl);
try { if (isIP(parsedUrl.hostname)) {
if (isIP(parsedUrl.hostname)) { domainSet.add(parsedUrl.hostname);
domainSet.add(parsedUrl.hostname); } else {
} else { try {
const tldParsed = tldExtract(requestUrl); const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain); domainSet.add(tldParsed.domain);
} catch (_err) {
domainSet.add(parsedUrl.hostname);
} }
} catch (err) {
return req.abort('blockedbyclient', 1000);
} }
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) { if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {