fix: guard invalid domain names

This commit is contained in:
Yanlong Wang 2025-03-08 22:21:25 +08:00
parent 4830ff5fda
commit 4ca627c0c5
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 28 additions and 13 deletions

8
package-lock.json generated
View File

@ -17,7 +17,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-6ed9027",
"civkit": "^0.8.4-bc8ef5e",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -4095,9 +4095,9 @@
}
},
"node_modules/civkit": {
"version": "0.8.4-6ed9027",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz",
"integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==",
"version": "0.8.4-bc8ef5e",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-bc8ef5e.tgz",
"integrity": "sha512-WpybmXgLxUmqrqTeCsWmVFRSEq/3up34kFfByEssXARom5XcvB9uAHzXHhPXmob3m9BGxBXAALD04UBOUq0J4g==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",

View File

@ -25,7 +25,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-6ed9027",
"civkit": "^0.8.4-bc8ef5e",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",

View File

@ -42,6 +42,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { RobotsTxtService } from '../services/robots-text';
import { lookup } from 'dns/promises';
import { isIP } from 'net';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted';
@ -465,7 +467,7 @@ export class CrawlerHost extends RPCHost {
const targetUrlFromGet = originPath.slice(1);
if (crawlerOptions.pdf) {
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
} else if (targetUrlFromGet) {
url = targetUrlFromGet.trim();
} else if (crawlerOptions.url) {
@ -495,13 +497,26 @@ export class CrawlerHost extends RPCHost {
});
}
if (!['http:', 'https:', 'file:'].includes(result.protocol)) {
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
throw new ParamValidationError({
message: `Invalid protocol ${result.protocol}`,
path: 'url'
});
}
if (!isIP(result.hostname)) {
await lookup(result.hostname).catch((err) => {
if (err.code === 'ENOTFOUND') {
return Promise.reject(new ParamValidationError({
message: `Domain '${result.hostname}' could not be resolved`,
path: 'url'
}));
}
return;
});
}
return result;
}

View File

@ -605,15 +605,15 @@ export class PuppeteerControl extends AsyncService {
}
const parsedUrl = new URL(requestUrl);
try {
if (isIP(parsedUrl.hostname)) {
domainSet.add(parsedUrl.hostname);
} else {
if (isIP(parsedUrl.hostname)) {
domainSet.add(parsedUrl.hostname);
} else {
try {
const tldParsed = tldExtract(requestUrl);
domainSet.add(tldParsed.domain);
} catch (_err) {
domainSet.add(parsedUrl.hostname);
}
} catch (err) {
return req.abort('blockedbyclient', 1000);
}
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {