mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 15:29:05 +08:00
fix: guard invalid domain names
This commit is contained in:
parent
4830ff5fda
commit
4ca627c0c5
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-6ed9027",
|
||||
"civkit": "^0.8.4-bc8ef5e",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -4095,9 +4095,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.8.4-6ed9027",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz",
|
||||
"integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==",
|
||||
"version": "0.8.4-bc8ef5e",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-bc8ef5e.tgz",
|
||||
"integrity": "sha512-WpybmXgLxUmqrqTeCsWmVFRSEq/3up34kFfByEssXARom5XcvB9uAHzXHhPXmob3m9BGxBXAALD04UBOUq0J4g==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
|
@ -25,7 +25,7 @@
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-6ed9027",
|
||||
"civkit": "^0.8.4-bc8ef5e",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
|
@ -42,6 +42,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
|
||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||
import { RobotsTxtService } from '../services/robots-text';
|
||||
import { lookup } from 'dns/promises';
|
||||
import { isIP } from 'net';
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean | 'quoted';
|
||||
@ -465,7 +467,7 @@ export class CrawlerHost extends RPCHost {
|
||||
const targetUrlFromGet = originPath.slice(1);
|
||||
if (crawlerOptions.pdf) {
|
||||
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
||||
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
|
||||
url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
|
||||
} else if (targetUrlFromGet) {
|
||||
url = targetUrlFromGet.trim();
|
||||
} else if (crawlerOptions.url) {
|
||||
@ -495,13 +497,26 @@ export class CrawlerHost extends RPCHost {
|
||||
});
|
||||
}
|
||||
|
||||
if (!['http:', 'https:', 'file:'].includes(result.protocol)) {
|
||||
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
|
||||
throw new ParamValidationError({
|
||||
message: `Invalid protocol ${result.protocol}`,
|
||||
path: 'url'
|
||||
});
|
||||
}
|
||||
|
||||
if (!isIP(result.hostname)) {
|
||||
await lookup(result.hostname).catch((err) => {
|
||||
if (err.code === 'ENOTFOUND') {
|
||||
return Promise.reject(new ParamValidationError({
|
||||
message: `Domain '${result.hostname}' could not be resolved`,
|
||||
path: 'url'
|
||||
}));
|
||||
}
|
||||
|
||||
return;
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -605,15 +605,15 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
|
||||
const parsedUrl = new URL(requestUrl);
|
||||
try {
|
||||
if (isIP(parsedUrl.hostname)) {
|
||||
domainSet.add(parsedUrl.hostname);
|
||||
} else {
|
||||
if (isIP(parsedUrl.hostname)) {
|
||||
domainSet.add(parsedUrl.hostname);
|
||||
} else {
|
||||
try {
|
||||
const tldParsed = tldExtract(requestUrl);
|
||||
domainSet.add(tldParsed.domain);
|
||||
} catch (_err) {
|
||||
domainSet.add(parsedUrl.hostname);
|
||||
}
|
||||
} catch (err) {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user