mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 01:09:06 +08:00
fix: guard invalid domain names
This commit is contained in:
parent
4830ff5fda
commit
4ca627c0c5
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-6ed9027",
|
"civkit": "^0.8.4-bc8ef5e",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -4095,9 +4095,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/civkit": {
|
"node_modules/civkit": {
|
||||||
"version": "0.8.4-6ed9027",
|
"version": "0.8.4-bc8ef5e",
|
||||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz",
|
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-bc8ef5e.tgz",
|
||||||
"integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==",
|
"integrity": "sha512-WpybmXgLxUmqrqTeCsWmVFRSEq/3up34kFfByEssXARom5XcvB9uAHzXHhPXmob3m9BGxBXAALD04UBOUq0J4g==",
|
||||||
"license": "AGPL",
|
"license": "AGPL",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
|
@ -25,7 +25,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-6ed9027",
|
"civkit": "^0.8.4-bc8ef5e",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
|
@ -42,6 +42,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
|
|||||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||||
import { RobotsTxtService } from '../services/robots-text';
|
import { RobotsTxtService } from '../services/robots-text';
|
||||||
|
import { lookup } from 'dns/promises';
|
||||||
|
import { isIP } from 'net';
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean | 'quoted';
|
withIframe?: boolean | 'quoted';
|
||||||
@ -465,7 +467,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
const targetUrlFromGet = originPath.slice(1);
|
const targetUrlFromGet = originPath.slice(1);
|
||||||
if (crawlerOptions.pdf) {
|
if (crawlerOptions.pdf) {
|
||||||
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
||||||
url = `file://pdf.${md5Hasher.hash(pdfBuf)}`;
|
url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
|
||||||
} else if (targetUrlFromGet) {
|
} else if (targetUrlFromGet) {
|
||||||
url = targetUrlFromGet.trim();
|
url = targetUrlFromGet.trim();
|
||||||
} else if (crawlerOptions.url) {
|
} else if (crawlerOptions.url) {
|
||||||
@ -495,13 +497,26 @@ export class CrawlerHost extends RPCHost {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!['http:', 'https:', 'file:'].includes(result.protocol)) {
|
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
|
||||||
throw new ParamValidationError({
|
throw new ParamValidationError({
|
||||||
message: `Invalid protocol ${result.protocol}`,
|
message: `Invalid protocol ${result.protocol}`,
|
||||||
path: 'url'
|
path: 'url'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!isIP(result.hostname)) {
|
||||||
|
await lookup(result.hostname).catch((err) => {
|
||||||
|
if (err.code === 'ENOTFOUND') {
|
||||||
|
return Promise.reject(new ParamValidationError({
|
||||||
|
message: `Domain '${result.hostname}' could not be resolved`,
|
||||||
|
path: 'url'
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -605,15 +605,15 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const parsedUrl = new URL(requestUrl);
|
const parsedUrl = new URL(requestUrl);
|
||||||
try {
|
if (isIP(parsedUrl.hostname)) {
|
||||||
if (isIP(parsedUrl.hostname)) {
|
domainSet.add(parsedUrl.hostname);
|
||||||
domainSet.add(parsedUrl.hostname);
|
} else {
|
||||||
} else {
|
try {
|
||||||
const tldParsed = tldExtract(requestUrl);
|
const tldParsed = tldExtract(requestUrl);
|
||||||
domainSet.add(tldParsed.domain);
|
domainSet.add(tldParsed.domain);
|
||||||
|
} catch (_err) {
|
||||||
|
domainSet.add(parsedUrl.hostname);
|
||||||
}
|
}
|
||||||
} catch (err) {
|
|
||||||
return req.abort('blockedbyclient', 1000);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user