diff --git a/package-lock.json b/package-lock.json index e01c9a3..20b6af7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-6ed9027", + "civkit": "^0.8.4-bc8ef5e", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -4095,9 +4095,9 @@ } }, "node_modules/civkit": { - "version": "0.8.4-6ed9027", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-6ed9027.tgz", - "integrity": "sha512-VU8Ykik1L16Li9/QZfw5wYsmu3jJYH/zIHbM6Vd2ajRI7Mh4fSO3cXadUntM190BersLW9Fts+qunDPabhIWZA==", + "version": "0.8.4-bc8ef5e", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-bc8ef5e.tgz", + "integrity": "sha512-WpybmXgLxUmqrqTeCsWmVFRSEq/3up34kFfByEssXARom5XcvB9uAHzXHhPXmob3m9BGxBXAALD04UBOUq0J4g==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", diff --git a/package.json b/package.json index 3724cc6..f7f89b3 100644 --- a/package.json +++ b/package.json @@ -25,7 +25,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-6ed9027", + "civkit": "^0.8.4-bc8ef5e", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 594d683..77cd822 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -42,6 +42,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { RobotsTxtService } from '../services/robots-text'; +import { lookup } from 'dns/promises'; +import { isIP } from 'net'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -465,7 +467,7 @@ export class CrawlerHost extends RPCHost { const targetUrlFromGet = originPath.slice(1); if (crawlerOptions.pdf) { const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64'); - url = `file://pdf.${md5Hasher.hash(pdfBuf)}`; + url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`; } else if (targetUrlFromGet) { url = targetUrlFromGet.trim(); } else if (crawlerOptions.url) { @@ -495,13 +497,26 @@ export class CrawlerHost extends RPCHost { }); } - if (!['http:', 'https:', 'file:'].includes(result.protocol)) { + if (!['http:', 'https:', 'blob:'].includes(result.protocol)) { throw new ParamValidationError({ message: `Invalid protocol ${result.protocol}`, path: 'url' }); } + if (!isIP(result.hostname)) { + await lookup(result.hostname).catch((err) => { + if (err.code === 'ENOTFOUND') { + return Promise.reject(new ParamValidationError({ + message: `Domain '${result.hostname}' could not be resolved`, + path: 'url' + })); + } + + return; + }); + } + return result; } diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 446a428..e3cc31e 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -605,15 +605,15 @@ export class PuppeteerControl extends AsyncService { } const parsedUrl = new URL(requestUrl); - try { - if (isIP(parsedUrl.hostname)) { - domainSet.add(parsedUrl.hostname); - } else { + if (isIP(parsedUrl.hostname)) { + domainSet.add(parsedUrl.hostname); + } else { + try { const tldParsed = tldExtract(requestUrl); domainSet.add(tldParsed.domain); + } catch (_err) { + domainSet.add(parsedUrl.hostname); } - } catch (err) { - return req.abort('blockedbyclient', 1000); } if (this.circuitBreakerHosts.has(parsedUrl.hostname.toLowerCase())) {