From 45d1682db02f0c50b680ad350855da4e38ab2122 Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Tue, 11 Mar 2025 19:20:11 +0800 Subject: [PATCH] feat: pdf upload and ip check --- package-lock.json | 8 +- package.json | 2 +- src/api/crawler.ts | 77 ++++------------- src/dto/crawler-options.ts | 7 +- src/services/curl.ts | 11 +-- src/services/misc.ts | 99 +++++++++++++++++++++ src/services/pdf-extract.ts | 16 ++-- src/utils/ip.ts | 167 ++++++++++++++++++++++++++++++++++++ 8 files changed, 306 insertions(+), 81 deletions(-) create mode 100644 src/services/misc.ts create mode 100644 src/utils/ip.ts diff --git a/package-lock.json b/package-lock.json index f12801f..f1c05bf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-ef21ac9", + "civkit": "^0.9.0-f7b0ca7", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -3989,9 +3989,9 @@ } }, "node_modules/civkit": { - "version": "0.8.4-ef21ac9", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-ef21ac9.tgz", - "integrity": "sha512-CAGzSIcXeBbYmhweTBqTqoroIpxI/dH87KhlT6MzokOiMpRcs02NJXM5V/KPbZ5hTqT9jii2xGd1CwsvTYZezg==", + "version": "0.9.0-f7b0ca7", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-f7b0ca7.tgz", + "integrity": "sha512-WjF0zRY83Ewvx4fGs1O0PQD2Oyc/RlKCVGiO/LHdwEFwfldTqDE3XWdWv+brZ2GvsIsVVKVa+bEGP0SwJfrRXA==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", diff --git a/package.json b/package.json index b163074..7f9181f 100644 --- a/package.json +++ b/package.json @@ -26,7 +26,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-ef21ac9", + "civkit": "^0.9.0-f7b0ca7", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", diff --git a/src/api/crawler.ts b/src/api/crawler.ts index d1ac249..2911b9e 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -13,6 +13,7 @@ import { import { marshalErrorLike } from 'civkit/lang'; import { Defer } from 'civkit/defer'; import { retryWith } from 'civkit/decorators'; +import { FancyFile } from 'civkit/fancy-file'; import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options'; @@ -43,10 +44,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { RobotsTxtService } from '../services/robots-text'; -import { lookup } from 'dns/promises'; -import { isIP } from 'net'; - -const normalizeUrl = require('@esm2cjs/normalize-url').default; +import { TempFileManager } from '../services/temp-file'; +import { MiscService } from '../services/misc'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -92,6 +91,8 @@ export class CrawlerHost extends RPCHost { protected rateLimitControl: RateLimitControl, protected threadLocal: AsyncLocalContext, protected robotsTxtService: RobotsTxtService, + protected tempFileManager: TempFileManager, + protected miscService: MiscService, ) { super(...arguments); @@ -472,47 +473,28 @@ export class CrawlerHost extends RPCHost { } async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) { - let url: string; + let url: string = ''; const targetUrlFromGet = originPath.slice(1); if (crawlerOptions.pdf) { - url = `blob://pdf/${randomUUID()}`; + const pdfFile = crawlerOptions.pdf; + const identifier = pdfFile instanceof FancyFile ? (await pdfFile.sha256Sum) : randomUUID(); + url = `blob://pdf/${identifier}`; + crawlerOptions.url ??= url; } else if (targetUrlFromGet) { url = targetUrlFromGet.trim(); } else if (crawlerOptions.url) { url = crawlerOptions.url.trim(); - } else { - return null; } - let result: URL; - try { - result = new URL( - normalizeUrl( - url, - { - stripWWW: false, - removeTrailingSlash: false, - removeSingleSlash: false, - sortQueryParameters: false, - } - ) - ); - } catch (err) { + if (!url) { throw new ParamValidationError({ - message: `${err}`, + message: 'No URL provided', path: 'url' }); } - if (!['http:', 'https:', 'blob:'].includes(result.protocol)) { - throw new ParamValidationError({ - message: `Invalid protocol ${result.protocol}`, - path: 'url' - }); - } - - + const result = await this.miscService.assertNormalizedUrl(url); if (this.puppeteerControl.circuitBreakerHosts.has(result.hostname.toLowerCase())) { throw new SecurityCompromiseError({ message: `Circular hostname: ${result.protocol}`, @@ -520,31 +502,6 @@ export class CrawlerHost extends RPCHost { }); } - const isIp = isIP(result.hostname); - - if ( - (result.hostname === 'localhost') || - (isIp && result.hostname.startsWith('127.')) - ) { - throw new SecurityCompromiseError({ - message: `Suspicious action: Request to localhost: ${result}`, - path: 'url' - }); - } - - if (!isIp && result.protocol !== 'blob:') { - await lookup(result.hostname).catch((err) => { - if (err.code === 'ENOTFOUND') { - return Promise.reject(new ParamValidationError({ - message: `Domain '${result.hostname}' could not be resolved`, - path: 'url' - })); - } - - return; - }); - } - return result; } @@ -733,14 +690,14 @@ export class CrawlerHost extends RPCHost { } if (crawlerOpts?.pdf) { - const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64'); - const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`; + const pdfFile = crawlerOpts.pdf instanceof FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64')); + const pdfLocalPath = pathToFileURL((await pdfFile.filePath)); const snapshot = { href: urlToCrawl.toString(), - html: ``, + html: ``, title: '', text: '', - pdfs: [pdfDataUrl], + pdfs: [pdfLocalPath.href], } as PageSnapshot; yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); diff --git a/src/dto/crawler-options.ts b/src/dto/crawler-options.ts index a562c0c..a89f3d2 100644 --- a/src/dto/crawler-options.ts +++ b/src/dto/crawler-options.ts @@ -1,4 +1,5 @@ -import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined +import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc'; +import { FancyFile } from 'civkit/fancy-file'; import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser'; import { Context } from '../services/registry'; import { TurnDownTweakableOptions } from './turndown-tweakable-options'; @@ -277,9 +278,9 @@ export class CrawlerOptions extends AutoCastable { @Prop({ desc: 'Base64 encoded PDF.', - type: [File, String] + type: [FancyFile, String] }) - pdf?: File | string; + pdf?: FancyFile | string; @Prop({ default: CONTENT_FORMAT.CONTENT, diff --git a/src/services/curl.ts b/src/services/curl.ts index f29a981..8b6ff7b 100644 --- a/src/services/curl.ts +++ b/src/services/curl.ts @@ -109,6 +109,8 @@ export class CurlControl extends AsyncService { curl.setOpt(Curl.option.SSL_VERIFYPEER, false); curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000); curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000); + curl.setOpt(Curl.option.LOW_SPEED_LIMIT, 32768); + curl.setOpt(Curl.option.LOW_SPEED_TIME, 5_000); if (crawlOpts?.method) { curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase()); } @@ -401,12 +403,12 @@ export class CurlControl extends AsyncService { digestCurlCode(code: CurlCode, msg: string) { switch (code) { // 400 User errors - case CurlCode.CURLE_COULDNT_RESOLVE_HOST: - { - return new AssertionFailureError(msg); - } + case CurlCode.CURLE_COULDNT_RESOLVE_HOST: { + return new AssertionFailureError(msg); + } // Maybe retry but dont retry with curl again + case CurlCode.CURLE_OPERATION_TIMEDOUT: case CurlCode.CURLE_UNSUPPORTED_PROTOCOL: case CurlCode.CURLE_PEER_FAILED_VERIFICATION: { return new ServiceBadApproachError(msg); @@ -417,7 +419,6 @@ export class CurlControl extends AsyncService { case CurlCode.CURLE_SEND_ERROR: case CurlCode.CURLE_RECV_ERROR: case CurlCode.CURLE_GOT_NOTHING: - case CurlCode.CURLE_OPERATION_TIMEDOUT: case CurlCode.CURLE_SSL_CONNECT_ERROR: case CurlCode.CURLE_QUIC_CONNECT_ERROR: case CurlCode.CURLE_COULDNT_RESOLVE_PROXY: diff --git a/src/services/misc.ts b/src/services/misc.ts new file mode 100644 index 0000000..993ae7f --- /dev/null +++ b/src/services/misc.ts @@ -0,0 +1,99 @@ +import { singleton } from 'tsyringe'; +import { AsyncService } from 'civkit/async-service'; +import { ParamValidationError } from 'civkit/civ-rpc'; +import { SecurityCompromiseError } from '../shared/lib/errors'; +import { isIP } from 'node:net'; +import { isIPInNonPublicRange } from '../utils/ip'; +import { GlobalLogger } from './logger'; +import { lookup } from 'node:dns/promises'; +import { Threaded } from './threaded'; + +const normalizeUrl = require('@esm2cjs/normalize-url').default; + +@singleton() +export class MiscService extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + constructor( + protected globalLogger: GlobalLogger, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + @Threaded() + async assertNormalizedUrl(input: string) { + let result: URL; + try { + result = new URL( + normalizeUrl( + input, + { + stripWWW: false, + removeTrailingSlash: false, + removeSingleSlash: false, + sortQueryParameters: false, + } + ) + ); + } catch (err) { + throw new ParamValidationError({ + message: `${err}`, + path: 'url' + }); + } + + if (!['http:', 'https:', 'blob:'].includes(result.protocol)) { + throw new ParamValidationError({ + message: `Invalid protocol ${result.protocol}`, + path: 'url' + }); + } + + const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname; + const isIp = isIP(normalizedHostname); + if ( + (result.hostname === 'localhost') || + (isIp && isIPInNonPublicRange(normalizedHostname)) + ) { + this.logger.warn(`Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, { href: result.href }); + throw new SecurityCompromiseError({ + message: `Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, + path: 'url' + }); + } + if (!isIp && result.protocol !== 'blob:') { + const resolved = await lookup(result.hostname, { all: true }).catch((err) => { + if (err.code === 'ENOTFOUND') { + return Promise.reject(new ParamValidationError({ + message: `Domain '${result.hostname}' could not be resolved`, + path: 'url' + })); + } + + return; + }); + if (resolved) { + for (const x of resolved) { + if (isIPInNonPublicRange(x.address)) { + this.logger.warn(`Suspicious action: Domain resolved to non-public IP: ${result.hostname} => ${x.address}`, { href: result.href, ip: x.address }); + throw new SecurityCompromiseError({ + message: `Suspicious action: Domain resolved to non-public IP: ${x.address}`, + path: 'url' + }); + } + } + + } + } + + return result; + } + +} \ No newline at end of file diff --git a/src/services/pdf-extract.ts b/src/services/pdf-extract.ts index 634e909..cf7a6d2 100644 --- a/src/services/pdf-extract.ts +++ b/src/services/pdf-extract.ts @@ -274,19 +274,19 @@ export class PDFExtractor extends AsyncService { return { meta: meta.info as Record, content: mdChunks.join(''), text: rawChunks.join('') }; } - async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) { + async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) { if (!url) { return undefined; } - const nameUrl = alternativeUrl || url.toString(); + let nameUrl = alternativeUrl || url; const digest = md5Hasher.hash(nameUrl); - const data = url; - if (typeof url === 'string' && this.isDataUrl(url)) { - url = `dataurl://digest:${digest}`; + if (this.isDataUrl(url)) { + nameUrl = `blob://pdf:${digest}`; } - const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; + const cache: PDFContent | undefined = nameUrl.startsWith('blob:') ? undefined : + (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; if (cache) { const age = Date.now() - cache?.createdAt.valueOf(); @@ -324,13 +324,13 @@ export class PDFExtractor extends AsyncService { let extracted; try { - extracted = await this.extract(data); + extracted = await this.extract(url); } catch (err: any) { this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl }); throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`); } - if (!this.asyncLocalContext.ctx.DNT) { + if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) { const theID = randomUUID(); await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`, Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' }); diff --git a/src/utils/ip.ts b/src/utils/ip.ts new file mode 100644 index 0000000..6d15e69 --- /dev/null +++ b/src/utils/ip.ts @@ -0,0 +1,167 @@ +import { isIPv4, isIPv6 } from 'net'; + +export function parseIp(ip: string): Buffer { + if (isIPv4(ip)) { + const [a, b, c, d] = ip.split('.').map(Number); + + const buf = Buffer.alloc(4); + buf.writeUInt8(a, 0); + buf.writeUInt8(b, 1); + buf.writeUInt8(c, 2); + buf.writeUInt8(d, 3); + + return buf; + } + + if (isIPv6(ip)) { + if (ip.includes('.')) { + const parts = ip.split(':'); + const ipv4Part = parts.pop(); + if (!ipv4Part) throw new Error('Invalid IPv6 address'); + const ipv4Bytes = parseIp(ipv4Part); + parts.push('0'); + const ipv6Bytes = parseIp(parts.join(':')); + ipv6Bytes.writeUInt32BE(ipv4Bytes.readUInt32BE(0), 12); + + return ipv6Bytes; + } + + const buf = Buffer.alloc(16); + + // Expand :: notation + let expanded = ip; + if (ip.includes('::')) { + const sides = ip.split('::'); + const left = sides[0] ? sides[0].split(':') : []; + const right = sides[1] ? sides[1].split(':') : []; + const middle = Array(8 - left.length - right.length).fill('0'); + expanded = [...left, ...middle, ...right].join(':'); + } + + // Convert to buffer + const parts = expanded.split(':'); + let offset = 0; + for (const part of parts) { + buf.writeUInt16BE(parseInt(part, 16), offset); + offset += 2; + } + + return buf; + } + + throw new Error('Invalid IP address'); +} + + +export function parseCIDR(cidr: string): [Buffer, Buffer] { + const [ip, prefixTxt] = cidr.split('/'); + const buf = parseIp(ip); + const maskBuf = Buffer.alloc(buf.byteLength, 0xff); + const prefixBits = parseInt(prefixTxt); + + let offsetBits = 0; + while (offsetBits < (buf.byteLength * 8)) { + if (offsetBits <= (prefixBits - 8)) { + offsetBits += 8; + continue; + } + const bitsRemain = prefixBits - offsetBits; + const byteOffset = Math.floor(offsetBits / 8); + + if (bitsRemain > 0) { + const theByte = buf[byteOffset]; + const mask = 0xff << (8 - bitsRemain); + maskBuf[byteOffset] = mask; + buf[byteOffset] = theByte & mask; + + offsetBits += 8; + continue; + }; + buf[byteOffset] = 0; + maskBuf[byteOffset] = 0; + + offsetBits += 8; + } + + return [buf, maskBuf]; +} + +export class CIDR { + buff: Buffer; + mask: Buffer; + text: string; + constructor(cidr: string) { + this.text = cidr; + [this.buff, this.mask] = parseCIDR(cidr); + } + + toString() { + return this.text; + } + + get family() { + return this.buff.byteLength === 4 ? 4 : 6; + } + + test(ip: string | Buffer): boolean { + const parsedIp = typeof ip === 'string' ? parseIp(ip) : ip; + + if (parsedIp.byteLength !== this.buff.byteLength) { + return false; + } + + for (const i of Array(this.buff.byteLength).keys()) { + const t = parsedIp[i]; + const m = this.mask[i]; + + if (m === 0) { + return true; + } + + const r = this.buff[i]; + if ((t & m) !== r) { + return false; + } + } + + return true; + } +} + +const nonPublicNetworks4 = [ + '10.0.0.0/8', + '172.16.0.0/12', + '192.168.0.0/16', + + '127.0.0.0/8', + '255.255.255.255/32', + '169.254.0.0/16', + '224.0.0.0/4', + + '100.64.0.0/10', + '240.0.0.0/4', +]; + + +const nonPublicNetworks6 = [ + 'fc00::/7', + 'fe80::/10', + 'ff00::/8', + + '::127.0.0.0/104', + '::/128', +]; + +const nonPublicCIDRs = [...nonPublicNetworks4, ...nonPublicNetworks6].map(cidr => new CIDR(cidr)); + +export function isIPInNonPublicRange(ip: string) { + const parsed = parseIp(ip); + + for (const cidr of nonPublicCIDRs) { + if (cidr.test(parsed)) { + return true; + } + } + + return false; +} \ No newline at end of file