feat: pdf upload and ip check

This commit is contained in:
yanlong.wang 2025-03-11 19:20:11 +08:00
parent 7d4102e96e
commit 45d1682db0
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
8 changed files with 306 additions and 81 deletions

8
package-lock.json generated
View File

@ -17,7 +17,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-ef21ac9",
"civkit": "^0.9.0-f7b0ca7",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -3989,9 +3989,9 @@
}
},
"node_modules/civkit": {
"version": "0.8.4-ef21ac9",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-ef21ac9.tgz",
"integrity": "sha512-CAGzSIcXeBbYmhweTBqTqoroIpxI/dH87KhlT6MzokOiMpRcs02NJXM5V/KPbZ5hTqT9jii2xGd1CwsvTYZezg==",
"version": "0.9.0-f7b0ca7",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-f7b0ca7.tgz",
"integrity": "sha512-WjF0zRY83Ewvx4fGs1O0PQD2Oyc/RlKCVGiO/LHdwEFwfldTqDE3XWdWv+brZ2GvsIsVVKVa+bEGP0SwJfrRXA==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",

View File

@ -26,7 +26,7 @@
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-ef21ac9",
"civkit": "^0.9.0-f7b0ca7",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",

View File

@ -13,6 +13,7 @@ import {
import { marshalErrorLike } from 'civkit/lang';
import { Defer } from 'civkit/defer';
import { retryWith } from 'civkit/decorators';
import { FancyFile } from 'civkit/fancy-file';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
@ -43,10 +44,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { RobotsTxtService } from '../services/robots-text';
import { lookup } from 'dns/promises';
import { isIP } from 'net';
const normalizeUrl = require('@esm2cjs/normalize-url').default;
import { TempFileManager } from '../services/temp-file';
import { MiscService } from '../services/misc';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted';
@ -92,6 +91,8 @@ export class CrawlerHost extends RPCHost {
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncLocalContext,
protected robotsTxtService: RobotsTxtService,
protected tempFileManager: TempFileManager,
protected miscService: MiscService,
) {
super(...arguments);
@ -472,47 +473,28 @@ export class CrawlerHost extends RPCHost {
}
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
let url: string;
let url: string = '';
const targetUrlFromGet = originPath.slice(1);
if (crawlerOptions.pdf) {
url = `blob://pdf/${randomUUID()}`;
const pdfFile = crawlerOptions.pdf;
const identifier = pdfFile instanceof FancyFile ? (await pdfFile.sha256Sum) : randomUUID();
url = `blob://pdf/${identifier}`;
crawlerOptions.url ??= url;
} else if (targetUrlFromGet) {
url = targetUrlFromGet.trim();
} else if (crawlerOptions.url) {
url = crawlerOptions.url.trim();
} else {
return null;
}
let result: URL;
try {
result = new URL(
normalizeUrl(
url,
{
stripWWW: false,
removeTrailingSlash: false,
removeSingleSlash: false,
sortQueryParameters: false,
}
)
);
} catch (err) {
if (!url) {
throw new ParamValidationError({
message: `${err}`,
message: 'No URL provided',
path: 'url'
});
}
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
throw new ParamValidationError({
message: `Invalid protocol ${result.protocol}`,
path: 'url'
});
}
const result = await this.miscService.assertNormalizedUrl(url);
if (this.puppeteerControl.circuitBreakerHosts.has(result.hostname.toLowerCase())) {
throw new SecurityCompromiseError({
message: `Circular hostname: ${result.protocol}`,
@ -520,31 +502,6 @@ export class CrawlerHost extends RPCHost {
});
}
const isIp = isIP(result.hostname);
if (
(result.hostname === 'localhost') ||
(isIp && result.hostname.startsWith('127.'))
) {
throw new SecurityCompromiseError({
message: `Suspicious action: Request to localhost: ${result}`,
path: 'url'
});
}
if (!isIp && result.protocol !== 'blob:') {
await lookup(result.hostname).catch((err) => {
if (err.code === 'ENOTFOUND') {
return Promise.reject(new ParamValidationError({
message: `Domain '${result.hostname}' could not be resolved`,
path: 'url'
}));
}
return;
});
}
return result;
}
@ -733,14 +690,14 @@ export class CrawlerHost extends RPCHost {
}
if (crawlerOpts?.pdf) {
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
const pdfFile = crawlerOpts.pdf instanceof FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64'));
const pdfLocalPath = pathToFileURL((await pdfFile.filePath));
const snapshot = {
href: urlToCrawl.toString(),
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${crawlerOpts.url}"></body></html>`,
title: '',
text: '',
pdfs: [pdfDataUrl],
pdfs: [pdfLocalPath.href],
} as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);

View File

@ -1,4 +1,5 @@
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
import { FancyFile } from 'civkit/fancy-file';
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
import { Context } from '../services/registry';
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
@ -277,9 +278,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop({
desc: 'Base64 encoded PDF.',
type: [File, String]
type: [FancyFile, String]
})
pdf?: File | string;
pdf?: FancyFile | string;
@Prop({
default: CONTENT_FORMAT.CONTENT,

View File

@ -109,6 +109,8 @@ export class CurlControl extends AsyncService {
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
curl.setOpt(Curl.option.LOW_SPEED_LIMIT, 32768);
curl.setOpt(Curl.option.LOW_SPEED_TIME, 5_000);
if (crawlOpts?.method) {
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
}
@ -401,12 +403,12 @@ export class CurlControl extends AsyncService {
digestCurlCode(code: CurlCode, msg: string) {
switch (code) {
// 400 User errors
case CurlCode.CURLE_COULDNT_RESOLVE_HOST:
{
return new AssertionFailureError(msg);
}
case CurlCode.CURLE_COULDNT_RESOLVE_HOST: {
return new AssertionFailureError(msg);
}
// Maybe retry but dont retry with curl again
case CurlCode.CURLE_OPERATION_TIMEDOUT:
case CurlCode.CURLE_UNSUPPORTED_PROTOCOL:
case CurlCode.CURLE_PEER_FAILED_VERIFICATION: {
return new ServiceBadApproachError(msg);
@ -417,7 +419,6 @@ export class CurlControl extends AsyncService {
case CurlCode.CURLE_SEND_ERROR:
case CurlCode.CURLE_RECV_ERROR:
case CurlCode.CURLE_GOT_NOTHING:
case CurlCode.CURLE_OPERATION_TIMEDOUT:
case CurlCode.CURLE_SSL_CONNECT_ERROR:
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:

99
src/services/misc.ts Normal file
View File

@ -0,0 +1,99 @@
import { singleton } from 'tsyringe';
import { AsyncService } from 'civkit/async-service';
import { ParamValidationError } from 'civkit/civ-rpc';
import { SecurityCompromiseError } from '../shared/lib/errors';
import { isIP } from 'node:net';
import { isIPInNonPublicRange } from '../utils/ip';
import { GlobalLogger } from './logger';
import { lookup } from 'node:dns/promises';
import { Threaded } from './threaded';
const normalizeUrl = require('@esm2cjs/normalize-url').default;
@singleton()
export class MiscService extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: GlobalLogger,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
@Threaded()
async assertNormalizedUrl(input: string) {
let result: URL;
try {
result = new URL(
normalizeUrl(
input,
{
stripWWW: false,
removeTrailingSlash: false,
removeSingleSlash: false,
sortQueryParameters: false,
}
)
);
} catch (err) {
throw new ParamValidationError({
message: `${err}`,
path: 'url'
});
}
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
throw new ParamValidationError({
message: `Invalid protocol ${result.protocol}`,
path: 'url'
});
}
const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
const isIp = isIP(normalizedHostname);
if (
(result.hostname === 'localhost') ||
(isIp && isIPInNonPublicRange(normalizedHostname))
) {
this.logger.warn(`Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, { href: result.href });
throw new SecurityCompromiseError({
message: `Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`,
path: 'url'
});
}
if (!isIp && result.protocol !== 'blob:') {
const resolved = await lookup(result.hostname, { all: true }).catch((err) => {
if (err.code === 'ENOTFOUND') {
return Promise.reject(new ParamValidationError({
message: `Domain '${result.hostname}' could not be resolved`,
path: 'url'
}));
}
return;
});
if (resolved) {
for (const x of resolved) {
if (isIPInNonPublicRange(x.address)) {
this.logger.warn(`Suspicious action: Domain resolved to non-public IP: ${result.hostname} => ${x.address}`, { href: result.href, ip: x.address });
throw new SecurityCompromiseError({
message: `Suspicious action: Domain resolved to non-public IP: ${x.address}`,
path: 'url'
});
}
}
}
}
return result;
}
}

View File

@ -274,19 +274,19 @@ export class PDFExtractor extends AsyncService {
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
}
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
if (!url) {
return undefined;
}
const nameUrl = alternativeUrl || url.toString();
let nameUrl = alternativeUrl || url;
const digest = md5Hasher.hash(nameUrl);
const data = url;
if (typeof url === 'string' && this.isDataUrl(url)) {
url = `dataurl://digest:${digest}`;
if (this.isDataUrl(url)) {
nameUrl = `blob://pdf:${digest}`;
}
const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
const cache: PDFContent | undefined = nameUrl.startsWith('blob:') ? undefined :
(await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (cache) {
const age = Date.now() - cache?.createdAt.valueOf();
@ -324,13 +324,13 @@ export class PDFExtractor extends AsyncService {
let extracted;
try {
extracted = await this.extract(data);
extracted = await this.extract(url);
} catch (err: any) {
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
}
if (!this.asyncLocalContext.ctx.DNT) {
if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) {
const theID = randomUUID();
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });

167
src/utils/ip.ts Normal file
View File

@ -0,0 +1,167 @@
import { isIPv4, isIPv6 } from 'net';
export function parseIp(ip: string): Buffer {
if (isIPv4(ip)) {
const [a, b, c, d] = ip.split('.').map(Number);
const buf = Buffer.alloc(4);
buf.writeUInt8(a, 0);
buf.writeUInt8(b, 1);
buf.writeUInt8(c, 2);
buf.writeUInt8(d, 3);
return buf;
}
if (isIPv6(ip)) {
if (ip.includes('.')) {
const parts = ip.split(':');
const ipv4Part = parts.pop();
if (!ipv4Part) throw new Error('Invalid IPv6 address');
const ipv4Bytes = parseIp(ipv4Part);
parts.push('0');
const ipv6Bytes = parseIp(parts.join(':'));
ipv6Bytes.writeUInt32BE(ipv4Bytes.readUInt32BE(0), 12);
return ipv6Bytes;
}
const buf = Buffer.alloc(16);
// Expand :: notation
let expanded = ip;
if (ip.includes('::')) {
const sides = ip.split('::');
const left = sides[0] ? sides[0].split(':') : [];
const right = sides[1] ? sides[1].split(':') : [];
const middle = Array(8 - left.length - right.length).fill('0');
expanded = [...left, ...middle, ...right].join(':');
}
// Convert to buffer
const parts = expanded.split(':');
let offset = 0;
for (const part of parts) {
buf.writeUInt16BE(parseInt(part, 16), offset);
offset += 2;
}
return buf;
}
throw new Error('Invalid IP address');
}
export function parseCIDR(cidr: string): [Buffer, Buffer] {
const [ip, prefixTxt] = cidr.split('/');
const buf = parseIp(ip);
const maskBuf = Buffer.alloc(buf.byteLength, 0xff);
const prefixBits = parseInt(prefixTxt);
let offsetBits = 0;
while (offsetBits < (buf.byteLength * 8)) {
if (offsetBits <= (prefixBits - 8)) {
offsetBits += 8;
continue;
}
const bitsRemain = prefixBits - offsetBits;
const byteOffset = Math.floor(offsetBits / 8);
if (bitsRemain > 0) {
const theByte = buf[byteOffset];
const mask = 0xff << (8 - bitsRemain);
maskBuf[byteOffset] = mask;
buf[byteOffset] = theByte & mask;
offsetBits += 8;
continue;
};
buf[byteOffset] = 0;
maskBuf[byteOffset] = 0;
offsetBits += 8;
}
return [buf, maskBuf];
}
export class CIDR {
buff: Buffer;
mask: Buffer;
text: string;
constructor(cidr: string) {
this.text = cidr;
[this.buff, this.mask] = parseCIDR(cidr);
}
toString() {
return this.text;
}
get family() {
return this.buff.byteLength === 4 ? 4 : 6;
}
test(ip: string | Buffer): boolean {
const parsedIp = typeof ip === 'string' ? parseIp(ip) : ip;
if (parsedIp.byteLength !== this.buff.byteLength) {
return false;
}
for (const i of Array(this.buff.byteLength).keys()) {
const t = parsedIp[i];
const m = this.mask[i];
if (m === 0) {
return true;
}
const r = this.buff[i];
if ((t & m) !== r) {
return false;
}
}
return true;
}
}
const nonPublicNetworks4 = [
'10.0.0.0/8',
'172.16.0.0/12',
'192.168.0.0/16',
'127.0.0.0/8',
'255.255.255.255/32',
'169.254.0.0/16',
'224.0.0.0/4',
'100.64.0.0/10',
'240.0.0.0/4',
];
const nonPublicNetworks6 = [
'fc00::/7',
'fe80::/10',
'ff00::/8',
'::127.0.0.0/104',
'::/128',
];
const nonPublicCIDRs = [...nonPublicNetworks4, ...nonPublicNetworks6].map(cidr => new CIDR(cidr));
export function isIPInNonPublicRange(ip: string) {
const parsed = parseIp(ip);
for (const cidr of nonPublicCIDRs) {
if (cidr.test(parsed)) {
return true;
}
}
return false;
}