mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-19 00:05:59 +08:00
feat: pdf upload and ip check
This commit is contained in:
parent
7d4102e96e
commit
45d1682db0
8
package-lock.json
generated
8
package-lock.json
generated
@ -17,7 +17,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-ef21ac9",
|
"civkit": "^0.9.0-f7b0ca7",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -3989,9 +3989,9 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/civkit": {
|
"node_modules/civkit": {
|
||||||
"version": "0.8.4-ef21ac9",
|
"version": "0.9.0-f7b0ca7",
|
||||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-ef21ac9.tgz",
|
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.9.0-f7b0ca7.tgz",
|
||||||
"integrity": "sha512-CAGzSIcXeBbYmhweTBqTqoroIpxI/dH87KhlT6MzokOiMpRcs02NJXM5V/KPbZ5hTqT9jii2xGd1CwsvTYZezg==",
|
"integrity": "sha512-WjF0zRY83Ewvx4fGs1O0PQD2Oyc/RlKCVGiO/LHdwEFwfldTqDE3XWdWv+brZ2GvsIsVVKVa+bEGP0SwJfrRXA==",
|
||||||
"license": "AGPL",
|
"license": "AGPL",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
|
@ -26,7 +26,7 @@
|
|||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"busboy": "^1.6.0",
|
"busboy": "^1.6.0",
|
||||||
"civkit": "^0.8.4-ef21ac9",
|
"civkit": "^0.9.0-f7b0ca7",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
|
@ -13,6 +13,7 @@ import {
|
|||||||
import { marshalErrorLike } from 'civkit/lang';
|
import { marshalErrorLike } from 'civkit/lang';
|
||||||
import { Defer } from 'civkit/defer';
|
import { Defer } from 'civkit/defer';
|
||||||
import { retryWith } from 'civkit/decorators';
|
import { retryWith } from 'civkit/decorators';
|
||||||
|
import { FancyFile } from 'civkit/fancy-file';
|
||||||
|
|
||||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
||||||
|
|
||||||
@ -43,10 +44,8 @@ import { ProxyProvider } from '../shared/services/proxy-provider';
|
|||||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||||
import { RobotsTxtService } from '../services/robots-text';
|
import { RobotsTxtService } from '../services/robots-text';
|
||||||
import { lookup } from 'dns/promises';
|
import { TempFileManager } from '../services/temp-file';
|
||||||
import { isIP } from 'net';
|
import { MiscService } from '../services/misc';
|
||||||
|
|
||||||
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean | 'quoted';
|
withIframe?: boolean | 'quoted';
|
||||||
@ -92,6 +91,8 @@ export class CrawlerHost extends RPCHost {
|
|||||||
protected rateLimitControl: RateLimitControl,
|
protected rateLimitControl: RateLimitControl,
|
||||||
protected threadLocal: AsyncLocalContext,
|
protected threadLocal: AsyncLocalContext,
|
||||||
protected robotsTxtService: RobotsTxtService,
|
protected robotsTxtService: RobotsTxtService,
|
||||||
|
protected tempFileManager: TempFileManager,
|
||||||
|
protected miscService: MiscService,
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
|
|
||||||
@ -472,47 +473,28 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
||||||
let url: string;
|
let url: string = '';
|
||||||
|
|
||||||
const targetUrlFromGet = originPath.slice(1);
|
const targetUrlFromGet = originPath.slice(1);
|
||||||
if (crawlerOptions.pdf) {
|
if (crawlerOptions.pdf) {
|
||||||
url = `blob://pdf/${randomUUID()}`;
|
const pdfFile = crawlerOptions.pdf;
|
||||||
|
const identifier = pdfFile instanceof FancyFile ? (await pdfFile.sha256Sum) : randomUUID();
|
||||||
|
url = `blob://pdf/${identifier}`;
|
||||||
|
crawlerOptions.url ??= url;
|
||||||
} else if (targetUrlFromGet) {
|
} else if (targetUrlFromGet) {
|
||||||
url = targetUrlFromGet.trim();
|
url = targetUrlFromGet.trim();
|
||||||
} else if (crawlerOptions.url) {
|
} else if (crawlerOptions.url) {
|
||||||
url = crawlerOptions.url.trim();
|
url = crawlerOptions.url.trim();
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let result: URL;
|
if (!url) {
|
||||||
try {
|
|
||||||
result = new URL(
|
|
||||||
normalizeUrl(
|
|
||||||
url,
|
|
||||||
{
|
|
||||||
stripWWW: false,
|
|
||||||
removeTrailingSlash: false,
|
|
||||||
removeSingleSlash: false,
|
|
||||||
sortQueryParameters: false,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
);
|
|
||||||
} catch (err) {
|
|
||||||
throw new ParamValidationError({
|
throw new ParamValidationError({
|
||||||
message: `${err}`,
|
message: 'No URL provided',
|
||||||
path: 'url'
|
path: 'url'
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
|
const result = await this.miscService.assertNormalizedUrl(url);
|
||||||
throw new ParamValidationError({
|
|
||||||
message: `Invalid protocol ${result.protocol}`,
|
|
||||||
path: 'url'
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
if (this.puppeteerControl.circuitBreakerHosts.has(result.hostname.toLowerCase())) {
|
if (this.puppeteerControl.circuitBreakerHosts.has(result.hostname.toLowerCase())) {
|
||||||
throw new SecurityCompromiseError({
|
throw new SecurityCompromiseError({
|
||||||
message: `Circular hostname: ${result.protocol}`,
|
message: `Circular hostname: ${result.protocol}`,
|
||||||
@ -520,31 +502,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const isIp = isIP(result.hostname);
|
|
||||||
|
|
||||||
if (
|
|
||||||
(result.hostname === 'localhost') ||
|
|
||||||
(isIp && result.hostname.startsWith('127.'))
|
|
||||||
) {
|
|
||||||
throw new SecurityCompromiseError({
|
|
||||||
message: `Suspicious action: Request to localhost: ${result}`,
|
|
||||||
path: 'url'
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isIp && result.protocol !== 'blob:') {
|
|
||||||
await lookup(result.hostname).catch((err) => {
|
|
||||||
if (err.code === 'ENOTFOUND') {
|
|
||||||
return Promise.reject(new ParamValidationError({
|
|
||||||
message: `Domain '${result.hostname}' could not be resolved`,
|
|
||||||
path: 'url'
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -733,14 +690,14 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (crawlerOpts?.pdf) {
|
if (crawlerOpts?.pdf) {
|
||||||
const pdfBuf = crawlerOpts.pdf instanceof Blob ? await crawlerOpts.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOpts.pdf, 'base64');
|
const pdfFile = crawlerOpts.pdf instanceof FancyFile ? crawlerOpts.pdf : this.tempFileManager.cacheBuffer(Buffer.from(crawlerOpts.pdf, 'base64'));
|
||||||
const pdfDataUrl = `data:application/pdf;base64,${pdfBuf.toString('base64')}`;
|
const pdfLocalPath = pathToFileURL((await pdfFile.filePath));
|
||||||
const snapshot = {
|
const snapshot = {
|
||||||
href: urlToCrawl.toString(),
|
href: urlToCrawl.toString(),
|
||||||
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${pdfDataUrl}"></body></html>`,
|
html: `<!DOCTYPE html><html><head></head><body style="height: 100%; width: 100%; overflow: hidden; margin:0px; background-color: rgb(82, 86, 89);"><embed style="position:absolute; left: 0; top: 0;" width="100%" height="100%" src="${crawlerOpts.url}"></body></html>`,
|
||||||
title: '',
|
title: '',
|
||||||
text: '',
|
text: '',
|
||||||
pdfs: [pdfDataUrl],
|
pdfs: [pdfLocalPath.href],
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
|
||||||
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit/civ-rpc';
|
||||||
|
import { FancyFile } from 'civkit/fancy-file';
|
||||||
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
import { Context } from '../services/registry';
|
import { Context } from '../services/registry';
|
||||||
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
import { TurnDownTweakableOptions } from './turndown-tweakable-options';
|
||||||
@ -277,9 +278,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
desc: 'Base64 encoded PDF.',
|
desc: 'Base64 encoded PDF.',
|
||||||
type: [File, String]
|
type: [FancyFile, String]
|
||||||
})
|
})
|
||||||
pdf?: File | string;
|
pdf?: FancyFile | string;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
default: CONTENT_FORMAT.CONTENT,
|
default: CONTENT_FORMAT.CONTENT,
|
||||||
|
@ -109,6 +109,8 @@ export class CurlControl extends AsyncService {
|
|||||||
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
|
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
|
||||||
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
|
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
|
||||||
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
|
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
|
||||||
|
curl.setOpt(Curl.option.LOW_SPEED_LIMIT, 32768);
|
||||||
|
curl.setOpt(Curl.option.LOW_SPEED_TIME, 5_000);
|
||||||
if (crawlOpts?.method) {
|
if (crawlOpts?.method) {
|
||||||
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
|
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
|
||||||
}
|
}
|
||||||
@ -401,12 +403,12 @@ export class CurlControl extends AsyncService {
|
|||||||
digestCurlCode(code: CurlCode, msg: string) {
|
digestCurlCode(code: CurlCode, msg: string) {
|
||||||
switch (code) {
|
switch (code) {
|
||||||
// 400 User errors
|
// 400 User errors
|
||||||
case CurlCode.CURLE_COULDNT_RESOLVE_HOST:
|
case CurlCode.CURLE_COULDNT_RESOLVE_HOST: {
|
||||||
{
|
|
||||||
return new AssertionFailureError(msg);
|
return new AssertionFailureError(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Maybe retry but dont retry with curl again
|
// Maybe retry but dont retry with curl again
|
||||||
|
case CurlCode.CURLE_OPERATION_TIMEDOUT:
|
||||||
case CurlCode.CURLE_UNSUPPORTED_PROTOCOL:
|
case CurlCode.CURLE_UNSUPPORTED_PROTOCOL:
|
||||||
case CurlCode.CURLE_PEER_FAILED_VERIFICATION: {
|
case CurlCode.CURLE_PEER_FAILED_VERIFICATION: {
|
||||||
return new ServiceBadApproachError(msg);
|
return new ServiceBadApproachError(msg);
|
||||||
@ -417,7 +419,6 @@ export class CurlControl extends AsyncService {
|
|||||||
case CurlCode.CURLE_SEND_ERROR:
|
case CurlCode.CURLE_SEND_ERROR:
|
||||||
case CurlCode.CURLE_RECV_ERROR:
|
case CurlCode.CURLE_RECV_ERROR:
|
||||||
case CurlCode.CURLE_GOT_NOTHING:
|
case CurlCode.CURLE_GOT_NOTHING:
|
||||||
case CurlCode.CURLE_OPERATION_TIMEDOUT:
|
|
||||||
case CurlCode.CURLE_SSL_CONNECT_ERROR:
|
case CurlCode.CURLE_SSL_CONNECT_ERROR:
|
||||||
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
|
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
|
||||||
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
|
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
|
||||||
|
99
src/services/misc.ts
Normal file
99
src/services/misc.ts
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
import { AsyncService } from 'civkit/async-service';
|
||||||
|
import { ParamValidationError } from 'civkit/civ-rpc';
|
||||||
|
import { SecurityCompromiseError } from '../shared/lib/errors';
|
||||||
|
import { isIP } from 'node:net';
|
||||||
|
import { isIPInNonPublicRange } from '../utils/ip';
|
||||||
|
import { GlobalLogger } from './logger';
|
||||||
|
import { lookup } from 'node:dns/promises';
|
||||||
|
import { Threaded } from './threaded';
|
||||||
|
|
||||||
|
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class MiscService extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: GlobalLogger,
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
@Threaded()
|
||||||
|
async assertNormalizedUrl(input: string) {
|
||||||
|
let result: URL;
|
||||||
|
try {
|
||||||
|
result = new URL(
|
||||||
|
normalizeUrl(
|
||||||
|
input,
|
||||||
|
{
|
||||||
|
stripWWW: false,
|
||||||
|
removeTrailingSlash: false,
|
||||||
|
removeSingleSlash: false,
|
||||||
|
sortQueryParameters: false,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
);
|
||||||
|
} catch (err) {
|
||||||
|
throw new ParamValidationError({
|
||||||
|
message: `${err}`,
|
||||||
|
path: 'url'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!['http:', 'https:', 'blob:'].includes(result.protocol)) {
|
||||||
|
throw new ParamValidationError({
|
||||||
|
message: `Invalid protocol ${result.protocol}`,
|
||||||
|
path: 'url'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const normalizedHostname = result.hostname.startsWith('[') ? result.hostname.slice(1, -1) : result.hostname;
|
||||||
|
const isIp = isIP(normalizedHostname);
|
||||||
|
if (
|
||||||
|
(result.hostname === 'localhost') ||
|
||||||
|
(isIp && isIPInNonPublicRange(normalizedHostname))
|
||||||
|
) {
|
||||||
|
this.logger.warn(`Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`, { href: result.href });
|
||||||
|
throw new SecurityCompromiseError({
|
||||||
|
message: `Suspicious action: Request to localhost or non-public IP: ${normalizedHostname}`,
|
||||||
|
path: 'url'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (!isIp && result.protocol !== 'blob:') {
|
||||||
|
const resolved = await lookup(result.hostname, { all: true }).catch((err) => {
|
||||||
|
if (err.code === 'ENOTFOUND') {
|
||||||
|
return Promise.reject(new ParamValidationError({
|
||||||
|
message: `Domain '${result.hostname}' could not be resolved`,
|
||||||
|
path: 'url'
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
});
|
||||||
|
if (resolved) {
|
||||||
|
for (const x of resolved) {
|
||||||
|
if (isIPInNonPublicRange(x.address)) {
|
||||||
|
this.logger.warn(`Suspicious action: Domain resolved to non-public IP: ${result.hostname} => ${x.address}`, { href: result.href, ip: x.address });
|
||||||
|
throw new SecurityCompromiseError({
|
||||||
|
message: `Suspicious action: Domain resolved to non-public IP: ${x.address}`,
|
||||||
|
path: 'url'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -274,19 +274,19 @@ export class PDFExtractor extends AsyncService {
|
|||||||
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
return { meta: meta.info as Record<string, any>, content: mdChunks.join(''), text: rawChunks.join('') };
|
||||||
}
|
}
|
||||||
|
|
||||||
async cachedExtract(url: string | URL, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
|
async cachedExtract(url: string, cacheTolerance: number = 1000 * 3600 * 24, alternativeUrl?: string) {
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
const nameUrl = alternativeUrl || url.toString();
|
let nameUrl = alternativeUrl || url;
|
||||||
const digest = md5Hasher.hash(nameUrl);
|
const digest = md5Hasher.hash(nameUrl);
|
||||||
|
|
||||||
const data = url;
|
if (this.isDataUrl(url)) {
|
||||||
if (typeof url === 'string' && this.isDataUrl(url)) {
|
nameUrl = `blob://pdf:${digest}`;
|
||||||
url = `dataurl://digest:${digest}`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const cache: PDFContent | undefined = (await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
const cache: PDFContent | undefined = nameUrl.startsWith('blob:') ? undefined :
|
||||||
|
(await PDFContent.fromFirestoreQuery(PDFContent.COLLECTION.where('urlDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
|
||||||
|
|
||||||
if (cache) {
|
if (cache) {
|
||||||
const age = Date.now() - cache?.createdAt.valueOf();
|
const age = Date.now() - cache?.createdAt.valueOf();
|
||||||
@ -324,13 +324,13 @@ export class PDFExtractor extends AsyncService {
|
|||||||
let extracted;
|
let extracted;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
extracted = await this.extract(data);
|
extracted = await this.extract(url);
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
|
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err, url, nameUrl });
|
||||||
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
|
throw new AssertionFailureError(`Unable to process ${nameUrl} as pdf: ${err?.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.asyncLocalContext.ctx.DNT) {
|
if (!this.asyncLocalContext.ctx.DNT && !nameUrl.startsWith('blob:')) {
|
||||||
const theID = randomUUID();
|
const theID = randomUUID();
|
||||||
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
await this.firebaseObjectStorage.saveFile(`pdfs/${theID}`,
|
||||||
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
Buffer.from(JSON.stringify(extracted), 'utf-8'), { contentType: 'application/json' });
|
||||||
|
167
src/utils/ip.ts
Normal file
167
src/utils/ip.ts
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
import { isIPv4, isIPv6 } from 'net';
|
||||||
|
|
||||||
|
export function parseIp(ip: string): Buffer {
|
||||||
|
if (isIPv4(ip)) {
|
||||||
|
const [a, b, c, d] = ip.split('.').map(Number);
|
||||||
|
|
||||||
|
const buf = Buffer.alloc(4);
|
||||||
|
buf.writeUInt8(a, 0);
|
||||||
|
buf.writeUInt8(b, 1);
|
||||||
|
buf.writeUInt8(c, 2);
|
||||||
|
buf.writeUInt8(d, 3);
|
||||||
|
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isIPv6(ip)) {
|
||||||
|
if (ip.includes('.')) {
|
||||||
|
const parts = ip.split(':');
|
||||||
|
const ipv4Part = parts.pop();
|
||||||
|
if (!ipv4Part) throw new Error('Invalid IPv6 address');
|
||||||
|
const ipv4Bytes = parseIp(ipv4Part);
|
||||||
|
parts.push('0');
|
||||||
|
const ipv6Bytes = parseIp(parts.join(':'));
|
||||||
|
ipv6Bytes.writeUInt32BE(ipv4Bytes.readUInt32BE(0), 12);
|
||||||
|
|
||||||
|
return ipv6Bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
const buf = Buffer.alloc(16);
|
||||||
|
|
||||||
|
// Expand :: notation
|
||||||
|
let expanded = ip;
|
||||||
|
if (ip.includes('::')) {
|
||||||
|
const sides = ip.split('::');
|
||||||
|
const left = sides[0] ? sides[0].split(':') : [];
|
||||||
|
const right = sides[1] ? sides[1].split(':') : [];
|
||||||
|
const middle = Array(8 - left.length - right.length).fill('0');
|
||||||
|
expanded = [...left, ...middle, ...right].join(':');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert to buffer
|
||||||
|
const parts = expanded.split(':');
|
||||||
|
let offset = 0;
|
||||||
|
for (const part of parts) {
|
||||||
|
buf.writeUInt16BE(parseInt(part, 16), offset);
|
||||||
|
offset += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error('Invalid IP address');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export function parseCIDR(cidr: string): [Buffer, Buffer] {
|
||||||
|
const [ip, prefixTxt] = cidr.split('/');
|
||||||
|
const buf = parseIp(ip);
|
||||||
|
const maskBuf = Buffer.alloc(buf.byteLength, 0xff);
|
||||||
|
const prefixBits = parseInt(prefixTxt);
|
||||||
|
|
||||||
|
let offsetBits = 0;
|
||||||
|
while (offsetBits < (buf.byteLength * 8)) {
|
||||||
|
if (offsetBits <= (prefixBits - 8)) {
|
||||||
|
offsetBits += 8;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const bitsRemain = prefixBits - offsetBits;
|
||||||
|
const byteOffset = Math.floor(offsetBits / 8);
|
||||||
|
|
||||||
|
if (bitsRemain > 0) {
|
||||||
|
const theByte = buf[byteOffset];
|
||||||
|
const mask = 0xff << (8 - bitsRemain);
|
||||||
|
maskBuf[byteOffset] = mask;
|
||||||
|
buf[byteOffset] = theByte & mask;
|
||||||
|
|
||||||
|
offsetBits += 8;
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
buf[byteOffset] = 0;
|
||||||
|
maskBuf[byteOffset] = 0;
|
||||||
|
|
||||||
|
offsetBits += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
return [buf, maskBuf];
|
||||||
|
}
|
||||||
|
|
||||||
|
export class CIDR {
|
||||||
|
buff: Buffer;
|
||||||
|
mask: Buffer;
|
||||||
|
text: string;
|
||||||
|
constructor(cidr: string) {
|
||||||
|
this.text = cidr;
|
||||||
|
[this.buff, this.mask] = parseCIDR(cidr);
|
||||||
|
}
|
||||||
|
|
||||||
|
toString() {
|
||||||
|
return this.text;
|
||||||
|
}
|
||||||
|
|
||||||
|
get family() {
|
||||||
|
return this.buff.byteLength === 4 ? 4 : 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
test(ip: string | Buffer): boolean {
|
||||||
|
const parsedIp = typeof ip === 'string' ? parseIp(ip) : ip;
|
||||||
|
|
||||||
|
if (parsedIp.byteLength !== this.buff.byteLength) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const i of Array(this.buff.byteLength).keys()) {
|
||||||
|
const t = parsedIp[i];
|
||||||
|
const m = this.mask[i];
|
||||||
|
|
||||||
|
if (m === 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const r = this.buff[i];
|
||||||
|
if ((t & m) !== r) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const nonPublicNetworks4 = [
|
||||||
|
'10.0.0.0/8',
|
||||||
|
'172.16.0.0/12',
|
||||||
|
'192.168.0.0/16',
|
||||||
|
|
||||||
|
'127.0.0.0/8',
|
||||||
|
'255.255.255.255/32',
|
||||||
|
'169.254.0.0/16',
|
||||||
|
'224.0.0.0/4',
|
||||||
|
|
||||||
|
'100.64.0.0/10',
|
||||||
|
'240.0.0.0/4',
|
||||||
|
];
|
||||||
|
|
||||||
|
|
||||||
|
const nonPublicNetworks6 = [
|
||||||
|
'fc00::/7',
|
||||||
|
'fe80::/10',
|
||||||
|
'ff00::/8',
|
||||||
|
|
||||||
|
'::127.0.0.0/104',
|
||||||
|
'::/128',
|
||||||
|
];
|
||||||
|
|
||||||
|
const nonPublicCIDRs = [...nonPublicNetworks4, ...nonPublicNetworks6].map(cidr => new CIDR(cidr));
|
||||||
|
|
||||||
|
export function isIPInNonPublicRange(ip: string) {
|
||||||
|
const parsed = parseIp(ip);
|
||||||
|
|
||||||
|
for (const cidr of nonPublicCIDRs) {
|
||||||
|
if (cidr.test(parsed)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user