fix: url check

This commit is contained in:
yanlong.wang 2025-03-10 18:23:30 +08:00
parent 5bbd75a6d6
commit dceb361a7a
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -46,6 +46,8 @@ import { RobotsTxtService } from '../services/robots-text';
import { lookup } from 'dns/promises'; import { lookup } from 'dns/promises';
import { isIP } from 'net'; import { isIP } from 'net';
const normalizeUrl = require('@esm2cjs/normalize-url').default;
export interface ExtraScrappingOptions extends ScrappingOptions { export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted'; withIframe?: boolean | 'quoted';
withShadowDom?: boolean; withShadowDom?: boolean;
@ -474,8 +476,7 @@ export class CrawlerHost extends RPCHost {
const targetUrlFromGet = originPath.slice(1); const targetUrlFromGet = originPath.slice(1);
if (crawlerOptions.pdf) { if (crawlerOptions.pdf) {
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64'); url = `blob://pdf/${randomUUID()}`;
url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
} else if (targetUrlFromGet) { } else if (targetUrlFromGet) {
url = targetUrlFromGet.trim(); url = targetUrlFromGet.trim();
} else if (crawlerOptions.url) { } else if (crawlerOptions.url) {
@ -485,7 +486,6 @@ export class CrawlerHost extends RPCHost {
} }
let result: URL; let result: URL;
const normalizeUrl = require('@esm2cjs/normalize-url').default;
try { try {
result = new URL( result = new URL(
normalizeUrl( normalizeUrl(