mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 11:45:56 +08:00
fix: url check
This commit is contained in:
parent
5bbd75a6d6
commit
dceb361a7a
@ -46,6 +46,8 @@ import { RobotsTxtService } from '../services/robots-text';
|
||||
import { lookup } from 'dns/promises';
|
||||
import { isIP } from 'net';
|
||||
|
||||
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean | 'quoted';
|
||||
withShadowDom?: boolean;
|
||||
@ -474,8 +476,7 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
const targetUrlFromGet = originPath.slice(1);
|
||||
if (crawlerOptions.pdf) {
|
||||
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
||||
url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
|
||||
url = `blob://pdf/${randomUUID()}`;
|
||||
} else if (targetUrlFromGet) {
|
||||
url = targetUrlFromGet.trim();
|
||||
} else if (crawlerOptions.url) {
|
||||
@ -485,7 +486,6 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
let result: URL;
|
||||
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
||||
try {
|
||||
result = new URL(
|
||||
normalizeUrl(
|
||||
|
Loading…
x
Reference in New Issue
Block a user