mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 22:05:57 +08:00
fix: url check
This commit is contained in:
parent
5bbd75a6d6
commit
dceb361a7a
@ -46,6 +46,8 @@ import { RobotsTxtService } from '../services/robots-text';
|
|||||||
import { lookup } from 'dns/promises';
|
import { lookup } from 'dns/promises';
|
||||||
import { isIP } from 'net';
|
import { isIP } from 'net';
|
||||||
|
|
||||||
|
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean | 'quoted';
|
withIframe?: boolean | 'quoted';
|
||||||
withShadowDom?: boolean;
|
withShadowDom?: boolean;
|
||||||
@ -474,8 +476,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const targetUrlFromGet = originPath.slice(1);
|
const targetUrlFromGet = originPath.slice(1);
|
||||||
if (crawlerOptions.pdf) {
|
if (crawlerOptions.pdf) {
|
||||||
const pdfBuf = crawlerOptions.pdf instanceof Blob ? await crawlerOptions.pdf.arrayBuffer().then((x) => Buffer.from(x)) : Buffer.from(crawlerOptions.pdf, 'base64');
|
url = `blob://pdf/${randomUUID()}`;
|
||||||
url = `blob://pdf/${md5Hasher.hash(pdfBuf)}`;
|
|
||||||
} else if (targetUrlFromGet) {
|
} else if (targetUrlFromGet) {
|
||||||
url = targetUrlFromGet.trim();
|
url = targetUrlFromGet.trim();
|
||||||
} else if (crawlerOptions.url) {
|
} else if (crawlerOptions.url) {
|
||||||
@ -485,7 +486,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let result: URL;
|
let result: URL;
|
||||||
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
|
||||||
try {
|
try {
|
||||||
result = new URL(
|
result = new URL(
|
||||||
normalizeUrl(
|
normalizeUrl(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user