mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 08:49:03 +08:00
fix: filter out invalid URLs from crawl links
This commit is contained in:
parent
dae1408e66
commit
55009e51f5
@ -267,9 +267,18 @@ export class WebCrawler {
|
|||||||
public filterURL(href: string, url: string): string | null {
|
public filterURL(href: string, url: string): string | null {
|
||||||
let fullUrl = href;
|
let fullUrl = href;
|
||||||
if (!href.startsWith("http")) {
|
if (!href.startsWith("http")) {
|
||||||
fullUrl = new URL(href, this.baseUrl).toString();
|
try {
|
||||||
|
fullUrl = new URL(href, this.baseUrl).toString();
|
||||||
|
} catch (_) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let urlObj;
|
||||||
|
try {
|
||||||
|
urlObj = new URL(fullUrl);
|
||||||
|
} catch (_) {
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
const urlObj = new URL(fullUrl);
|
|
||||||
const path = urlObj.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||||
|
Loading…
x
Reference in New Issue
Block a user