fix: filter out invalid URLs from crawl links

This commit is contained in:
Gergő Móricz 2024-08-21 20:49:25 +02:00
parent dae1408e66
commit 55009e51f5

View File

@ -267,9 +267,18 @@ export class WebCrawler {
public filterURL(href: string, url: string): string | null {
let fullUrl = href;
if (!href.startsWith("http")) {
fullUrl = new URL(href, this.baseUrl).toString();
try {
fullUrl = new URL(href, this.baseUrl).toString();
} catch (_) {
return null;
}
}
let urlObj;
try {
urlObj = new URL(fullUrl);
} catch (_) {
return null;
}
const urlObj = new URL(fullUrl);
const path = urlObj.pathname;
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS