fix: filter out invalid URLs from crawl links

2025-08-12 01:59:04 +08:00 · 2024-08-21 20:49:25 +02:00 · 2024-08-21 20:49:25 +02:00 · 55009e51f5
commit 55009e51f5
parent dae1408e66
1 changed files with 11 additions and 2 deletions
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -267,9 +267,18 @@ export class WebCrawler {
  public filterURL(href: string, url: string): string | null {
    let fullUrl = href;
    if (!href.startsWith("http")) {
-      fullUrl = new URL(href, this.baseUrl).toString();
+      try {
+        fullUrl = new URL(href, this.baseUrl).toString();
+      } catch (_) {
+        return null;
+      }
+    }
+    let urlObj;
+    try {
+      urlObj = new URL(fullUrl);
+    } catch (_) {
+      return null;
    }
-    const urlObj = new URL(fullUrl);
    const path = urlObj.pathname;

    if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS