From 55009e51f5338c05a913da3c2a9b2e88c1d9f5f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com>
Date: Wed, 21 Aug 2024 20:49:25 +0200
Subject: [PATCH] fix: filter out invalid URLs from crawl links

---
 apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index af3a9d69..02894cfc 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -267,9 +267,18 @@ export class WebCrawler {
   public filterURL(href: string, url: string): string | null {
     let fullUrl = href;
     if (!href.startsWith("http")) {
-      fullUrl = new URL(href, this.baseUrl).toString();
+      try {
+        fullUrl = new URL(href, this.baseUrl).toString();
+      } catch (_) {
+        return null;
+      }
+    }
+    let urlObj;
+    try {
+      urlObj = new URL(fullUrl);
+    } catch (_) {
+      return null;
     }
-    const urlObj = new URL(fullUrl);
     const path = urlObj.pathname;
 
     if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS