From 55009e51f5338c05a913da3c2a9b2e88c1d9f5f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 21 Aug 2024 20:49:25 +0200 Subject: [PATCH] fix: filter out invalid URLs from crawl links --- apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index af3a9d69..02894cfc 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -267,9 +267,18 @@ export class WebCrawler { public filterURL(href: string, url: string): string | null { let fullUrl = href; if (!href.startsWith("http")) { - fullUrl = new URL(href, this.baseUrl).toString(); + try { + fullUrl = new URL(href, this.baseUrl).toString(); + } catch (_) { + return null; + } + } + let urlObj; + try { + urlObj = new URL(fullUrl); + } catch (_) { + return null; } - const urlObj = new URL(fullUrl); const path = urlObj.pathname; if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS