From a684bd3c5d765e263dfec15aae113339bede4991 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Jul 2024 09:07:23 -0300 Subject: [PATCH] added regex for links in sitemap --- apps/api/src/scraper/WebScraper/crawler.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 59b53642..00d51853 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -64,6 +64,14 @@ export class WebCrawler { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { + + // if link is not a complete url, add the base url + link = link.trim(); + const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i'); + if (!isCompleteUrl.test(link)){ + link = this.baseUrl + link; + } + const url = new URL(link); const path = url.pathname;