Update apps/api/src/scraper/WebScraper/crawler.ts

no need for regex Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
2025-08-06 08:06:01 +08:00 · 2024-07-24 08:33:00 -03:00 · 2024-07-24 08:33:00 -03:00 · 5e728c1a4d
commit 5e728c1a4d
parent a684bd3c5d
1 changed files with 1 additions and 9 deletions
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -64,15 +64,7 @@ export class WebCrawler {
  private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
    return sitemapLinks
      .filter((link) => {
-        
+        const url = new URL(link.trim(), this.baseUrl);
        // if link is not a complete url, add the base url
        link = link.trim();
        const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i');
        if (!isCompleteUrl.test(link)){
          link = this.baseUrl + link;
        }
        const url = new URL(link);
        const path = url.pathname;
        const depth = getURLDepth(url.toString());