From e1c9cbf70906213cbacd3dd5e6665be74017e78f Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:11:58 -0300 Subject: [PATCH] bug fixed. crawl should not stop if sitemap url is invalid --- apps/api/src/scraper/WebScraper/crawler.ts | 8 +++++++- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 1 - 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 79e4bf18..af3a9d69 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -69,7 +69,13 @@ export class WebCrawler { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { - const url = new URL(link.trim(), this.baseUrl); + let url: URL; + try { + url = new URL(link.trim(), this.baseUrl); + } catch (error) { + Logger.debug(`Error processing link: ${link} | Error: ${error.message}`); + return false; + } const path = url.pathname; const depth = getURLDepth(url.toString()); diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index e427f582..7c24fab4 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -73,7 +73,6 @@ export async function scrapWithFireEngine({ ); if (pageOptions?.useFastMode) { - console.log('using tlsclient') fireEngineOptionsParam.engine = "tlsclient"; engine = "tlsclient"; }