From 180801225b50b12662991e2a6e4f8f16c728dbdd Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 14 Oct 2024 15:44:45 -0300 Subject: [PATCH] fix/check files on crawl --- apps/api/src/scraper/WebScraper/crawler.ts | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d5dadaf8..009a5933 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -136,6 +136,10 @@ export class WebCrawler { return false; } + if (this.isFile(link)) { + return false; + } + return true; }) .slice(0, limit); @@ -478,7 +482,14 @@ export class WebCrawler { ".webp", ".inc" ]; - return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext)); + + try { + const urlWithoutQuery = url.split('?')[0].toLowerCase(); + return fileExtensions.some((ext) => urlWithoutQuery.endsWith(ext)); + } catch (error) { + Logger.error(`Error processing URL in isFile: ${error}`); + return false; + } } private isSocialMediaOrEmail(url: string): boolean {