From f0e95ce39927929a1d7578dfac6c932aef01dc3c Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 21:49:37 +0200 Subject: [PATCH] fix(WebCrawler): filter out file URLs when taking URLs from sitemap --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/WebScraper/sitemap.ts | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 80705dbd..f97230ff 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -383,7 +383,7 @@ export class WebCrawler { return linkDomain === baseDomain; } - private isFile(url: string): boolean { + public isFile(url: string): boolean { const fileExtensions = [ ".png", ".jpg", diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c9368f41..460aeca6 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -2,6 +2,7 @@ import axios from "axios"; import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; +import { WebCrawler } from "./crawler"; export async function getLinksFromSitemap( { @@ -41,7 +42,7 @@ export async function getLinksFromSitemap( } } else if (root && root.url) { for (const url of root.url) { - if (url.loc && url.loc.length > 0) { + if (url.loc && url.loc.length > 0 && !WebCrawler.prototype.isFile(url.loc[0])) { allUrls.push(url.loc[0]); } }