From a922aac80594693ea14a253518e33eeaccba22f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 23 Jan 2025 07:10:07 +0100 Subject: [PATCH] fix(crawler): dumb sitemap limit --- apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 442a4f5e..f883e4c5 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -27,6 +27,7 @@ export class WebCrawler { private allowSubdomains: boolean; private ignoreRobotsTxt: boolean; private logger: typeof _logger; + private sitemapsHit: Set = new Set(); constructor({ jobId, @@ -531,10 +532,22 @@ export class WebCrawler { url: string, urlsHandler: (urls: string[]) => unknown, ): Promise { + if (this.sitemapsHit.size >= 5) { + this.logger.warn("Sitemap limit of 5 hit, not hitting this one."); + return 0; + } + const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; + if (this.sitemapsHit.has(sitemapUrl)) { + this.logger.warn("This sitemap has already been hit.", { sitemapUrl }); + return 0; + } + + this.sitemapsHit.add(sitemapUrl); + let sitemapCount: number = 0; // Try to get sitemap from the provided URL first