From a922aac80594693ea14a253518e33eeaccba22f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= <mo.geryy@gmail.com>
Date: Thu, 23 Jan 2025 07:10:07 +0100
Subject: [PATCH] fix(crawler): dumb sitemap limit

---
 apps/api/src/scraper/WebScraper/crawler.ts | 13 +++++++++++++
 1 file changed, 13 insertions(+)
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index 442a4f5e..f883e4c5 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -27,6 +27,7 @@ export class WebCrawler {
   private allowSubdomains: boolean;
   private ignoreRobotsTxt: boolean;
   private logger: typeof _logger;
+  private sitemapsHit: Set<string> = new Set();
 
   constructor({
     jobId,
@@ -531,10 +532,22 @@ export class WebCrawler {
     url: string,
     urlsHandler: (urls: string[]) => unknown,
   ): Promise<number> {
+    if (this.sitemapsHit.size >= 5) {
+      this.logger.warn("Sitemap limit of 5 hit, not hitting this one.");
+      return 0;
+    }
+
     const sitemapUrl = url.endsWith(".xml")
       ? url
       : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
 
+    if (this.sitemapsHit.has(sitemapUrl)) {
+      this.logger.warn("This sitemap has already been hit.", { sitemapUrl });
+      return 0;
+    }
+
+    this.sitemapsHit.add(sitemapUrl);
+
     let sitemapCount: number = 0;
 
     // Try to get sitemap from the provided URL first