diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ceaba6ef..0cb50808 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -532,20 +532,10 @@ export class WebCrawler { url: string, urlsHandler: (urls: string[]) => unknown, ): Promise { - if (this.sitemapsHit.size >= 5) { - this.logger.warn("Sitemap limit of 5 hit, not hitting this one."); - return 0; - } - const sitemapUrl = url.endsWith(".xml") ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; - if (this.sitemapsHit.has(sitemapUrl)) { - this.logger.warn("This sitemap has already been hit.", { sitemapUrl }); - return 0; - } - this.sitemapsHit.add(sitemapUrl); let sitemapCount: number = 0; @@ -556,6 +546,7 @@ export class WebCrawler { { sitemapUrl, urlsHandler, mode: "fire-engine" }, this.logger, this.jobId, + this.sitemapsHit, ); } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, { @@ -597,6 +588,7 @@ export class WebCrawler { }, this.logger, this.jobId, + this.sitemapsHit, ); } catch (error) { this.logger.debug( @@ -621,6 +613,7 @@ export class WebCrawler { { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, this.logger, this.jobId, + this.sitemapsHit, ); } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { @@ -635,6 +628,7 @@ export class WebCrawler { { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, this.logger, this.jobId, + this.sitemapsHit, ); } } diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index aba0e0ba..7cb9e274 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -19,7 +19,20 @@ export async function getLinksFromSitemap( }, logger: Logger, crawlId: string, + sitemapsHit: Set, ): Promise { + if (sitemapsHit.size >= 5) { + logger.warn("Sitemap limit of 5 hit, not hitting this one."); + return 0; + } + + if (sitemapsHit.has(sitemapUrl)) { + logger.warn("This sitemap has already been hit.", { sitemapUrl }); + return 0; + } + + sitemapsHit.add(sitemapUrl); + try { let content: string = ""; try { @@ -126,7 +139,7 @@ export async function getLinksFromSitemap( .map((sitemap) => sitemap.loc[0].trim()); const sitemapPromises: Promise[] = sitemapUrls.map((sitemapUrl) => - getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId), + getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit), ); const results = await Promise.all(sitemapPromises); @@ -149,6 +162,7 @@ export async function getLinksFromSitemap( { sitemapUrl: sitemapUrl, urlsHandler, mode }, logger, crawlId, + sitemapsHit, ), ); count += (await Promise.all(sitemapPromises)).reduce(