fix(crawler): dumb sitemap limit

This commit is contained in:
Móricz Gergő 2025-01-23 07:10:07 +01:00
parent 51a0e233e3
commit a922aac805

View File

@ -27,6 +27,7 @@ export class WebCrawler {
private allowSubdomains: boolean;
private ignoreRobotsTxt: boolean;
private logger: typeof _logger;
private sitemapsHit: Set<string> = new Set();
constructor({
jobId,
@ -531,10 +532,22 @@ export class WebCrawler {
url: string,
urlsHandler: (urls: string[]) => unknown,
): Promise<number> {
if (this.sitemapsHit.size >= 5) {
this.logger.warn("Sitemap limit of 5 hit, not hitting this one.");
return 0;
}
const sitemapUrl = url.endsWith(".xml")
? url
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
if (this.sitemapsHit.has(sitemapUrl)) {
this.logger.warn("This sitemap has already been hit.", { sitemapUrl });
return 0;
}
this.sitemapsHit.add(sitemapUrl);
let sitemapCount: number = 0;
// Try to get sitemap from the provided URL first