fix(crawler): move sitemap deduplication to deeper in the process

This commit is contained in:
Móricz Gergő 2025-01-23 08:10:46 +01:00
parent aa2c369060
commit 72198123cb
2 changed files with 19 additions and 11 deletions

View File

@ -532,20 +532,10 @@ export class WebCrawler {
url: string, url: string,
urlsHandler: (urls: string[]) => unknown, urlsHandler: (urls: string[]) => unknown,
): Promise<number> { ): Promise<number> {
if (this.sitemapsHit.size >= 5) {
this.logger.warn("Sitemap limit of 5 hit, not hitting this one.");
return 0;
}
const sitemapUrl = url.endsWith(".xml") const sitemapUrl = url.endsWith(".xml")
? url ? url
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
if (this.sitemapsHit.has(sitemapUrl)) {
this.logger.warn("This sitemap has already been hit.", { sitemapUrl });
return 0;
}
this.sitemapsHit.add(sitemapUrl); this.sitemapsHit.add(sitemapUrl);
let sitemapCount: number = 0; let sitemapCount: number = 0;
@ -556,6 +546,7 @@ export class WebCrawler {
{ sitemapUrl, urlsHandler, mode: "fire-engine" }, { sitemapUrl, urlsHandler, mode: "fire-engine" },
this.logger, this.logger,
this.jobId, this.jobId,
this.sitemapsHit,
); );
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, { this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
@ -597,6 +588,7 @@ export class WebCrawler {
}, },
this.logger, this.logger,
this.jobId, this.jobId,
this.sitemapsHit,
); );
} catch (error) { } catch (error) {
this.logger.debug( this.logger.debug(
@ -621,6 +613,7 @@ export class WebCrawler {
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger, this.logger,
this.jobId, this.jobId,
this.sitemapsHit,
); );
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
@ -635,6 +628,7 @@ export class WebCrawler {
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger, this.logger,
this.jobId, this.jobId,
this.sitemapsHit,
); );
} }
} }

View File

@ -19,7 +19,20 @@ export async function getLinksFromSitemap(
}, },
logger: Logger, logger: Logger,
crawlId: string, crawlId: string,
sitemapsHit: Set<string>,
): Promise<number> { ): Promise<number> {
if (sitemapsHit.size >= 5) {
logger.warn("Sitemap limit of 5 hit, not hitting this one.");
return 0;
}
if (sitemapsHit.has(sitemapUrl)) {
logger.warn("This sitemap has already been hit.", { sitemapUrl });
return 0;
}
sitemapsHit.add(sitemapUrl);
try { try {
let content: string = ""; let content: string = "";
try { try {
@ -126,7 +139,7 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim()); .map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) => const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId), getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit),
); );
const results = await Promise.all(sitemapPromises); const results = await Promise.all(sitemapPromises);
@ -149,6 +162,7 @@ export async function getLinksFromSitemap(
{ sitemapUrl: sitemapUrl, urlsHandler, mode }, { sitemapUrl: sitemapUrl, urlsHandler, mode },
logger, logger,
crawlId, crawlId,
sitemapsHit,
), ),
); );
count += (await Promise.all(sitemapPromises)).reduce( count += (await Promise.all(sitemapPromises)).reduce(