From aa2c369060300c44a506b2af66bd9324701b8ffe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 23 Jan 2025 07:19:00 +0100 Subject: [PATCH] feat(sitemap): propagate crawlid --- apps/api/src/scraper/WebScraper/crawler.ts | 4 ++++ apps/api/src/scraper/WebScraper/sitemap.ts | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index f883e4c5..ceaba6ef 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -555,6 +555,7 @@ export class WebCrawler { sitemapCount = await getLinksFromSitemap( { sitemapUrl, urlsHandler, mode: "fire-engine" }, this.logger, + this.jobId, ); } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, { @@ -595,6 +596,7 @@ export class WebCrawler { mode: "fire-engine", }, this.logger, + this.jobId, ); } catch (error) { this.logger.debug( @@ -618,6 +620,7 @@ export class WebCrawler { sitemapCount += await getLinksFromSitemap( { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, this.logger, + this.jobId, ); } catch (error) { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { @@ -631,6 +634,7 @@ export class WebCrawler { sitemapCount += await getLinksFromSitemap( { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, this.logger, + this.jobId, ); } } diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index bdd5cd65..aba0e0ba 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -18,13 +18,14 @@ export async function getLinksFromSitemap( mode?: "axios" | "fire-engine"; }, logger: Logger, + crawlId: string, ): Promise { try { let content: string = ""; try { if (mode === "fire-engine" && useFireEngine) { const fetchResponse = await scrapeURL( - "sitemap", + "sitemap;" + crawlId, sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fetch" }, @@ -79,7 +80,7 @@ export async function getLinksFromSitemap( } } else { const fetchResponse = await scrapeURL( - "sitemap", + "sitemap;" + crawlId, sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fetch" }, @@ -125,7 +126,7 @@ export async function getLinksFromSitemap( .map((sitemap) => sitemap.loc[0].trim()); const sitemapPromises: Promise[] = sitemapUrls.map((sitemapUrl) => - getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger), + getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId), ); const results = await Promise.all(sitemapPromises); @@ -147,6 +148,7 @@ export async function getLinksFromSitemap( getLinksFromSitemap( { sitemapUrl: sitemapUrl, urlsHandler, mode }, logger, + crawlId, ), ); count += (await Promise.all(sitemapPromises)).reduce(