feat(sitemap): propagate crawlid

This commit is contained in:
Móricz Gergő 2025-01-23 07:19:00 +01:00
parent a922aac805
commit aa2c369060
2 changed files with 9 additions and 3 deletions

View File

@ -555,6 +555,7 @@ export class WebCrawler {
sitemapCount = await getLinksFromSitemap( sitemapCount = await getLinksFromSitemap(
{ sitemapUrl, urlsHandler, mode: "fire-engine" }, { sitemapUrl, urlsHandler, mode: "fire-engine" },
this.logger, this.logger,
this.jobId,
); );
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, { this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
@ -595,6 +596,7 @@ export class WebCrawler {
mode: "fire-engine", mode: "fire-engine",
}, },
this.logger, this.logger,
this.jobId,
); );
} catch (error) { } catch (error) {
this.logger.debug( this.logger.debug(
@ -618,6 +620,7 @@ export class WebCrawler {
sitemapCount += await getLinksFromSitemap( sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger, this.logger,
this.jobId,
); );
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
@ -631,6 +634,7 @@ export class WebCrawler {
sitemapCount += await getLinksFromSitemap( sitemapCount += await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" }, { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
this.logger, this.logger,
this.jobId,
); );
} }
} }

View File

@ -18,13 +18,14 @@ export async function getLinksFromSitemap(
mode?: "axios" | "fire-engine"; mode?: "axios" | "fire-engine";
}, },
logger: Logger, logger: Logger,
crawlId: string,
): Promise<number> { ): Promise<number> {
try { try {
let content: string = ""; let content: string = "";
try { try {
if (mode === "fire-engine" && useFireEngine) { if (mode === "fire-engine" && useFireEngine) {
const fetchResponse = await scrapeURL( const fetchResponse = await scrapeURL(
"sitemap", "sitemap;" + crawlId,
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" }, { forceEngine: "fetch" },
@ -79,7 +80,7 @@ export async function getLinksFromSitemap(
} }
} else { } else {
const fetchResponse = await scrapeURL( const fetchResponse = await scrapeURL(
"sitemap", "sitemap;" + crawlId,
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" }, { forceEngine: "fetch" },
@ -125,7 +126,7 @@ export async function getLinksFromSitemap(
.map((sitemap) => sitemap.loc[0].trim()); .map((sitemap) => sitemap.loc[0].trim());
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) => const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger), getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId),
); );
const results = await Promise.all(sitemapPromises); const results = await Promise.all(sitemapPromises);
@ -147,6 +148,7 @@ export async function getLinksFromSitemap(
getLinksFromSitemap( getLinksFromSitemap(
{ sitemapUrl: sitemapUrl, urlsHandler, mode }, { sitemapUrl: sitemapUrl, urlsHandler, mode },
logger, logger,
crawlId,
), ),
); );
count += (await Promise.all(sitemapPromises)).reduce( count += (await Promise.all(sitemapPromises)).reduce(