mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 16:59:05 +08:00
feat(sitemap): propagate crawlid
This commit is contained in:
parent
a922aac805
commit
aa2c369060
@ -555,6 +555,7 @@ export class WebCrawler {
|
|||||||
sitemapCount = await getLinksFromSitemap(
|
sitemapCount = await getLinksFromSitemap(
|
||||||
{ sitemapUrl, urlsHandler, mode: "fire-engine" },
|
{ sitemapUrl, urlsHandler, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
|
this.jobId,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
||||||
@ -595,6 +596,7 @@ export class WebCrawler {
|
|||||||
mode: "fire-engine",
|
mode: "fire-engine",
|
||||||
},
|
},
|
||||||
this.logger,
|
this.logger,
|
||||||
|
this.jobId,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(
|
this.logger.debug(
|
||||||
@ -618,6 +620,7 @@ export class WebCrawler {
|
|||||||
sitemapCount += await getLinksFromSitemap(
|
sitemapCount += await getLinksFromSitemap(
|
||||||
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
|
this.jobId,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||||
@ -631,6 +634,7 @@ export class WebCrawler {
|
|||||||
sitemapCount += await getLinksFromSitemap(
|
sitemapCount += await getLinksFromSitemap(
|
||||||
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
|
this.jobId,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,13 +18,14 @@ export async function getLinksFromSitemap(
|
|||||||
mode?: "axios" | "fire-engine";
|
mode?: "axios" | "fire-engine";
|
||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
|
crawlId: string,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
if (mode === "fire-engine" && useFireEngine) {
|
if (mode === "fire-engine" && useFireEngine) {
|
||||||
const fetchResponse = await scrapeURL(
|
const fetchResponse = await scrapeURL(
|
||||||
"sitemap",
|
"sitemap;" + crawlId,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
{ forceEngine: "fetch" },
|
{ forceEngine: "fetch" },
|
||||||
@ -79,7 +80,7 @@ export async function getLinksFromSitemap(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const fetchResponse = await scrapeURL(
|
const fetchResponse = await scrapeURL(
|
||||||
"sitemap",
|
"sitemap;" + crawlId,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
{ forceEngine: "fetch" },
|
{ forceEngine: "fetch" },
|
||||||
@ -125,7 +126,7 @@ export async function getLinksFromSitemap(
|
|||||||
.map((sitemap) => sitemap.loc[0].trim());
|
.map((sitemap) => sitemap.loc[0].trim());
|
||||||
|
|
||||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger),
|
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId),
|
||||||
);
|
);
|
||||||
|
|
||||||
const results = await Promise.all(sitemapPromises);
|
const results = await Promise.all(sitemapPromises);
|
||||||
@ -147,6 +148,7 @@ export async function getLinksFromSitemap(
|
|||||||
getLinksFromSitemap(
|
getLinksFromSitemap(
|
||||||
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
|
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
|
||||||
logger,
|
logger,
|
||||||
|
crawlId,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
count += (await Promise.all(sitemapPromises)).reduce(
|
count += (await Promise.all(sitemapPromises)).reduce(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user