mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 02:09:01 +08:00
fix(crawler): move sitemap deduplication to deeper in the process
This commit is contained in:
parent
aa2c369060
commit
72198123cb
@ -532,20 +532,10 @@ export class WebCrawler {
|
|||||||
url: string,
|
url: string,
|
||||||
urlsHandler: (urls: string[]) => unknown,
|
urlsHandler: (urls: string[]) => unknown,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
if (this.sitemapsHit.size >= 5) {
|
|
||||||
this.logger.warn("Sitemap limit of 5 hit, not hitting this one.");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
const sitemapUrl = url.endsWith(".xml")
|
const sitemapUrl = url.endsWith(".xml")
|
||||||
? url
|
? url
|
||||||
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||||
|
|
||||||
if (this.sitemapsHit.has(sitemapUrl)) {
|
|
||||||
this.logger.warn("This sitemap has already been hit.", { sitemapUrl });
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
this.sitemapsHit.add(sitemapUrl);
|
this.sitemapsHit.add(sitemapUrl);
|
||||||
|
|
||||||
let sitemapCount: number = 0;
|
let sitemapCount: number = 0;
|
||||||
@ -556,6 +546,7 @@ export class WebCrawler {
|
|||||||
{ sitemapUrl, urlsHandler, mode: "fire-engine" },
|
{ sitemapUrl, urlsHandler, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
this.jobId,
|
this.jobId,
|
||||||
|
this.sitemapsHit,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
this.logger.debug(`Failed to fetch sitemap from ${sitemapUrl}`, {
|
||||||
@ -597,6 +588,7 @@ export class WebCrawler {
|
|||||||
},
|
},
|
||||||
this.logger,
|
this.logger,
|
||||||
this.jobId,
|
this.jobId,
|
||||||
|
this.sitemapsHit,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(
|
this.logger.debug(
|
||||||
@ -621,6 +613,7 @@ export class WebCrawler {
|
|||||||
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
this.jobId,
|
this.jobId,
|
||||||
|
this.sitemapsHit,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||||
@ -635,6 +628,7 @@ export class WebCrawler {
|
|||||||
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
{ sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
|
||||||
this.logger,
|
this.logger,
|
||||||
this.jobId,
|
this.jobId,
|
||||||
|
this.sitemapsHit,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -19,7 +19,20 @@ export async function getLinksFromSitemap(
|
|||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
crawlId: string,
|
crawlId: string,
|
||||||
|
sitemapsHit: Set<string>,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
|
if (sitemapsHit.size >= 5) {
|
||||||
|
logger.warn("Sitemap limit of 5 hit, not hitting this one.");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sitemapsHit.has(sitemapUrl)) {
|
||||||
|
logger.warn("This sitemap has already been hit.", { sitemapUrl });
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
sitemapsHit.add(sitemapUrl);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
@ -126,7 +139,7 @@ export async function getLinksFromSitemap(
|
|||||||
.map((sitemap) => sitemap.loc[0].trim());
|
.map((sitemap) => sitemap.loc[0].trim());
|
||||||
|
|
||||||
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId),
|
getLinksFromSitemap({ sitemapUrl, urlsHandler, mode }, logger, crawlId, sitemapsHit),
|
||||||
);
|
);
|
||||||
|
|
||||||
const results = await Promise.all(sitemapPromises);
|
const results = await Promise.all(sitemapPromises);
|
||||||
@ -149,6 +162,7 @@ export async function getLinksFromSitemap(
|
|||||||
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
|
{ sitemapUrl: sitemapUrl, urlsHandler, mode },
|
||||||
logger,
|
logger,
|
||||||
crawlId,
|
crawlId,
|
||||||
|
sitemapsHit,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
count += (await Promise.all(sitemapPromises)).reduce(
|
count += (await Promise.all(sitemapPromises)).reduce(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user