mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 15:59:04 +08:00
fix(crawler): dumb sitemap limit
This commit is contained in:
parent
51a0e233e3
commit
a922aac805
@ -27,6 +27,7 @@ export class WebCrawler {
|
|||||||
private allowSubdomains: boolean;
|
private allowSubdomains: boolean;
|
||||||
private ignoreRobotsTxt: boolean;
|
private ignoreRobotsTxt: boolean;
|
||||||
private logger: typeof _logger;
|
private logger: typeof _logger;
|
||||||
|
private sitemapsHit: Set<string> = new Set();
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
@ -531,10 +532,22 @@ export class WebCrawler {
|
|||||||
url: string,
|
url: string,
|
||||||
urlsHandler: (urls: string[]) => unknown,
|
urlsHandler: (urls: string[]) => unknown,
|
||||||
): Promise<number> {
|
): Promise<number> {
|
||||||
|
if (this.sitemapsHit.size >= 5) {
|
||||||
|
this.logger.warn("Sitemap limit of 5 hit, not hitting this one.");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
const sitemapUrl = url.endsWith(".xml")
|
const sitemapUrl = url.endsWith(".xml")
|
||||||
? url
|
? url
|
||||||
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||||
|
|
||||||
|
if (this.sitemapsHit.has(sitemapUrl)) {
|
||||||
|
this.logger.warn("This sitemap has already been hit.", { sitemapUrl });
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.sitemapsHit.add(sitemapUrl);
|
||||||
|
|
||||||
let sitemapCount: number = 0;
|
let sitemapCount: number = 0;
|
||||||
|
|
||||||
// Try to get sitemap from the provided URL first
|
// Try to get sitemap from the provided URL first
|
||||||
|
Loading…
x
Reference in New Issue
Block a user