feat(crawl): add maxDiscoveryDepth (#1329)

This commit is contained in:
Gergő Móricz 2025-03-12 18:46:57 +01:00 committed by GitHub
parent d855f5a567
commit 7cf2e52fe6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 44 additions and 1 deletions

View File

@ -53,4 +53,21 @@ describe("Crawl tests", () => {
}
}
}, 120000);
it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => {
const res = await crawl({
url: "https://firecrawl.dev",
ignoreSitemap: true,
maxDiscoveryDepth: 1,
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.data.length).toBeGreaterThan(1);
for (const page of res.data) {
expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/);
}
}
}, 120000);
});

View File

@ -440,6 +440,7 @@ const crawlerOptions = z
includePaths: z.string().array().default([]),
excludePaths: z.string().array().default([]),
maxDepth: z.number().default(10), // default?
maxDiscoveryDepth: z.number().optional(),
limit: z.number().default(10000), // default?
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
@ -793,6 +794,8 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
maxDiscoveryDepth: x.maxDiscoveryDepth,
currentDiscoveryDepth: 0,
};
}
@ -814,7 +817,8 @@ export function fromLegacyCrawlerOptions(x: any): {
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
}),
maxDiscoveryDepth: x.maxDiscoveryDepth,
}),
internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls,
},

View File

@ -379,6 +379,7 @@ export function crawlToCrawler(
id: string,
sc: StoredCrawl,
newBase?: string,
crawlerOptions?: any,
): WebCrawler {
const crawler = new WebCrawler({
jobId: id,
@ -399,6 +400,8 @@ export function crawlToCrawler(
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
});
if (sc.robots !== undefined) {

View File

@ -31,6 +31,8 @@ export class WebCrawler {
private regexOnFullURL: boolean;
private logger: typeof _logger;
private sitemapsHit: Set<string> = new Set();
private maxDiscoveryDepth: number | undefined;
private currentDiscoveryDepth: number;
constructor({
jobId,
@ -47,6 +49,8 @@ export class WebCrawler {
allowSubdomains = false,
ignoreRobotsTxt = false,
regexOnFullURL = false,
maxDiscoveryDepth,
currentDiscoveryDepth,
}: {
jobId: string;
initialUrl: string;
@ -62,6 +66,8 @@ export class WebCrawler {
allowSubdomains?: boolean;
ignoreRobotsTxt?: boolean;
regexOnFullURL?: boolean;
maxDiscoveryDepth?: number;
currentDiscoveryDepth?: number;
}) {
this.jobId = jobId;
this.initialUrl = initialUrl;
@ -81,6 +87,8 @@ export class WebCrawler {
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
this.regexOnFullURL = regexOnFullURL ?? false;
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
this.maxDiscoveryDepth = maxDiscoveryDepth;
this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0;
}
public filterLinks(
@ -89,6 +97,11 @@ export class WebCrawler {
maxDepth: number,
fromMap: boolean = false,
): string[] {
if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) {
this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth });
return [];
}
// If the initial URL is a sitemap.xml, skip filtering
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
return sitemapLinks.slice(0, limit);

View File

@ -1044,6 +1044,7 @@ async function processJob(job: Job & { id: string }, token: string) {
job.data.crawl_id,
sc,
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
job.data.crawlerOptions,
);
const links = crawler.filterLinks(
@ -1088,6 +1089,10 @@ async function processJob(job: Job & { id: string }, token: string) {
team_id: sc.team_id,
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
internalOptions: sc.internalOptions,
crawlerOptions: {
...sc.crawlerOptions,
currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
},
plan: job.data.plan,
origin: job.data.origin,
crawl_id: job.data.crawl_id,

View File

@ -160,6 +160,7 @@ export interface CrawlParams {
includePaths?: string[];
excludePaths?: string[];
maxDepth?: number;
maxDiscoveryDepth?: number;
limit?: number;
allowBackwardLinks?: boolean;
allowExternalLinks?: boolean;