feat(crawl): add maxDiscoveryDepth (#1329)

2025-08-11 17:48:59 +08:00 · 2025-03-12 18:46:57 +01:00 · 2025-03-12 18:46:57 +01:00 · 7cf2e52fe6
commit 7cf2e52fe6
parent d855f5a567
6 changed files with 44 additions and 1 deletions
--- a/apps/api/src/tests/snips/crawl.test.ts
+++ b/apps/api/src/tests/snips/crawl.test.ts
@ -53,4 +53,21 @@ describe("Crawl tests", () => {
            }
        }
    }, 120000);
+    
+    it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => {
+        const res = await crawl({
+            url: "https://firecrawl.dev",
+            ignoreSitemap: true,
+            maxDiscoveryDepth: 1,
+            limit: 10,
+        });
+
+        expect(res.success).toBe(true);
+        if (res.success) {
+            expect(res.data.length).toBeGreaterThan(1);
+            for (const page of res.data) {
+                expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/);
+            }
+        }
+    }, 120000);
 });
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -440,6 +440,7 @@ const crawlerOptions = z
    includePaths: z.string().array().default([]),
    excludePaths: z.string().array().default([]),
    maxDepth: z.number().default(10), // default?
+    maxDiscoveryDepth: z.number().optional(),
    limit: z.number().default(10000), // default?
    allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
    allowExternalLinks: z.boolean().default(false),
@ -793,6 +794,8 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
    deduplicateSimilarURLs: x.deduplicateSimilarURLs,
    ignoreQueryParameters: x.ignoreQueryParameters,
    regexOnFullURL: x.regexOnFullURL,
+    maxDiscoveryDepth: x.maxDiscoveryDepth,
+    currentDiscoveryDepth: 0,
  };
 }

@ -814,7 +817,8 @@ export function fromLegacyCrawlerOptions(x: any): {
      deduplicateSimilarURLs: x.deduplicateSimilarURLs,
      ignoreQueryParameters: x.ignoreQueryParameters,
      regexOnFullURL: x.regexOnFullURL,
-    }),
+      maxDiscoveryDepth: x.maxDiscoveryDepth,
+   }),
    internalOptions: {
      v0CrawlOnlyUrls: x.returnOnlyUrls,
    },
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -379,6 +379,7 @@ export function crawlToCrawler(
  id: string,
  sc: StoredCrawl,
  newBase?: string,
+  crawlerOptions?: any,
 ): WebCrawler {
  const crawler = new WebCrawler({
    jobId: id,
@ -399,6 +400,8 @@ export function crawlToCrawler(
    allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
    ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
    regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
+    maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
+    currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
  });

  if (sc.robots !== undefined) {
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -31,6 +31,8 @@ export class WebCrawler {
  private regexOnFullURL: boolean;
  private logger: typeof _logger;
  private sitemapsHit: Set<string> = new Set();
+  private maxDiscoveryDepth: number | undefined;
+  private currentDiscoveryDepth: number;

  constructor({
    jobId,
@ -47,6 +49,8 @@ export class WebCrawler {
    allowSubdomains = false,
    ignoreRobotsTxt = false,
    regexOnFullURL = false,
+    maxDiscoveryDepth,
+    currentDiscoveryDepth,
  }: {
    jobId: string;
    initialUrl: string;
@ -62,6 +66,8 @@ export class WebCrawler {
    allowSubdomains?: boolean;
    ignoreRobotsTxt?: boolean;
    regexOnFullURL?: boolean;
+    maxDiscoveryDepth?: number;
+    currentDiscoveryDepth?: number;
  }) {
    this.jobId = jobId;
    this.initialUrl = initialUrl;
@ -81,6 +87,8 @@ export class WebCrawler {
    this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
    this.regexOnFullURL = regexOnFullURL ?? false;
    this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
+    this.maxDiscoveryDepth = maxDiscoveryDepth;
+    this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0;
  }

  public filterLinks(
@ -89,6 +97,11 @@ export class WebCrawler {
    maxDepth: number,
    fromMap: boolean = false,
  ): string[] {
+    if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) {
+      this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth });
+      return [];
+    }
+
    // If the initial URL is a sitemap.xml, skip filtering
    if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
      return sitemapLinks.slice(0, limit);
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -1044,6 +1044,7 @@ async function processJob(job: Job & { id: string }, token: string) {
            job.data.crawl_id,
            sc,
            doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
+            job.data.crawlerOptions,
          );

          const links = crawler.filterLinks(
@ -1088,6 +1089,10 @@ async function processJob(job: Job & { id: string }, token: string) {
                  team_id: sc.team_id,
                  scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
                  internalOptions: sc.internalOptions,
+                  crawlerOptions: {
+                    ...sc.crawlerOptions,
+                    currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
+                  },
                  plan: job.data.plan,
                  origin: job.data.origin,
                  crawl_id: job.data.crawl_id,
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -160,6 +160,7 @@ export interface CrawlParams {
  includePaths?: string[];
  excludePaths?: string[];
  maxDepth?: number;
+  maxDiscoveryDepth?: number;
  limit?: number;
  allowBackwardLinks?: boolean;
  allowExternalLinks?: boolean;