feat(crawl): add maxDiscoveryDepth (#1329)

2025-07-31 00:52:04 +08:00 · 2025-03-12 18:46:57 +01:00 · 2025-03-12 18:46:57 +01:00 · 7cf2e52fe6
commit 7cf2e52fe6
parent d855f5a567
6 changed files with 44 additions and 1 deletions
--- a/apps/api/src/tests/snips/crawl.test.ts
+++ b/apps/api/src/tests/snips/crawl.test.ts
@ -53,4 +53,21 @@ describe("Crawl tests", () => {
            }
        }
    }, 120000);
    it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => {
        const res = await crawl({
            url: "https://firecrawl.dev",
            ignoreSitemap: true,
            maxDiscoveryDepth: 1,
            limit: 10,
        });
        expect(res.success).toBe(true);
        if (res.success) {
            expect(res.data.length).toBeGreaterThan(1);
            for (const page of res.data) {
                expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/);
            }
        }
    }, 120000);
 });
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -440,6 +440,7 @@ const crawlerOptions = z
    includePaths: z.string().array().default([]),
    excludePaths: z.string().array().default([]),
    maxDepth: z.number().default(10), // default?
    maxDiscoveryDepth: z.number().optional(),
    limit: z.number().default(10000), // default?
    allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
    allowExternalLinks: z.boolean().default(false),
@ -793,6 +794,8 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
    deduplicateSimilarURLs: x.deduplicateSimilarURLs,
    ignoreQueryParameters: x.ignoreQueryParameters,
    regexOnFullURL: x.regexOnFullURL,
    maxDiscoveryDepth: x.maxDiscoveryDepth,
    currentDiscoveryDepth: 0,
  };
 }
@ -814,6 +817,7 @@ export function fromLegacyCrawlerOptions(x: any): {
      deduplicateSimilarURLs: x.deduplicateSimilarURLs,
      ignoreQueryParameters: x.ignoreQueryParameters,
      regexOnFullURL: x.regexOnFullURL,
      maxDiscoveryDepth: x.maxDiscoveryDepth,
   }),
    internalOptions: {
      v0CrawlOnlyUrls: x.returnOnlyUrls,
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -379,6 +379,7 @@ export function crawlToCrawler(
  id: string,
  sc: StoredCrawl,
  newBase?: string,
  crawlerOptions?: any,
 ): WebCrawler {
  const crawler = new WebCrawler({
    jobId: id,
@ -399,6 +400,8 @@ export function crawlToCrawler(
    allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
    ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
    regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
    maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
    currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
  });
  if (sc.robots !== undefined) {
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -31,6 +31,8 @@ export class WebCrawler {
  private regexOnFullURL: boolean;
  private logger: typeof _logger;
  private sitemapsHit: Set<string> = new Set();
  private maxDiscoveryDepth: number | undefined;
  private currentDiscoveryDepth: number;
  constructor({
    jobId,
@ -47,6 +49,8 @@ export class WebCrawler {
    allowSubdomains = false,
    ignoreRobotsTxt = false,
    regexOnFullURL = false,
    maxDiscoveryDepth,
    currentDiscoveryDepth,
  }: {
    jobId: string;
    initialUrl: string;
@ -62,6 +66,8 @@ export class WebCrawler {
    allowSubdomains?: boolean;
    ignoreRobotsTxt?: boolean;
    regexOnFullURL?: boolean;
    maxDiscoveryDepth?: number;
    currentDiscoveryDepth?: number;
  }) {
    this.jobId = jobId;
    this.initialUrl = initialUrl;
@ -81,6 +87,8 @@ export class WebCrawler {
    this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
    this.regexOnFullURL = regexOnFullURL ?? false;
    this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
    this.maxDiscoveryDepth = maxDiscoveryDepth;
    this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0;
  }
  public filterLinks(
@ -89,6 +97,11 @@ export class WebCrawler {
    maxDepth: number,
    fromMap: boolean = false,
  ): string[] {
    if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) {
      this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth });
      return [];
    }
    // If the initial URL is a sitemap.xml, skip filtering
    if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
      return sitemapLinks.slice(0, limit);
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -1044,6 +1044,7 @@ async function processJob(job: Job & { id: string }, token: string) {
            job.data.crawl_id,
            sc,
            doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
            job.data.crawlerOptions,
          );
          const links = crawler.filterLinks(
@ -1088,6 +1089,10 @@ async function processJob(job: Job & { id: string }, token: string) {
                  team_id: sc.team_id,
                  scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
                  internalOptions: sc.internalOptions,
                  crawlerOptions: {
                    ...sc.crawlerOptions,
                    currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
                  },
                  plan: job.data.plan,
                  origin: job.data.origin,
                  crawl_id: job.data.crawl_id,
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -160,6 +160,7 @@ export interface CrawlParams {
  includePaths?: string[];
  excludePaths?: string[];
  maxDepth?: number;
  maxDiscoveryDepth?: number;
  limit?: number;
  allowBackwardLinks?: boolean;
  allowExternalLinks?: boolean;