feat(crawl): includes/excludes fixes (FIR-1300) (#1303)

* feat(crawl): includes/excludes fixes pt. 1 * fix(snips): billing tests * drop tha logs * fix(ci): add replica url * feat(crawl): drop initial scrape if it's not included * feat(ci): more verbose logging * fix crawl path in test * fix(ci): wait for api * fix(snips/scrape/ad): test for more pixels * feat(js-sdk/crawl): add regexOnFullURL
2025-08-11 02:49:01 +08:00 · 2025-03-06 17:05:15 +01:00 · 2025-03-06 17:05:15 +01:00 · e1cfe1da48
commit e1cfe1da48
parent f8df18ed6a
11 changed files with 81 additions and 10 deletions
--- a/.github/workflows/test-server-self-host.yml
+++ b/.github/workflows/test-server-self-host.yml
@ -15,6 +15,7 @@ env:
  ENV: ${{ secrets.ENV }}
  TEST_SUITE_SELF_HOSTED: true
  USE_GO_MARKDOWN_PARSER: true
+  FIRECRAWL_DEBUG_FILTER_LINKS: true

 jobs:
  test:
--- a/.github/workflows/test-server.yml
+++ b/.github/workflows/test-server.yml
@ -21,6 +21,7 @@ env:
  SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
  SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
  SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
+  SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }}
  TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
  FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
  USE_DB_AUTHENTICATION: true
@ -72,18 +73,20 @@ jobs:
          chmod +x html-to-markdown.so
        working-directory: ./apps/api/sharedLibs/go-html-to-md
      - name: Start the application
-        run: npm start &
+        run: npm start > api.log 2>&1 &
        working-directory: ./apps/api
        id: start_app
      - name: Start worker
-        run: npm run workers &
+        run: npm run workers > worker.log 2>&1 &
        working-directory: ./apps/api
        id: start_workers
      - name: Start index worker
        run: npm run index-worker &
        working-directory: ./apps/api
        id: start_index_worker
+      - name: Wait for API
+        run: pnpx wait-on tcp:3002 -t 15s
      - name: Run snippet tests
        run: |
          npm run test:snips
-        working-directory: ./apps/api
+        working-directory: ./apps/api
--- a/apps/api/src/tests/snips/billing.test.ts
+++ b/apps/api/src/tests/snips/billing.test.ts
@ -125,13 +125,18 @@ describe("Billing tests", () => {
                })
            ]);
            
+            expect(crawl1.success).toBe(true);
+            expect(crawl2.success).toBe(true);
+            
            // sum: x+5y credits

            await sleepForBatchBilling();

            const rc2 = (await creditUsage()).remaining_credits;

-            expect(rc1 - rc2).toBe(crawl1.body.completed + crawl2.body.completed * 5);
+            if (crawl1.success && crawl2.success) {
+                expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
+            }
        }, 300000);

        it("bills map correctly", async () => {
--- a/apps/api/src/tests/snips/crawl.test.ts
+++ b/apps/api/src/tests/snips/crawl.test.ts
@ -7,4 +7,50 @@ describe("Crawl tests", () => {
            limit: 10,
        });
    }, 120000);
+
+    it.concurrent("filters URLs properly", async () => {
+        const res = await crawl({
+            url: "https://firecrawl.dev/pricing",
+            includePaths: ["^/pricing$"],
+            limit: 10,
+        });
+
+        expect(res.success).toBe(true);
+        if (res.success) {
+            expect(res.completed).toBe(1);
+            expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
+        }
+    }, 120000);
+
+    it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
+        const res = await crawl({
+            url: "https://firecrawl.dev/pricing",
+            includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
+            regexOnFullURL: true,
+            limit: 10,
+        });
+
+        expect(res.success).toBe(true);
+        if (res.success) {
+            expect(res.completed).toBe(1);
+            expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
+        }
+    }, 120000);
+
+    it.concurrent("discovers URLs properly when origin is not included", async () => {
+        const res = await crawl({
+            url: "https://firecrawl.dev",
+            includePaths: ["^/blog"],
+            ignoreSitemap: true,
+            limit: 10,
+        });
+
+        expect(res.success).toBe(true);
+        if (res.success) {
+            expect(res.data.length).toBeGreaterThan(1);
+            for (const page of res.data) {
+                expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
+            }
+        }
+    }, 120000);
 });
--- a/apps/api/src/tests/snips/lib.ts
+++ b/apps/api/src/tests/snips/lib.ts
@ -1,7 +1,7 @@
 import { configDotenv } from "dotenv";
 configDotenv();

-import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput } from "../../controllers/v1/types";
+import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types";
 import request from "supertest";

 // =========================================
@ -69,7 +69,7 @@ function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>)
    expect(response.body.data.length).toBeGreaterThan(0);
 }

-export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
+export async function crawl(body: CrawlRequestInput): Promise<CrawlStatusResponse> {
    const cs = await crawlStart(body);
    expectCrawlStartToSucceed(cs);

@ -82,7 +82,7 @@ export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlSta
    } while (x.body.status === "scraping");

    expectCrawlToSucceed(x);
-    return x;
+    return x.body;
 }

 // =========================================
--- a/apps/api/src/tests/snips/scrape.test.ts
+++ b/apps/api/src/tests/snips/scrape.test.ts
@ -81,7 +81,7 @@ describe("Scrape tests", () => {
          blockAds: false,
        });

-        expect(response.markdown).toContain(".g.doubleclick.net/");
+        expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
      }, 30000);
    });
  
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -448,6 +448,7 @@ const crawlerOptions = z
    ignoreSitemap: z.boolean().default(false),
    deduplicateSimilarURLs: z.boolean().default(true),
    ignoreQueryParameters: z.boolean().default(false),
+    regexOnFullURL: z.boolean().default(false),
  })
  .strict(strictMessage);

@ -791,6 +792,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
    ignoreSitemap: x.ignoreSitemap,
    deduplicateSimilarURLs: x.deduplicateSimilarURLs,
    ignoreQueryParameters: x.ignoreQueryParameters,
+    regexOnFullURL: x.regexOnFullURL,
  };
 }

@ -811,6 +813,7 @@ export function fromLegacyCrawlerOptions(x: any): {
      ignoreSitemap: x.ignoreSitemap,
      deduplicateSimilarURLs: x.deduplicateSimilarURLs,
      ignoreQueryParameters: x.ignoreQueryParameters,
+      regexOnFullURL: x.regexOnFullURL,
    }),
    internalOptions: {
      v0CrawlOnlyUrls: x.returnOnlyUrls,
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -398,6 +398,7 @@ export function crawlToCrawler(
      sc.crawlerOptions?.allowExternalContentLinks ?? false,
    allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
    ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
+    regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
  });

  if (sc.robots !== undefined) {
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -28,6 +28,7 @@ export class WebCrawler {
  private allowExternalContentLinks: boolean;
  private allowSubdomains: boolean;
  private ignoreRobotsTxt: boolean;
+  private regexOnFullURL: boolean;
  private logger: typeof _logger;
  private sitemapsHit: Set<string> = new Set();

@ -45,6 +46,7 @@ export class WebCrawler {
    allowExternalContentLinks = false,
    allowSubdomains = false,
    ignoreRobotsTxt = false,
+    regexOnFullURL = false,
  }: {
    jobId: string;
    initialUrl: string;
@ -59,6 +61,7 @@ export class WebCrawler {
    allowExternalContentLinks?: boolean;
    allowSubdomains?: boolean;
    ignoreRobotsTxt?: boolean;
+    regexOnFullURL?: boolean;
  }) {
    this.jobId = jobId;
    this.initialUrl = initialUrl;
@ -76,6 +79,7 @@ export class WebCrawler {
    this.allowExternalContentLinks = allowExternalContentLinks ?? false;
    this.allowSubdomains = allowSubdomains ?? false;
    this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
+    this.regexOnFullURL = regexOnFullURL ?? false;
    this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
  }

@ -115,11 +119,13 @@ export class WebCrawler {
          return false;
        }

+        const excincPath = this.regexOnFullURL ? link : path;
+
        // Check if the link should be excluded
        if (this.excludes.length > 0 && this.excludes[0] !== "") {
          if (
            this.excludes.some((excludePattern) =>
-              new RegExp(excludePattern).test(path),
+              new RegExp(excludePattern).test(excincPath),
            )
          ) {
            if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
@ -133,7 +139,7 @@ export class WebCrawler {
        if (this.includes.length > 0 && this.includes[0] !== "") {
          if (
            !this.includes.some((includePattern) =>
-              new RegExp(includePattern).test(path),
+              new RegExp(includePattern).test(excincPath),
            )
          ) {
            if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -1112,6 +1112,11 @@ async function processJob(job: Job & { id: string }, token: string) {
              // });
            }
          }
+
+          // Only run check after adding new jobs for discovery - mogery
+          if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) {
+            throw new Error("Source URL is not allowed by includePaths/excludePaths rules")
+          }
        }
      }

--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -173,6 +173,7 @@ export interface CrawlParams {
  };
  deduplicateSimilarURLs?: boolean;
  ignoreQueryParameters?: boolean;
+  regexOnFullURL?: boolean;
 }

 /**