feat(crawl): includes/excludes fixes (FIR-1300) (#1303)

* feat(crawl): includes/excludes fixes pt. 1

* fix(snips): billing tests

* drop tha logs

* fix(ci): add replica url

* feat(crawl): drop initial scrape if it's not included

* feat(ci): more verbose logging

* fix crawl path in test

* fix(ci): wait for api

* fix(snips/scrape/ad): test for more pixels

* feat(js-sdk/crawl): add regexOnFullURL
This commit is contained in:
Gergő Móricz 2025-03-06 17:05:15 +01:00 committed by GitHub
parent f8df18ed6a
commit e1cfe1da48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 81 additions and 10 deletions

View File

@ -15,6 +15,7 @@ env:
ENV: ${{ secrets.ENV }} ENV: ${{ secrets.ENV }}
TEST_SUITE_SELF_HOSTED: true TEST_SUITE_SELF_HOSTED: true
USE_GO_MARKDOWN_PARSER: true USE_GO_MARKDOWN_PARSER: true
FIRECRAWL_DEBUG_FILTER_LINKS: true
jobs: jobs:
test: test:

View File

@ -21,6 +21,7 @@ env:
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: true USE_DB_AUTHENTICATION: true
@ -72,17 +73,19 @@ jobs:
chmod +x html-to-markdown.so chmod +x html-to-markdown.so
working-directory: ./apps/api/sharedLibs/go-html-to-md working-directory: ./apps/api/sharedLibs/go-html-to-md
- name: Start the application - name: Start the application
run: npm start & run: npm start > api.log 2>&1 &
working-directory: ./apps/api working-directory: ./apps/api
id: start_app id: start_app
- name: Start worker - name: Start worker
run: npm run workers & run: npm run workers > worker.log 2>&1 &
working-directory: ./apps/api working-directory: ./apps/api
id: start_workers id: start_workers
- name: Start index worker - name: Start index worker
run: npm run index-worker & run: npm run index-worker &
working-directory: ./apps/api working-directory: ./apps/api
id: start_index_worker id: start_index_worker
- name: Wait for API
run: pnpx wait-on tcp:3002 -t 15s
- name: Run snippet tests - name: Run snippet tests
run: | run: |
npm run test:snips npm run test:snips

View File

@ -125,13 +125,18 @@ describe("Billing tests", () => {
}) })
]); ]);
expect(crawl1.success).toBe(true);
expect(crawl2.success).toBe(true);
// sum: x+5y credits // sum: x+5y credits
await sleepForBatchBilling(); await sleepForBatchBilling();
const rc2 = (await creditUsage()).remaining_credits; const rc2 = (await creditUsage()).remaining_credits;
expect(rc1 - rc2).toBe(crawl1.body.completed + crawl2.body.completed * 5); if (crawl1.success && crawl2.success) {
expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
}
}, 300000); }, 300000);
it("bills map correctly", async () => { it("bills map correctly", async () => {

View File

@ -7,4 +7,50 @@ describe("Crawl tests", () => {
limit: 10, limit: 10,
}); });
}, 120000); }, 120000);
it.concurrent("filters URLs properly", async () => {
const res = await crawl({
url: "https://firecrawl.dev/pricing",
includePaths: ["^/pricing$"],
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBe(1);
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
}
}, 120000);
it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
const res = await crawl({
url: "https://firecrawl.dev/pricing",
includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
regexOnFullURL: true,
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBe(1);
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
}
}, 120000);
it.concurrent("discovers URLs properly when origin is not included", async () => {
const res = await crawl({
url: "https://firecrawl.dev",
includePaths: ["^/blog"],
ignoreSitemap: true,
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.data.length).toBeGreaterThan(1);
for (const page of res.data) {
expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
}
}
}, 120000);
}); });

View File

@ -1,7 +1,7 @@
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
configDotenv(); configDotenv();
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput } from "../../controllers/v1/types"; import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types";
import request from "supertest"; import request from "supertest";
// ========================================= // =========================================
@ -69,7 +69,7 @@ function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>)
expect(response.body.data.length).toBeGreaterThan(0); expect(response.body.data.length).toBeGreaterThan(0);
} }
export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> { export async function crawl(body: CrawlRequestInput): Promise<CrawlStatusResponse> {
const cs = await crawlStart(body); const cs = await crawlStart(body);
expectCrawlStartToSucceed(cs); expectCrawlStartToSucceed(cs);
@ -82,7 +82,7 @@ export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlSta
} while (x.body.status === "scraping"); } while (x.body.status === "scraping");
expectCrawlToSucceed(x); expectCrawlToSucceed(x);
return x; return x.body;
} }
// ========================================= // =========================================

View File

@ -81,7 +81,7 @@ describe("Scrape tests", () => {
blockAds: false, blockAds: false,
}); });
expect(response.markdown).toContain(".g.doubleclick.net/"); expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
}, 30000); }, 30000);
}); });

View File

@ -448,6 +448,7 @@ const crawlerOptions = z
ignoreSitemap: z.boolean().default(false), ignoreSitemap: z.boolean().default(false),
deduplicateSimilarURLs: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false), ignoreQueryParameters: z.boolean().default(false),
regexOnFullURL: z.boolean().default(false),
}) })
.strict(strictMessage); .strict(strictMessage);
@ -791,6 +792,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
}; };
} }
@ -811,6 +813,7 @@ export function fromLegacyCrawlerOptions(x: any): {
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
}), }),
internalOptions: { internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls, v0CrawlOnlyUrls: x.returnOnlyUrls,

View File

@ -398,6 +398,7 @@ export function crawlToCrawler(
sc.crawlerOptions?.allowExternalContentLinks ?? false, sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
}); });
if (sc.robots !== undefined) { if (sc.robots !== undefined) {

View File

@ -28,6 +28,7 @@ export class WebCrawler {
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
private allowSubdomains: boolean; private allowSubdomains: boolean;
private ignoreRobotsTxt: boolean; private ignoreRobotsTxt: boolean;
private regexOnFullURL: boolean;
private logger: typeof _logger; private logger: typeof _logger;
private sitemapsHit: Set<string> = new Set(); private sitemapsHit: Set<string> = new Set();
@ -45,6 +46,7 @@ export class WebCrawler {
allowExternalContentLinks = false, allowExternalContentLinks = false,
allowSubdomains = false, allowSubdomains = false,
ignoreRobotsTxt = false, ignoreRobotsTxt = false,
regexOnFullURL = false,
}: { }: {
jobId: string; jobId: string;
initialUrl: string; initialUrl: string;
@ -59,6 +61,7 @@ export class WebCrawler {
allowExternalContentLinks?: boolean; allowExternalContentLinks?: boolean;
allowSubdomains?: boolean; allowSubdomains?: boolean;
ignoreRobotsTxt?: boolean; ignoreRobotsTxt?: boolean;
regexOnFullURL?: boolean;
}) { }) {
this.jobId = jobId; this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
@ -76,6 +79,7 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
this.allowSubdomains = allowSubdomains ?? false; this.allowSubdomains = allowSubdomains ?? false;
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false; this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
this.regexOnFullURL = regexOnFullURL ?? false;
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" }); this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
} }
@ -115,11 +119,13 @@ export class WebCrawler {
return false; return false;
} }
const excincPath = this.regexOnFullURL ? link : path;
// Check if the link should be excluded // Check if the link should be excluded
if (this.excludes.length > 0 && this.excludes[0] !== "") { if (this.excludes.length > 0 && this.excludes[0] !== "") {
if ( if (
this.excludes.some((excludePattern) => this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path), new RegExp(excludePattern).test(excincPath),
) )
) { ) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
@ -133,7 +139,7 @@ export class WebCrawler {
if (this.includes.length > 0 && this.includes[0] !== "") { if (this.includes.length > 0 && this.includes[0] !== "") {
if ( if (
!this.includes.some((includePattern) => !this.includes.some((includePattern) =>
new RegExp(includePattern).test(path), new RegExp(includePattern).test(excincPath),
) )
) { ) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {

View File

@ -1112,6 +1112,11 @@ async function processJob(job: Job & { id: string }, token: string) {
// }); // });
} }
} }
// Only run check after adding new jobs for discovery - mogery
if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) {
throw new Error("Source URL is not allowed by includePaths/excludePaths rules")
}
} }
} }

View File

@ -173,6 +173,7 @@ export interface CrawlParams {
}; };
deduplicateSimilarURLs?: boolean; deduplicateSimilarURLs?: boolean;
ignoreQueryParameters?: boolean; ignoreQueryParameters?: boolean;
regexOnFullURL?: boolean;
} }
/** /**