diff --git a/.github/workflows/test-server-self-host.yml b/.github/workflows/test-server-self-host.yml index 301353df..26d3aad3 100644 --- a/.github/workflows/test-server-self-host.yml +++ b/.github/workflows/test-server-self-host.yml @@ -15,6 +15,7 @@ env: ENV: ${{ secrets.ENV }} TEST_SUITE_SELF_HOSTED: true USE_GO_MARKDOWN_PARSER: true + FIRECRAWL_DEBUG_FILTER_LINKS: true jobs: test: diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index 188cbeb7..66156261 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -21,6 +21,7 @@ env: SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }} SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }} SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }} TEST_API_KEY: ${{ secrets.TEST_API_KEY }} FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} USE_DB_AUTHENTICATION: true @@ -72,18 +73,20 @@ jobs: chmod +x html-to-markdown.so working-directory: ./apps/api/sharedLibs/go-html-to-md - name: Start the application - run: npm start & + run: npm start > api.log 2>&1 & working-directory: ./apps/api id: start_app - name: Start worker - run: npm run workers & + run: npm run workers > worker.log 2>&1 & working-directory: ./apps/api id: start_workers - name: Start index worker run: npm run index-worker & working-directory: ./apps/api id: start_index_worker + - name: Wait for API + run: pnpx wait-on tcp:3002 -t 15s - name: Run snippet tests run: | npm run test:snips - working-directory: ./apps/api + working-directory: ./apps/api \ No newline at end of file diff --git a/apps/api/src/__tests__/snips/billing.test.ts b/apps/api/src/__tests__/snips/billing.test.ts index 314c3f7a..b3639fcb 100644 --- a/apps/api/src/__tests__/snips/billing.test.ts +++ b/apps/api/src/__tests__/snips/billing.test.ts @@ -125,13 +125,18 @@ describe("Billing tests", () => { }) ]); + expect(crawl1.success).toBe(true); + expect(crawl2.success).toBe(true); + // sum: x+5y credits await sleepForBatchBilling(); const rc2 = (await creditUsage()).remaining_credits; - expect(rc1 - rc2).toBe(crawl1.body.completed + crawl2.body.completed * 5); + if (crawl1.success && crawl2.success) { + expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5); + } }, 300000); it("bills map correctly", async () => { diff --git a/apps/api/src/__tests__/snips/crawl.test.ts b/apps/api/src/__tests__/snips/crawl.test.ts index 67d5a181..f388243d 100644 --- a/apps/api/src/__tests__/snips/crawl.test.ts +++ b/apps/api/src/__tests__/snips/crawl.test.ts @@ -7,4 +7,50 @@ describe("Crawl tests", () => { limit: 10, }); }, 120000); + + it.concurrent("filters URLs properly", async () => { + const res = await crawl({ + url: "https://firecrawl.dev/pricing", + includePaths: ["^/pricing$"], + limit: 10, + }); + + expect(res.success).toBe(true); + if (res.success) { + expect(res.completed).toBe(1); + expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing"); + } + }, 120000); + + it.concurrent("filters URLs properly when using regexOnFullURL", async () => { + const res = await crawl({ + url: "https://firecrawl.dev/pricing", + includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"], + regexOnFullURL: true, + limit: 10, + }); + + expect(res.success).toBe(true); + if (res.success) { + expect(res.completed).toBe(1); + expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing"); + } + }, 120000); + + it.concurrent("discovers URLs properly when origin is not included", async () => { + const res = await crawl({ + url: "https://firecrawl.dev", + includePaths: ["^/blog"], + ignoreSitemap: true, + limit: 10, + }); + + expect(res.success).toBe(true); + if (res.success) { + expect(res.data.length).toBeGreaterThan(1); + for (const page of res.data) { + expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/); + } + } + }, 120000); }); diff --git a/apps/api/src/__tests__/snips/lib.ts b/apps/api/src/__tests__/snips/lib.ts index c296f465..fb1f6cff 100644 --- a/apps/api/src/__tests__/snips/lib.ts +++ b/apps/api/src/__tests__/snips/lib.ts @@ -1,7 +1,7 @@ import { configDotenv } from "dotenv"; configDotenv(); -import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput } from "../../controllers/v1/types"; +import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types"; import request from "supertest"; // ========================================= @@ -69,7 +69,7 @@ function expectCrawlToSucceed(response: Awaited>) expect(response.body.data.length).toBeGreaterThan(0); } -export async function crawl(body: CrawlRequestInput): ReturnType { +export async function crawl(body: CrawlRequestInput): Promise { const cs = await crawlStart(body); expectCrawlStartToSucceed(cs); @@ -82,7 +82,7 @@ export async function crawl(body: CrawlRequestInput): ReturnType { blockAds: false, }); - expect(response.markdown).toContain(".g.doubleclick.net/"); + expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//); }, 30000); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index efb5e094..4f946f03 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -448,6 +448,7 @@ const crawlerOptions = z ignoreSitemap: z.boolean().default(false), deduplicateSimilarURLs: z.boolean().default(true), ignoreQueryParameters: z.boolean().default(false), + regexOnFullURL: z.boolean().default(false), }) .strict(strictMessage); @@ -791,6 +792,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, + regexOnFullURL: x.regexOnFullURL, }; } @@ -811,6 +813,7 @@ export function fromLegacyCrawlerOptions(x: any): { ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, + regexOnFullURL: x.regexOnFullURL, }), internalOptions: { v0CrawlOnlyUrls: x.returnOnlyUrls, diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 526ba235..256d7435 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -398,6 +398,7 @@ export function crawlToCrawler( sc.crawlerOptions?.allowExternalContentLinks ?? false, allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, + regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false, }); if (sc.robots !== undefined) { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 126520e2..ea93110a 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -28,6 +28,7 @@ export class WebCrawler { private allowExternalContentLinks: boolean; private allowSubdomains: boolean; private ignoreRobotsTxt: boolean; + private regexOnFullURL: boolean; private logger: typeof _logger; private sitemapsHit: Set = new Set(); @@ -45,6 +46,7 @@ export class WebCrawler { allowExternalContentLinks = false, allowSubdomains = false, ignoreRobotsTxt = false, + regexOnFullURL = false, }: { jobId: string; initialUrl: string; @@ -59,6 +61,7 @@ export class WebCrawler { allowExternalContentLinks?: boolean; allowSubdomains?: boolean; ignoreRobotsTxt?: boolean; + regexOnFullURL?: boolean; }) { this.jobId = jobId; this.initialUrl = initialUrl; @@ -76,6 +79,7 @@ export class WebCrawler { this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowSubdomains = allowSubdomains ?? false; this.ignoreRobotsTxt = ignoreRobotsTxt ?? false; + this.regexOnFullURL = regexOnFullURL ?? false; this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" }); } @@ -115,11 +119,13 @@ export class WebCrawler { return false; } + const excincPath = this.regexOnFullURL ? link : path; + // Check if the link should be excluded if (this.excludes.length > 0 && this.excludes[0] !== "") { if ( this.excludes.some((excludePattern) => - new RegExp(excludePattern).test(path), + new RegExp(excludePattern).test(excincPath), ) ) { if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { @@ -133,7 +139,7 @@ export class WebCrawler { if (this.includes.length > 0 && this.includes[0] !== "") { if ( !this.includes.some((includePattern) => - new RegExp(includePattern).test(path), + new RegExp(includePattern).test(excincPath), ) ) { if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 86375ec2..6287cf95 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1112,6 +1112,11 @@ async function processJob(job: Job & { id: string }, token: string) { // }); } } + + // Only run check after adding new jobs for discovery - mogery + if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) { + throw new Error("Source URL is not allowed by includePaths/excludePaths rules") + } } } diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 6d3093b8..006e3737 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -173,6 +173,7 @@ export interface CrawlParams { }; deduplicateSimilarURLs?: boolean; ignoreQueryParameters?: boolean; + regexOnFullURL?: boolean; } /**