feat(crawl): includes/excludes fixes (FIR-1300) (#1303)

* feat(crawl): includes/excludes fixes pt. 1

* fix(snips): billing tests

* drop tha logs

* fix(ci): add replica url

* feat(crawl): drop initial scrape if it's not included

* feat(ci): more verbose logging

* fix crawl path in test

* fix(ci): wait for api

* fix(snips/scrape/ad): test for more pixels

* feat(js-sdk/crawl): add regexOnFullURL
This commit is contained in:
Gergő Móricz 2025-03-06 17:05:15 +01:00 committed by GitHub
parent f8df18ed6a
commit e1cfe1da48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 81 additions and 10 deletions

View File

@ -15,6 +15,7 @@ env:
ENV: ${{ secrets.ENV }}
TEST_SUITE_SELF_HOSTED: true
USE_GO_MARKDOWN_PARSER: true
FIRECRAWL_DEBUG_FILTER_LINKS: true
jobs:
test:

View File

@ -21,6 +21,7 @@ env:
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }}
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
USE_DB_AUTHENTICATION: true
@ -72,18 +73,20 @@ jobs:
chmod +x html-to-markdown.so
working-directory: ./apps/api/sharedLibs/go-html-to-md
- name: Start the application
run: npm start &
run: npm start > api.log 2>&1 &
working-directory: ./apps/api
id: start_app
- name: Start worker
run: npm run workers &
run: npm run workers > worker.log 2>&1 &
working-directory: ./apps/api
id: start_workers
- name: Start index worker
run: npm run index-worker &
working-directory: ./apps/api
id: start_index_worker
- name: Wait for API
run: pnpx wait-on tcp:3002 -t 15s
- name: Run snippet tests
run: |
npm run test:snips
working-directory: ./apps/api
working-directory: ./apps/api

View File

@ -125,13 +125,18 @@ describe("Billing tests", () => {
})
]);
expect(crawl1.success).toBe(true);
expect(crawl2.success).toBe(true);
// sum: x+5y credits
await sleepForBatchBilling();
const rc2 = (await creditUsage()).remaining_credits;
expect(rc1 - rc2).toBe(crawl1.body.completed + crawl2.body.completed * 5);
if (crawl1.success && crawl2.success) {
expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
}
}, 300000);
it("bills map correctly", async () => {

View File

@ -7,4 +7,50 @@ describe("Crawl tests", () => {
limit: 10,
});
}, 120000);
it.concurrent("filters URLs properly", async () => {
const res = await crawl({
url: "https://firecrawl.dev/pricing",
includePaths: ["^/pricing$"],
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBe(1);
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
}
}, 120000);
it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
const res = await crawl({
url: "https://firecrawl.dev/pricing",
includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
regexOnFullURL: true,
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.completed).toBe(1);
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
}
}, 120000);
it.concurrent("discovers URLs properly when origin is not included", async () => {
const res = await crawl({
url: "https://firecrawl.dev",
includePaths: ["^/blog"],
ignoreSitemap: true,
limit: 10,
});
expect(res.success).toBe(true);
if (res.success) {
expect(res.data.length).toBeGreaterThan(1);
for (const page of res.data) {
expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
}
}
}, 120000);
});

View File

@ -1,7 +1,7 @@
import { configDotenv } from "dotenv";
configDotenv();
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput } from "../../controllers/v1/types";
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types";
import request from "supertest";
// =========================================
@ -69,7 +69,7 @@ function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>)
expect(response.body.data.length).toBeGreaterThan(0);
}
export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
export async function crawl(body: CrawlRequestInput): Promise<CrawlStatusResponse> {
const cs = await crawlStart(body);
expectCrawlStartToSucceed(cs);
@ -82,7 +82,7 @@ export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlSta
} while (x.body.status === "scraping");
expectCrawlToSucceed(x);
return x;
return x.body;
}
// =========================================

View File

@ -81,7 +81,7 @@ describe("Scrape tests", () => {
blockAds: false,
});
expect(response.markdown).toContain(".g.doubleclick.net/");
expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
}, 30000);
});

View File

@ -448,6 +448,7 @@ const crawlerOptions = z
ignoreSitemap: z.boolean().default(false),
deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false),
regexOnFullURL: z.boolean().default(false),
})
.strict(strictMessage);
@ -791,6 +792,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
};
}
@ -811,6 +813,7 @@ export function fromLegacyCrawlerOptions(x: any): {
ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters,
regexOnFullURL: x.regexOnFullURL,
}),
internalOptions: {
v0CrawlOnlyUrls: x.returnOnlyUrls,

View File

@ -398,6 +398,7 @@ export function crawlToCrawler(
sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
});
if (sc.robots !== undefined) {

View File

@ -28,6 +28,7 @@ export class WebCrawler {
private allowExternalContentLinks: boolean;
private allowSubdomains: boolean;
private ignoreRobotsTxt: boolean;
private regexOnFullURL: boolean;
private logger: typeof _logger;
private sitemapsHit: Set<string> = new Set();
@ -45,6 +46,7 @@ export class WebCrawler {
allowExternalContentLinks = false,
allowSubdomains = false,
ignoreRobotsTxt = false,
regexOnFullURL = false,
}: {
jobId: string;
initialUrl: string;
@ -59,6 +61,7 @@ export class WebCrawler {
allowExternalContentLinks?: boolean;
allowSubdomains?: boolean;
ignoreRobotsTxt?: boolean;
regexOnFullURL?: boolean;
}) {
this.jobId = jobId;
this.initialUrl = initialUrl;
@ -76,6 +79,7 @@ export class WebCrawler {
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
this.allowSubdomains = allowSubdomains ?? false;
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
this.regexOnFullURL = regexOnFullURL ?? false;
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
}
@ -115,11 +119,13 @@ export class WebCrawler {
return false;
}
const excincPath = this.regexOnFullURL ? link : path;
// Check if the link should be excluded
if (this.excludes.length > 0 && this.excludes[0] !== "") {
if (
this.excludes.some((excludePattern) =>
new RegExp(excludePattern).test(path),
new RegExp(excludePattern).test(excincPath),
)
) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
@ -133,7 +139,7 @@ export class WebCrawler {
if (this.includes.length > 0 && this.includes[0] !== "") {
if (
!this.includes.some((includePattern) =>
new RegExp(includePattern).test(path),
new RegExp(includePattern).test(excincPath),
)
) {
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {

View File

@ -1112,6 +1112,11 @@ async function processJob(job: Job & { id: string }, token: string) {
// });
}
}
// Only run check after adding new jobs for discovery - mogery
if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) {
throw new Error("Source URL is not allowed by includePaths/excludePaths rules")
}
}
}

View File

@ -173,6 +173,7 @@ export interface CrawlParams {
};
deduplicateSimilarURLs?: boolean;
ignoreQueryParameters?: boolean;
regexOnFullURL?: boolean;
}
/**