mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 02:49:01 +08:00
feat(crawl): includes/excludes fixes (FIR-1300) (#1303)
* feat(crawl): includes/excludes fixes pt. 1 * fix(snips): billing tests * drop tha logs * fix(ci): add replica url * feat(crawl): drop initial scrape if it's not included * feat(ci): more verbose logging * fix crawl path in test * fix(ci): wait for api * fix(snips/scrape/ad): test for more pixels * feat(js-sdk/crawl): add regexOnFullURL
This commit is contained in:
parent
f8df18ed6a
commit
e1cfe1da48
1
.github/workflows/test-server-self-host.yml
vendored
1
.github/workflows/test-server-self-host.yml
vendored
@ -15,6 +15,7 @@ env:
|
||||
ENV: ${{ secrets.ENV }}
|
||||
TEST_SUITE_SELF_HOSTED: true
|
||||
USE_GO_MARKDOWN_PARSER: true
|
||||
FIRECRAWL_DEBUG_FILTER_LINKS: true
|
||||
|
||||
jobs:
|
||||
test:
|
||||
|
9
.github/workflows/test-server.yml
vendored
9
.github/workflows/test-server.yml
vendored
@ -21,6 +21,7 @@ env:
|
||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||
SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }}
|
||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||
USE_DB_AUTHENTICATION: true
|
||||
@ -72,18 +73,20 @@ jobs:
|
||||
chmod +x html-to-markdown.so
|
||||
working-directory: ./apps/api/sharedLibs/go-html-to-md
|
||||
- name: Start the application
|
||||
run: npm start &
|
||||
run: npm start > api.log 2>&1 &
|
||||
working-directory: ./apps/api
|
||||
id: start_app
|
||||
- name: Start worker
|
||||
run: npm run workers &
|
||||
run: npm run workers > worker.log 2>&1 &
|
||||
working-directory: ./apps/api
|
||||
id: start_workers
|
||||
- name: Start index worker
|
||||
run: npm run index-worker &
|
||||
working-directory: ./apps/api
|
||||
id: start_index_worker
|
||||
- name: Wait for API
|
||||
run: pnpx wait-on tcp:3002 -t 15s
|
||||
- name: Run snippet tests
|
||||
run: |
|
||||
npm run test:snips
|
||||
working-directory: ./apps/api
|
||||
working-directory: ./apps/api
|
@ -125,13 +125,18 @@ describe("Billing tests", () => {
|
||||
})
|
||||
]);
|
||||
|
||||
expect(crawl1.success).toBe(true);
|
||||
expect(crawl2.success).toBe(true);
|
||||
|
||||
// sum: x+5y credits
|
||||
|
||||
await sleepForBatchBilling();
|
||||
|
||||
const rc2 = (await creditUsage()).remaining_credits;
|
||||
|
||||
expect(rc1 - rc2).toBe(crawl1.body.completed + crawl2.body.completed * 5);
|
||||
if (crawl1.success && crawl2.success) {
|
||||
expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
|
||||
}
|
||||
}, 300000);
|
||||
|
||||
it("bills map correctly", async () => {
|
||||
|
@ -7,4 +7,50 @@ describe("Crawl tests", () => {
|
||||
limit: 10,
|
||||
});
|
||||
}, 120000);
|
||||
|
||||
it.concurrent("filters URLs properly", async () => {
|
||||
const res = await crawl({
|
||||
url: "https://firecrawl.dev/pricing",
|
||||
includePaths: ["^/pricing$"],
|
||||
limit: 10,
|
||||
});
|
||||
|
||||
expect(res.success).toBe(true);
|
||||
if (res.success) {
|
||||
expect(res.completed).toBe(1);
|
||||
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
|
||||
}
|
||||
}, 120000);
|
||||
|
||||
it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
|
||||
const res = await crawl({
|
||||
url: "https://firecrawl.dev/pricing",
|
||||
includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
|
||||
regexOnFullURL: true,
|
||||
limit: 10,
|
||||
});
|
||||
|
||||
expect(res.success).toBe(true);
|
||||
if (res.success) {
|
||||
expect(res.completed).toBe(1);
|
||||
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
|
||||
}
|
||||
}, 120000);
|
||||
|
||||
it.concurrent("discovers URLs properly when origin is not included", async () => {
|
||||
const res = await crawl({
|
||||
url: "https://firecrawl.dev",
|
||||
includePaths: ["^/blog"],
|
||||
ignoreSitemap: true,
|
||||
limit: 10,
|
||||
});
|
||||
|
||||
expect(res.success).toBe(true);
|
||||
if (res.success) {
|
||||
expect(res.data.length).toBeGreaterThan(1);
|
||||
for (const page of res.data) {
|
||||
expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
|
||||
}
|
||||
}
|
||||
}, 120000);
|
||||
});
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { configDotenv } from "dotenv";
|
||||
configDotenv();
|
||||
|
||||
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput } from "../../controllers/v1/types";
|
||||
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types";
|
||||
import request from "supertest";
|
||||
|
||||
// =========================================
|
||||
@ -69,7 +69,7 @@ function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>)
|
||||
expect(response.body.data.length).toBeGreaterThan(0);
|
||||
}
|
||||
|
||||
export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
|
||||
export async function crawl(body: CrawlRequestInput): Promise<CrawlStatusResponse> {
|
||||
const cs = await crawlStart(body);
|
||||
expectCrawlStartToSucceed(cs);
|
||||
|
||||
@ -82,7 +82,7 @@ export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlSta
|
||||
} while (x.body.status === "scraping");
|
||||
|
||||
expectCrawlToSucceed(x);
|
||||
return x;
|
||||
return x.body;
|
||||
}
|
||||
|
||||
// =========================================
|
||||
|
@ -81,7 +81,7 @@ describe("Scrape tests", () => {
|
||||
blockAds: false,
|
||||
});
|
||||
|
||||
expect(response.markdown).toContain(".g.doubleclick.net/");
|
||||
expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
||||
}, 30000);
|
||||
});
|
||||
|
||||
|
@ -448,6 +448,7 @@ const crawlerOptions = z
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
deduplicateSimilarURLs: z.boolean().default(true),
|
||||
ignoreQueryParameters: z.boolean().default(false),
|
||||
regexOnFullURL: z.boolean().default(false),
|
||||
})
|
||||
.strict(strictMessage);
|
||||
|
||||
@ -791,6 +792,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
regexOnFullURL: x.regexOnFullURL,
|
||||
};
|
||||
}
|
||||
|
||||
@ -811,6 +813,7 @@ export function fromLegacyCrawlerOptions(x: any): {
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
regexOnFullURL: x.regexOnFullURL,
|
||||
}),
|
||||
internalOptions: {
|
||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||
|
@ -398,6 +398,7 @@ export function crawlToCrawler(
|
||||
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
||||
});
|
||||
|
||||
if (sc.robots !== undefined) {
|
||||
|
@ -28,6 +28,7 @@ export class WebCrawler {
|
||||
private allowExternalContentLinks: boolean;
|
||||
private allowSubdomains: boolean;
|
||||
private ignoreRobotsTxt: boolean;
|
||||
private regexOnFullURL: boolean;
|
||||
private logger: typeof _logger;
|
||||
private sitemapsHit: Set<string> = new Set();
|
||||
|
||||
@ -45,6 +46,7 @@ export class WebCrawler {
|
||||
allowExternalContentLinks = false,
|
||||
allowSubdomains = false,
|
||||
ignoreRobotsTxt = false,
|
||||
regexOnFullURL = false,
|
||||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
@ -59,6 +61,7 @@ export class WebCrawler {
|
||||
allowExternalContentLinks?: boolean;
|
||||
allowSubdomains?: boolean;
|
||||
ignoreRobotsTxt?: boolean;
|
||||
regexOnFullURL?: boolean;
|
||||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
@ -76,6 +79,7 @@ export class WebCrawler {
|
||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
this.allowSubdomains = allowSubdomains ?? false;
|
||||
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
||||
this.regexOnFullURL = regexOnFullURL ?? false;
|
||||
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
||||
}
|
||||
|
||||
@ -115,11 +119,13 @@ export class WebCrawler {
|
||||
return false;
|
||||
}
|
||||
|
||||
const excincPath = this.regexOnFullURL ? link : path;
|
||||
|
||||
// Check if the link should be excluded
|
||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||
if (
|
||||
this.excludes.some((excludePattern) =>
|
||||
new RegExp(excludePattern).test(path),
|
||||
new RegExp(excludePattern).test(excincPath),
|
||||
)
|
||||
) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
@ -133,7 +139,7 @@ export class WebCrawler {
|
||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||
if (
|
||||
!this.includes.some((includePattern) =>
|
||||
new RegExp(includePattern).test(path),
|
||||
new RegExp(includePattern).test(excincPath),
|
||||
)
|
||||
) {
|
||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||
|
@ -1112,6 +1112,11 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
// });
|
||||
}
|
||||
}
|
||||
|
||||
// Only run check after adding new jobs for discovery - mogery
|
||||
if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) {
|
||||
throw new Error("Source URL is not allowed by includePaths/excludePaths rules")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -173,6 +173,7 @@ export interface CrawlParams {
|
||||
};
|
||||
deduplicateSimilarURLs?: boolean;
|
||||
ignoreQueryParameters?: boolean;
|
||||
regexOnFullURL?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
x
Reference in New Issue
Block a user