mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 16:08:59 +08:00
feat(crawl): includes/excludes fixes (FIR-1300) (#1303)
* feat(crawl): includes/excludes fixes pt. 1 * fix(snips): billing tests * drop tha logs * fix(ci): add replica url * feat(crawl): drop initial scrape if it's not included * feat(ci): more verbose logging * fix crawl path in test * fix(ci): wait for api * fix(snips/scrape/ad): test for more pixels * feat(js-sdk/crawl): add regexOnFullURL
This commit is contained in:
parent
f8df18ed6a
commit
e1cfe1da48
1
.github/workflows/test-server-self-host.yml
vendored
1
.github/workflows/test-server-self-host.yml
vendored
@ -15,6 +15,7 @@ env:
|
|||||||
ENV: ${{ secrets.ENV }}
|
ENV: ${{ secrets.ENV }}
|
||||||
TEST_SUITE_SELF_HOSTED: true
|
TEST_SUITE_SELF_HOSTED: true
|
||||||
USE_GO_MARKDOWN_PARSER: true
|
USE_GO_MARKDOWN_PARSER: true
|
||||||
|
FIRECRAWL_DEBUG_FILTER_LINKS: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test:
|
test:
|
||||||
|
7
.github/workflows/test-server.yml
vendored
7
.github/workflows/test-server.yml
vendored
@ -21,6 +21,7 @@ env:
|
|||||||
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
SUPABASE_ANON_TOKEN: ${{ secrets.SUPABASE_ANON_TOKEN }}
|
||||||
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
SUPABASE_SERVICE_TOKEN: ${{ secrets.SUPABASE_SERVICE_TOKEN }}
|
||||||
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
|
||||||
|
SUPABASE_REPLICA_URL: ${{ secrets.SUPABASE_REPLICA_URL }}
|
||||||
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
|
||||||
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
|
||||||
USE_DB_AUTHENTICATION: true
|
USE_DB_AUTHENTICATION: true
|
||||||
@ -72,17 +73,19 @@ jobs:
|
|||||||
chmod +x html-to-markdown.so
|
chmod +x html-to-markdown.so
|
||||||
working-directory: ./apps/api/sharedLibs/go-html-to-md
|
working-directory: ./apps/api/sharedLibs/go-html-to-md
|
||||||
- name: Start the application
|
- name: Start the application
|
||||||
run: npm start &
|
run: npm start > api.log 2>&1 &
|
||||||
working-directory: ./apps/api
|
working-directory: ./apps/api
|
||||||
id: start_app
|
id: start_app
|
||||||
- name: Start worker
|
- name: Start worker
|
||||||
run: npm run workers &
|
run: npm run workers > worker.log 2>&1 &
|
||||||
working-directory: ./apps/api
|
working-directory: ./apps/api
|
||||||
id: start_workers
|
id: start_workers
|
||||||
- name: Start index worker
|
- name: Start index worker
|
||||||
run: npm run index-worker &
|
run: npm run index-worker &
|
||||||
working-directory: ./apps/api
|
working-directory: ./apps/api
|
||||||
id: start_index_worker
|
id: start_index_worker
|
||||||
|
- name: Wait for API
|
||||||
|
run: pnpx wait-on tcp:3002 -t 15s
|
||||||
- name: Run snippet tests
|
- name: Run snippet tests
|
||||||
run: |
|
run: |
|
||||||
npm run test:snips
|
npm run test:snips
|
||||||
|
@ -125,13 +125,18 @@ describe("Billing tests", () => {
|
|||||||
})
|
})
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
expect(crawl1.success).toBe(true);
|
||||||
|
expect(crawl2.success).toBe(true);
|
||||||
|
|
||||||
// sum: x+5y credits
|
// sum: x+5y credits
|
||||||
|
|
||||||
await sleepForBatchBilling();
|
await sleepForBatchBilling();
|
||||||
|
|
||||||
const rc2 = (await creditUsage()).remaining_credits;
|
const rc2 = (await creditUsage()).remaining_credits;
|
||||||
|
|
||||||
expect(rc1 - rc2).toBe(crawl1.body.completed + crawl2.body.completed * 5);
|
if (crawl1.success && crawl2.success) {
|
||||||
|
expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5);
|
||||||
|
}
|
||||||
}, 300000);
|
}, 300000);
|
||||||
|
|
||||||
it("bills map correctly", async () => {
|
it("bills map correctly", async () => {
|
||||||
|
@ -7,4 +7,50 @@ describe("Crawl tests", () => {
|
|||||||
limit: 10,
|
limit: 10,
|
||||||
});
|
});
|
||||||
}, 120000);
|
}, 120000);
|
||||||
|
|
||||||
|
it.concurrent("filters URLs properly", async () => {
|
||||||
|
const res = await crawl({
|
||||||
|
url: "https://firecrawl.dev/pricing",
|
||||||
|
includePaths: ["^/pricing$"],
|
||||||
|
limit: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.success).toBe(true);
|
||||||
|
if (res.success) {
|
||||||
|
expect(res.completed).toBe(1);
|
||||||
|
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
|
||||||
|
}
|
||||||
|
}, 120000);
|
||||||
|
|
||||||
|
it.concurrent("filters URLs properly when using regexOnFullURL", async () => {
|
||||||
|
const res = await crawl({
|
||||||
|
url: "https://firecrawl.dev/pricing",
|
||||||
|
includePaths: ["^https://(www\\.)?firecrawl\\.dev/pricing$"],
|
||||||
|
regexOnFullURL: true,
|
||||||
|
limit: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.success).toBe(true);
|
||||||
|
if (res.success) {
|
||||||
|
expect(res.completed).toBe(1);
|
||||||
|
expect(res.data[0].metadata.sourceURL).toBe("https://firecrawl.dev/pricing");
|
||||||
|
}
|
||||||
|
}, 120000);
|
||||||
|
|
||||||
|
it.concurrent("discovers URLs properly when origin is not included", async () => {
|
||||||
|
const res = await crawl({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
includePaths: ["^/blog"],
|
||||||
|
ignoreSitemap: true,
|
||||||
|
limit: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.success).toBe(true);
|
||||||
|
if (res.success) {
|
||||||
|
expect(res.data.length).toBeGreaterThan(1);
|
||||||
|
for (const page of res.data) {
|
||||||
|
expect(page.metadata.url ?? page.metadata.sourceURL).toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog/);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 120000);
|
||||||
});
|
});
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput } from "../../controllers/v1/types";
|
import { ScrapeRequestInput, Document, ExtractRequestInput, ExtractResponse, CrawlRequestInput, MapRequestInput, BatchScrapeRequestInput, SearchRequestInput, CrawlStatusResponse } from "../../controllers/v1/types";
|
||||||
import request from "supertest";
|
import request from "supertest";
|
||||||
|
|
||||||
// =========================================
|
// =========================================
|
||||||
@ -69,7 +69,7 @@ function expectCrawlToSucceed(response: Awaited<ReturnType<typeof crawlStatus>>)
|
|||||||
expect(response.body.data.length).toBeGreaterThan(0);
|
expect(response.body.data.length).toBeGreaterThan(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlStatus> {
|
export async function crawl(body: CrawlRequestInput): Promise<CrawlStatusResponse> {
|
||||||
const cs = await crawlStart(body);
|
const cs = await crawlStart(body);
|
||||||
expectCrawlStartToSucceed(cs);
|
expectCrawlStartToSucceed(cs);
|
||||||
|
|
||||||
@ -82,7 +82,7 @@ export async function crawl(body: CrawlRequestInput): ReturnType<typeof crawlSta
|
|||||||
} while (x.body.status === "scraping");
|
} while (x.body.status === "scraping");
|
||||||
|
|
||||||
expectCrawlToSucceed(x);
|
expectCrawlToSucceed(x);
|
||||||
return x;
|
return x.body;
|
||||||
}
|
}
|
||||||
|
|
||||||
// =========================================
|
// =========================================
|
||||||
|
@ -81,7 +81,7 @@ describe("Scrape tests", () => {
|
|||||||
blockAds: false,
|
blockAds: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(response.markdown).toContain(".g.doubleclick.net/");
|
expect(response.markdown).toMatch(/(\.g\.doubleclick\.net|amazon-adsystem\.com)\//);
|
||||||
}, 30000);
|
}, 30000);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -448,6 +448,7 @@ const crawlerOptions = z
|
|||||||
ignoreSitemap: z.boolean().default(false),
|
ignoreSitemap: z.boolean().default(false),
|
||||||
deduplicateSimilarURLs: z.boolean().default(true),
|
deduplicateSimilarURLs: z.boolean().default(true),
|
||||||
ignoreQueryParameters: z.boolean().default(false),
|
ignoreQueryParameters: z.boolean().default(false),
|
||||||
|
regexOnFullURL: z.boolean().default(false),
|
||||||
})
|
})
|
||||||
.strict(strictMessage);
|
.strict(strictMessage);
|
||||||
|
|
||||||
@ -791,6 +792,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
|
regexOnFullURL: x.regexOnFullURL,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -811,6 +813,7 @@ export function fromLegacyCrawlerOptions(x: any): {
|
|||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
|
regexOnFullURL: x.regexOnFullURL,
|
||||||
}),
|
}),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||||
|
@ -398,6 +398,7 @@ export function crawlToCrawler(
|
|||||||
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||||
|
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sc.robots !== undefined) {
|
if (sc.robots !== undefined) {
|
||||||
|
@ -28,6 +28,7 @@ export class WebCrawler {
|
|||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
private allowSubdomains: boolean;
|
private allowSubdomains: boolean;
|
||||||
private ignoreRobotsTxt: boolean;
|
private ignoreRobotsTxt: boolean;
|
||||||
|
private regexOnFullURL: boolean;
|
||||||
private logger: typeof _logger;
|
private logger: typeof _logger;
|
||||||
private sitemapsHit: Set<string> = new Set();
|
private sitemapsHit: Set<string> = new Set();
|
||||||
|
|
||||||
@ -45,6 +46,7 @@ export class WebCrawler {
|
|||||||
allowExternalContentLinks = false,
|
allowExternalContentLinks = false,
|
||||||
allowSubdomains = false,
|
allowSubdomains = false,
|
||||||
ignoreRobotsTxt = false,
|
ignoreRobotsTxt = false,
|
||||||
|
regexOnFullURL = false,
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
@ -59,6 +61,7 @@ export class WebCrawler {
|
|||||||
allowExternalContentLinks?: boolean;
|
allowExternalContentLinks?: boolean;
|
||||||
allowSubdomains?: boolean;
|
allowSubdomains?: boolean;
|
||||||
ignoreRobotsTxt?: boolean;
|
ignoreRobotsTxt?: boolean;
|
||||||
|
regexOnFullURL?: boolean;
|
||||||
}) {
|
}) {
|
||||||
this.jobId = jobId;
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
@ -76,6 +79,7 @@ export class WebCrawler {
|
|||||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||||
this.allowSubdomains = allowSubdomains ?? false;
|
this.allowSubdomains = allowSubdomains ?? false;
|
||||||
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
||||||
|
this.regexOnFullURL = regexOnFullURL ?? false;
|
||||||
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,11 +119,13 @@ export class WebCrawler {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const excincPath = this.regexOnFullURL ? link : path;
|
||||||
|
|
||||||
// Check if the link should be excluded
|
// Check if the link should be excluded
|
||||||
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
if (this.excludes.length > 0 && this.excludes[0] !== "") {
|
||||||
if (
|
if (
|
||||||
this.excludes.some((excludePattern) =>
|
this.excludes.some((excludePattern) =>
|
||||||
new RegExp(excludePattern).test(path),
|
new RegExp(excludePattern).test(excincPath),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
@ -133,7 +139,7 @@ export class WebCrawler {
|
|||||||
if (this.includes.length > 0 && this.includes[0] !== "") {
|
if (this.includes.length > 0 && this.includes[0] !== "") {
|
||||||
if (
|
if (
|
||||||
!this.includes.some((includePattern) =>
|
!this.includes.some((includePattern) =>
|
||||||
new RegExp(includePattern).test(path),
|
new RegExp(includePattern).test(excincPath),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
|
||||||
|
@ -1112,6 +1112,11 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
// });
|
// });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Only run check after adding new jobs for discovery - mogery
|
||||||
|
if (job.data.isCrawlSourceScrape && crawler.filterLinks([doc.metadata.url ?? doc.metadata.sourceURL!], 1, sc.crawlerOptions?.maxDepth ?? 10).length === 0) {
|
||||||
|
throw new Error("Source URL is not allowed by includePaths/excludePaths rules")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -173,6 +173,7 @@ export interface CrawlParams {
|
|||||||
};
|
};
|
||||||
deduplicateSimilarURLs?: boolean;
|
deduplicateSimilarURLs?: boolean;
|
||||||
ignoreQueryParameters?: boolean;
|
ignoreQueryParameters?: boolean;
|
||||||
|
regexOnFullURL?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Loading…
x
Reference in New Issue
Block a user