mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-18 12:09:42 +08:00
feat(crawl): add maxDiscoveryDepth (#1329)
This commit is contained in:
parent
d855f5a567
commit
7cf2e52fe6
@ -53,4 +53,21 @@ describe("Crawl tests", () => {
|
||||
}
|
||||
}
|
||||
}, 120000);
|
||||
|
||||
it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => {
|
||||
const res = await crawl({
|
||||
url: "https://firecrawl.dev",
|
||||
ignoreSitemap: true,
|
||||
maxDiscoveryDepth: 1,
|
||||
limit: 10,
|
||||
});
|
||||
|
||||
expect(res.success).toBe(true);
|
||||
if (res.success) {
|
||||
expect(res.data.length).toBeGreaterThan(1);
|
||||
for (const page of res.data) {
|
||||
expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/);
|
||||
}
|
||||
}
|
||||
}, 120000);
|
||||
});
|
||||
|
@ -440,6 +440,7 @@ const crawlerOptions = z
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
maxDepth: z.number().default(10), // default?
|
||||
maxDiscoveryDepth: z.number().optional(),
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
@ -793,6 +794,8 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
regexOnFullURL: x.regexOnFullURL,
|
||||
maxDiscoveryDepth: x.maxDiscoveryDepth,
|
||||
currentDiscoveryDepth: 0,
|
||||
};
|
||||
}
|
||||
|
||||
@ -814,7 +817,8 @@ export function fromLegacyCrawlerOptions(x: any): {
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
regexOnFullURL: x.regexOnFullURL,
|
||||
}),
|
||||
maxDiscoveryDepth: x.maxDiscoveryDepth,
|
||||
}),
|
||||
internalOptions: {
|
||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||
},
|
||||
|
@ -379,6 +379,7 @@ export function crawlToCrawler(
|
||||
id: string,
|
||||
sc: StoredCrawl,
|
||||
newBase?: string,
|
||||
crawlerOptions?: any,
|
||||
): WebCrawler {
|
||||
const crawler = new WebCrawler({
|
||||
jobId: id,
|
||||
@ -399,6 +400,8 @@ export function crawlToCrawler(
|
||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
||||
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
|
||||
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
|
||||
});
|
||||
|
||||
if (sc.robots !== undefined) {
|
||||
|
@ -31,6 +31,8 @@ export class WebCrawler {
|
||||
private regexOnFullURL: boolean;
|
||||
private logger: typeof _logger;
|
||||
private sitemapsHit: Set<string> = new Set();
|
||||
private maxDiscoveryDepth: number | undefined;
|
||||
private currentDiscoveryDepth: number;
|
||||
|
||||
constructor({
|
||||
jobId,
|
||||
@ -47,6 +49,8 @@ export class WebCrawler {
|
||||
allowSubdomains = false,
|
||||
ignoreRobotsTxt = false,
|
||||
regexOnFullURL = false,
|
||||
maxDiscoveryDepth,
|
||||
currentDiscoveryDepth,
|
||||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
@ -62,6 +66,8 @@ export class WebCrawler {
|
||||
allowSubdomains?: boolean;
|
||||
ignoreRobotsTxt?: boolean;
|
||||
regexOnFullURL?: boolean;
|
||||
maxDiscoveryDepth?: number;
|
||||
currentDiscoveryDepth?: number;
|
||||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
@ -81,6 +87,8 @@ export class WebCrawler {
|
||||
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
||||
this.regexOnFullURL = regexOnFullURL ?? false;
|
||||
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
||||
this.maxDiscoveryDepth = maxDiscoveryDepth;
|
||||
this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0;
|
||||
}
|
||||
|
||||
public filterLinks(
|
||||
@ -89,6 +97,11 @@ export class WebCrawler {
|
||||
maxDepth: number,
|
||||
fromMap: boolean = false,
|
||||
): string[] {
|
||||
if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) {
|
||||
this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth });
|
||||
return [];
|
||||
}
|
||||
|
||||
// If the initial URL is a sitemap.xml, skip filtering
|
||||
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
|
||||
return sitemapLinks.slice(0, limit);
|
||||
|
@ -1044,6 +1044,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
job.data.crawl_id,
|
||||
sc,
|
||||
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
|
||||
job.data.crawlerOptions,
|
||||
);
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
@ -1088,6 +1089,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
team_id: sc.team_id,
|
||||
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
|
||||
internalOptions: sc.internalOptions,
|
||||
crawlerOptions: {
|
||||
...sc.crawlerOptions,
|
||||
currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
|
||||
},
|
||||
plan: job.data.plan,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
|
@ -160,6 +160,7 @@ export interface CrawlParams {
|
||||
includePaths?: string[];
|
||||
excludePaths?: string[];
|
||||
maxDepth?: number;
|
||||
maxDiscoveryDepth?: number;
|
||||
limit?: number;
|
||||
allowBackwardLinks?: boolean;
|
||||
allowExternalLinks?: boolean;
|
||||
|
Loading…
x
Reference in New Issue
Block a user