mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 00:52:04 +08:00
feat(crawl): add maxDiscoveryDepth (#1329)
This commit is contained in:
parent
d855f5a567
commit
7cf2e52fe6
@ -53,4 +53,21 @@ describe("Crawl tests", () => {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, 120000);
|
}, 120000);
|
||||||
|
|
||||||
|
it.concurrent("discovers URLs properly when maxDiscoveryDepth is provided", async () => {
|
||||||
|
const res = await crawl({
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
ignoreSitemap: true,
|
||||||
|
maxDiscoveryDepth: 1,
|
||||||
|
limit: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(res.success).toBe(true);
|
||||||
|
if (res.success) {
|
||||||
|
expect(res.data.length).toBeGreaterThan(1);
|
||||||
|
for (const page of res.data) {
|
||||||
|
expect(page.metadata.url ?? page.metadata.sourceURL).not.toMatch(/^https:\/\/(www\.)?firecrawl\.dev\/blog\/.+$/);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 120000);
|
||||||
});
|
});
|
||||||
|
@ -440,6 +440,7 @@ const crawlerOptions = z
|
|||||||
includePaths: z.string().array().default([]),
|
includePaths: z.string().array().default([]),
|
||||||
excludePaths: z.string().array().default([]),
|
excludePaths: z.string().array().default([]),
|
||||||
maxDepth: z.number().default(10), // default?
|
maxDepth: z.number().default(10), // default?
|
||||||
|
maxDiscoveryDepth: z.number().optional(),
|
||||||
limit: z.number().default(10000), // default?
|
limit: z.number().default(10000), // default?
|
||||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
@ -793,6 +794,8 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
regexOnFullURL: x.regexOnFullURL,
|
regexOnFullURL: x.regexOnFullURL,
|
||||||
|
maxDiscoveryDepth: x.maxDiscoveryDepth,
|
||||||
|
currentDiscoveryDepth: 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -814,6 +817,7 @@ export function fromLegacyCrawlerOptions(x: any): {
|
|||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
regexOnFullURL: x.regexOnFullURL,
|
regexOnFullURL: x.regexOnFullURL,
|
||||||
|
maxDiscoveryDepth: x.maxDiscoveryDepth,
|
||||||
}),
|
}),
|
||||||
internalOptions: {
|
internalOptions: {
|
||||||
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
v0CrawlOnlyUrls: x.returnOnlyUrls,
|
||||||
|
@ -379,6 +379,7 @@ export function crawlToCrawler(
|
|||||||
id: string,
|
id: string,
|
||||||
sc: StoredCrawl,
|
sc: StoredCrawl,
|
||||||
newBase?: string,
|
newBase?: string,
|
||||||
|
crawlerOptions?: any,
|
||||||
): WebCrawler {
|
): WebCrawler {
|
||||||
const crawler = new WebCrawler({
|
const crawler = new WebCrawler({
|
||||||
jobId: id,
|
jobId: id,
|
||||||
@ -399,6 +400,8 @@ export function crawlToCrawler(
|
|||||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||||
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
||||||
|
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
|
||||||
|
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sc.robots !== undefined) {
|
if (sc.robots !== undefined) {
|
||||||
|
@ -31,6 +31,8 @@ export class WebCrawler {
|
|||||||
private regexOnFullURL: boolean;
|
private regexOnFullURL: boolean;
|
||||||
private logger: typeof _logger;
|
private logger: typeof _logger;
|
||||||
private sitemapsHit: Set<string> = new Set();
|
private sitemapsHit: Set<string> = new Set();
|
||||||
|
private maxDiscoveryDepth: number | undefined;
|
||||||
|
private currentDiscoveryDepth: number;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
@ -47,6 +49,8 @@ export class WebCrawler {
|
|||||||
allowSubdomains = false,
|
allowSubdomains = false,
|
||||||
ignoreRobotsTxt = false,
|
ignoreRobotsTxt = false,
|
||||||
regexOnFullURL = false,
|
regexOnFullURL = false,
|
||||||
|
maxDiscoveryDepth,
|
||||||
|
currentDiscoveryDepth,
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
@ -62,6 +66,8 @@ export class WebCrawler {
|
|||||||
allowSubdomains?: boolean;
|
allowSubdomains?: boolean;
|
||||||
ignoreRobotsTxt?: boolean;
|
ignoreRobotsTxt?: boolean;
|
||||||
regexOnFullURL?: boolean;
|
regexOnFullURL?: boolean;
|
||||||
|
maxDiscoveryDepth?: number;
|
||||||
|
currentDiscoveryDepth?: number;
|
||||||
}) {
|
}) {
|
||||||
this.jobId = jobId;
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
@ -81,6 +87,8 @@ export class WebCrawler {
|
|||||||
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
||||||
this.regexOnFullURL = regexOnFullURL ?? false;
|
this.regexOnFullURL = regexOnFullURL ?? false;
|
||||||
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
this.logger = _logger.child({ crawlId: this.jobId, module: "WebCrawler" });
|
||||||
|
this.maxDiscoveryDepth = maxDiscoveryDepth;
|
||||||
|
this.currentDiscoveryDepth = currentDiscoveryDepth ?? 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterLinks(
|
public filterLinks(
|
||||||
@ -89,6 +97,11 @@ export class WebCrawler {
|
|||||||
maxDepth: number,
|
maxDepth: number,
|
||||||
fromMap: boolean = false,
|
fromMap: boolean = false,
|
||||||
): string[] {
|
): string[] {
|
||||||
|
if (this.currentDiscoveryDepth === this.maxDiscoveryDepth) {
|
||||||
|
this.logger.debug("Max discovery depth hit, filtering off all links", { currentDiscoveryDepth: this.currentDiscoveryDepth, maxDiscoveryDepth: this.maxDiscoveryDepth });
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
// If the initial URL is a sitemap.xml, skip filtering
|
// If the initial URL is a sitemap.xml, skip filtering
|
||||||
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
|
if (this.initialUrl.endsWith("sitemap.xml") && fromMap) {
|
||||||
return sitemapLinks.slice(0, limit);
|
return sitemapLinks.slice(0, limit);
|
||||||
|
@ -1044,6 +1044,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
job.data.crawl_id,
|
job.data.crawl_id,
|
||||||
sc,
|
sc,
|
||||||
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
|
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
|
||||||
|
job.data.crawlerOptions,
|
||||||
);
|
);
|
||||||
|
|
||||||
const links = crawler.filterLinks(
|
const links = crawler.filterLinks(
|
||||||
@ -1088,6 +1089,10 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
team_id: sc.team_id,
|
team_id: sc.team_id,
|
||||||
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
|
scrapeOptions: scrapeOptions.parse(sc.scrapeOptions),
|
||||||
internalOptions: sc.internalOptions,
|
internalOptions: sc.internalOptions,
|
||||||
|
crawlerOptions: {
|
||||||
|
...sc.crawlerOptions,
|
||||||
|
currentDiscoveryDepth: (job.data.crawlerOptions?.currentDiscoveryDepth ?? 0) + 1,
|
||||||
|
},
|
||||||
plan: job.data.plan,
|
plan: job.data.plan,
|
||||||
origin: job.data.origin,
|
origin: job.data.origin,
|
||||||
crawl_id: job.data.crawl_id,
|
crawl_id: job.data.crawl_id,
|
||||||
|
@ -160,6 +160,7 @@ export interface CrawlParams {
|
|||||||
includePaths?: string[];
|
includePaths?: string[];
|
||||||
excludePaths?: string[];
|
excludePaths?: string[];
|
||||||
maxDepth?: number;
|
maxDepth?: number;
|
||||||
|
maxDiscoveryDepth?: number;
|
||||||
limit?: number;
|
limit?: number;
|
||||||
allowBackwardLinks?: boolean;
|
allowBackwardLinks?: boolean;
|
||||||
allowExternalLinks?: boolean;
|
allowExternalLinks?: boolean;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user