mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 23:35:52 +08:00
parameter
This commit is contained in:
parent
2988a56ee5
commit
5ddb7eb922
@ -231,6 +231,7 @@ const crawlerOptions = z.object({
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
allowSubdomains: z.boolean().default(false),
|
||||
ignoreRobotsTxt: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
deduplicateSimilarURLs: z.boolean().default(true),
|
||||
ignoreQueryParameters: z.boolean().default(false),
|
||||
@ -504,6 +505,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
@ -520,6 +522,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
||||
allowBackwardLinks: x.allowBackwardCrawling,
|
||||
allowExternalLinks: x.allowExternalContentLinks,
|
||||
allowSubdomains: x.allowSubdomains,
|
||||
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||
ignoreSitemap: x.ignoreSitemap,
|
||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||
|
@ -181,6 +181,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
|
||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||
});
|
||||
|
||||
if (sc.robots !== undefined) {
|
||||
|
@ -24,6 +24,7 @@ export class WebCrawler {
|
||||
private allowBackwardCrawling: boolean;
|
||||
private allowExternalContentLinks: boolean;
|
||||
private allowSubdomains: boolean;
|
||||
private ignoreRobotsTxt: boolean;
|
||||
|
||||
constructor({
|
||||
jobId,
|
||||
@ -38,6 +39,7 @@ export class WebCrawler {
|
||||
allowBackwardCrawling = false,
|
||||
allowExternalContentLinks = false,
|
||||
allowSubdomains = false,
|
||||
ignoreRobotsTxt = false,
|
||||
}: {
|
||||
jobId: string;
|
||||
initialUrl: string;
|
||||
@ -51,6 +53,7 @@ export class WebCrawler {
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
allowSubdomains?: boolean;
|
||||
ignoreRobotsTxt?: boolean;
|
||||
}) {
|
||||
this.jobId = jobId;
|
||||
this.initialUrl = initialUrl;
|
||||
@ -67,6 +70,7 @@ export class WebCrawler {
|
||||
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||
this.allowSubdomains = allowSubdomains ?? false;
|
||||
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
||||
}
|
||||
|
||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||
@ -137,7 +141,7 @@ export class WebCrawler {
|
||||
}
|
||||
}
|
||||
|
||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
||||
const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
|
||||
// Check if the link is disallowed by robots.txt
|
||||
if (!isAllowed) {
|
||||
logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||
@ -202,7 +206,7 @@ export class WebCrawler {
|
||||
if (this.isInternalLink(fullUrl) &&
|
||||
this.noSections(fullUrl) &&
|
||||
!this.matchesExcludes(path) &&
|
||||
this.isRobotsAllowed(fullUrl)
|
||||
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||
) {
|
||||
return fullUrl;
|
||||
}
|
||||
@ -255,8 +259,8 @@ export class WebCrawler {
|
||||
return links;
|
||||
}
|
||||
|
||||
private isRobotsAllowed(url: string): boolean {
|
||||
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
||||
private isRobotsAllowed(url: string, ignoreRobotsTxt: boolean = false): boolean {
|
||||
return (ignoreRobotsTxt ? true : (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true))
|
||||
}
|
||||
|
||||
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
||||
|
Loading…
x
Reference in New Issue
Block a user