mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 15:35:58 +08:00
parameter
This commit is contained in:
parent
2988a56ee5
commit
5ddb7eb922
@ -231,6 +231,7 @@ const crawlerOptions = z.object({
|
|||||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
allowSubdomains: z.boolean().default(false),
|
allowSubdomains: z.boolean().default(false),
|
||||||
|
ignoreRobotsTxt: z.boolean().default(false),
|
||||||
ignoreSitemap: z.boolean().default(true),
|
ignoreSitemap: z.boolean().default(true),
|
||||||
deduplicateSimilarURLs: z.boolean().default(true),
|
deduplicateSimilarURLs: z.boolean().default(true),
|
||||||
ignoreQueryParameters: z.boolean().default(false),
|
ignoreQueryParameters: z.boolean().default(false),
|
||||||
@ -504,6 +505,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
|
|||||||
allowBackwardCrawling: x.allowBackwardLinks,
|
allowBackwardCrawling: x.allowBackwardLinks,
|
||||||
allowExternalContentLinks: x.allowExternalLinks,
|
allowExternalContentLinks: x.allowExternalLinks,
|
||||||
allowSubdomains: x.allowSubdomains,
|
allowSubdomains: x.allowSubdomains,
|
||||||
|
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
@ -520,6 +522,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
|
|||||||
allowBackwardLinks: x.allowBackwardCrawling,
|
allowBackwardLinks: x.allowBackwardCrawling,
|
||||||
allowExternalLinks: x.allowExternalContentLinks,
|
allowExternalLinks: x.allowExternalContentLinks,
|
||||||
allowSubdomains: x.allowSubdomains,
|
allowSubdomains: x.allowSubdomains,
|
||||||
|
ignoreRobotsTxt: x.ignoreRobotsTxt,
|
||||||
ignoreSitemap: x.ignoreSitemap,
|
ignoreSitemap: x.ignoreSitemap,
|
||||||
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
deduplicateSimilarURLs: x.deduplicateSimilarURLs,
|
||||||
ignoreQueryParameters: x.ignoreQueryParameters,
|
ignoreQueryParameters: x.ignoreQueryParameters,
|
||||||
|
@ -181,6 +181,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
|
|||||||
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
|
||||||
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
|
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (sc.robots !== undefined) {
|
if (sc.robots !== undefined) {
|
||||||
|
@ -24,6 +24,7 @@ export class WebCrawler {
|
|||||||
private allowBackwardCrawling: boolean;
|
private allowBackwardCrawling: boolean;
|
||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
private allowSubdomains: boolean;
|
private allowSubdomains: boolean;
|
||||||
|
private ignoreRobotsTxt: boolean;
|
||||||
|
|
||||||
constructor({
|
constructor({
|
||||||
jobId,
|
jobId,
|
||||||
@ -38,6 +39,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling = false,
|
allowBackwardCrawling = false,
|
||||||
allowExternalContentLinks = false,
|
allowExternalContentLinks = false,
|
||||||
allowSubdomains = false,
|
allowSubdomains = false,
|
||||||
|
ignoreRobotsTxt = false,
|
||||||
}: {
|
}: {
|
||||||
jobId: string;
|
jobId: string;
|
||||||
initialUrl: string;
|
initialUrl: string;
|
||||||
@ -51,6 +53,7 @@ export class WebCrawler {
|
|||||||
allowBackwardCrawling?: boolean;
|
allowBackwardCrawling?: boolean;
|
||||||
allowExternalContentLinks?: boolean;
|
allowExternalContentLinks?: boolean;
|
||||||
allowSubdomains?: boolean;
|
allowSubdomains?: boolean;
|
||||||
|
ignoreRobotsTxt?: boolean;
|
||||||
}) {
|
}) {
|
||||||
this.jobId = jobId;
|
this.jobId = jobId;
|
||||||
this.initialUrl = initialUrl;
|
this.initialUrl = initialUrl;
|
||||||
@ -67,6 +70,7 @@ export class WebCrawler {
|
|||||||
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
this.allowBackwardCrawling = allowBackwardCrawling ?? false;
|
||||||
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
this.allowExternalContentLinks = allowExternalContentLinks ?? false;
|
||||||
this.allowSubdomains = allowSubdomains ?? false;
|
this.allowSubdomains = allowSubdomains ?? false;
|
||||||
|
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
|
||||||
@ -137,7 +141,7 @@ export class WebCrawler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
|
const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
|
||||||
// Check if the link is disallowed by robots.txt
|
// Check if the link is disallowed by robots.txt
|
||||||
if (!isAllowed) {
|
if (!isAllowed) {
|
||||||
logger.debug(`Link disallowed by robots.txt: ${link}`);
|
logger.debug(`Link disallowed by robots.txt: ${link}`);
|
||||||
@ -202,7 +206,7 @@ export class WebCrawler {
|
|||||||
if (this.isInternalLink(fullUrl) &&
|
if (this.isInternalLink(fullUrl) &&
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.isRobotsAllowed(fullUrl)
|
this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
|
||||||
) {
|
) {
|
||||||
return fullUrl;
|
return fullUrl;
|
||||||
}
|
}
|
||||||
@ -255,8 +259,8 @@ export class WebCrawler {
|
|||||||
return links;
|
return links;
|
||||||
}
|
}
|
||||||
|
|
||||||
private isRobotsAllowed(url: string): boolean {
|
private isRobotsAllowed(url: string, ignoreRobotsTxt: boolean = false): boolean {
|
||||||
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
|
return (ignoreRobotsTxt ? true : (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true))
|
||||||
}
|
}
|
||||||
|
|
||||||
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user