parameter

This commit is contained in:
rafaelmmiller 2024-11-29 16:44:54 -03:00
parent 2988a56ee5
commit 5ddb7eb922
3 changed files with 12 additions and 4 deletions

View File

@ -231,6 +231,7 @@ const crawlerOptions = z.object({
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false), allowExternalLinks: z.boolean().default(false),
allowSubdomains: z.boolean().default(false), allowSubdomains: z.boolean().default(false),
ignoreRobotsTxt: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true), ignoreSitemap: z.boolean().default(true),
deduplicateSimilarURLs: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true),
ignoreQueryParameters: z.boolean().default(false), ignoreQueryParameters: z.boolean().default(false),
@ -504,6 +505,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
allowBackwardCrawling: x.allowBackwardLinks, allowBackwardCrawling: x.allowBackwardLinks,
allowExternalContentLinks: x.allowExternalLinks, allowExternalContentLinks: x.allowExternalLinks,
allowSubdomains: x.allowSubdomains, allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,
@ -520,6 +522,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
allowBackwardLinks: x.allowBackwardCrawling, allowBackwardLinks: x.allowBackwardCrawling,
allowExternalLinks: x.allowExternalContentLinks, allowExternalLinks: x.allowExternalContentLinks,
allowSubdomains: x.allowSubdomains, allowSubdomains: x.allowSubdomains,
ignoreRobotsTxt: x.ignoreRobotsTxt,
ignoreSitemap: x.ignoreSitemap, ignoreSitemap: x.ignoreSitemap,
deduplicateSimilarURLs: x.deduplicateSimilarURLs, deduplicateSimilarURLs: x.deduplicateSimilarURLs,
ignoreQueryParameters: x.ignoreQueryParameters, ignoreQueryParameters: x.ignoreQueryParameters,

View File

@ -181,6 +181,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
}); });
if (sc.robots !== undefined) { if (sc.robots !== undefined) {

View File

@ -24,6 +24,7 @@ export class WebCrawler {
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
private allowSubdomains: boolean; private allowSubdomains: boolean;
private ignoreRobotsTxt: boolean;
constructor({ constructor({
jobId, jobId,
@ -38,6 +39,7 @@ export class WebCrawler {
allowBackwardCrawling = false, allowBackwardCrawling = false,
allowExternalContentLinks = false, allowExternalContentLinks = false,
allowSubdomains = false, allowSubdomains = false,
ignoreRobotsTxt = false,
}: { }: {
jobId: string; jobId: string;
initialUrl: string; initialUrl: string;
@ -51,6 +53,7 @@ export class WebCrawler {
allowBackwardCrawling?: boolean; allowBackwardCrawling?: boolean;
allowExternalContentLinks?: boolean; allowExternalContentLinks?: boolean;
allowSubdomains?: boolean; allowSubdomains?: boolean;
ignoreRobotsTxt?: boolean;
}) { }) {
this.jobId = jobId; this.jobId = jobId;
this.initialUrl = initialUrl; this.initialUrl = initialUrl;
@ -67,6 +70,7 @@ export class WebCrawler {
this.allowBackwardCrawling = allowBackwardCrawling ?? false; this.allowBackwardCrawling = allowBackwardCrawling ?? false;
this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false;
this.allowSubdomains = allowSubdomains ?? false; this.allowSubdomains = allowSubdomains ?? false;
this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
} }
public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
@ -137,7 +141,7 @@ export class WebCrawler {
} }
} }
const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
// Check if the link is disallowed by robots.txt // Check if the link is disallowed by robots.txt
if (!isAllowed) { if (!isAllowed) {
logger.debug(`Link disallowed by robots.txt: ${link}`); logger.debug(`Link disallowed by robots.txt: ${link}`);
@ -202,7 +206,7 @@ export class WebCrawler {
if (this.isInternalLink(fullUrl) && if (this.isInternalLink(fullUrl) &&
this.noSections(fullUrl) && this.noSections(fullUrl) &&
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl) this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
) { ) {
return fullUrl; return fullUrl;
} }
@ -255,8 +259,8 @@ export class WebCrawler {
return links; return links;
} }
private isRobotsAllowed(url: string): boolean { private isRobotsAllowed(url: string, ignoreRobotsTxt: boolean = false): boolean {
return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true) return (ignoreRobotsTxt ? true : (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true))
} }
private matchesExcludes(url: string, onlyDomains: boolean = false): boolean { private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {