diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index f354c640..639589ee 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -231,6 +231,7 @@ const crawlerOptions = z.object({ allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowExternalLinks: z.boolean().default(false), allowSubdomains: z.boolean().default(false), + ignoreRobotsTxt: z.boolean().default(false), ignoreSitemap: z.boolean().default(true), deduplicateSimilarURLs: z.boolean().default(true), ignoreQueryParameters: z.boolean().default(false), @@ -504,6 +505,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) { allowBackwardCrawling: x.allowBackwardLinks, allowExternalContentLinks: x.allowExternalLinks, allowSubdomains: x.allowSubdomains, + ignoreRobotsTxt: x.ignoreRobotsTxt, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, @@ -520,6 +522,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions allowBackwardLinks: x.allowBackwardCrawling, allowExternalLinks: x.allowExternalContentLinks, allowSubdomains: x.allowSubdomains, + ignoreRobotsTxt: x.ignoreRobotsTxt, ignoreSitemap: x.ignoreSitemap, deduplicateSimilarURLs: x.deduplicateSimilarURLs, ignoreQueryParameters: x.ignoreQueryParameters, diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 3928eb11..ba7487bd 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -181,6 +181,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false, allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, + ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, }); if (sc.robots !== undefined) { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index a7c7de75..2296b095 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -24,6 +24,7 @@ export class WebCrawler { private allowBackwardCrawling: boolean; private allowExternalContentLinks: boolean; private allowSubdomains: boolean; + private ignoreRobotsTxt: boolean; constructor({ jobId, @@ -38,6 +39,7 @@ export class WebCrawler { allowBackwardCrawling = false, allowExternalContentLinks = false, allowSubdomains = false, + ignoreRobotsTxt = false, }: { jobId: string; initialUrl: string; @@ -51,6 +53,7 @@ export class WebCrawler { allowBackwardCrawling?: boolean; allowExternalContentLinks?: boolean; allowSubdomains?: boolean; + ignoreRobotsTxt?: boolean; }) { this.jobId = jobId; this.initialUrl = initialUrl; @@ -67,6 +70,7 @@ export class WebCrawler { this.allowBackwardCrawling = allowBackwardCrawling ?? false; this.allowExternalContentLinks = allowExternalContentLinks ?? false; this.allowSubdomains = allowSubdomains ?? false; + this.ignoreRobotsTxt = ignoreRobotsTxt ?? false; } public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] { @@ -137,7 +141,7 @@ export class WebCrawler { } } - const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; + const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true); // Check if the link is disallowed by robots.txt if (!isAllowed) { logger.debug(`Link disallowed by robots.txt: ${link}`); @@ -202,7 +206,7 @@ export class WebCrawler { if (this.isInternalLink(fullUrl) && this.noSections(fullUrl) && !this.matchesExcludes(path) && - this.isRobotsAllowed(fullUrl) + this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt) ) { return fullUrl; } @@ -255,8 +259,8 @@ export class WebCrawler { return links; } - private isRobotsAllowed(url: string): boolean { - return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true) + private isRobotsAllowed(url: string, ignoreRobotsTxt: boolean = false): boolean { + return (ignoreRobotsTxt ? true : (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)) } private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {