parameter

2025-08-13 23:35:52 +08:00 · 2024-11-29 16:44:54 -03:00 · 2024-11-29 16:44:54 -03:00 · 5ddb7eb922
commit 5ddb7eb922
parent 2988a56ee5
3 changed files with 12 additions and 4 deletions
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -231,6 +231,7 @@ const crawlerOptions = z.object({
  allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
  allowExternalLinks: z.boolean().default(false),
  allowSubdomains: z.boolean().default(false),
+  ignoreRobotsTxt: z.boolean().default(false),
  ignoreSitemap: z.boolean().default(true),
  deduplicateSimilarURLs: z.boolean().default(true),
  ignoreQueryParameters: z.boolean().default(false),
@ -504,6 +505,7 @@ export function toLegacyCrawlerOptions(x: CrawlerOptions) {
    allowBackwardCrawling: x.allowBackwardLinks,
    allowExternalContentLinks: x.allowExternalLinks,
    allowSubdomains: x.allowSubdomains,
+    ignoreRobotsTxt: x.ignoreRobotsTxt,
    ignoreSitemap: x.ignoreSitemap,
    deduplicateSimilarURLs: x.deduplicateSimilarURLs,
    ignoreQueryParameters: x.ignoreQueryParameters,
@ -520,6 +522,7 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
      allowBackwardLinks: x.allowBackwardCrawling,
      allowExternalLinks: x.allowExternalContentLinks,
      allowSubdomains: x.allowSubdomains,
+      ignoreRobotsTxt: x.ignoreRobotsTxt,
      ignoreSitemap: x.ignoreSitemap,
      deduplicateSimilarURLs: x.deduplicateSimilarURLs,
      ignoreQueryParameters: x.ignoreQueryParameters,
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -181,6 +181,7 @@ export function crawlToCrawler(id: string, sc: StoredCrawl, newBase?: string): W
        allowBackwardCrawling: sc.crawlerOptions?.allowBackwardCrawling ?? false,
        allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false,
        allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
+        ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
    });

    if (sc.robots !== undefined) {
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -24,6 +24,7 @@ export class WebCrawler {
  private allowBackwardCrawling: boolean;
  private allowExternalContentLinks: boolean;
  private allowSubdomains: boolean;
+  private ignoreRobotsTxt: boolean;

  constructor({
    jobId,
@ -38,6 +39,7 @@ export class WebCrawler {
    allowBackwardCrawling = false,
    allowExternalContentLinks = false,
    allowSubdomains = false,
+    ignoreRobotsTxt = false,
  }: {
    jobId: string;
    initialUrl: string;
@ -51,6 +53,7 @@ export class WebCrawler {
    allowBackwardCrawling?: boolean;
    allowExternalContentLinks?: boolean;
    allowSubdomains?: boolean;
+    ignoreRobotsTxt?: boolean;
  }) {
    this.jobId = jobId;
    this.initialUrl = initialUrl;
@ -67,6 +70,7 @@ export class WebCrawler {
    this.allowBackwardCrawling = allowBackwardCrawling ?? false;
    this.allowExternalContentLinks = allowExternalContentLinks ?? false;
    this.allowSubdomains = allowSubdomains ?? false;
+    this.ignoreRobotsTxt = ignoreRobotsTxt ?? false;
  }

  public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
@ -137,7 +141,7 @@ export class WebCrawler {
          }
        }

-        const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true;
+        const isAllowed = this.ignoreRobotsTxt ? true : (this.robots.isAllowed(link, "FireCrawlAgent") ?? true);
        // Check if the link is disallowed by robots.txt
        if (!isAllowed) {
          logger.debug(`Link disallowed by robots.txt: ${link}`);
@ -202,7 +206,7 @@ export class WebCrawler {
      if (this.isInternalLink(fullUrl) &&
        this.noSections(fullUrl) &&
        !this.matchesExcludes(path) &&
-        this.isRobotsAllowed(fullUrl)
+        this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
      ) {
        return fullUrl;
      }
@ -255,8 +259,8 @@ export class WebCrawler {
    return links;
  }

-  private isRobotsAllowed(url: string): boolean {
-    return (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true)
+  private isRobotsAllowed(url: string, ignoreRobotsTxt: boolean = false): boolean {
+    return (ignoreRobotsTxt ? true : (this.robots ? (this.robots.isAllowed(url, "FireCrawlAgent") ?? true) : true))
  }

  private matchesExcludes(url: string, onlyDomains: boolean = false): boolean {