feat(crawler): handle cross-origin redirects differently than same-origin redirects (#1279)

2025-04-18 12:09:42 +08:00 · 2025-03-02 13:32:46 +01:00 · 2025-03-02 13:32:46 +01:00 · e8c698d613
commit e8c698d613
parent fea249c568
2 changed files with 31 additions and 1 deletions
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -109,6 +109,9 @@ export class WebCrawler {

        // Check if the link exceeds the maximum depth allowed
        if (depth > maxDepth) {
+          if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+            this.logger.debug(`${link} DEPTH FAIL`);
+          }
          return false;
        }

@ -119,6 +122,9 @@ export class WebCrawler {
              new RegExp(excludePattern).test(path),
            )
          ) {
+            if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+              this.logger.debug(`${link} EXCLUDE FAIL`);
+            }
            return false;
          }
        }
@ -130,6 +136,9 @@ export class WebCrawler {
              new RegExp(includePattern).test(path),
            )
          ) {
+            if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+              this.logger.debug(`${link} INCLUDE FAIL`);
+            }
            return false;
          }
        }
@ -140,6 +149,9 @@ export class WebCrawler {
        try {
          normalizedLink = new URL(link);
        } catch (_) {
+          if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+            this.logger.debug(`${link} URL PARSE FAIL`);
+          }
          return false;
        }
        const initialHostname = normalizedInitialUrl.hostname.replace(
@ -158,6 +170,9 @@ export class WebCrawler {
          if (
            !normalizedLink.pathname.startsWith(normalizedInitialUrl.pathname)
          ) {
+            if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+              this.logger.debug(`${link} BACKWARDS FAIL ${normalizedLink.pathname} ${normalizedInitialUrl.pathname}`);
+            }
            return false;
          }
        }
@ -171,13 +186,22 @@ export class WebCrawler {
            method: "filterLinks",
            link,
          });
+          if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+            this.logger.debug(`${link} ROBOTS FAIL`);
+          }
          return false;
        }

        if (this.isFile(link)) {
+          if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+            this.logger.debug(`${link} FILE FAIL`);
+          }
          return false;
        }

+        if (process.env.FIRECRAWL_DEBUG_FILTER_LINKS) {
+          this.logger.debug(`${link} OK`);
+        }
        return true;
      })
      .slice(0, limit);
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -973,7 +973,13 @@ async function processJob(job: Job & { id: string }, token: string) {
          ); // TODO: make this its own error type that is ignored by error tracking
        }

-        if (job.data.isCrawlSourceScrape) {
+        // Only re-set originUrl if it's different from the current hostname
+        // This is only done on this condition to handle cross-domain redirects
+        // If this would be done for non-crossdomain redirects, but also for e.g.
+        // redirecting / -> /introduction (like our docs site does), it would
+        // break crawling the entire site without allowBackwardsCrawling - mogery
+        const isHostnameDifferent = normalizeUrlOnlyHostname(doc.metadata.url) !== normalizeUrlOnlyHostname(doc.metadata.sourceURL);
+        if (job.data.isCrawlSourceScrape && isHostnameDifferent) {
          // TODO: re-fetch sitemap for redirect target domain
          sc.originUrl = doc.metadata.url;
          await saveCrawl(job.data.crawl_id, sc);