fix(crawl): never invalidate first crawl scrape if redirects

2025-08-14 19:06:01 +08:00 · 2025-01-07 13:56:41 +01:00 · 2025-01-07 13:56:41 +01:00 · 7d73ebdbf1
commit 7d73ebdbf1
parent b96b97ed72
2 changed files with 9 additions and 3 deletions
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -558,6 +558,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
          crawl_id: job.data.crawl_id,
          webhook: job.data.webhook,
          v1: job.data.v1,
+          isCrawlSourceScrape: true,
        },
        {
          priority: 15,
@ -721,10 +722,14 @@ async function processJob(job: Job & { id: string }, token: string) {
        if (
          crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null
        ) {
+          if (job.data.isCrawlSourceScrape) {
+            // TODO: re-fetch sitemap for redirect target domain
+          } else {
            throw new Error(
              "Redirected target URL is not allowed by crawlOptions",
            ); // TODO: make this its own error type that is ignored by error tracking
          }
+        }

        if (isUrlBlocked(doc.metadata.url)) {
          throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -44,6 +44,7 @@ export interface WebScraperOptions {
  webhook?: z.infer<typeof webhookSchema>;
  v1?: boolean;
  is_scrape?: boolean;
+  isCrawlSourceScrape?: boolean;
 }

 export interface RunWebScraperParams {