feat(queue-worker): fix redirect slipping

2025-08-12 19:18:58 +08:00 · 2024-12-15 20:16:29 +01:00 · 2024-12-15 20:16:29 +01:00 · 30fa78cd9e
commit 30fa78cd9e
parent 126b46ee2c
4 changed files with 76 additions and 21 deletions
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -481,33 +481,30 @@ async function processJob(job: Job & { id: string }, token: string) {
        normalizeURL(doc.metadata.url, sc) !==
          normalizeURL(doc.metadata.sourceURL, sc)
      ) {
-        logger.debug(
-          "Was redirected, removing old URL and locking new URL...",
-          { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
-        );
-        // Remove the old URL from visited unique due to checking for limit
-        // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
-        await redisConnection.srem(
-          "crawl:" + job.data.crawl_id + ":visited_unique",
-          normalizeURL(doc.metadata.sourceURL, sc),
-        );
-
        const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
        const p2 = generateURLPermutations(
          normalizeURL(doc.metadata.sourceURL, sc),
        );

-        // In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before.
-        // This can prevent flakiness with race conditions.
-        // Lock the new URL
-        const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url);
-        if (
-          job.data.crawlerOptions !== null &&
-          !lockRes &&
-          JSON.stringify(p1) !== JSON.stringify(p2)
-        ) {
-          throw new RacedRedirectError();
+        if (JSON.stringify(p1) !== JSON.stringify(p2)) {
+          logger.debug(
+            "Was redirected, removing old URL and locking new URL...",
+            { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
+          );
+
+          // Prevent redirect target from being visited in the crawl again
+          // See lockURL
+          const x = await redisConnection.sadd(
+            "crawl:" + job.data.crawl_id + ":visited",
+            ...p1.map(x => x.href),
+          );
+          const lockRes = x === p1.length;
+  
+          if (job.data.crawlerOptions !== null && !lockRes) {
+            throw new RacedRedirectError();
+          }
        }
+        
      }

      logger.debug("Logging job to DB...");
@ -678,6 +675,7 @@ async function processJob(job: Job & { id: string }, token: string) {

      logger.debug("Declaring job as done...");
      await addCrawlJobDone(job.data.crawl_id, job.id, false);
+      await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc));

      logger.debug("Logging job to DB...");
      await logJob(
--- a/apps/api/utils/logview.js
+++ b/apps/api/utils/logview.js
--- a/apps/api/utils/urldump-redis.js
+++ b/apps/api/utils/urldump-redis.js
@ -0,0 +1,14 @@
+require("dotenv").config();
+const Redis = require("ioredis");
+
+const crawlId = process.argv[2];
+
+const redisConnection = new Redis(process.env.REDIS_URL, {
+    maxRetriesPerRequest: null,
+});
+
+(async () => {
+    const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
+    await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
+    process.exit(0);
+})();
--- a/apps/api/utils/urldump.js
+++ b/apps/api/utils/urldump.js
@ -0,0 +1,43 @@
+require("dotenv").config();
+
+//const baseUrl = "https://api.firecrawl.dev";
+const baseUrl = "http://localhost:3002";
+const crawlId = process.argv[2];
+
+(async () => {
+    let url = baseUrl + "/v1/crawl/" + crawlId;
+    let urls = [];
+
+    while (url) {
+        let res;
+        
+        while (true) {
+            try {
+                res = (await (await fetch(url, {
+                    headers: {
+                        "Authorization": "Bearer " + process.env.TEST_API_KEY
+                    }
+                })).json());
+                break;
+            } catch (e) {
+                console.error(e);
+            }
+        }
+
+        console.log(res.data.length);
+        if (res.data.length === 0) {
+            break;
+        }
+
+        urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
+
+        url = res.next;
+        if (url !== undefined) {
+            const o = new URL(url)
+            o.protocol = new URL(baseUrl).protocol;
+            url = o.href;
+        }
+    }
+
+    await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
+})();