feat(queue-worker): fix redirect slipping

2025-08-13 05:09:03 +08:00 · 2024-12-15 20:16:29 +01:00 · 2024-12-15 20:16:29 +01:00 · 30fa78cd9e
commit 30fa78cd9e
parent 126b46ee2c
4 changed files with 76 additions and 21 deletions
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -481,35 +481,32 @@ async function processJob(job: Job & { id: string }, token: string) {
        normalizeURL(doc.metadata.url, sc) !==
          normalizeURL(doc.metadata.sourceURL, sc)
      ) {
        logger.debug(
          "Was redirected, removing old URL and locking new URL...",
          { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
        );
        // Remove the old URL from visited unique due to checking for limit
        // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL)
        await redisConnection.srem(
          "crawl:" + job.data.crawl_id + ":visited_unique",
          normalizeURL(doc.metadata.sourceURL, sc),
        );
        const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc));
        const p2 = generateURLPermutations(
          normalizeURL(doc.metadata.sourceURL, sc),
        );
-        // In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before.
+        if (JSON.stringify(p1) !== JSON.stringify(p2)) {
-        // This can prevent flakiness with race conditions.
+          logger.debug(
-        // Lock the new URL
+            "Was redirected, removing old URL and locking new URL...",
-        const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url);
+            { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url },
-        if (
+          );
-          job.data.crawlerOptions !== null &&
+
-          !lockRes &&
+          // Prevent redirect target from being visited in the crawl again
-          JSON.stringify(p1) !== JSON.stringify(p2)
+          // See lockURL
-        ) {
+          const x = await redisConnection.sadd(
            "crawl:" + job.data.crawl_id + ":visited",
            ...p1.map(x => x.href),
          );
          const lockRes = x === p1.length;
          if (job.data.crawlerOptions !== null && !lockRes) {
            throw new RacedRedirectError();
          }
        }
      }
      logger.debug("Logging job to DB...");
      await logJob(
        {
@ -678,6 +675,7 @@ async function processJob(job: Job & { id: string }, token: string) {
      logger.debug("Declaring job as done...");
      await addCrawlJobDone(job.data.crawl_id, job.id, false);
      await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc));
      logger.debug("Logging job to DB...");
      await logJob(
--- a/apps/api/utils/logview.js
+++ b/apps/api/utils/logview.js
--- a/apps/api/utils/urldump-redis.js
+++ b/apps/api/utils/urldump-redis.js
@ -0,0 +1,14 @@
 require("dotenv").config();
 const Redis = require("ioredis");
 const crawlId = process.argv[2];
 const redisConnection = new Redis(process.env.REDIS_URL, {
    maxRetriesPerRequest: null,
 });
 (async () => {
    const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999);
    await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
    process.exit(0);
 })();
--- a/apps/api/utils/urldump.js
+++ b/apps/api/utils/urldump.js
@ -0,0 +1,43 @@
 require("dotenv").config();
 //const baseUrl = "https://api.firecrawl.dev";
 const baseUrl = "http://localhost:3002";
 const crawlId = process.argv[2];
 (async () => {
    let url = baseUrl + "/v1/crawl/" + crawlId;
    let urls = [];
    while (url) {
        let res;
        while (true) {
            try {
                res = (await (await fetch(url, {
                    headers: {
                        "Authorization": "Bearer " + process.env.TEST_API_KEY
                    }
                })).json());
                break;
            } catch (e) {
                console.error(e);
            }
        }
        console.log(res.data.length);
        if (res.data.length === 0) {
            break;
        }
        urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL));
        url = res.next;
        if (url !== undefined) {
            const o = new URL(url)
            o.protocol = new URL(baseUrl).protocol;
            url = o.href;
        }
    }
    await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n"));
 })();