diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 9fd8861b..c2d2e2c6 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -481,33 +481,30 @@ async function processJob(job: Job & { id: string }, token: string) { normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc) ) { - logger.debug( - "Was redirected, removing old URL and locking new URL...", - { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }, - ); - // Remove the old URL from visited unique due to checking for limit - // Do not remove from :visited otherwise it will keep crawling the original URL (sourceURL) - await redisConnection.srem( - "crawl:" + job.data.crawl_id + ":visited_unique", - normalizeURL(doc.metadata.sourceURL, sc), - ); - const p1 = generateURLPermutations(normalizeURL(doc.metadata.url, sc)); const p2 = generateURLPermutations( normalizeURL(doc.metadata.sourceURL, sc), ); - // In crawls, we should only crawl a redirected page once, no matter how many; times it is redirected to, or if it's been discovered by the crawler before. - // This can prevent flakiness with race conditions. - // Lock the new URL - const lockRes = await lockURL(job.data.crawl_id, sc, doc.metadata.url); - if ( - job.data.crawlerOptions !== null && - !lockRes && - JSON.stringify(p1) !== JSON.stringify(p2) - ) { - throw new RacedRedirectError(); + if (JSON.stringify(p1) !== JSON.stringify(p2)) { + logger.debug( + "Was redirected, removing old URL and locking new URL...", + { oldUrl: doc.metadata.sourceURL, newUrl: doc.metadata.url }, + ); + + // Prevent redirect target from being visited in the crawl again + // See lockURL + const x = await redisConnection.sadd( + "crawl:" + job.data.crawl_id + ":visited", + ...p1.map(x => x.href), + ); + const lockRes = x === p1.length; + + if (job.data.crawlerOptions !== null && !lockRes) { + throw new RacedRedirectError(); + } } + } logger.debug("Logging job to DB..."); @@ -678,6 +675,7 @@ async function processJob(job: Job & { id: string }, token: string) { logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, false); + await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(job.data.url, sc)); logger.debug("Logging job to DB..."); await logJob( diff --git a/apps/api/logview.js b/apps/api/utils/logview.js similarity index 100% rename from apps/api/logview.js rename to apps/api/utils/logview.js diff --git a/apps/api/utils/urldump-redis.js b/apps/api/utils/urldump-redis.js new file mode 100644 index 00000000..fdd6090c --- /dev/null +++ b/apps/api/utils/urldump-redis.js @@ -0,0 +1,14 @@ +require("dotenv").config(); +const Redis = require("ioredis"); + +const crawlId = process.argv[2]; + +const redisConnection = new Redis(process.env.REDIS_URL, { + maxRetriesPerRequest: null, +}); + +(async () => { + const res = await redisConnection.sscan("crawl:" + crawlId + ":visited_unique", 0, "COUNT", 999); + await require("fs/promises").writeFile(crawlId + "-visited.txt", res[1].map(x => x.split("://").slice(1).join("://")).sort().join("\n")); + process.exit(0); +})(); \ No newline at end of file diff --git a/apps/api/utils/urldump.js b/apps/api/utils/urldump.js new file mode 100644 index 00000000..3583f7c6 --- /dev/null +++ b/apps/api/utils/urldump.js @@ -0,0 +1,43 @@ +require("dotenv").config(); + +//const baseUrl = "https://api.firecrawl.dev"; +const baseUrl = "http://localhost:3002"; +const crawlId = process.argv[2]; + +(async () => { + let url = baseUrl + "/v1/crawl/" + crawlId; + let urls = []; + + while (url) { + let res; + + while (true) { + try { + res = (await (await fetch(url, { + headers: { + "Authorization": "Bearer " + process.env.TEST_API_KEY + } + })).json()); + break; + } catch (e) { + console.error(e); + } + } + + console.log(res.data.length); + if (res.data.length === 0) { + break; + } + + urls.push(...res.data.map(x => x.metadata.url ?? x.metadata.sourceURL)); + + url = res.next; + if (url !== undefined) { + const o = new URL(url) + o.protocol = new URL(baseUrl).protocol; + url = o.href; + } + } + + await require("fs/promises").writeFile(crawlId + "-urls.txt", urls.map(x => x.split("://").slice(1).join("://")).sort().join("\n")); +})(); \ No newline at end of file