From 43530b3b4d8b0dd873c1f636ca0f0cd6b88078ad Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 3 Dec 2024 17:53:17 -0300 Subject: [PATCH] Nick: fixed n-1 w/ Rafa --- apps/api/src/services/queue-worker.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e06c3ed3..61748312 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -20,6 +20,7 @@ import { addCrawlJobDone, crawlToCrawler, finishCrawl, + generateURLPermutations, getCrawl, getCrawlJobs, lockURL, @@ -422,7 +423,16 @@ async function processJob(job: Job & { id: string }, token: string) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; if (doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== normalizeURL(doc.metadata.sourceURL, sc)) { - logger.debug("Was redirected, locking new URL..."); + logger.debug("Was redirected, removing old URL and locking new URL..."); + // Remove the old URL from visited sets + await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", normalizeURL(doc.metadata.sourceURL, sc)); + if (sc.crawlerOptions?.deduplicateSimilarURLs) { + const permutations = generateURLPermutations(doc.metadata.sourceURL).map(x => x.href); + await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited", ...permutations); + } + await redisConnection.srem("crawl:" + job.data.crawl_id + ":visited_unique", normalizeURL(doc.metadata.sourceURL, sc)); + + // Lock the new URL await lockURL(job.data.crawl_id, sc, doc.metadata.url); }