From 570809aa59eac5c8bb820842637c5ebf80c44259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 4 Apr 2025 22:12:59 +0200 Subject: [PATCH] fix(unvisitedUrls): filter with crawler Fixes #1410 --- apps/api/src/services/queue-worker.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a7da10f7..36f8ed2d 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -118,7 +118,19 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { const lastUrlsSet = new Set(lastUrls); - const univistedUrls = Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)); + const crawler = crawlToCrawler( + job.data.crawl_id, + sc, + sc.originUrl!, + job.data.crawlerOptions, + ); + + const univistedUrls = crawler.filterLinks( + Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)), + Infinity, + sc.crawlerOptions.maxDepth ?? 10, + ); + const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id)); console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount);