fix(unvisitedUrls): filter with crawler

Fixes #1410
This commit is contained in:
Gergő Móricz 2025-04-04 22:12:59 +02:00
parent 6bed5eca50
commit 570809aa59

View File

@ -118,7 +118,19 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
const lastUrlsSet = new Set(lastUrls); const lastUrlsSet = new Set(lastUrls);
const univistedUrls = Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)); const crawler = crawlToCrawler(
job.data.crawl_id,
sc,
sc.originUrl!,
job.data.crawlerOptions,
);
const univistedUrls = crawler.filterLinks(
Array.from(lastUrlsSet).filter(x => !visitedUrls.has(x)),
Infinity,
sc.crawlerOptions.maxDepth ?? 10,
);
const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id)); const addableJobCount = sc.crawlerOptions.limit === undefined ? Infinity : (sc.crawlerOptions.limit - await getDoneJobsOrderedLength(job.data.crawl_id));
console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount); console.log(sc.originUrl!, univistedUrls, visitedUrls, lastUrls, addableJobCount);