From 258c67ce673986e8354eecc2e592c7c0f9abd7dd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 1 Oct 2024 14:20:23 -0300 Subject: [PATCH] Revert "feat(queue-worker): always crawl links from content even if sitemapped" This reverts commit 3c045c43a446bb7895892338c881cd7bc4f77cbf. --- apps/api/src/services/queue-worker.ts | 70 +++++++++++++++------------ 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 672f8b76..37e14baf 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -300,41 +300,49 @@ async function processJob(job: Job, token: string) { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - if (!sc.cancelled) { - const crawler = crawlToCrawler(job.data.crawl_id, sc); + if (!job.data.sitemapped) { + if (!sc.cancelled) { + const crawler = crawlToCrawler(job.data.crawl_id, sc); - const links = crawler.filterLinks( - crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl), - Infinity, - sc.crawlerOptions?.maxDepth ?? 10 - ); + const links = crawler.filterLinks( + crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl), + Infinity, + sc.crawlerOptions?.maxDepth ?? 10 + ); - for (const link of links) { - if (await lockURL(job.data.crawl_id, sc, link)) { - const jobPriority = await getJobPriority({ - plan: sc.plan as PlanType, - team_id: sc.team_id, - basePriority: job.data.crawl_id ? 20 : 10, - }); - const jobId = uuidv4(); - - const newJob = await addScrapeJob( - { - url: link, - mode: "single_urls", - crawlerOptions: sc.crawlerOptions, + for (const link of links) { + if (await lockURL(job.data.crawl_id, sc, link)) { + // This seems to work really welel + const jobPriority = await getJobPriority({ + plan: sc.plan as PlanType, team_id: sc.team_id, - pageOptions: sc.pageOptions, - origin: job.data.origin, - crawl_id: job.data.crawl_id, - v1: job.data.v1, - }, - {}, - jobId, - jobPriority - ); + basePriority: job.data.crawl_id ? 20 : 10, + }); + const jobId = uuidv4(); - await addCrawlJob(job.data.crawl_id, newJob.id); + // console.log("plan: ", sc.plan); + // console.log("team_id: ", sc.team_id) + // console.log("base priority: ", job.data.crawl_id ? 20 : 10) + // console.log("job priority: " , jobPriority, "\n\n\n") + + const newJob = await addScrapeJob( + { + url: link, + mode: "single_urls", + crawlerOptions: sc.crawlerOptions, + team_id: sc.team_id, + pageOptions: sc.pageOptions, + origin: job.data.origin, + crawl_id: job.data.crawl_id, + v1: job.data.v1, + }, + {}, + jobId, + jobPriority + ); + + await addCrawlJob(job.data.crawl_id, newJob.id); + } } } }