From 877f072e3c51e8fe11258ff2635dc993692eeb21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 9 Dec 2024 23:40:44 +0100 Subject: [PATCH] feat: crawl log parser (poc) --- apps/api/logview.js | 23 +++++++++++++++++++++++ apps/api/src/services/queue-worker.ts | 4 ++-- 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 apps/api/logview.js diff --git a/apps/api/logview.js b/apps/api/logview.js new file mode 100644 index 00000000..17032b2e --- /dev/null +++ b/apps/api/logview.js @@ -0,0 +1,23 @@ +const fs = require("fs"); + +const logs = fs.readFileSync("log-20780c8a-52f5-4af7-ac48-62997d11ec9b.log", "utf8") + .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); + +const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))]; + +const urlFilter = x => new URL(x).pathname.slice(1) || "root" + +for (const crawlId of crawlIds) { + const crawlLogs = logs.filter(x => x.crawlId === crawlId); + + const jobAdds = crawlLogs.filter(x => x.jobPriority !== undefined && x.message.startsWith("Added job for URL ")); + const jobStarts = crawlLogs.filter(x => x.message.startsWith("🐂 Worker taking job")); + + fs.writeFileSync(crawlId + ".md", + "```mermaid\nflowchart LR\n " + + jobStarts.map(x => `${x.jobId}[${urlFilter(x.url)}]`).join("\n ") + "\n " + + jobAdds.map(x => `${x.jobId}[${urlFilter(jobStarts.find(y => y.jobId === x.jobId).url)}] --> ${x.newJobId}[${urlFilter(x.url)}]`).join("\n ") + + "\n```\n\nURLs scraped: (" + jobStarts.length + ")\n" + + jobStarts.map(x => "- " + x.url).join("\n") + ); +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 78578395..9d25848b 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -346,7 +346,7 @@ workerFun(getScrapeQueue(), processJobInternal); async function processJob(job: Job & { id: string }, token: string) { const logger = _logger.child({ module: "queue-worker", method: "processJob", jobId: job.id, scrapeId: job.id, crawlId: job.data?.crawl_id ?? undefined }); - logger.info(`🐂 Worker taking job ${job.id}`); + logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url }); // Check if the job URL is researchhub and block it immediately // TODO: remove this once solve the root issue @@ -505,7 +505,7 @@ async function processJob(job: Job & { id: string }, token: string) { ); await addCrawlJob(job.data.crawl_id, jobId); - logger.debug("Added job for URL " + JSON.stringify(link), { jobPriority, url: link }); + logger.debug("Added job for URL " + JSON.stringify(link), { jobPriority, url: link, newJobId: jobId }); } else { logger.debug("Could not lock URL " + JSON.stringify(link), { url: link }); }