mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 20:49:02 +08:00
feat: crawl log parser (poc)
This commit is contained in:
parent
4dbe0e6236
commit
877f072e3c
23
apps/api/logview.js
Normal file
23
apps/api/logview.js
Normal file
@ -0,0 +1,23 @@
|
||||
const fs = require("fs");
|
||||
|
||||
const logs = fs.readFileSync("log-20780c8a-52f5-4af7-ac48-62997d11ec9b.log", "utf8")
|
||||
.split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
|
||||
|
||||
const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
|
||||
|
||||
const urlFilter = x => new URL(x).pathname.slice(1) || "root"
|
||||
|
||||
for (const crawlId of crawlIds) {
|
||||
const crawlLogs = logs.filter(x => x.crawlId === crawlId);
|
||||
|
||||
const jobAdds = crawlLogs.filter(x => x.jobPriority !== undefined && x.message.startsWith("Added job for URL "));
|
||||
const jobStarts = crawlLogs.filter(x => x.message.startsWith("🐂 Worker taking job"));
|
||||
|
||||
fs.writeFileSync(crawlId + ".md",
|
||||
"```mermaid\nflowchart LR\n "
|
||||
+ jobStarts.map(x => `${x.jobId}[${urlFilter(x.url)}]`).join("\n ") + "\n "
|
||||
+ jobAdds.map(x => `${x.jobId}[${urlFilter(jobStarts.find(y => y.jobId === x.jobId).url)}] --> ${x.newJobId}[${urlFilter(x.url)}]`).join("\n ")
|
||||
+ "\n```\n\nURLs scraped: (" + jobStarts.length + ")\n"
|
||||
+ jobStarts.map(x => "- " + x.url).join("\n")
|
||||
);
|
||||
}
|
@ -346,7 +346,7 @@ workerFun(getScrapeQueue(), processJobInternal);
|
||||
|
||||
async function processJob(job: Job & { id: string }, token: string) {
|
||||
const logger = _logger.child({ module: "queue-worker", method: "processJob", jobId: job.id, scrapeId: job.id, crawlId: job.data?.crawl_id ?? undefined });
|
||||
logger.info(`🐂 Worker taking job ${job.id}`);
|
||||
logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
|
||||
|
||||
// Check if the job URL is researchhub and block it immediately
|
||||
// TODO: remove this once solve the root issue
|
||||
@ -505,7 +505,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
);
|
||||
|
||||
await addCrawlJob(job.data.crawl_id, jobId);
|
||||
logger.debug("Added job for URL " + JSON.stringify(link), { jobPriority, url: link });
|
||||
logger.debug("Added job for URL " + JSON.stringify(link), { jobPriority, url: link, newJobId: jobId });
|
||||
} else {
|
||||
logger.debug("Could not lock URL " + JSON.stringify(link), { url: link });
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user