diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 5e2ad1e9..7c3e6328 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -107,6 +107,7 @@ export async function crawlController(req: Request, res: Response) { pageOptions, team_id, robots, + createdAt: Date.now(), }; await saveCrawl(id, sc); diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index d802f431..88d6b716 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -8,6 +8,7 @@ export type StoredCrawl = { team_id: string; robots?: string; cancelled?: boolean; + createdAt: number; }; export async function saveCrawl(id: string, crawl: StoredCrawl) { diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index c49e697e..61983be0 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -44,7 +44,7 @@ export async function logJob(job: FirecrawlJob) { }, ]); - if (process.env.POSTHOG_API_KEY) { + if (process.env.POSTHOG_API_KEY && !job.crawl_id) { let phLog = { distinctId: "from-api", //* To identify this on the group level, setting distinctid to a static string per posthog docs: https://posthog.com/docs/product-analytics/group-analytics#advanced-server-side-only-capturing-group-events-without-a-user ...(job.team_id !== "preview" && { @@ -65,7 +65,6 @@ export async function logJob(job: FirecrawlJob) { extractor_options: job.extractor_options, num_tokens: job.num_tokens, retry: job.retry, - crawl_id: job.crawl_id, }, }; posthog.capture(phLog); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e8439b5f..d47ad756 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -230,6 +230,21 @@ async function processJob(job: Job, token: string) { const fullDocs = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); + await logJob({ + job_id: job.data.crawl_id, + success: jobStatus === "completed", + message: message, + num_docs: fullDocs.length, + docs: [], + time_taken: (Date.now() - sc.createdAt) / 1000, + team_id: job.data.team_id, + mode: "crawl", + url: sc.originUrl, + crawlerOptions: sc.crawlerOptions, + pageOptions: sc.pageOptions, + origin: job.data.origin, + }); + const data = { success: jobStatus !== "failed", result: { @@ -281,9 +296,11 @@ async function processJob(job: Job, token: string) { error: "Something went wrong... Contact help@mendable.ai or try again." /* etc... */, }; + if (job.data.mode === "crawl" || job.data.crawl_id) { await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data); } + await logJob({ job_id: job.id as string, success: false, @@ -302,6 +319,28 @@ async function processJob(job: Job, token: string) { origin: job.data.origin, crawl_id: job.data.crawl_id, }); + + if (job.data.crawl_id) { + const sc = await getCrawl(job.data.crawl_id); + + await logJob({ + job_id: job.data.crawl_id, + success: false, + message: + typeof error === "string" + ? error + : error.message ?? "Something went wrong... Contact help@mendable.ai", + num_docs: 0, + docs: [], + time_taken: 0, + team_id: job.data.team_id, + mode: "crawl", + url: sc ? sc.originUrl : job.data.url, + crawlerOptions: sc ? sc.crawlerOptions : job.data.crawlerOptions, + pageOptions: sc ? sc.pageOptions : job.data.pageOptions, + origin: job.data.origin, + }); + } // done(null, data); return data; }