diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index 60ca0e7f..7040a857 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -9,31 +9,53 @@ import * as Sentry from "@sentry/node"; import { configDotenv } from "dotenv"; import { Job } from "bullmq"; import { toLegacyDocument } from "../v1/types"; +import type { DBJob, PseudoJob } from "../v1/crawl-status"; configDotenv(); -export async function getJobs(crawlId: string, ids: string[]) { - const jobs = ( - await Promise.all(ids.map((x) => getScrapeQueue().getJob(x))) - ).filter((x) => x) as Job[]; - - if (process.env.USE_DB_AUTHENTICATION === "true") { - const supabaseData = await supabaseGetJobsByCrawlId(crawlId); - - supabaseData.forEach((x) => { - const job = jobs.find((y) => y.id === x.job_id); - if (job) { - job.returnvalue = x.docs; +export async function getJobs(crawlId: string, ids: string[]): Promise[]> { + const [bullJobs, dbJobs] = await Promise.all([ + Promise.all(ids.map((x) => getScrapeQueue().getJob(x))).then(x => x.filter(x => x)) as Promise<(Job & { id: string })[]>, + process.env.USE_DB_AUTHENTICATION === "true" ? await supabaseGetJobsByCrawlId(crawlId) : [], + ]); + + const bullJobMap = new Map>(); + const dbJobMap = new Map(); + + for (const job of bullJobs) { + bullJobMap.set(job.id, job); + } + + for (const job of dbJobs) { + dbJobMap.set(job.job_id, job); + } + + const jobs: PseudoJob[] = []; + + for (const id of ids) { + const bullJob = bullJobMap.get(id); + const dbJob = dbJobMap.get(id); + + if (!bullJob && !dbJob) continue; + + const data = dbJob?.docs ?? bullJob?.returnvalue; + + const job: PseudoJob = { + id, + getState: bullJob ? (() => bullJob.getState()) : (() => dbJob!.success ? "completed" : "failed"), + returnvalue: Array.isArray(data) + ? data[0] + : data, + data: { + scrapeOptions: bullJob ? bullJob.data.scrapeOptions : dbJob!.page_options, + }, + timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(), + failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined, } - }); - } - - jobs.forEach((job) => { - job.returnvalue = Array.isArray(job.returnvalue) - ? job.returnvalue[0] - : job.returnvalue; - }); - - return jobs; + + jobs.push(job); + } + + return jobs; } export async function crawlStatusController(req: Request, res: Response) { @@ -93,8 +115,9 @@ export async function crawlStatusController(req: Request, res: Response) { if ( jobs.length > 0 && jobs[0].data && - jobs[0].data.pageOptions && - !jobs[0].data.pageOptions.includeRawHtml + jobs[0].data.scrapeOptions && + jobs[0].data.scrapeOptions.formats && + !jobs[0].data.scrapeOptions.formats.includes("rawHtml") ) { data.forEach((item) => { if (item) { diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index a27e791a..d43562b6 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -25,7 +25,7 @@ import { logger } from "../../lib/logger"; import { supabase_service } from "../../services/supabase"; configDotenv(); -type PseudoJob = { +export type PseudoJob = { id: string, getState(): Promise | JobState | "unknown", returnvalue: T | null, @@ -33,9 +33,10 @@ type PseudoJob = { data: { scrapeOptions: any, }, + failedReason?: string, } -type DBJob = { docs: any, success: boolean, page_options: any, date_added: any } +export type DBJob = { docs: any, success: boolean, page_options: any, date_added: any, message: string | null } export async function getJob(id: string): Promise | null> { const [bullJob, dbJob] = await Promise.all([ @@ -57,6 +58,7 @@ export async function getJob(id: string): Promise | null> { scrapeOptions: bullJob ? bullJob.data.scrapeOptions : dbJob!.page_options, }, timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(), + failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined, } return job; @@ -99,6 +101,7 @@ export async function getJobs(ids: string[]): Promise[]> { scrapeOptions: bullJob ? bullJob.data.scrapeOptions : dbJob!.page_options, }, timestamp: bullJob ? bullJob.timestamp : new Date(dbJob!.date_added).valueOf(), + failedReason: (bullJob ? bullJob.failedReason : dbJob!.message) || undefined, } jobs.push(job);