diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 4f65bdb1..28948570 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -1,68 +1,85 @@ import { Response } from "express"; -import { v4 as uuidv4 } from "uuid"; -import { RequestWithAuth } from "./types"; +import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types"; +import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis"; +import { getScrapeQueue } from "../../services/queue-service"; +import { supabaseGetJobById } from "../../lib/supabase-jobs"; -export async function crawlStatusController(req: RequestWithAuth, res: Response) { - // const job = await getWebScraperQueue().getJob(req.params.jobId); - // if (!job) { - // return res.status(404).json({ error: "Job not found" }); - // } +async function getJob(id: string) { + console.log("getting job", id); + const job = await getScrapeQueue().getJob(id); + if (!job) return job; + + if (process.env.USE_DB_AUTHENTICATION === "true") { + const supabaseData = await supabaseGetJobById(id); - // const { current, current_url, total, current_step, partialDocs } = await job.progress(); - - // let data = job.returnvalue; - // if (process.env.USE_DB_AUTHENTICATION === "true") { - // const supabaseData = await supabaseGetJobById(req.params.jobId); - - // if (supabaseData) { - // data = supabaseData.docs; - // } - // } - - // const jobStatus = await job.getState(); - - // mock: - const id = uuidv4(); - const result = { - totalCount: 100, - creditsUsed: 2, - expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(), - status: "scraping", // scraping, completed, failed - next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`, - data: [{ - markdown: "test", - content: "test", - html: "test", - rawHtml: "test", - linksOnPage: ["test1", "test2"], - screenshot: "test", - metadata: { - title: "test", - description: "test", - language: "test", - sourceURL: "test", - statusCode: 200, - error: "test" - } - }, - { - markdown: "test", - content: "test", - html: "test", - rawHtml: "test", - linksOnPage: ["test1", "test2"], - screenshot: "test", - metadata: { - title: "test", - description: "test", - language: "test", - sourceURL: "test", - statusCode: 200, - error: "test" - } - }] + if (supabaseData) { + job.returnvalue = supabaseData.docs; + } } - res.status(200).json(result); + job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; + + return job; +} + +export async function crawlStatusController(req: RequestWithAuth, res: Response) { + const sc = await getCrawl(req.params.jobId); + if (!sc) { + return res.status(404).json({ success: false, error: "Job not found" }); + } + + if (sc.team_id !== req.auth.team_id) { + return res.status(403).json({ success: false, error: "Forbidden" }); + } + + const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0; + const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined; + + const jobIDs = await getCrawlJobs(req.params.jobId); + const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x))); + const status: Exclude["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping"; + const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId); + const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1); + + let doneJobs = []; + + if (end === undefined) { // determine 10 megabyte limit + let bytes = 0, used = 0; + + while (bytes < 10485760 && used < doneJobsOrder.length) { + const job = await getJob(doneJobsOrder[used]); + + doneJobs.push(job); + bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length; + used++; + } + + doneJobs.splice(doneJobs.length - 1, 1); + used--; + } else { + doneJobs = (await Promise.all(doneJobsOrder.map(async x => await getJob(x)))); + } + + const data = doneJobs.map(x => x.returnvalue); + + const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`); + + nextURL.searchParams.set("skip", (start + data.length).toString()); + + if (typeof req.query.limit === "string") { + nextURL.searchParams.set("limit", req.query.limit); + } + + res.status(200).json({ + status, + totalCount: jobIDs.length, + creditsUsed: jobIDs.length, + expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(), + next: + status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this + ? undefined + : nextURL.href, + data: data.map(x => legacyDocumentConverter(x)), + }); } diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 52fe22e6..ebdcfba3 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -7,7 +7,7 @@ import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob } from "../../services/queue-jobs"; import { Logger } from "../../lib/logger"; -export async function crawlController(req: RequestWithAuth, res: Response) { +export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, res: Response) { req.body = crawlRequestSchema.parse(req.body); const id = uuidv4(); diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 49bc86ed..77167b2a 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -4,7 +4,7 @@ import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types"; import { checkTeamCredits } from "../../services/billing/credit_billing"; -export async function mapController(req: RequestWithAuth, res: Response) { +export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response) { req.body = mapRequestSchema.parse(req.body); console.log(req.body); // expected req.body diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 3a7d0e32..449f9106 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import { Logger } from '../../lib/logger'; -import { Document, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; +import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types"; import { billTeam } from "../../services/billing/credit_billing"; import { v4 as uuidv4 } from 'uuid'; import { numTokensFromString } from "../../lib/LLM-extraction/helpers"; @@ -8,7 +8,7 @@ import { addScrapeJob } from "../../services/queue-jobs"; import { scrapeQueueEvents } from '../../services/queue-service'; import { logJob } from "../../services/logging/log_job"; -export async function scrapeController(req: RequestWithAuth, res: Response) { +export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response) { req.body = scrapeRequestSchema.parse(req.body); let earlyReturn = false; @@ -101,20 +101,6 @@ export async function scrapeController(req: RequestWithAuth extends Request<{}, ReqBody, ResBody> { +export interface RequestWithMaybeAuth extends Request { auth?: AuthObject; } -export interface RequestWithAuth extends Request<{}, ReqBody, ResBody> { +export interface RequestWithAuth extends Request { auth: AuthObject; } @@ -211,3 +224,20 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { parsePDF: x.parsePDF }; } + +export function legacyDocumentConverter(doc: any): Document { + return { + markdown: doc.markdown, + links: doc.linksOnPage, + rawHtml: doc.rawHtml, + html: doc.html, + screenshot: doc.screenshot ?? doc.fullPageScreenshot, + metadata: { + ...doc.metadata, + pageError: undefined, + pageStatusCode: undefined, + error: doc.metadata.pageError, + statusCode: doc.metadata.pageStatusCode, + }, + } +} diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 9e8a0cf6..9d8802c7 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -26,6 +26,13 @@ export async function getCrawl(id: string): Promise { return JSON.parse(x); } +export async function getCrawlExpiry(id: string): Promise { + const d = new Date(); + const ttl = await redisConnection.ttl(id); + d.setSeconds(d.getSeconds() + ttl); + return d; +} + export async function addCrawlJob(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs", job_id); await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); @@ -38,7 +45,17 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { export async function addCrawlJobDone(id: string, job_id: string) { await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); + await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id); await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX"); +} + +export async function getDoneJobsOrderedLength(id: string): Promise { + return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); +} + +export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise { + return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end); } export async function isCrawlFinished(id: string) { diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index c3ea8633..b2ef5958 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -60,12 +60,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) if (!isIdempotencyValid) { return res.status(409).json({ success: false, error: "Idempotency key already used" }); } - // try { createIdempotencyKey(req); - // } catch (error) { - // Logger.error(error); - // return res.status(500).json({ success: false, error: error.message }); - // } } next(); })() @@ -128,7 +123,18 @@ v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Res res.status(400).json({ success: false, error: "Bad Request", details: err.errors }); } else { const id = uuidv4(); - Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + JSON.stringify(err)); + let verbose = JSON.stringify(err); + if (verbose === "{}") { + if (err instanceof Error) { + verbose = JSON.stringify({ + message: err.message, + name: err.name, + stack: err.stack, + }); + } + } + + Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose); res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" }); } });