mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 04:18:59 +08:00
crawl status and document stuff
This commit is contained in:
parent
0c057bb649
commit
f20328bdbb
@ -1,68 +1,85 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||||
import { RequestWithAuth } from "./types";
|
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||||
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
|
import { supabaseGetJobById } from "../../lib/supabase-jobs";
|
||||||
|
|
||||||
export async function crawlStatusController(req: RequestWithAuth, res: Response) {
|
async function getJob(id: string) {
|
||||||
// const job = await getWebScraperQueue().getJob(req.params.jobId);
|
console.log("getting job", id);
|
||||||
// if (!job) {
|
const job = await getScrapeQueue().getJob(id);
|
||||||
// return res.status(404).json({ error: "Job not found" });
|
if (!job) return job;
|
||||||
// }
|
|
||||||
|
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||||
|
const supabaseData = await supabaseGetJobById(id);
|
||||||
|
|
||||||
// const { current, current_url, total, current_step, partialDocs } = await job.progress();
|
if (supabaseData) {
|
||||||
|
job.returnvalue = supabaseData.docs;
|
||||||
// let data = job.returnvalue;
|
}
|
||||||
// if (process.env.USE_DB_AUTHENTICATION === "true") {
|
|
||||||
// const supabaseData = await supabaseGetJobById(req.params.jobId);
|
|
||||||
|
|
||||||
// if (supabaseData) {
|
|
||||||
// data = supabaseData.docs;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// const jobStatus = await job.getState();
|
|
||||||
|
|
||||||
// mock:
|
|
||||||
const id = uuidv4();
|
|
||||||
const result = {
|
|
||||||
totalCount: 100,
|
|
||||||
creditsUsed: 2,
|
|
||||||
expiresAt: new Date(Date.now() + 24 * 60 * 60 * 1000).getTime(),
|
|
||||||
status: "scraping", // scraping, completed, failed
|
|
||||||
next: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
|
||||||
data: [{
|
|
||||||
markdown: "test",
|
|
||||||
content: "test",
|
|
||||||
html: "test",
|
|
||||||
rawHtml: "test",
|
|
||||||
linksOnPage: ["test1", "test2"],
|
|
||||||
screenshot: "test",
|
|
||||||
metadata: {
|
|
||||||
title: "test",
|
|
||||||
description: "test",
|
|
||||||
language: "test",
|
|
||||||
sourceURL: "test",
|
|
||||||
statusCode: 200,
|
|
||||||
error: "test"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
markdown: "test",
|
|
||||||
content: "test",
|
|
||||||
html: "test",
|
|
||||||
rawHtml: "test",
|
|
||||||
linksOnPage: ["test1", "test2"],
|
|
||||||
screenshot: "test",
|
|
||||||
metadata: {
|
|
||||||
title: "test",
|
|
||||||
description: "test",
|
|
||||||
language: "test",
|
|
||||||
sourceURL: "test",
|
|
||||||
statusCode: 200,
|
|
||||||
error: "test"
|
|
||||||
}
|
|
||||||
}]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
res.status(200).json(result);
|
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||||
|
|
||||||
|
return job;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||||
|
const sc = await getCrawl(req.params.jobId);
|
||||||
|
if (!sc) {
|
||||||
|
return res.status(404).json({ success: false, error: "Job not found" });
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sc.team_id !== req.auth.team_id) {
|
||||||
|
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||||
|
}
|
||||||
|
|
||||||
|
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
|
||||||
|
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
|
||||||
|
|
||||||
|
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||||
|
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||||
|
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||||
|
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||||
|
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||||
|
|
||||||
|
let doneJobs = [];
|
||||||
|
|
||||||
|
if (end === undefined) { // determine 10 megabyte limit
|
||||||
|
let bytes = 0, used = 0;
|
||||||
|
|
||||||
|
while (bytes < 10485760 && used < doneJobsOrder.length) {
|
||||||
|
const job = await getJob(doneJobsOrder[used]);
|
||||||
|
|
||||||
|
doneJobs.push(job);
|
||||||
|
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||||
|
used++;
|
||||||
|
}
|
||||||
|
|
||||||
|
doneJobs.splice(doneJobs.length - 1, 1);
|
||||||
|
used--;
|
||||||
|
} else {
|
||||||
|
doneJobs = (await Promise.all(doneJobsOrder.map(async x => await getJob(x))));
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = doneJobs.map(x => x.returnvalue);
|
||||||
|
|
||||||
|
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||||
|
|
||||||
|
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||||
|
|
||||||
|
if (typeof req.query.limit === "string") {
|
||||||
|
nextURL.searchParams.set("limit", req.query.limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
res.status(200).json({
|
||||||
|
status,
|
||||||
|
totalCount: jobIDs.length,
|
||||||
|
creditsUsed: jobIDs.length,
|
||||||
|
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||||
|
next:
|
||||||
|
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||||
|
? undefined
|
||||||
|
: nextURL.href,
|
||||||
|
data: data.map(x => legacyDocumentConverter(x)),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ import { getScrapeQueue } from "../../services/queue-service";
|
|||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function crawlController(req: RequestWithAuth<CrawlResponse, CrawlRequest>, res: Response<CrawlResponse>) {
|
export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, res: Response<CrawlResponse>) {
|
||||||
req.body = crawlRequestSchema.parse(req.body);
|
req.body = crawlRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
|
@ -4,7 +4,7 @@ import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
|||||||
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
|
import { MapRequest, mapRequestSchema, MapResponse, RequestWithAuth } from "./types";
|
||||||
import { checkTeamCredits } from "../../services/billing/credit_billing";
|
import { checkTeamCredits } from "../../services/billing/credit_billing";
|
||||||
|
|
||||||
export async function mapController(req: RequestWithAuth<MapResponse, MapRequest>, res: Response<MapResponse>) {
|
export async function mapController(req: RequestWithAuth<{}, MapResponse, MapRequest>, res: Response<MapResponse>) {
|
||||||
req.body = mapRequestSchema.parse(req.body);
|
req.body = mapRequestSchema.parse(req.body);
|
||||||
console.log(req.body);
|
console.log(req.body);
|
||||||
// expected req.body
|
// expected req.body
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { Logger } from '../../lib/logger';
|
import { Logger } from '../../lib/logger';
|
||||||
import { Document, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from 'uuid';
|
import { v4 as uuidv4 } from 'uuid';
|
||||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||||
@ -8,7 +8,7 @@ import { addScrapeJob } from "../../services/queue-jobs";
|
|||||||
import { scrapeQueueEvents } from '../../services/queue-service';
|
import { scrapeQueueEvents } from '../../services/queue-service';
|
||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
|
|
||||||
export async function scrapeController(req: RequestWithAuth<ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
||||||
req.body = scrapeRequestSchema.parse(req.body);
|
req.body = scrapeRequestSchema.parse(req.body);
|
||||||
let earlyReturn = false;
|
let earlyReturn = false;
|
||||||
|
|
||||||
@ -101,20 +101,6 @@ export async function scrapeController(req: RequestWithAuth<ScrapeResponse, Scra
|
|||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: {
|
data: legacyDocumentConverter(doc),
|
||||||
markdown: doc.markdown,
|
|
||||||
links: doc.linksOnPage,
|
|
||||||
rawHtml: doc.rawHtml,
|
|
||||||
html: doc.html,
|
|
||||||
screenshot: doc.screenshot,
|
|
||||||
fullPageScreenshot: doc.fullPageScreenshot,
|
|
||||||
metadata: {
|
|
||||||
...doc.metadata,
|
|
||||||
pageError: undefined,
|
|
||||||
pageStatusCode: undefined,
|
|
||||||
error: doc.metadata.pageError,
|
|
||||||
statusCode: doc.metadata.pageStatusCode,
|
|
||||||
},
|
|
||||||
} as Document
|
|
||||||
});
|
});
|
||||||
}
|
}
|
@ -171,16 +171,29 @@ export type MapResponse = ErrorResponse | {
|
|||||||
links: string[];
|
links: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type CrawlStatusParams = {
|
||||||
|
jobId: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export type CrawlStatusResponse = ErrorResponse | {
|
||||||
|
status: "scraping" | "completed" | "failed" | "cancelled",
|
||||||
|
totalCount: number;
|
||||||
|
creditsUsed: number;
|
||||||
|
expiresAt: string;
|
||||||
|
next?: string;
|
||||||
|
data: Document[];
|
||||||
|
}
|
||||||
|
|
||||||
type AuthObject = {
|
type AuthObject = {
|
||||||
team_id: string;
|
team_id: string;
|
||||||
plan: string;
|
plan: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RequestWithMaybeAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
|
export interface RequestWithMaybeAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth?: AuthObject;
|
auth?: AuthObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RequestWithAuth<ReqBody = undefined, ResBody = undefined> extends Request<{}, ReqBody, ResBody> {
|
export interface RequestWithAuth<ReqParams = {}, ReqBody = undefined, ResBody = undefined> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth: AuthObject;
|
auth: AuthObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -211,3 +224,20 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
parsePDF: x.parsePDF
|
parsePDF: x.parsePDF
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function legacyDocumentConverter(doc: any): Document {
|
||||||
|
return {
|
||||||
|
markdown: doc.markdown,
|
||||||
|
links: doc.linksOnPage,
|
||||||
|
rawHtml: doc.rawHtml,
|
||||||
|
html: doc.html,
|
||||||
|
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||||
|
metadata: {
|
||||||
|
...doc.metadata,
|
||||||
|
pageError: undefined,
|
||||||
|
pageStatusCode: undefined,
|
||||||
|
error: doc.metadata.pageError,
|
||||||
|
statusCode: doc.metadata.pageStatusCode,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -26,6 +26,13 @@ export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
|||||||
return JSON.parse(x);
|
return JSON.parse(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function getCrawlExpiry(id: string): Promise<Date> {
|
||||||
|
const d = new Date();
|
||||||
|
const ttl = await redisConnection.ttl(id);
|
||||||
|
d.setSeconds(d.getSeconds() + ttl);
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
export async function addCrawlJob(id: string, job_id: string) {
|
export async function addCrawlJob(id: string, job_id: string) {
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||||
@ -38,7 +45,17 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
|||||||
|
|
||||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||||
|
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||||
|
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||||
|
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
|
||||||
|
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function isCrawlFinished(id: string) {
|
export async function isCrawlFinished(id: string) {
|
||||||
|
@ -60,12 +60,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
|||||||
if (!isIdempotencyValid) {
|
if (!isIdempotencyValid) {
|
||||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||||
}
|
}
|
||||||
// try {
|
|
||||||
createIdempotencyKey(req);
|
createIdempotencyKey(req);
|
||||||
// } catch (error) {
|
|
||||||
// Logger.error(error);
|
|
||||||
// return res.status(500).json({ success: false, error: error.message });
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
})()
|
})()
|
||||||
@ -128,7 +123,18 @@ v1Router.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Res
|
|||||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||||
} else {
|
} else {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + JSON.stringify(err));
|
let verbose = JSON.stringify(err);
|
||||||
|
if (verbose === "{}") {
|
||||||
|
if (err instanceof Error) {
|
||||||
|
verbose = JSON.stringify({
|
||||||
|
message: err.message,
|
||||||
|
name: err.name,
|
||||||
|
stack: err.stack,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" });
|
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id + "" });
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user