mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-22 14:09:45 +08:00

* feat: new snips test framework * Update mock.ts --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com>
163 lines
4.1 KiB
TypeScript
163 lines
4.1 KiB
TypeScript
import { Logger } from "winston";
|
|
import * as Sentry from "@sentry/node";
|
|
import { z } from "zod";
|
|
|
|
import { robustFetch } from "../../lib/fetch";
|
|
import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error";
|
|
import { MockState } from "../../lib/mock";
|
|
|
|
const successSchema = z.object({
|
|
jobId: z.string(),
|
|
state: z.literal("completed"),
|
|
processing: z.literal(false),
|
|
|
|
// timeTaken: z.number(),
|
|
content: z.string(),
|
|
url: z.string().optional(),
|
|
|
|
pageStatusCode: z.number(),
|
|
pageError: z.string().optional(),
|
|
|
|
// TODO: this needs to be non-optional, might need fixes on f-e side to ensure reliability
|
|
responseHeaders: z.record(z.string(), z.string()).optional(),
|
|
|
|
// timeTakenCookie: z.number().optional(),
|
|
// timeTakenRequest: z.number().optional(),
|
|
|
|
// legacy: playwright only
|
|
screenshot: z.string().optional(),
|
|
|
|
// new: actions
|
|
screenshots: z.string().array().optional(),
|
|
actionContent: z
|
|
.object({
|
|
url: z.string(),
|
|
html: z.string(),
|
|
})
|
|
.array()
|
|
.optional(),
|
|
|
|
// chrome-cdp only -- file download handler
|
|
file: z
|
|
.object({
|
|
name: z.string(),
|
|
content: z.string(),
|
|
})
|
|
.optional()
|
|
.or(z.null()),
|
|
});
|
|
|
|
export type FireEngineCheckStatusSuccess = z.infer<typeof successSchema>;
|
|
|
|
const processingSchema = z.object({
|
|
jobId: z.string(),
|
|
state: z.enum([
|
|
"delayed",
|
|
"active",
|
|
"waiting",
|
|
"waiting-children",
|
|
"unknown",
|
|
"prioritized",
|
|
]),
|
|
processing: z.boolean(),
|
|
});
|
|
|
|
const failedSchema = z.object({
|
|
jobId: z.string(),
|
|
state: z.literal("failed"),
|
|
processing: z.literal(false),
|
|
error: z.string(),
|
|
});
|
|
|
|
export class StillProcessingError extends Error {
|
|
constructor(jobId: string) {
|
|
super("Job is still under processing", { cause: { jobId } });
|
|
}
|
|
}
|
|
|
|
export async function fireEngineCheckStatus(
|
|
logger: Logger,
|
|
jobId: string,
|
|
mock: MockState | null,
|
|
): Promise<FireEngineCheckStatusSuccess> {
|
|
const fireEngineURL = process.env.FIRE_ENGINE_BETA_URL!;
|
|
|
|
const status = await Sentry.startSpan(
|
|
{
|
|
name: "fire-engine: Check status",
|
|
attributes: {
|
|
jobId,
|
|
},
|
|
},
|
|
async (span) => {
|
|
return await robustFetch({
|
|
url: `${fireEngineURL}/scrape/${jobId}`,
|
|
method: "GET",
|
|
logger: logger.child({ method: "fireEngineCheckStatus/robustFetch" }),
|
|
headers: {
|
|
...(Sentry.isInitialized()
|
|
? {
|
|
"sentry-trace": Sentry.spanToTraceHeader(span),
|
|
baggage: Sentry.spanToBaggageHeader(span),
|
|
}
|
|
: {}),
|
|
},
|
|
mock,
|
|
});
|
|
},
|
|
);
|
|
|
|
const successParse = successSchema.safeParse(status);
|
|
const processingParse = processingSchema.safeParse(status);
|
|
const failedParse = failedSchema.safeParse(status);
|
|
|
|
if (successParse.success) {
|
|
logger.debug("Scrape succeeded!", { jobId });
|
|
return successParse.data;
|
|
} else if (processingParse.success) {
|
|
throw new StillProcessingError(jobId);
|
|
} else if (failedParse.success) {
|
|
logger.debug("Scrape job failed", { status, jobId });
|
|
if (
|
|
typeof status.error === "string" &&
|
|
status.error.includes("Chrome error: ")
|
|
) {
|
|
throw new SiteError(status.error.split("Chrome error: ")[1]);
|
|
} else if (
|
|
typeof status.error === "string" &&
|
|
status.error.includes("File size exceeds")
|
|
) {
|
|
throw new UnsupportedFileError(
|
|
"File size exceeds " + status.error.split("File size exceeds ")[1],
|
|
);
|
|
} else if (
|
|
typeof status.error === "string" &&
|
|
// TODO: improve this later
|
|
status.error.includes("Element")
|
|
) {
|
|
throw new ActionError(status.error.split("Error: ")[1]);
|
|
} else {
|
|
throw new EngineError("Scrape job failed", {
|
|
cause: {
|
|
status,
|
|
jobId,
|
|
},
|
|
});
|
|
}
|
|
} else {
|
|
logger.debug("Check status returned response not matched by any schema", {
|
|
status,
|
|
jobId,
|
|
});
|
|
throw new Error(
|
|
"Check status returned response not matched by any schema",
|
|
{
|
|
cause: {
|
|
status,
|
|
jobId,
|
|
},
|
|
},
|
|
);
|
|
}
|
|
}
|