mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 04:15:53 +08:00
Update index.ts
This commit is contained in:
parent
b8d7f9f257
commit
f9d55efba8
@ -13,102 +13,46 @@ import path from "node:path";
|
|||||||
|
|
||||||
type PDFProcessorResult = { html: string; markdown?: string };
|
type PDFProcessorResult = { html: string; markdown?: string };
|
||||||
|
|
||||||
|
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
|
||||||
|
|
||||||
async function scrapePDFWithRunPodMU(
|
async function scrapePDFWithRunPodMU(
|
||||||
meta: Meta,
|
meta: Meta,
|
||||||
tempFilePath: string,
|
tempFilePath: string,
|
||||||
timeToRun: number | undefined,
|
timeToRun: number | undefined,
|
||||||
|
base64Content: string,
|
||||||
): Promise<PDFProcessorResult> {
|
): Promise<PDFProcessorResult> {
|
||||||
meta.logger.debug("Processing PDF document with RunPod MU", {
|
meta.logger.debug("Processing PDF document with RunPod MU", {
|
||||||
tempFilePath,
|
tempFilePath,
|
||||||
});
|
});
|
||||||
|
|
||||||
const fileStat = await stat(tempFilePath);
|
|
||||||
if (fileStat.size > ((2**10)**2)*10) {
|
|
||||||
throw new UnsupportedFileError("File is larger than PDF parser limit (10MiB)");
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(tempFilePath);
|
const result = await robustFetch({
|
||||||
|
url:
|
||||||
const upload = await robustFetch({
|
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
|
||||||
url: "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/run",
|
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
|
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
|
||||||
},
|
},
|
||||||
body: {
|
body: {
|
||||||
input: {
|
input: {
|
||||||
file_content: (await readFile(tempFilePath)).toString("base64"),
|
file_content: base64Content,
|
||||||
filename: path.basename(tempFilePath) + ".pdf",
|
filename: path.basename(tempFilePath) + ".pdf",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
method: "scrapePDFWithRunPodMU/upload/robustFetch",
|
method: "scrapePDFWithRunPodMU/robustFetch",
|
||||||
}),
|
}),
|
||||||
schema: z.object({
|
schema: z.object({
|
||||||
id: z.string(),
|
|
||||||
}),
|
|
||||||
});
|
|
||||||
|
|
||||||
const jobId = upload.id;
|
|
||||||
|
|
||||||
// TODO: timeout, retries
|
|
||||||
const startedAt = Date.now();
|
|
||||||
const timeout = timeToRun ?? 300000;
|
|
||||||
|
|
||||||
while (Date.now() <= startedAt + timeout) {
|
|
||||||
try {
|
|
||||||
const result = await robustFetch({
|
|
||||||
url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${jobId}`,
|
|
||||||
method: "GET",
|
|
||||||
headers: {
|
|
||||||
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
|
|
||||||
},
|
|
||||||
logger: meta.logger.child({
|
|
||||||
method: "scrapePDFWithRunPodMU/result/robustFetch",
|
|
||||||
}),
|
|
||||||
schema: z.object({
|
|
||||||
status: z.string(),
|
|
||||||
error: z.any().optional(),
|
|
||||||
output: z.object({
|
output: z.object({
|
||||||
markdown: z.string(),
|
markdown: z.string(),
|
||||||
}).optional(),
|
}),
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (result.status === "COMPLETED") {
|
|
||||||
return {
|
return {
|
||||||
markdown: result.output!.markdown,
|
markdown: result.output.markdown,
|
||||||
html: await marked.parse(result.output!.markdown, { async: true }),
|
html: await marked.parse(result.output.markdown, { async: true }),
|
||||||
};
|
};
|
||||||
}
|
|
||||||
|
|
||||||
if (result.status === "FAILED") {
|
|
||||||
throw new Error("RunPod MU failed to parse PDF: " + result.error!, { cause: result.error });
|
|
||||||
}
|
|
||||||
|
|
||||||
// result not up yet
|
|
||||||
} catch (e) {
|
|
||||||
if (e instanceof Error && e.message === "Request sent failure status") {
|
|
||||||
// if ((e.cause as any).response.status === 404) {
|
|
||||||
// // no-op, result not up yet
|
|
||||||
// } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
|
|
||||||
// // URL is not a PDF, actually!
|
|
||||||
// meta.logger.debug("URL is not actually a PDF, signalling...");
|
|
||||||
// throw new RemoveFeatureError(["pdf"]);
|
|
||||||
// } else {
|
|
||||||
throw new Error("RunPod MU threw an error", {
|
|
||||||
cause: e.cause,
|
|
||||||
});
|
|
||||||
// }
|
|
||||||
} else {
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new Error("RunPod MU timed out");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapePDFWithParsePDF(
|
async function scrapePDFWithParsePDF(
|
||||||
@ -146,21 +90,14 @@ export async function scrapePDF(
|
|||||||
|
|
||||||
let result: PDFProcessorResult | null = null;
|
let result: PDFProcessorResult | null = null;
|
||||||
|
|
||||||
// First, try parsing with PdfParse
|
const base64Content = (await readFile(tempFilePath)).toString("base64");
|
||||||
result = await scrapePDFWithParsePDF(
|
|
||||||
{
|
|
||||||
...meta,
|
|
||||||
logger: meta.logger.child({
|
|
||||||
method: "scrapePDF/scrapePDFWithParsePDF",
|
|
||||||
}),
|
|
||||||
},
|
|
||||||
tempFilePath,
|
|
||||||
);
|
|
||||||
|
|
||||||
// Then, if output is too short, pass to RunPod MU
|
// Then, if output is too short, pass to RunPod MU
|
||||||
if (
|
if (
|
||||||
result.markdown && result.markdown.length < 500 &&
|
// result.markdown && result.markdown.length < 500 &&
|
||||||
process.env.RUNPOD_MU_API_KEY && process.env.RUNPOD_MU_POD_ID
|
base64Content.length < MAX_FILE_SIZE &&
|
||||||
|
process.env.RUNPOD_MU_API_KEY &&
|
||||||
|
process.env.RUNPOD_MU_POD_ID
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const muResult = await scrapePDFWithRunPodMU(
|
const muResult = await scrapePDFWithRunPodMU(
|
||||||
@ -172,6 +109,7 @@ export async function scrapePDF(
|
|||||||
},
|
},
|
||||||
tempFilePath,
|
tempFilePath,
|
||||||
timeToRun,
|
timeToRun,
|
||||||
|
base64Content,
|
||||||
);
|
);
|
||||||
result = muResult; // Use LlamaParse result if successful
|
result = muResult; // Use LlamaParse result if successful
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -189,6 +127,17 @@ export async function scrapePDF(
|
|||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// First, try parsing with PdfParse
|
||||||
|
result = await scrapePDFWithParsePDF(
|
||||||
|
{
|
||||||
|
...meta,
|
||||||
|
logger: meta.logger.child({
|
||||||
|
method: "scrapePDF/scrapePDFWithParsePDF",
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
tempFilePath,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await unlink(tempFilePath);
|
await unlink(tempFilePath);
|
||||||
@ -197,7 +146,7 @@ export async function scrapePDF(
|
|||||||
url: response.url,
|
url: response.url,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
|
||||||
html: result.html,
|
html: result?.html ?? "",
|
||||||
markdown: result.markdown,
|
markdown: result?.markdown ?? "",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user