Update index.ts

This commit is contained in:
Nicolas 2024-12-27 20:54:26 -03:00
parent b8d7f9f257
commit f9d55efba8

View File

@ -13,102 +13,46 @@ import path from "node:path";
type PDFProcessorResult = { html: string; markdown?: string }; type PDFProcessorResult = { html: string; markdown?: string };
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
async function scrapePDFWithRunPodMU( async function scrapePDFWithRunPodMU(
meta: Meta, meta: Meta,
tempFilePath: string, tempFilePath: string,
timeToRun: number | undefined, timeToRun: number | undefined,
base64Content: string,
): Promise<PDFProcessorResult> { ): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with RunPod MU", { meta.logger.debug("Processing PDF document with RunPod MU", {
tempFilePath, tempFilePath,
}); });
const fileStat = await stat(tempFilePath);
if (fileStat.size > ((2**10)**2)*10) {
throw new UnsupportedFileError("File is larger than PDF parser limit (10MiB)");
}
console.log(tempFilePath); const result = await robustFetch({
url:
const upload = await robustFetch({ "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
url: "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/run",
method: "POST", method: "POST",
headers: { headers: {
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`, Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
}, },
body: { body: {
input: { input: {
file_content: (await readFile(tempFilePath)).toString("base64"), file_content: base64Content,
filename: path.basename(tempFilePath) + ".pdf", filename: path.basename(tempFilePath) + ".pdf",
}, },
}, },
logger: meta.logger.child({ logger: meta.logger.child({
method: "scrapePDFWithRunPodMU/upload/robustFetch", method: "scrapePDFWithRunPodMU/robustFetch",
}), }),
schema: z.object({ schema: z.object({
id: z.string(), output: z.object({
markdown: z.string(),
}),
}), }),
}); });
const jobId = upload.id; return {
markdown: result.output.markdown,
// TODO: timeout, retries html: await marked.parse(result.output.markdown, { async: true }),
const startedAt = Date.now(); };
const timeout = timeToRun ?? 300000;
while (Date.now() <= startedAt + timeout) {
try {
const result = await robustFetch({
url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${jobId}`,
method: "GET",
headers: {
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
},
logger: meta.logger.child({
method: "scrapePDFWithRunPodMU/result/robustFetch",
}),
schema: z.object({
status: z.string(),
error: z.any().optional(),
output: z.object({
markdown: z.string(),
}).optional(),
}),
});
if (result.status === "COMPLETED") {
return {
markdown: result.output!.markdown,
html: await marked.parse(result.output!.markdown, { async: true }),
};
}
if (result.status === "FAILED") {
throw new Error("RunPod MU failed to parse PDF: " + result.error!, { cause: result.error });
}
// result not up yet
} catch (e) {
if (e instanceof Error && e.message === "Request sent failure status") {
// if ((e.cause as any).response.status === 404) {
// // no-op, result not up yet
// } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
// // URL is not a PDF, actually!
// meta.logger.debug("URL is not actually a PDF, signalling...");
// throw new RemoveFeatureError(["pdf"]);
// } else {
throw new Error("RunPod MU threw an error", {
cause: e.cause,
});
// }
} else {
throw e;
}
}
await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
}
throw new Error("RunPod MU timed out");
} }
async function scrapePDFWithParsePDF( async function scrapePDFWithParsePDF(
@ -146,21 +90,14 @@ export async function scrapePDF(
let result: PDFProcessorResult | null = null; let result: PDFProcessorResult | null = null;
// First, try parsing with PdfParse const base64Content = (await readFile(tempFilePath)).toString("base64");
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
// Then, if output is too short, pass to RunPod MU // Then, if output is too short, pass to RunPod MU
if ( if (
result.markdown && result.markdown.length < 500 && // result.markdown && result.markdown.length < 500 &&
process.env.RUNPOD_MU_API_KEY && process.env.RUNPOD_MU_POD_ID base64Content.length < MAX_FILE_SIZE &&
process.env.RUNPOD_MU_API_KEY &&
process.env.RUNPOD_MU_POD_ID
) { ) {
try { try {
const muResult = await scrapePDFWithRunPodMU( const muResult = await scrapePDFWithRunPodMU(
@ -172,6 +109,7 @@ export async function scrapePDF(
}, },
tempFilePath, tempFilePath,
timeToRun, timeToRun,
base64Content,
); );
result = muResult; // Use LlamaParse result if successful result = muResult; // Use LlamaParse result if successful
} catch (error) { } catch (error) {
@ -189,6 +127,17 @@ export async function scrapePDF(
Sentry.captureException(error); Sentry.captureException(error);
} }
} }
} else {
// First, try parsing with PdfParse
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
} }
await unlink(tempFilePath); await unlink(tempFilePath);
@ -197,7 +146,7 @@ export async function scrapePDF(
url: response.url, url: response.url,
statusCode: response.status, statusCode: response.status,
html: result.html, html: result?.html ?? "",
markdown: result.markdown, markdown: result?.markdown ?? "",
}; };
} }