Nick: we are using runpod

This commit is contained in:
Nicolas 2024-12-27 19:59:05 -03:00
parent 5fcf3fa97e
commit b8d7f9f257

View File

@ -13,12 +13,12 @@ import path from "node:path";
type PDFProcessorResult = { html: string; markdown?: string }; type PDFProcessorResult = { html: string; markdown?: string };
async function scrapePDFWithMinerU( async function scrapePDFWithRunPodMU(
meta: Meta, meta: Meta,
tempFilePath: string, tempFilePath: string,
timeToRun: number | undefined, timeToRun: number | undefined,
): Promise<PDFProcessorResult> { ): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with MinerU", { meta.logger.debug("Processing PDF document with RunPod MU", {
tempFilePath, tempFilePath,
}); });
@ -30,10 +30,10 @@ async function scrapePDFWithMinerU(
console.log(tempFilePath); console.log(tempFilePath);
const upload = await robustFetch({ const upload = await robustFetch({
url: "https://api.runpod.ai/v2/" + process.env.MINERU_POD_ID + "/run", url: "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/run",
method: "POST", method: "POST",
headers: { headers: {
Authorization: `Bearer ${process.env.MINERU_API_KEY}`, Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
}, },
body: { body: {
input: { input: {
@ -42,7 +42,7 @@ async function scrapePDFWithMinerU(
}, },
}, },
logger: meta.logger.child({ logger: meta.logger.child({
method: "scrapePDFWithMinerU/upload/robustFetch", method: "scrapePDFWithRunPodMU/upload/robustFetch",
}), }),
schema: z.object({ schema: z.object({
id: z.string(), id: z.string(),
@ -58,13 +58,13 @@ async function scrapePDFWithMinerU(
while (Date.now() <= startedAt + timeout) { while (Date.now() <= startedAt + timeout) {
try { try {
const result = await robustFetch({ const result = await robustFetch({
url: `https://api.runpod.ai/v2/${process.env.MINERU_POD_ID}/status/${jobId}`, url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${jobId}`,
method: "GET", method: "GET",
headers: { headers: {
Authorization: `Bearer ${process.env.MINERU_API_KEY}`, Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
}, },
logger: meta.logger.child({ logger: meta.logger.child({
method: "scrapePDFWithMinerU/result/robustFetch", method: "scrapePDFWithRunPodMU/result/robustFetch",
}), }),
schema: z.object({ schema: z.object({
status: z.string(), status: z.string(),
@ -83,7 +83,7 @@ async function scrapePDFWithMinerU(
} }
if (result.status === "FAILED") { if (result.status === "FAILED") {
throw new Error("MinerU failed to parse PDF: " + result.error!, { cause: result.error }); throw new Error("RunPod MU failed to parse PDF: " + result.error!, { cause: result.error });
} }
// result not up yet // result not up yet
@ -96,7 +96,7 @@ async function scrapePDFWithMinerU(
// meta.logger.debug("URL is not actually a PDF, signalling..."); // meta.logger.debug("URL is not actually a PDF, signalling...");
// throw new RemoveFeatureError(["pdf"]); // throw new RemoveFeatureError(["pdf"]);
// } else { // } else {
throw new Error("MinerU threw an error", { throw new Error("RunPod MU threw an error", {
cause: e.cause, cause: e.cause,
}); });
// } // }
@ -108,7 +108,7 @@ async function scrapePDFWithMinerU(
await new Promise<void>((resolve) => setTimeout(() => resolve(), 250)); await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
} }
throw new Error("MinerU timed out"); throw new Error("RunPod MU timed out");
} }
async function scrapePDFWithParsePDF( async function scrapePDFWithParsePDF(
@ -157,33 +157,33 @@ export async function scrapePDF(
tempFilePath, tempFilePath,
); );
// Then, if output is too short, pass to MinerU // Then, if output is too short, pass to RunPod MU
if ( if (
result.markdown && result.markdown.length < 500 && result.markdown && result.markdown.length < 500 &&
process.env.MINERU_API_KEY && process.env.MINERU_POD_ID process.env.RUNPOD_MU_API_KEY && process.env.RUNPOD_MU_POD_ID
) { ) {
try { try {
const mineruResult = await scrapePDFWithMinerU( const muResult = await scrapePDFWithRunPodMU(
{ {
...meta, ...meta,
logger: meta.logger.child({ logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithMinerU", method: "scrapePDF/scrapePDFWithRunPodMU",
}), }),
}, },
tempFilePath, tempFilePath,
timeToRun, timeToRun,
); );
result = mineruResult; // Use LlamaParse result if successful result = muResult; // Use LlamaParse result if successful
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "MinerU timed out") { if (error instanceof Error && error.message === "RunPod MU timed out") {
meta.logger.warn("MinerU timed out -- using parse-pdf result", { meta.logger.warn("RunPod MU timed out -- using parse-pdf result", {
error, error,
}); });
} else if (error instanceof RemoveFeatureError) { } else if (error instanceof RemoveFeatureError) {
throw error; throw error;
} else { } else {
meta.logger.warn( meta.logger.warn(
"MinerU failed to parse PDF -- using parse-pdf result", "RunPod MU failed to parse PDF -- using parse-pdf result",
{ error }, { error },
); );
Sentry.captureException(error); Sentry.captureException(error);