Update index.ts

This commit is contained in:
Nicolas 2024-12-27 20:59:18 -03:00
parent f9d55efba8
commit 1eca61bffb

View File

@ -8,7 +8,7 @@ import escapeHtml from "escape-html";
import PdfParse from "pdf-parse"; import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError, UnsupportedFileError } from "../../error"; import { RemoveFeatureError, UnsupportedFileError } from "../../error";
import { stat, readFile, unlink } from "node:fs/promises"; import { readFile, unlink } from "node:fs/promises";
import path from "node:path"; import path from "node:path";
type PDFProcessorResult = { html: string; markdown?: string }; type PDFProcessorResult = { html: string; markdown?: string };
@ -25,7 +25,6 @@ async function scrapePDFWithRunPodMU(
tempFilePath, tempFilePath,
}); });
const result = await robustFetch({ const result = await robustFetch({
url: url:
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync", "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
@ -92,15 +91,14 @@ export async function scrapePDF(
const base64Content = (await readFile(tempFilePath)).toString("base64"); const base64Content = (await readFile(tempFilePath)).toString("base64");
// Then, if output is too short, pass to RunPod MU // First try RunPod MU if conditions are met
if ( if (
// result.markdown && result.markdown.length < 500 &&
base64Content.length < MAX_FILE_SIZE && base64Content.length < MAX_FILE_SIZE &&
process.env.RUNPOD_MU_API_KEY && process.env.RUNPOD_MU_API_KEY &&
process.env.RUNPOD_MU_POD_ID process.env.RUNPOD_MU_POD_ID
) { ) {
try { try {
const muResult = await scrapePDFWithRunPodMU( result = await scrapePDFWithRunPodMU(
{ {
...meta, ...meta,
logger: meta.logger.child({ logger: meta.logger.child({
@ -111,24 +109,20 @@ export async function scrapePDF(
timeToRun, timeToRun,
base64Content, base64Content,
); );
result = muResult; // Use LlamaParse result if successful
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "RunPod MU timed out") { if (error instanceof RemoveFeatureError) {
meta.logger.warn("RunPod MU timed out -- using parse-pdf result", {
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error; throw error;
} else {
meta.logger.warn(
"RunPod MU failed to parse PDF -- using parse-pdf result",
{ error },
);
Sentry.captureException(error);
} }
meta.logger.warn(
"RunPod MU failed to parse PDF -- falling back to parse-pdf",
{ error },
);
Sentry.captureException(error);
} }
} else { }
// First, try parsing with PdfParse
// If RunPod MU failed or wasn't attempted, use PdfParse
if (!result) {
result = await scrapePDFWithParsePDF( result = await scrapePDFWithParsePDF(
{ {
...meta, ...meta,
@ -145,7 +139,6 @@ export async function scrapePDF(
return { return {
url: response.url, url: response.url,
statusCode: response.status, statusCode: response.status,
html: result?.html ?? "", html: result?.html ?? "",
markdown: result?.markdown ?? "", markdown: result?.markdown ?? "",
}; };