diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 6bac2ba4..0ea1d579 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -1,4 +1,3 @@ -import { createReadStream, promises as fs } from "node:fs"; import { Meta } from "../.."; import { EngineScrapeResult } from ".."; import * as marked from "marked"; @@ -8,108 +7,51 @@ import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; -import { RemoveFeatureError } from "../../error"; +import { RemoveFeatureError, UnsupportedFileError } from "../../error"; +import { readFile, unlink } from "node:fs/promises"; +import path from "node:path"; type PDFProcessorResult = { html: string; markdown?: string }; -async function scrapePDFWithLlamaParse( +const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB + +async function scrapePDFWithRunPodMU( meta: Meta, tempFilePath: string, timeToRun: number | undefined, + base64Content: string, ): Promise { - meta.logger.debug("Processing PDF document with LlamaIndex", { + meta.logger.debug("Processing PDF document with RunPod MU", { tempFilePath, }); - const uploadForm = new FormData(); - - // This is utterly stupid but it works! - mogery - uploadForm.append("file", { - [Symbol.toStringTag]: "Blob", - name: tempFilePath, - stream() { - return createReadStream( - tempFilePath, - ) as unknown as ReadableStream; - }, - bytes() { - throw Error("Unimplemented in mock Blob: bytes"); - }, - arrayBuffer() { - throw Error("Unimplemented in mock Blob: arrayBuffer"); - }, - size: (await fs.stat(tempFilePath)).size, - text() { - throw Error("Unimplemented in mock Blob: text"); - }, - slice(start, end, contentType) { - throw Error("Unimplemented in mock Blob: slice"); - }, - type: "application/pdf", - } as Blob); - - const upload = await robustFetch({ - url: "https://api.cloud.llamaindex.ai/api/parsing/upload", + const result = await robustFetch({ + url: + "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync", method: "POST", headers: { - Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`, + Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`, + }, + body: { + input: { + file_content: base64Content, + filename: path.basename(tempFilePath) + ".pdf", + }, }, - body: uploadForm, logger: meta.logger.child({ - method: "scrapePDFWithLlamaParse/upload/robustFetch", + method: "scrapePDFWithRunPodMU/robustFetch", }), schema: z.object({ - id: z.string(), + output: z.object({ + markdown: z.string(), + }), }), }); - const jobId = upload.id; - - // TODO: timeout, retries - const startedAt = Date.now(); - const timeout = timeToRun ?? 300000; - - while (Date.now() <= startedAt + timeout) { - try { - const result = await robustFetch({ - url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`, - method: "GET", - headers: { - Authorization: `Bearer ${process.env.LLAMAPARSE_API_KEY}`, - }, - logger: meta.logger.child({ - method: "scrapePDFWithLlamaParse/result/robustFetch", - }), - schema: z.object({ - markdown: z.string(), - }), - }); - return { - markdown: result.markdown, - html: await marked.parse(result.markdown, { async: true }), - }; - } catch (e) { - if (e instanceof Error && e.message === "Request sent failure status") { - if ((e.cause as any).response.status === 404) { - // no-op, result not up yet - } else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) { - // URL is not a PDF, actually! - meta.logger.debug("URL is not actually a PDF, signalling..."); - throw new RemoveFeatureError(["pdf"]); - } else { - throw new Error("LlamaParse threw an error", { - cause: e.cause, - }); - } - } else { - throw e; - } - } - - await new Promise((resolve) => setTimeout(() => resolve(), 250)); - } - - throw new Error("LlamaParse timed out"); + return { + markdown: result.output.markdown, + html: await marked.parse(result.output.markdown, { async: true }), + }; } async function scrapePDFWithParsePDF( @@ -118,7 +60,7 @@ async function scrapePDFWithParsePDF( ): Promise { meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath }); - const result = await PdfParse(await fs.readFile(tempFilePath)); + const result = await PdfParse(await readFile(tempFilePath)); const escaped = escapeHtml(result.text); return { @@ -147,59 +89,57 @@ export async function scrapePDF( let result: PDFProcessorResult | null = null; - // First, try parsing with PdfParse - result = await scrapePDFWithParsePDF( - { - ...meta, - logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithParsePDF", - }), - }, - tempFilePath, - ); + const base64Content = (await readFile(tempFilePath)).toString("base64"); - // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse + // First try RunPod MU if conditions are met if ( - result.markdown && - result.markdown.length < 500 && - process.env.LLAMAPARSE_API_KEY + base64Content.length < MAX_FILE_SIZE && + process.env.RUNPOD_MU_API_KEY && + process.env.RUNPOD_MU_POD_ID ) { try { - const llamaResult = await scrapePDFWithLlamaParse( + result = await scrapePDFWithRunPodMU( { ...meta, logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithLlamaParse", + method: "scrapePDF/scrapePDFWithRunPodMU", }), }, tempFilePath, timeToRun, + base64Content, ); - result = llamaResult; // Use LlamaParse result if successful } catch (error) { - if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { - error, - }); - } else if (error instanceof RemoveFeatureError) { + if (error instanceof RemoveFeatureError) { throw error; - } else { - meta.logger.warn( - "LlamaParse failed to parse PDF -- using parse-pdf result", - { error }, - ); - Sentry.captureException(error); } + meta.logger.warn( + "RunPod MU failed to parse PDF -- falling back to parse-pdf", + { error }, + ); + Sentry.captureException(error); } } - await fs.unlink(tempFilePath); + // If RunPod MU failed or wasn't attempted, use PdfParse + if (!result) { + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), + }, + tempFilePath, + ); + } + + await unlink(tempFilePath); return { url: response.url, statusCode: response.status, - - html: result.html, - markdown: result.markdown, + html: result?.html ?? "", + markdown: result?.markdown ?? "", }; }