From ed7d15d2af23409e351df5a3574ef565f06a29d6 Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Tue, 17 Dec 2024 09:50:29 -0500 Subject: [PATCH] Update index.ts --- .../scraper/scrapeURL/engines/pdf/index.ts | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 24d5f002..38dc6b5e 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom const { response, tempFilePath } = await downloadFile(meta.id, meta.url); let result: PDFProcessorResult | null = null; - if (process.env.LLAMAPARSE_API_KEY) { + + // First, try parsing with PdfParse + result = await scrapePDFWithParsePDF( + { + ...meta, + logger: meta.logger.child({ + method: "scrapePDF/scrapePDFWithParsePDF", + }), + }, + tempFilePath, + ); + + + // If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse + if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) { try { - result = await scrapePDFWithLlamaParse( + const llamaResult = await scrapePDFWithLlamaParse( { ...meta, logger: meta.logger.child({ @@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom tempFilePath, timeToRun, ); + result = llamaResult; // Use LlamaParse result if successful } catch (error) { if (error instanceof Error && error.message === "LlamaParse timed out") { - meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { + meta.logger.warn("LlamaParse timed out -- using parse-pdf result", { error, }); } else if (error instanceof RemoveFeatureError) { throw error; } else { meta.logger.warn( - "LlamaParse failed to parse PDF -- falling back to parse-pdf", + "LlamaParse failed to parse PDF -- using parse-pdf result", { error }, ); Sentry.captureException(error); @@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom } } - if (result === null) { - result = await scrapePDFWithParsePDF( - { - ...meta, - logger: meta.logger.child({ - method: "scrapePDF/scrapePDFWithParsePDF", - }), - }, - tempFilePath, - ); - } - await fs.unlink(tempFilePath); return {