Update index.ts

This commit is contained in:
Eric Ciarla 2024-12-17 09:50:29 -05:00
parent 654d6c6e0b
commit ed7d15d2af

View File

@ -140,9 +140,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
const { response, tempFilePath } = await downloadFile(meta.id, meta.url); const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null; let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
// First, try parsing with PdfParse
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
try { try {
result = await scrapePDFWithLlamaParse( const llamaResult = await scrapePDFWithLlamaParse(
{ {
...meta, ...meta,
logger: meta.logger.child({ logger: meta.logger.child({
@ -152,16 +166,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
tempFilePath, tempFilePath,
timeToRun, timeToRun,
); );
result = llamaResult; // Use LlamaParse result if successful
} catch (error) { } catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") { if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
error, error,
}); });
} else if (error instanceof RemoveFeatureError) { } else if (error instanceof RemoveFeatureError) {
throw error; throw error;
} else { } else {
meta.logger.warn( meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf", "LlamaParse failed to parse PDF -- using parse-pdf result",
{ error }, { error },
); );
Sentry.captureException(error); Sentry.captureException(error);
@ -169,18 +184,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
} }
} }
if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
}
await fs.unlink(tempFilePath); await fs.unlink(tempFilePath);
return { return {