Update index.ts

This commit is contained in:
Eric Ciarla 2024-12-17 09:50:29 -05:00
parent 654d6c6e0b
commit ed7d15d2af

View File

@ -140,36 +140,8 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
const { response, tempFilePath } = await downloadFile(meta.id, meta.url); const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null; let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
try {
result = await scrapePDFWithLlamaParse(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithLlamaParse",
}),
},
tempFilePath,
timeToRun,
);
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
{ error },
);
Sentry.captureException(error);
}
}
}
if (result === null) { // First, try parsing with PdfParse
result = await scrapePDFWithParsePDF( result = await scrapePDFWithParsePDF(
{ {
...meta, ...meta,
@ -179,6 +151,37 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
}, },
tempFilePath, tempFilePath,
); );
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
try {
const llamaResult = await scrapePDFWithLlamaParse(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithLlamaParse",
}),
},
tempFilePath,
timeToRun,
);
result = llamaResult; // Use LlamaParse result if successful
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- using parse-pdf result", {
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- using parse-pdf result",
{ error },
);
Sentry.captureException(error);
}
}
} }
await fs.unlink(tempFilePath); await fs.unlink(tempFilePath);