mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-22 04:22:07 +08:00
Update index.ts
This commit is contained in:
parent
f9d55efba8
commit
1eca61bffb
@ -8,7 +8,7 @@ import escapeHtml from "escape-html";
|
|||||||
import PdfParse from "pdf-parse";
|
import PdfParse from "pdf-parse";
|
||||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||||
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
|
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
|
||||||
import { stat, readFile, unlink } from "node:fs/promises";
|
import { readFile, unlink } from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
|
||||||
type PDFProcessorResult = { html: string; markdown?: string };
|
type PDFProcessorResult = { html: string; markdown?: string };
|
||||||
@ -25,7 +25,6 @@ async function scrapePDFWithRunPodMU(
|
|||||||
tempFilePath,
|
tempFilePath,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
const result = await robustFetch({
|
const result = await robustFetch({
|
||||||
url:
|
url:
|
||||||
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
|
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
|
||||||
@ -92,15 +91,14 @@ export async function scrapePDF(
|
|||||||
|
|
||||||
const base64Content = (await readFile(tempFilePath)).toString("base64");
|
const base64Content = (await readFile(tempFilePath)).toString("base64");
|
||||||
|
|
||||||
// Then, if output is too short, pass to RunPod MU
|
// First try RunPod MU if conditions are met
|
||||||
if (
|
if (
|
||||||
// result.markdown && result.markdown.length < 500 &&
|
|
||||||
base64Content.length < MAX_FILE_SIZE &&
|
base64Content.length < MAX_FILE_SIZE &&
|
||||||
process.env.RUNPOD_MU_API_KEY &&
|
process.env.RUNPOD_MU_API_KEY &&
|
||||||
process.env.RUNPOD_MU_POD_ID
|
process.env.RUNPOD_MU_POD_ID
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
const muResult = await scrapePDFWithRunPodMU(
|
result = await scrapePDFWithRunPodMU(
|
||||||
{
|
{
|
||||||
...meta,
|
...meta,
|
||||||
logger: meta.logger.child({
|
logger: meta.logger.child({
|
||||||
@ -111,24 +109,20 @@ export async function scrapePDF(
|
|||||||
timeToRun,
|
timeToRun,
|
||||||
base64Content,
|
base64Content,
|
||||||
);
|
);
|
||||||
result = muResult; // Use LlamaParse result if successful
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error instanceof Error && error.message === "RunPod MU timed out") {
|
if (error instanceof RemoveFeatureError) {
|
||||||
meta.logger.warn("RunPod MU timed out -- using parse-pdf result", {
|
|
||||||
error,
|
|
||||||
});
|
|
||||||
} else if (error instanceof RemoveFeatureError) {
|
|
||||||
throw error;
|
throw error;
|
||||||
} else {
|
|
||||||
meta.logger.warn(
|
|
||||||
"RunPod MU failed to parse PDF -- using parse-pdf result",
|
|
||||||
{ error },
|
|
||||||
);
|
|
||||||
Sentry.captureException(error);
|
|
||||||
}
|
}
|
||||||
|
meta.logger.warn(
|
||||||
|
"RunPod MU failed to parse PDF -- falling back to parse-pdf",
|
||||||
|
{ error },
|
||||||
|
);
|
||||||
|
Sentry.captureException(error);
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
// First, try parsing with PdfParse
|
|
||||||
|
// If RunPod MU failed or wasn't attempted, use PdfParse
|
||||||
|
if (!result) {
|
||||||
result = await scrapePDFWithParsePDF(
|
result = await scrapePDFWithParsePDF(
|
||||||
{
|
{
|
||||||
...meta,
|
...meta,
|
||||||
@ -145,7 +139,6 @@ export async function scrapePDF(
|
|||||||
return {
|
return {
|
||||||
url: response.url,
|
url: response.url,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
|
||||||
html: result?.html ?? "",
|
html: result?.html ?? "",
|
||||||
markdown: result?.markdown ?? "",
|
markdown: result?.markdown ?? "",
|
||||||
};
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user