mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 01:15:56 +08:00
fix(scrapeURL/pdf): handle if a presumed PDF link returns HTML (e.g. 404)
This commit is contained in:
parent
d9e017e5e2
commit
d276a23da0
@ -8,6 +8,7 @@ import * as Sentry from "@sentry/node";
|
|||||||
import escapeHtml from "escape-html";
|
import escapeHtml from "escape-html";
|
||||||
import PdfParse from "pdf-parse";
|
import PdfParse from "pdf-parse";
|
||||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||||
|
import { RemoveFeatureError } from "../../error";
|
||||||
|
|
||||||
type PDFProcessorResult = {html: string, markdown?: string};
|
type PDFProcessorResult = {html: string, markdown?: string};
|
||||||
|
|
||||||
@ -52,6 +53,10 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
const jobId = upload.id;
|
const jobId = upload.id;
|
||||||
|
|
||||||
// TODO: timeout, retries
|
// TODO: timeout, retries
|
||||||
|
const startedAt = Date.now();
|
||||||
|
|
||||||
|
while (Date.now() <= startedAt + (meta.options.timeout ?? 300000)) {
|
||||||
|
try {
|
||||||
const result = await robustFetch({
|
const result = await robustFetch({
|
||||||
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
url: `https://api.cloud.llamaindex.ai/api/parsing/job/${jobId}/result/markdown`,
|
||||||
method: "GET",
|
method: "GET",
|
||||||
@ -62,14 +67,33 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis
|
|||||||
schema: z.object({
|
schema: z.object({
|
||||||
markdown: z.string(),
|
markdown: z.string(),
|
||||||
}),
|
}),
|
||||||
tryCount: meta.options.timeout !== undefined ? 32 : 1200, // 5 minutes if timeout not specified
|
|
||||||
tryCooldown: 250,
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
markdown: result.markdown,
|
markdown: result.markdown,
|
||||||
html: await marked.parse(result.markdown, { async: true }),
|
html: await marked.parse(result.markdown, { async: true }),
|
||||||
};
|
};
|
||||||
|
} catch (e) {
|
||||||
|
if (e instanceof Error && e.message === "Request sent failure status") {
|
||||||
|
if ((e.cause as any).response.status === 404) {
|
||||||
|
// no-op, result not up yet
|
||||||
|
} else if ((e.cause as any).response.body.includes("PDF_IS_BROKEN")) {
|
||||||
|
// URL is not a PDF, actually!
|
||||||
|
meta.logger.debug("URL is not actually a PDF, signalling...");
|
||||||
|
throw new RemoveFeatureError(["pdf"]);
|
||||||
|
} else {
|
||||||
|
throw new Error("LlamaParse threw an error", {
|
||||||
|
cause: e.cause,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise<void>((resolve) => setTimeout(() => resolve(), 250));
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error("LlamaParse timed out");
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
async function scrapePDFWithParsePDF(meta: Meta, tempFilePath: string): Promise<PDFProcessorResult> {
|
||||||
@ -107,10 +131,16 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
|||||||
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
|
logger: meta.logger.child({ method: "scrapePDF/scrapePDFWithLlamaParse" }),
|
||||||
}, tempFilePath);
|
}, tempFilePath);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||||
|
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", { error });
|
||||||
|
} else if (error instanceof RemoveFeatureError) {
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
|
meta.logger.warn("LlamaParse failed to parse PDF -- falling back to parse-pdf", { error });
|
||||||
Sentry.captureException(error);
|
Sentry.captureException(error);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (result === null) {
|
if (result === null) {
|
||||||
result = await scrapePDFWithParsePDF({
|
result = await scrapePDFWithParsePDF({
|
||||||
|
@ -33,6 +33,15 @@ export class AddFeatureError extends Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class RemoveFeatureError extends Error {
|
||||||
|
public featureFlags: FeatureFlag[];
|
||||||
|
|
||||||
|
constructor(featureFlags: FeatureFlag[]) {
|
||||||
|
super("Incorrect feature flags have been discovered: " + featureFlags.join(", "));
|
||||||
|
this.featureFlags = featureFlags;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export class SiteError extends Error {
|
export class SiteError extends Error {
|
||||||
public code: string;
|
public code: string;
|
||||||
constructor(code: string) {
|
constructor(code: string) {
|
||||||
|
@ -5,7 +5,7 @@ import { Document, ScrapeOptions } from "../../controllers/v1/types";
|
|||||||
import { logger } from "../../lib/logger";
|
import { logger } from "../../lib/logger";
|
||||||
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
|
import { buildFallbackList, Engine, EngineScrapeResult, FeatureFlag, scrapeURLWithEngine } from "./engines";
|
||||||
import { parseMarkdown } from "../../lib/html-to-markdown";
|
import { parseMarkdown } from "../../lib/html-to-markdown";
|
||||||
import { AddFeatureError, EngineError, NoEnginesLeftError, SiteError, TimeoutError } from "./error";
|
import { AddFeatureError, EngineError, NoEnginesLeftError, RemoveFeatureError, SiteError, TimeoutError } from "./error";
|
||||||
import { executeTransformers } from "./transformers";
|
import { executeTransformers } from "./transformers";
|
||||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||||
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
import { urlSpecificParams } from "./lib/urlSpecificParams";
|
||||||
@ -216,7 +216,7 @@ async function scrapeURLLoop(
|
|||||||
startedAt,
|
startedAt,
|
||||||
finishedAt: Date.now(),
|
finishedAt: Date.now(),
|
||||||
};
|
};
|
||||||
} else if (error instanceof AddFeatureError) {
|
} else if (error instanceof AddFeatureError || error instanceof RemoveFeatureError) {
|
||||||
throw error;
|
throw error;
|
||||||
} else if (error instanceof LLMRefusalError) {
|
} else if (error instanceof LLMRefusalError) {
|
||||||
results[engine] = {
|
results[engine] = {
|
||||||
@ -293,6 +293,9 @@ export async function scrapeURL(
|
|||||||
if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) {
|
if (error instanceof AddFeatureError && meta.internalOptions.forceEngine === undefined) {
|
||||||
meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags });
|
meta.logger.debug("More feature flags requested by scraper: adding " + error.featureFlags.join(", "), { error, existingFlags: meta.featureFlags });
|
||||||
meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags));
|
meta.featureFlags = new Set([...meta.featureFlags].concat(error.featureFlags));
|
||||||
|
} else if (error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined) {
|
||||||
|
meta.logger.debug("Incorrect feature flags reported by scraper: removing " + error.featureFlags.join(","), { error, existingFlags: meta.featureFlags });
|
||||||
|
meta.featureFlags = new Set([...meta.featureFlags].filter(x => !error.featureFlags.includes(x)));
|
||||||
} else {
|
} else {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user