mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 05:36:02 +08:00
feat(scrapeURL): handle PDFs behind anti-bot (#1198)
This commit is contained in:
parent
bec52bef6c
commit
55d047b6b3
@ -150,6 +150,16 @@ describe("Scrape tests", () => {
|
||||
});
|
||||
}, 15000);
|
||||
});
|
||||
|
||||
describe("PDF (f-e dependant)", () => {
|
||||
it.concurrent("works for PDFs behind anti-bot", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
|
||||
});
|
||||
|
||||
expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
|
||||
}, 60000);
|
||||
});
|
||||
}
|
||||
|
||||
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {
|
||||
|
@ -44,7 +44,7 @@ export async function scrapeURLWithFetch(
|
||||
}
|
||||
}
|
||||
|
||||
specialtyScrapeCheck(
|
||||
await specialtyScrapeCheck(
|
||||
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
|
||||
Object.fromEntries(response.headers as any),
|
||||
);
|
||||
|
@ -133,11 +133,12 @@ async function performFireEngineScrape<
|
||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||
}
|
||||
|
||||
specialtyScrapeCheck(
|
||||
await specialtyScrapeCheck(
|
||||
logger.child({
|
||||
method: "performFireEngineScrape/specialtyScrapeCheck",
|
||||
}),
|
||||
status.responseHeaders,
|
||||
status,
|
||||
);
|
||||
|
||||
const contentType = (Object.entries(status.responseHeaders ?? {}).find(
|
||||
|
@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
|
||||
import escapeHtml from "escape-html";
|
||||
import PdfParse from "pdf-parse";
|
||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
|
||||
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
|
||||
import { readFile, unlink } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import type { Response } from "undici";
|
||||
|
||||
type PDFProcessorResult = { html: string; markdown?: string };
|
||||
|
||||
@ -88,9 +89,19 @@ export async function scrapePDF(
|
||||
};
|
||||
}
|
||||
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
|
||||
headers: meta.options.headers,
|
||||
});
|
||||
const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
|
||||
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
|
||||
: await downloadFile(meta.id, meta.url, {
|
||||
headers: meta.options.headers,
|
||||
});
|
||||
|
||||
if ((response as any).headers) { // if downloadFile was used
|
||||
const r: Response = response as any;
|
||||
const ct = r.headers.get("Content-Type");
|
||||
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
|
||||
throw new PDFAntibotError();
|
||||
}
|
||||
}
|
||||
|
||||
let result: PDFProcessorResult | null = null;
|
||||
|
||||
@ -142,7 +153,7 @@ export async function scrapePDF(
|
||||
await unlink(tempFilePath);
|
||||
|
||||
return {
|
||||
url: response.url,
|
||||
url: response.url ?? meta.url,
|
||||
statusCode: response.status,
|
||||
html: result?.html ?? "",
|
||||
markdown: result?.markdown ?? "",
|
||||
|
@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
|
||||
});
|
||||
}
|
||||
|
||||
specialtyScrapeCheck(
|
||||
await specialtyScrapeCheck(
|
||||
meta.logger.child({
|
||||
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
|
||||
}),
|
||||
|
@ -1,9 +1,30 @@
|
||||
import { Logger } from "winston";
|
||||
import { AddFeatureError } from "../../error";
|
||||
import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
|
||||
import path from "path";
|
||||
import os from "os";
|
||||
import { writeFile } from "fs/promises";
|
||||
import { Meta } from "../..";
|
||||
|
||||
export function specialtyScrapeCheck(
|
||||
async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise<Meta["pdfPrefetch"]> {
|
||||
if (!feRes?.file) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
|
||||
await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))
|
||||
|
||||
return {
|
||||
status: feRes.pageStatusCode,
|
||||
url: feRes.url,
|
||||
filePath,
|
||||
};
|
||||
}
|
||||
|
||||
export async function specialtyScrapeCheck(
|
||||
logger: Logger,
|
||||
headers: Record<string, string> | undefined,
|
||||
feRes?: FireEngineCheckStatusSuccess,
|
||||
) {
|
||||
const contentType = (Object.entries(headers ?? {}).find(
|
||||
(x) => x[0].toLowerCase() === "content-type",
|
||||
@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
|
||||
contentType.startsWith("application/pdf;")
|
||||
) {
|
||||
// .pdf
|
||||
throw new AddFeatureError(["pdf"]);
|
||||
throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
|
||||
} else if (
|
||||
contentType ===
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { EngineResultsTracker } from ".";
|
||||
import { EngineResultsTracker, Meta } from ".";
|
||||
import { Engine, FeatureFlag } from "./engines";
|
||||
|
||||
export class EngineError extends Error {
|
||||
@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {
|
||||
|
||||
export class AddFeatureError extends Error {
|
||||
public featureFlags: FeatureFlag[];
|
||||
public pdfPrefetch: Meta["pdfPrefetch"];
|
||||
|
||||
constructor(featureFlags: FeatureFlag[]) {
|
||||
constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
|
||||
super("New feature flags have been discovered: " + featureFlags.join(", "));
|
||||
this.featureFlags = featureFlags;
|
||||
this.pdfPrefetch = pdfPrefetch;
|
||||
}
|
||||
}
|
||||
|
||||
@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
|
||||
this.reason = reason;
|
||||
}
|
||||
}
|
||||
|
||||
export class PDFAntibotError extends Error {
|
||||
constructor() {
|
||||
super("PDF scrape was prevented by anti-bot")
|
||||
}
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ import {
|
||||
AddFeatureError,
|
||||
EngineError,
|
||||
NoEnginesLeftError,
|
||||
PDFAntibotError,
|
||||
RemoveFeatureError,
|
||||
SiteError,
|
||||
TimeoutError,
|
||||
@ -49,6 +50,11 @@ export type Meta = {
|
||||
logs: any[];
|
||||
featureFlags: Set<FeatureFlag>;
|
||||
mock: MockState | null;
|
||||
pdfPrefetch: {
|
||||
filePath: string;
|
||||
url?: string;
|
||||
status: number;
|
||||
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
|
||||
};
|
||||
|
||||
function buildFeatureFlags(
|
||||
@ -151,6 +157,7 @@ async function buildMetaObject(
|
||||
options.useMock !== undefined
|
||||
? await loadMock(options.useMock, _logger)
|
||||
: null,
|
||||
pdfPrefetch: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
@ -309,6 +316,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
throw error;
|
||||
} else if (error instanceof UnsupportedFileError) {
|
||||
throw error;
|
||||
} else if (error instanceof PDFAntibotError) {
|
||||
throw error;
|
||||
} else if (error instanceof TimeoutSignal) {
|
||||
throw error;
|
||||
} else {
|
||||
@ -394,6 +403,9 @@ export async function scrapeURL(
|
||||
meta.featureFlags = new Set(
|
||||
[...meta.featureFlags].concat(error.featureFlags),
|
||||
);
|
||||
if (error.pdfPrefetch) {
|
||||
meta.pdfPrefetch = error.pdfPrefetch;
|
||||
}
|
||||
} else if (
|
||||
error instanceof RemoveFeatureError &&
|
||||
meta.internalOptions.forceEngine === undefined
|
||||
@ -408,6 +420,21 @@ export async function scrapeURL(
|
||||
(x) => !error.featureFlags.includes(x),
|
||||
),
|
||||
);
|
||||
} else if (
|
||||
error instanceof PDFAntibotError &&
|
||||
meta.internalOptions.forceEngine === undefined
|
||||
) {
|
||||
if (meta.pdfPrefetch !== undefined) {
|
||||
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
|
||||
throw error;
|
||||
} else {
|
||||
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
|
||||
meta.featureFlags = new Set(
|
||||
[...meta.featureFlags].filter(
|
||||
(x) => x !== "pdf",
|
||||
),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
throw error;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user