mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 16:05:58 +08:00
feat(scrapeURL): handle PDFs behind anti-bot (#1198)
This commit is contained in:
parent
bec52bef6c
commit
55d047b6b3
@ -150,6 +150,16 @@ describe("Scrape tests", () => {
|
|||||||
});
|
});
|
||||||
}, 15000);
|
}, 15000);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("PDF (f-e dependant)", () => {
|
||||||
|
it.concurrent("works for PDFs behind anti-bot", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
|
||||||
|
}, 60000);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {
|
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {
|
||||||
|
@ -44,7 +44,7 @@ export async function scrapeURLWithFetch(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
await specialtyScrapeCheck(
|
||||||
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
|
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
|
||||||
Object.fromEntries(response.headers as any),
|
Object.fromEntries(response.headers as any),
|
||||||
);
|
);
|
||||||
|
@ -133,11 +133,12 @@ async function performFireEngineScrape<
|
|||||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||||
}
|
}
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
await specialtyScrapeCheck(
|
||||||
logger.child({
|
logger.child({
|
||||||
method: "performFireEngineScrape/specialtyScrapeCheck",
|
method: "performFireEngineScrape/specialtyScrapeCheck",
|
||||||
}),
|
}),
|
||||||
status.responseHeaders,
|
status.responseHeaders,
|
||||||
|
status,
|
||||||
);
|
);
|
||||||
|
|
||||||
const contentType = (Object.entries(status.responseHeaders ?? {}).find(
|
const contentType = (Object.entries(status.responseHeaders ?? {}).find(
|
||||||
|
@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
|
|||||||
import escapeHtml from "escape-html";
|
import escapeHtml from "escape-html";
|
||||||
import PdfParse from "pdf-parse";
|
import PdfParse from "pdf-parse";
|
||||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||||
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
|
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
|
||||||
import { readFile, unlink } from "node:fs/promises";
|
import { readFile, unlink } from "node:fs/promises";
|
||||||
import path from "node:path";
|
import path from "node:path";
|
||||||
|
import type { Response } from "undici";
|
||||||
|
|
||||||
type PDFProcessorResult = { html: string; markdown?: string };
|
type PDFProcessorResult = { html: string; markdown?: string };
|
||||||
|
|
||||||
@ -88,9 +89,19 @@ export async function scrapePDF(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
|
const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
|
||||||
headers: meta.options.headers,
|
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
|
||||||
});
|
: await downloadFile(meta.id, meta.url, {
|
||||||
|
headers: meta.options.headers,
|
||||||
|
});
|
||||||
|
|
||||||
|
if ((response as any).headers) { // if downloadFile was used
|
||||||
|
const r: Response = response as any;
|
||||||
|
const ct = r.headers.get("Content-Type");
|
||||||
|
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
|
||||||
|
throw new PDFAntibotError();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let result: PDFProcessorResult | null = null;
|
let result: PDFProcessorResult | null = null;
|
||||||
|
|
||||||
@ -142,7 +153,7 @@ export async function scrapePDF(
|
|||||||
await unlink(tempFilePath);
|
await unlink(tempFilePath);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: response.url,
|
url: response.url ?? meta.url,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
html: result?.html ?? "",
|
html: result?.html ?? "",
|
||||||
markdown: result?.markdown ?? "",
|
markdown: result?.markdown ?? "",
|
||||||
|
@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
specialtyScrapeCheck(
|
await specialtyScrapeCheck(
|
||||||
meta.logger.child({
|
meta.logger.child({
|
||||||
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
|
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
|
||||||
}),
|
}),
|
||||||
|
@ -1,9 +1,30 @@
|
|||||||
import { Logger } from "winston";
|
import { Logger } from "winston";
|
||||||
import { AddFeatureError } from "../../error";
|
import { AddFeatureError } from "../../error";
|
||||||
|
import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
|
||||||
|
import path from "path";
|
||||||
|
import os from "os";
|
||||||
|
import { writeFile } from "fs/promises";
|
||||||
|
import { Meta } from "../..";
|
||||||
|
|
||||||
export function specialtyScrapeCheck(
|
async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise<Meta["pdfPrefetch"]> {
|
||||||
|
if (!feRes?.file) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
|
||||||
|
await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))
|
||||||
|
|
||||||
|
return {
|
||||||
|
status: feRes.pageStatusCode,
|
||||||
|
url: feRes.url,
|
||||||
|
filePath,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function specialtyScrapeCheck(
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
headers: Record<string, string> | undefined,
|
headers: Record<string, string> | undefined,
|
||||||
|
feRes?: FireEngineCheckStatusSuccess,
|
||||||
) {
|
) {
|
||||||
const contentType = (Object.entries(headers ?? {}).find(
|
const contentType = (Object.entries(headers ?? {}).find(
|
||||||
(x) => x[0].toLowerCase() === "content-type",
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
|
|||||||
contentType.startsWith("application/pdf;")
|
contentType.startsWith("application/pdf;")
|
||||||
) {
|
) {
|
||||||
// .pdf
|
// .pdf
|
||||||
throw new AddFeatureError(["pdf"]);
|
throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
|
||||||
} else if (
|
} else if (
|
||||||
contentType ===
|
contentType ===
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { EngineResultsTracker } from ".";
|
import { EngineResultsTracker, Meta } from ".";
|
||||||
import { Engine, FeatureFlag } from "./engines";
|
import { Engine, FeatureFlag } from "./engines";
|
||||||
|
|
||||||
export class EngineError extends Error {
|
export class EngineError extends Error {
|
||||||
@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {
|
|||||||
|
|
||||||
export class AddFeatureError extends Error {
|
export class AddFeatureError extends Error {
|
||||||
public featureFlags: FeatureFlag[];
|
public featureFlags: FeatureFlag[];
|
||||||
|
public pdfPrefetch: Meta["pdfPrefetch"];
|
||||||
|
|
||||||
constructor(featureFlags: FeatureFlag[]) {
|
constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
|
||||||
super("New feature flags have been discovered: " + featureFlags.join(", "));
|
super("New feature flags have been discovered: " + featureFlags.join(", "));
|
||||||
this.featureFlags = featureFlags;
|
this.featureFlags = featureFlags;
|
||||||
|
this.pdfPrefetch = pdfPrefetch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
|
|||||||
this.reason = reason;
|
this.reason = reason;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export class PDFAntibotError extends Error {
|
||||||
|
constructor() {
|
||||||
|
super("PDF scrape was prevented by anti-bot")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -16,6 +16,7 @@ import {
|
|||||||
AddFeatureError,
|
AddFeatureError,
|
||||||
EngineError,
|
EngineError,
|
||||||
NoEnginesLeftError,
|
NoEnginesLeftError,
|
||||||
|
PDFAntibotError,
|
||||||
RemoveFeatureError,
|
RemoveFeatureError,
|
||||||
SiteError,
|
SiteError,
|
||||||
TimeoutError,
|
TimeoutError,
|
||||||
@ -49,6 +50,11 @@ export type Meta = {
|
|||||||
logs: any[];
|
logs: any[];
|
||||||
featureFlags: Set<FeatureFlag>;
|
featureFlags: Set<FeatureFlag>;
|
||||||
mock: MockState | null;
|
mock: MockState | null;
|
||||||
|
pdfPrefetch: {
|
||||||
|
filePath: string;
|
||||||
|
url?: string;
|
||||||
|
status: number;
|
||||||
|
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
|
||||||
};
|
};
|
||||||
|
|
||||||
function buildFeatureFlags(
|
function buildFeatureFlags(
|
||||||
@ -151,6 +157,7 @@ async function buildMetaObject(
|
|||||||
options.useMock !== undefined
|
options.useMock !== undefined
|
||||||
? await loadMock(options.useMock, _logger)
|
? await loadMock(options.useMock, _logger)
|
||||||
: null,
|
: null,
|
||||||
|
pdfPrefetch: undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -309,6 +316,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
throw error;
|
throw error;
|
||||||
} else if (error instanceof UnsupportedFileError) {
|
} else if (error instanceof UnsupportedFileError) {
|
||||||
throw error;
|
throw error;
|
||||||
|
} else if (error instanceof PDFAntibotError) {
|
||||||
|
throw error;
|
||||||
} else if (error instanceof TimeoutSignal) {
|
} else if (error instanceof TimeoutSignal) {
|
||||||
throw error;
|
throw error;
|
||||||
} else {
|
} else {
|
||||||
@ -394,6 +403,9 @@ export async function scrapeURL(
|
|||||||
meta.featureFlags = new Set(
|
meta.featureFlags = new Set(
|
||||||
[...meta.featureFlags].concat(error.featureFlags),
|
[...meta.featureFlags].concat(error.featureFlags),
|
||||||
);
|
);
|
||||||
|
if (error.pdfPrefetch) {
|
||||||
|
meta.pdfPrefetch = error.pdfPrefetch;
|
||||||
|
}
|
||||||
} else if (
|
} else if (
|
||||||
error instanceof RemoveFeatureError &&
|
error instanceof RemoveFeatureError &&
|
||||||
meta.internalOptions.forceEngine === undefined
|
meta.internalOptions.forceEngine === undefined
|
||||||
@ -408,6 +420,21 @@ export async function scrapeURL(
|
|||||||
(x) => !error.featureFlags.includes(x),
|
(x) => !error.featureFlags.includes(x),
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
} else if (
|
||||||
|
error instanceof PDFAntibotError &&
|
||||||
|
meta.internalOptions.forceEngine === undefined
|
||||||
|
) {
|
||||||
|
if (meta.pdfPrefetch !== undefined) {
|
||||||
|
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
|
||||||
|
throw error;
|
||||||
|
} else {
|
||||||
|
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
|
||||||
|
meta.featureFlags = new Set(
|
||||||
|
[...meta.featureFlags].filter(
|
||||||
|
(x) => x !== "pdf",
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user