feat(scrapeURL): handle PDFs behind anti-bot (#1198)

This commit is contained in:
Gergő Móricz 2025-02-20 04:11:30 +01:00 committed by GitHub
parent bec52bef6c
commit 55d047b6b3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 90 additions and 12 deletions

View File

@ -150,6 +150,16 @@ describe("Scrape tests", () => {
});
}, 15000);
});
describe("PDF (f-e dependant)", () => {
it.concurrent("works for PDFs behind anti-bot", async () => {
const response = await scrape({
url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
});
expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
}, 60000);
});
}
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {

View File

@ -44,7 +44,7 @@ export async function scrapeURLWithFetch(
}
}
specialtyScrapeCheck(
await specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
Object.fromEntries(response.headers as any),
);

View File

@ -133,11 +133,12 @@ async function performFireEngineScrape<
await new Promise((resolve) => setTimeout(resolve, 250));
}
specialtyScrapeCheck(
await specialtyScrapeCheck(
logger.child({
method: "performFireEngineScrape/specialtyScrapeCheck",
}),
status.responseHeaders,
status,
);
const contentType = (Object.entries(status.responseHeaders ?? {}).find(

View File

@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError, UnsupportedFileError } from "../../error";
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
import { readFile, unlink } from "node:fs/promises";
import path from "node:path";
import type { Response } from "undici";
type PDFProcessorResult = { html: string; markdown?: string };
@ -88,9 +89,19 @@ export async function scrapePDF(
};
}
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
: await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
if ((response as any).headers) { // if downloadFile was used
const r: Response = response as any;
const ct = r.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
throw new PDFAntibotError();
}
}
let result: PDFProcessorResult | null = null;
@ -142,7 +153,7 @@ export async function scrapePDF(
await unlink(tempFilePath);
return {
url: response.url,
url: response.url ?? meta.url,
statusCode: response.status,
html: result?.html ?? "",
markdown: result?.markdown ?? "",

View File

@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
});
}
specialtyScrapeCheck(
await specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
}),

View File

@ -1,9 +1,30 @@
import { Logger } from "winston";
import { AddFeatureError } from "../../error";
import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
import path from "path";
import os from "os";
import { writeFile } from "fs/promises";
import { Meta } from "../..";
export function specialtyScrapeCheck(
async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise<Meta["pdfPrefetch"]> {
if (!feRes?.file) {
return null;
}
const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))
return {
status: feRes.pageStatusCode,
url: feRes.url,
filePath,
};
}
export async function specialtyScrapeCheck(
logger: Logger,
headers: Record<string, string> | undefined,
feRes?: FireEngineCheckStatusSuccess,
) {
const contentType = (Object.entries(headers ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
contentType.startsWith("application/pdf;")
) {
// .pdf
throw new AddFeatureError(["pdf"]);
throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
} else if (
contentType ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||

View File

@ -1,4 +1,4 @@
import { EngineResultsTracker } from ".";
import { EngineResultsTracker, Meta } from ".";
import { Engine, FeatureFlag } from "./engines";
export class EngineError extends Error {
@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {
export class AddFeatureError extends Error {
public featureFlags: FeatureFlag[];
public pdfPrefetch: Meta["pdfPrefetch"];
constructor(featureFlags: FeatureFlag[]) {
constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
super("New feature flags have been discovered: " + featureFlags.join(", "));
this.featureFlags = featureFlags;
this.pdfPrefetch = pdfPrefetch;
}
}
@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
this.reason = reason;
}
}
export class PDFAntibotError extends Error {
constructor() {
super("PDF scrape was prevented by anti-bot")
}
}

View File

@ -16,6 +16,7 @@ import {
AddFeatureError,
EngineError,
NoEnginesLeftError,
PDFAntibotError,
RemoveFeatureError,
SiteError,
TimeoutError,
@ -49,6 +50,11 @@ export type Meta = {
logs: any[];
featureFlags: Set<FeatureFlag>;
mock: MockState | null;
pdfPrefetch: {
filePath: string;
url?: string;
status: number;
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
};
function buildFeatureFlags(
@ -151,6 +157,7 @@ async function buildMetaObject(
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
pdfPrefetch: undefined,
};
}
@ -309,6 +316,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error;
} else if (error instanceof UnsupportedFileError) {
throw error;
} else if (error instanceof PDFAntibotError) {
throw error;
} else if (error instanceof TimeoutSignal) {
throw error;
} else {
@ -394,6 +403,9 @@ export async function scrapeURL(
meta.featureFlags = new Set(
[...meta.featureFlags].concat(error.featureFlags),
);
if (error.pdfPrefetch) {
meta.pdfPrefetch = error.pdfPrefetch;
}
} else if (
error instanceof RemoveFeatureError &&
meta.internalOptions.forceEngine === undefined
@ -408,6 +420,21 @@ export async function scrapeURL(
(x) => !error.featureFlags.includes(x),
),
);
} else if (
error instanceof PDFAntibotError &&
meta.internalOptions.forceEngine === undefined
) {
if (meta.pdfPrefetch !== undefined) {
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
throw error;
} else {
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => x !== "pdf",
),
);
}
} else {
throw error;
}