feat(scrapeURL): handle PDFs behind anti-bot (#1198)

This commit is contained in:
Gergő Móricz 2025-02-20 04:11:30 +01:00 committed by GitHub
parent bec52bef6c
commit 55d047b6b3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 90 additions and 12 deletions

View File

@ -150,6 +150,16 @@ describe("Scrape tests", () => {
}); });
}, 15000); }, 15000);
}); });
describe("PDF (f-e dependant)", () => {
it.concurrent("works for PDFs behind anti-bot", async () => {
const response = await scrape({
url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
});
expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
}, 60000);
});
} }
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) { if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {

View File

@ -44,7 +44,7 @@ export async function scrapeURLWithFetch(
} }
} }
specialtyScrapeCheck( await specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
Object.fromEntries(response.headers as any), Object.fromEntries(response.headers as any),
); );

View File

@ -133,11 +133,12 @@ async function performFireEngineScrape<
await new Promise((resolve) => setTimeout(resolve, 250)); await new Promise((resolve) => setTimeout(resolve, 250));
} }
specialtyScrapeCheck( await specialtyScrapeCheck(
logger.child({ logger.child({
method: "performFireEngineScrape/specialtyScrapeCheck", method: "performFireEngineScrape/specialtyScrapeCheck",
}), }),
status.responseHeaders, status.responseHeaders,
status,
); );
const contentType = (Object.entries(status.responseHeaders ?? {}).find( const contentType = (Object.entries(status.responseHeaders ?? {}).find(

View File

@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html"; import escapeHtml from "escape-html";
import PdfParse from "pdf-parse"; import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError, UnsupportedFileError } from "../../error"; import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
import { readFile, unlink } from "node:fs/promises"; import { readFile, unlink } from "node:fs/promises";
import path from "node:path"; import path from "node:path";
import type { Response } from "undici";
type PDFProcessorResult = { html: string; markdown?: string }; type PDFProcessorResult = { html: string; markdown?: string };
@ -88,10 +89,20 @@ export async function scrapePDF(
}; };
} }
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, { const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
: await downloadFile(meta.id, meta.url, {
headers: meta.options.headers, headers: meta.options.headers,
}); });
if ((response as any).headers) { // if downloadFile was used
const r: Response = response as any;
const ct = r.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
throw new PDFAntibotError();
}
}
let result: PDFProcessorResult | null = null; let result: PDFProcessorResult | null = null;
const base64Content = (await readFile(tempFilePath)).toString("base64"); const base64Content = (await readFile(tempFilePath)).toString("base64");
@ -142,7 +153,7 @@ export async function scrapePDF(
await unlink(tempFilePath); await unlink(tempFilePath);
return { return {
url: response.url, url: response.url ?? meta.url,
statusCode: response.status, statusCode: response.status,
html: result?.html ?? "", html: result?.html ?? "",
markdown: result?.markdown ?? "", markdown: result?.markdown ?? "",

View File

@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
}); });
} }
specialtyScrapeCheck( await specialtyScrapeCheck(
meta.logger.child({ meta.logger.child({
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck", method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
}), }),

View File

@ -1,9 +1,30 @@
import { Logger } from "winston"; import { Logger } from "winston";
import { AddFeatureError } from "../../error"; import { AddFeatureError } from "../../error";
import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
import path from "path";
import os from "os";
import { writeFile } from "fs/promises";
import { Meta } from "../..";
export function specialtyScrapeCheck( async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise<Meta["pdfPrefetch"]> {
if (!feRes?.file) {
return null;
}
const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))
return {
status: feRes.pageStatusCode,
url: feRes.url,
filePath,
};
}
export async function specialtyScrapeCheck(
logger: Logger, logger: Logger,
headers: Record<string, string> | undefined, headers: Record<string, string> | undefined,
feRes?: FireEngineCheckStatusSuccess,
) { ) {
const contentType = (Object.entries(headers ?? {}).find( const contentType = (Object.entries(headers ?? {}).find(
(x) => x[0].toLowerCase() === "content-type", (x) => x[0].toLowerCase() === "content-type",
@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
contentType.startsWith("application/pdf;") contentType.startsWith("application/pdf;")
) { ) {
// .pdf // .pdf
throw new AddFeatureError(["pdf"]); throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
} else if ( } else if (
contentType === contentType ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" || "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||

View File

@ -1,4 +1,4 @@
import { EngineResultsTracker } from "."; import { EngineResultsTracker, Meta } from ".";
import { Engine, FeatureFlag } from "./engines"; import { Engine, FeatureFlag } from "./engines";
export class EngineError extends Error { export class EngineError extends Error {
@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {
export class AddFeatureError extends Error { export class AddFeatureError extends Error {
public featureFlags: FeatureFlag[]; public featureFlags: FeatureFlag[];
public pdfPrefetch: Meta["pdfPrefetch"];
constructor(featureFlags: FeatureFlag[]) { constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
super("New feature flags have been discovered: " + featureFlags.join(", ")); super("New feature flags have been discovered: " + featureFlags.join(", "));
this.featureFlags = featureFlags; this.featureFlags = featureFlags;
this.pdfPrefetch = pdfPrefetch;
} }
} }
@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
this.reason = reason; this.reason = reason;
} }
} }
export class PDFAntibotError extends Error {
constructor() {
super("PDF scrape was prevented by anti-bot")
}
}

View File

@ -16,6 +16,7 @@ import {
AddFeatureError, AddFeatureError,
EngineError, EngineError,
NoEnginesLeftError, NoEnginesLeftError,
PDFAntibotError,
RemoveFeatureError, RemoveFeatureError,
SiteError, SiteError,
TimeoutError, TimeoutError,
@ -49,6 +50,11 @@ export type Meta = {
logs: any[]; logs: any[];
featureFlags: Set<FeatureFlag>; featureFlags: Set<FeatureFlag>;
mock: MockState | null; mock: MockState | null;
pdfPrefetch: {
filePath: string;
url?: string;
status: number;
} | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
}; };
function buildFeatureFlags( function buildFeatureFlags(
@ -151,6 +157,7 @@ async function buildMetaObject(
options.useMock !== undefined options.useMock !== undefined
? await loadMock(options.useMock, _logger) ? await loadMock(options.useMock, _logger)
: null, : null,
pdfPrefetch: undefined,
}; };
} }
@ -309,6 +316,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error; throw error;
} else if (error instanceof UnsupportedFileError) { } else if (error instanceof UnsupportedFileError) {
throw error; throw error;
} else if (error instanceof PDFAntibotError) {
throw error;
} else if (error instanceof TimeoutSignal) { } else if (error instanceof TimeoutSignal) {
throw error; throw error;
} else { } else {
@ -394,6 +403,9 @@ export async function scrapeURL(
meta.featureFlags = new Set( meta.featureFlags = new Set(
[...meta.featureFlags].concat(error.featureFlags), [...meta.featureFlags].concat(error.featureFlags),
); );
if (error.pdfPrefetch) {
meta.pdfPrefetch = error.pdfPrefetch;
}
} else if ( } else if (
error instanceof RemoveFeatureError && error instanceof RemoveFeatureError &&
meta.internalOptions.forceEngine === undefined meta.internalOptions.forceEngine === undefined
@ -408,6 +420,21 @@ export async function scrapeURL(
(x) => !error.featureFlags.includes(x), (x) => !error.featureFlags.includes(x),
), ),
); );
} else if (
error instanceof PDFAntibotError &&
meta.internalOptions.forceEngine === undefined
) {
if (meta.pdfPrefetch !== undefined) {
meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
throw error;
} else {
meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
meta.featureFlags = new Set(
[...meta.featureFlags].filter(
(x) => x !== "pdf",
),
);
}
} else { } else {
throw error; throw error;
} }