feat(scrapeURL): handle PDFs behind anti-bot (#1198)

2025-08-15 00:45:52 +08:00 · 2025-02-20 04:11:30 +01:00 · 2025-02-20 04:11:30 +01:00 · 55d047b6b3
commit 55d047b6b3
parent bec52bef6c
8 changed files with 90 additions and 12 deletions
--- a/apps/api/src/tests/snips/scrape.test.ts
+++ b/apps/api/src/tests/snips/scrape.test.ts
@ -150,6 +150,16 @@ describe("Scrape tests", () => {
        });
      }, 15000);
    });
    describe("PDF (f-e dependant)", () => {
      it.concurrent("works for PDFs behind anti-bot", async () => {
        const response = await scrape({
          url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
        });
        expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
      }, 60000);
    });
  }
  if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {
--- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
@ -44,7 +44,7 @@ export async function scrapeURLWithFetch(
    }
  }
-  specialtyScrapeCheck(
+  await specialtyScrapeCheck(
    meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
    Object.fromEntries(response.headers as any),
  );
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@ -133,11 +133,12 @@ async function performFireEngineScrape<
    await new Promise((resolve) => setTimeout(resolve, 250));
  }
-  specialtyScrapeCheck(
+  await specialtyScrapeCheck(
    logger.child({
      method: "performFireEngineScrape/specialtyScrapeCheck",
    }),
    status.responseHeaders,
    status,
  );
  const contentType = (Object.entries(status.responseHeaders ?? {}).find(
--- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
 import escapeHtml from "escape-html";
 import PdfParse from "pdf-parse";
 import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
-import { RemoveFeatureError, UnsupportedFileError } from "../../error";
+import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
 import { readFile, unlink } from "node:fs/promises";
 import path from "node:path";
 import type { Response } from "undici";
 type PDFProcessorResult = { html: string; markdown?: string };
@ -88,10 +89,20 @@ export async function scrapePDF(
    };
  }
-  const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
+  const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
    ? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
    : await downloadFile(meta.id, meta.url, {
      headers: meta.options.headers,
    });
  if ((response as any).headers) { // if downloadFile was used
    const r: Response = response as any;
    const ct = r.headers.get("Content-Type");
    if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
      throw new PDFAntibotError();
    }
  }
  let result: PDFProcessorResult | null = null;
  const base64Content = (await readFile(tempFilePath)).toString("base64");
@ -142,7 +153,7 @@ export async function scrapePDF(
  await unlink(tempFilePath);
  return {
-    url: response.url,
+    url: response.url ?? meta.url,
    statusCode: response.status,
    html: result?.html ?? "",
    markdown: result?.markdown ?? "",
--- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
      });
    }
-    specialtyScrapeCheck(
+    await specialtyScrapeCheck(
      meta.logger.child({
        method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
      }),
--- a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
@ -1,9 +1,30 @@
 import { Logger } from "winston";
 import { AddFeatureError } from "../../error";
 import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
 import path from "path";
 import os from "os";
 import { writeFile } from "fs/promises";
 import { Meta } from "../..";
-export function specialtyScrapeCheck(
+async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise<Meta["pdfPrefetch"]> {
  if (!feRes?.file) {
    return null;
  }
  const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
  await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))
  return {
    status: feRes.pageStatusCode,
    url: feRes.url,
    filePath,
  };
 }
 export async function specialtyScrapeCheck(
  logger: Logger,
  headers: Record<string, string> | undefined,
  feRes?: FireEngineCheckStatusSuccess,
 ) {
  const contentType = (Object.entries(headers ?? {}).find(
    (x) => x[0].toLowerCase() === "content-type",
@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
    contentType.startsWith("application/pdf;")
  ) {
    // .pdf
-    throw new AddFeatureError(["pdf"]);
+    throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
  } else if (
    contentType ===
      "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
--- a/apps/api/src/scraper/scrapeURL/error.ts
+++ b/apps/api/src/scraper/scrapeURL/error.ts
@ -1,4 +1,4 @@
-import { EngineResultsTracker } from ".";
+import { EngineResultsTracker, Meta } from ".";
 import { Engine, FeatureFlag } from "./engines";
 export class EngineError extends Error {
@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {
 export class AddFeatureError extends Error {
  public featureFlags: FeatureFlag[];
  public pdfPrefetch: Meta["pdfPrefetch"];
-  constructor(featureFlags: FeatureFlag[]) {
+  constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
    super("New feature flags have been discovered: " + featureFlags.join(", "));
    this.featureFlags = featureFlags;
    this.pdfPrefetch = pdfPrefetch;
  }
 }
@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
    this.reason = reason;
  }
 }
 export class PDFAntibotError extends Error {
  constructor() {
    super("PDF scrape was prevented by anti-bot")
  }
 }
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@ -16,6 +16,7 @@ import {
  AddFeatureError,
  EngineError,
  NoEnginesLeftError,
  PDFAntibotError,
  RemoveFeatureError,
  SiteError,
  TimeoutError,
@ -49,6 +50,11 @@ export type Meta = {
  logs: any[];
  featureFlags: Set<FeatureFlag>;
  mock: MockState | null;
  pdfPrefetch: {
    filePath: string;
    url?: string;
    status: number;
  } | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
 };
 function buildFeatureFlags(
@ -151,6 +157,7 @@ async function buildMetaObject(
      options.useMock !== undefined
        ? await loadMock(options.useMock, _logger)
        : null,
    pdfPrefetch: undefined,
  };
 }
@ -309,6 +316,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
        throw error;
      } else if (error instanceof UnsupportedFileError) {
        throw error;
      } else if (error instanceof PDFAntibotError) {
        throw error;
      } else if (error instanceof TimeoutSignal) {
        throw error;
      } else {
@ -394,6 +403,9 @@ export async function scrapeURL(
          meta.featureFlags = new Set(
            [...meta.featureFlags].concat(error.featureFlags),
          );
          if (error.pdfPrefetch) {
            meta.pdfPrefetch = error.pdfPrefetch;
          }
        } else if (
          error instanceof RemoveFeatureError &&
          meta.internalOptions.forceEngine === undefined
@ -408,6 +420,21 @@ export async function scrapeURL(
              (x) => !error.featureFlags.includes(x),
            ),
          );
        } else if (
          error instanceof PDFAntibotError &&
          meta.internalOptions.forceEngine === undefined
        ) {
          if (meta.pdfPrefetch !== undefined) {
            meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
            throw error;
          } else {
            meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
            meta.featureFlags = new Set(
              [...meta.featureFlags].filter(
                (x) => x !== "pdf",
              ),
            );
          }
        } else {
          throw error;
        }