diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index a9fcf47a..69f99d63 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -150,6 +150,16 @@ describe("Scrape tests", () => { }); }, 15000); }); + + describe("PDF (f-e dependant)", () => { + it.concurrent("works for PDFs behind anti-bot", async () => { + const response = await scrape({ + url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf" + }); + + expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix"); + }, 60000); + }); } if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) { diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 9410b0df..bc9ce5bb 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -44,7 +44,7 @@ export async function scrapeURLWithFetch( } } - specialtyScrapeCheck( + await specialtyScrapeCheck( meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }), Object.fromEntries(response.headers as any), ); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 456c11e2..c21d9f90 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -133,11 +133,12 @@ async function performFireEngineScrape< await new Promise((resolve) => setTimeout(resolve, 250)); } - specialtyScrapeCheck( + await specialtyScrapeCheck( logger.child({ method: "performFireEngineScrape/specialtyScrapeCheck", }), status.responseHeaders, + status, ); const contentType = (Object.entries(status.responseHeaders ?? {}).find( diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index a3678615..77905e78 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; -import { RemoveFeatureError, UnsupportedFileError } from "../../error"; +import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error"; import { readFile, unlink } from "node:fs/promises"; import path from "node:path"; +import type { Response } from "undici"; type PDFProcessorResult = { html: string; markdown?: string }; @@ -88,9 +89,19 @@ export async function scrapePDF( }; } - const { response, tempFilePath } = await downloadFile(meta.id, meta.url, { - headers: meta.options.headers, - }); + const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null) + ? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath } + : await downloadFile(meta.id, meta.url, { + headers: meta.options.headers, + }); + + if ((response as any).headers) { // if downloadFile was used + const r: Response = response as any; + const ct = r.headers.get("Content-Type"); + if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF + throw new PDFAntibotError(); + } + } let result: PDFProcessorResult | null = null; @@ -142,7 +153,7 @@ export async function scrapePDF( await unlink(tempFilePath); return { - url: response.url, + url: response.url ?? meta.url, statusCode: response.status, html: result?.html ?? "", markdown: result?.markdown ?? "", diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts index 39805a2f..6840c142 100644 --- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts @@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee( }); } - specialtyScrapeCheck( + await specialtyScrapeCheck( meta.logger.child({ method: "scrapeURLWithScrapingBee/specialtyScrapeCheck", }), diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts index 352f6a7e..f762cfb1 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts @@ -1,9 +1,30 @@ import { Logger } from "winston"; import { AddFeatureError } from "../../error"; +import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus"; +import path from "path"; +import os from "os"; +import { writeFile } from "fs/promises"; +import { Meta } from "../.."; -export function specialtyScrapeCheck( +async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise { + if (!feRes?.file) { + return null; + } + + const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`); + await writeFile(filePath, Buffer.from(feRes.file.content, "base64")) + + return { + status: feRes.pageStatusCode, + url: feRes.url, + filePath, + }; +} + +export async function specialtyScrapeCheck( logger: Logger, headers: Record | undefined, + feRes?: FireEngineCheckStatusSuccess, ) { const contentType = (Object.entries(headers ?? {}).find( (x) => x[0].toLowerCase() === "content-type", @@ -18,7 +39,7 @@ export function specialtyScrapeCheck( contentType.startsWith("application/pdf;") ) { // .pdf - throw new AddFeatureError(["pdf"]); + throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes)); } else if ( contentType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index bff3a492..ff445f8d 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -1,4 +1,4 @@ -import { EngineResultsTracker } from "."; +import { EngineResultsTracker, Meta } from "."; import { Engine, FeatureFlag } from "./engines"; export class EngineError extends Error { @@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error { export class AddFeatureError extends Error { public featureFlags: FeatureFlag[]; + public pdfPrefetch: Meta["pdfPrefetch"]; - constructor(featureFlags: FeatureFlag[]) { + constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) { super("New feature flags have been discovered: " + featureFlags.join(", ")); this.featureFlags = featureFlags; + this.pdfPrefetch = pdfPrefetch; } } @@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error { this.reason = reason; } } + +export class PDFAntibotError extends Error { + constructor() { + super("PDF scrape was prevented by anti-bot") + } +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 511d3ffd..eaf5497a 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -16,6 +16,7 @@ import { AddFeatureError, EngineError, NoEnginesLeftError, + PDFAntibotError, RemoveFeatureError, SiteError, TimeoutError, @@ -49,6 +50,11 @@ export type Meta = { logs: any[]; featureFlags: Set; mock: MockState | null; + pdfPrefetch: { + filePath: string; + url?: string; + status: number; + } | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty }; function buildFeatureFlags( @@ -151,6 +157,7 @@ async function buildMetaObject( options.useMock !== undefined ? await loadMock(options.useMock, _logger) : null, + pdfPrefetch: undefined, }; } @@ -309,6 +316,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof UnsupportedFileError) { throw error; + } else if (error instanceof PDFAntibotError) { + throw error; } else if (error instanceof TimeoutSignal) { throw error; } else { @@ -394,6 +403,9 @@ export async function scrapeURL( meta.featureFlags = new Set( [...meta.featureFlags].concat(error.featureFlags), ); + if (error.pdfPrefetch) { + meta.pdfPrefetch = error.pdfPrefetch; + } } else if ( error instanceof RemoveFeatureError && meta.internalOptions.forceEngine === undefined @@ -408,6 +420,21 @@ export async function scrapeURL( (x) => !error.featureFlags.includes(x), ), ); + } else if ( + error instanceof PDFAntibotError && + meta.internalOptions.forceEngine === undefined + ) { + if (meta.pdfPrefetch !== undefined) { + meta.logger.error("PDF was prefetched and still blocked by antibot, failing"); + throw error; + } else { + meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp"); + meta.featureFlags = new Set( + [...meta.featureFlags].filter( + (x) => x !== "pdf", + ), + ); + } } else { throw error; }