diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts
index a9fcf47a..69f99d63 100644
--- a/apps/api/src/__tests__/snips/scrape.test.ts
+++ b/apps/api/src/__tests__/snips/scrape.test.ts
@@ -150,6 +150,16 @@ describe("Scrape tests", () => {
});
}, 15000);
});
+
+ describe("PDF (f-e dependant)", () => {
+ it.concurrent("works for PDFs behind anti-bot", async () => {
+ const response = await scrape({
+ url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
+ });
+
+ expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
+ }, 60000);
+ });
}
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY) {
diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
index 9410b0df..bc9ce5bb 100644
--- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts
@@ -44,7 +44,7 @@ export async function scrapeURLWithFetch(
}
}
- specialtyScrapeCheck(
+ await specialtyScrapeCheck(
meta.logger.child({ method: "scrapeURLWithFetch/specialtyScrapeCheck" }),
Object.fromEntries(response.headers as any),
);
diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
index 456c11e2..c21d9f90 100644
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@@ -133,11 +133,12 @@ async function performFireEngineScrape<
await new Promise((resolve) => setTimeout(resolve, 250));
}
- specialtyScrapeCheck(
+ await specialtyScrapeCheck(
logger.child({
method: "performFireEngineScrape/specialtyScrapeCheck",
}),
status.responseHeaders,
+ status,
);
const contentType = (Object.entries(status.responseHeaders ?? {}).find(
diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
index a3678615..77905e78 100644
--- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@@ -7,9 +7,10 @@ import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
-import { RemoveFeatureError, UnsupportedFileError } from "../../error";
+import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
import { readFile, unlink } from "node:fs/promises";
import path from "node:path";
+import type { Response } from "undici";
type PDFProcessorResult = { html: string; markdown?: string };
@@ -88,9 +89,19 @@ export async function scrapePDF(
};
}
- const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
- headers: meta.options.headers,
- });
+ const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)
+ ? { response: meta.pdfPrefetch, tempFilePath: meta.pdfPrefetch.filePath }
+ : await downloadFile(meta.id, meta.url, {
+ headers: meta.options.headers,
+ });
+
+ if ((response as any).headers) { // if downloadFile was used
+ const r: Response = response as any;
+ const ct = r.headers.get("Content-Type");
+ if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
+ throw new PDFAntibotError();
+ }
+ }
let result: PDFProcessorResult | null = null;
@@ -142,7 +153,7 @@ export async function scrapePDF(
await unlink(tempFilePath);
return {
- url: response.url,
+ url: response.url ?? meta.url,
statusCode: response.status,
html: result?.html ?? "",
markdown: result?.markdown ?? "",
diff --git a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
index 39805a2f..6840c142 100644
--- a/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/scrapingbee/index.ts
@@ -72,7 +72,7 @@ export function scrapeURLWithScrapingBee(
});
}
- specialtyScrapeCheck(
+ await specialtyScrapeCheck(
meta.logger.child({
method: "scrapeURLWithScrapingBee/specialtyScrapeCheck",
}),
diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
index 352f6a7e..f762cfb1 100644
--- a/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/utils/specialtyHandler.ts
@@ -1,9 +1,30 @@
import { Logger } from "winston";
import { AddFeatureError } from "../../error";
+import { FireEngineCheckStatusSuccess } from "../fire-engine/checkStatus";
+import path from "path";
+import os from "os";
+import { writeFile } from "fs/promises";
+import { Meta } from "../..";
-export function specialtyScrapeCheck(
+async function feResToPdfPrefetch(feRes: FireEngineCheckStatusSuccess | undefined): Promise {
+ if (!feRes?.file) {
+ return null;
+ }
+
+ const filePath = path.join(os.tmpdir(), `tempFile-${crypto.randomUUID()}.pdf`);
+ await writeFile(filePath, Buffer.from(feRes.file.content, "base64"))
+
+ return {
+ status: feRes.pageStatusCode,
+ url: feRes.url,
+ filePath,
+ };
+}
+
+export async function specialtyScrapeCheck(
logger: Logger,
headers: Record | undefined,
+ feRes?: FireEngineCheckStatusSuccess,
) {
const contentType = (Object.entries(headers ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
@@ -18,7 +39,7 @@ export function specialtyScrapeCheck(
contentType.startsWith("application/pdf;")
) {
// .pdf
- throw new AddFeatureError(["pdf"]);
+ throw new AddFeatureError(["pdf"], await feResToPdfPrefetch(feRes));
} else if (
contentType ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" ||
diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts
index bff3a492..ff445f8d 100644
--- a/apps/api/src/scraper/scrapeURL/error.ts
+++ b/apps/api/src/scraper/scrapeURL/error.ts
@@ -1,4 +1,4 @@
-import { EngineResultsTracker } from ".";
+import { EngineResultsTracker, Meta } from ".";
import { Engine, FeatureFlag } from "./engines";
export class EngineError extends Error {
@@ -28,10 +28,12 @@ export class NoEnginesLeftError extends Error {
export class AddFeatureError extends Error {
public featureFlags: FeatureFlag[];
+ public pdfPrefetch: Meta["pdfPrefetch"];
- constructor(featureFlags: FeatureFlag[]) {
+ constructor(featureFlags: FeatureFlag[], pdfPrefetch?: Meta["pdfPrefetch"]) {
super("New feature flags have been discovered: " + featureFlags.join(", "));
this.featureFlags = featureFlags;
+ this.pdfPrefetch = pdfPrefetch;
}
}
@@ -72,3 +74,9 @@ export class UnsupportedFileError extends Error {
this.reason = reason;
}
}
+
+export class PDFAntibotError extends Error {
+ constructor() {
+ super("PDF scrape was prevented by anti-bot")
+ }
+}
diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts
index 511d3ffd..eaf5497a 100644
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@@ -16,6 +16,7 @@ import {
AddFeatureError,
EngineError,
NoEnginesLeftError,
+ PDFAntibotError,
RemoveFeatureError,
SiteError,
TimeoutError,
@@ -49,6 +50,11 @@ export type Meta = {
logs: any[];
featureFlags: Set;
mock: MockState | null;
+ pdfPrefetch: {
+ filePath: string;
+ url?: string;
+ status: number;
+ } | null | undefined; // undefined: no prefetch yet, null: prefetch came back empty
};
function buildFeatureFlags(
@@ -151,6 +157,7 @@ async function buildMetaObject(
options.useMock !== undefined
? await loadMock(options.useMock, _logger)
: null,
+ pdfPrefetch: undefined,
};
}
@@ -309,6 +316,8 @@ async function scrapeURLLoop(meta: Meta): Promise {
throw error;
} else if (error instanceof UnsupportedFileError) {
throw error;
+ } else if (error instanceof PDFAntibotError) {
+ throw error;
} else if (error instanceof TimeoutSignal) {
throw error;
} else {
@@ -394,6 +403,9 @@ export async function scrapeURL(
meta.featureFlags = new Set(
[...meta.featureFlags].concat(error.featureFlags),
);
+ if (error.pdfPrefetch) {
+ meta.pdfPrefetch = error.pdfPrefetch;
+ }
} else if (
error instanceof RemoveFeatureError &&
meta.internalOptions.forceEngine === undefined
@@ -408,6 +420,21 @@ export async function scrapeURL(
(x) => !error.featureFlags.includes(x),
),
);
+ } else if (
+ error instanceof PDFAntibotError &&
+ meta.internalOptions.forceEngine === undefined
+ ) {
+ if (meta.pdfPrefetch !== undefined) {
+ meta.logger.error("PDF was prefetched and still blocked by antibot, failing");
+ throw error;
+ } else {
+ meta.logger.debug("PDF was blocked by anti-bot, prefetching with chrome-cdp");
+ meta.featureFlags = new Set(
+ [...meta.featureFlags].filter(
+ (x) => x !== "pdf",
+ ),
+ );
+ }
} else {
throw error;
}