feat(scrapeURL/pdf): support PDF prefetch when parsePDF is off

2025-08-11 18:08:59 +08:00 · 2025-02-20 09:28:13 +01:00 · 2025-02-20 09:28:13 +01:00 · 11ed679274
commit 11ed679274
parent 5eb0235ccb
1 changed files with 28 additions and 11 deletions
--- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@ -76,17 +76,34 @@ export async function scrapePDF(
  timeToRun: number | undefined,
 ): Promise<EngineScrapeResult> {
  if (!meta.options.parsePDF) {
-    const file = await fetchFileToBuffer(meta.url, {
-      headers: meta.options.headers,
-    });
-    const content = file.buffer.toString("base64");
-    return {
-      url: file.response.url,
-      statusCode: file.response.status,
-
-      html: content,
-      markdown: content,
-    };
+    if (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null) {
+      const content = (await readFile(meta.pdfPrefetch.filePath)).toString("base64");
+      return {
+        url: meta.pdfPrefetch.url ?? meta.url,
+        statusCode: meta.pdfPrefetch.status,
+  
+        html: content,
+        markdown: content,
+      };
+    } else {
+      const file = await fetchFileToBuffer(meta.url, {
+        headers: meta.options.headers,
+      });
+  
+      const ct = file.response.headers.get("Content-Type");
+      if (ct && !ct.includes("application/pdf")) { // if downloaded file wasn't a PDF
+        throw new PDFAntibotError();
+      }
+  
+      const content = file.buffer.toString("base64");
+      return {
+        url: file.response.url,
+        statusCode: file.response.status,
+  
+        html: content,
+        markdown: content,
+      };
+    }
  }

  const { response, tempFilePath } = (meta.pdfPrefetch !== undefined && meta.pdfPrefetch !== null)