From 16e850288cd6dea1e69c769d613271306ed241c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 12 Nov 2024 22:46:58 +0100 Subject: [PATCH] fix(scrapeURL/pdf,docx): ignore SSL when downloading PDF --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 9 +++++++++ .../api/src/scraper/scrapeURL/engines/docx/index.ts | 2 +- apps/api/src/scraper/scrapeURL/engines/pdf/index.ts | 2 +- .../scraper/scrapeURL/engines/utils/downloadFile.ts | 13 +++++++++++-- 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index aebd90a5..0da99459 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -113,6 +113,7 @@ "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", + "undici": "^6.20.1", "unstructured-client": "^0.11.3", "uuid": "^10.0.0", "winston": "^3.14.2", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index f98055fb..c2a9c8a3 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -197,6 +197,9 @@ importers: typesense: specifier: ^1.5.4 version: 1.8.2(@babel/runtime@7.24.6) + undici: + specifier: ^6.20.1 + version: 6.20.1 unstructured-client: specifier: ^0.11.3 version: 0.11.3(zod@3.23.8) @@ -3957,6 +3960,10 @@ packages: undici-types@5.26.5: resolution: {integrity: sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==} + undici@6.20.1: + resolution: {integrity: sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==} + engines: {node: '>=18.17'} + union@0.5.0: resolution: {integrity: sha512-N6uOhuW6zO95P3Mel2I2zMsbsanvvtgn6jVqJv4vbVcz/JN0OkL9suomjQGmWtxJQXOCqUJvquc1sMeNz/IwlA==} engines: {node: '>= 0.8.0'} @@ -8341,6 +8348,8 @@ snapshots: undici-types@5.26.5: {} + undici@6.20.1: {} + union@0.5.0: dependencies: qs: 6.12.2 diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index f8196ccd..9881fae7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -1,6 +1,6 @@ import { Meta } from "../.."; import { EngineScrapeResult } from ".."; -import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; +import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; export async function scrapeDOCX(meta: Meta): Promise { diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index d0591b57..bdc916e0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -62,7 +62,7 @@ async function scrapePDFWithLlamaParse(meta: Meta, tempFilePath: string): Promis schema: z.object({ markdown: z.string(), }), - tryCount: 16, + tryCount: 32, tryCooldown: 250, }); diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index 8db8892b..736faba7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -4,6 +4,7 @@ import { createWriteStream, promises as fs } from "node:fs"; import { EngineError } from "../../error"; import { Writable } from "stream"; import { v4 as uuid } from "uuid"; +import * as undici from "undici"; export async function fetchFileToBuffer(url: string): Promise<{ response: Response, @@ -17,13 +18,21 @@ export async function fetchFileToBuffer(url: string): Promise<{ } export async function downloadFile(id: string, url: string): Promise<{ - response: Response + response: undici.Response tempFilePath: string }> { const tempFilePath = path.join(os.tmpdir(), `tempFile-${id}--${uuid()}`); const tempFileWrite = createWriteStream(tempFilePath); - const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying + // TODO: maybe we could use tlsclient for this? for proxying + // use undici to ignore SSL for now + const response = await undici.fetch(url, { + dispatcher: new undici.Agent({ + connect: { + rejectUnauthorized: false, + }, + }) + }); // This should never happen in the current state of JS (2024), but let's check anyways. if (response.body === null) {