diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index 933d4d74..e7d9efe8 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -4,7 +4,9 @@ import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; export async function scrapeDOCX(meta: Meta): Promise { - const { response, tempFilePath } = await downloadFile(meta.id, meta.url); + const { response, tempFilePath } = await downloadFile(meta.id, meta.url, { + headers: meta.options.headers, + }); return { url: response.url, diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index b774cd0e..101c9a53 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -74,7 +74,9 @@ export async function scrapePDF( timeToRun: number | undefined, ): Promise { if (!meta.options.parsePDF) { - const file = await fetchFileToBuffer(meta.url); + const file = await fetchFileToBuffer(meta.url, { + headers: meta.options.headers, + }); const content = file.buffer.toString("base64"); return { url: file.response.url, @@ -85,7 +87,9 @@ export async function scrapePDF( }; } - const { response, tempFilePath } = await downloadFile(meta.id, meta.url); + const { response, tempFilePath } = await downloadFile(meta.id, meta.url, { + headers: meta.options.headers, + }); let result: PDFProcessorResult | null = null; diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts index d4932f34..55be08c3 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/downloadFile.ts @@ -7,11 +7,11 @@ import { v4 as uuid } from "uuid"; import * as undici from "undici"; import { makeSecureDispatcher } from "./safeFetch"; -export async function fetchFileToBuffer(url: string): Promise<{ - response: Response; +export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{ + response: undici.Response; buffer: Buffer; }> { - const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying + const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) }); return { response, buffer: Buffer.from(await response.arrayBuffer()), @@ -21,6 +21,7 @@ export async function fetchFileToBuffer(url: string): Promise<{ export async function downloadFile( id: string, url: string, + init?: undici.RequestInit, ): Promise<{ response: undici.Response; tempFilePath: string; @@ -29,7 +30,7 @@ export async function downloadFile( const tempFileWrite = createWriteStream(tempFilePath); // TODO: maybe we could use tlsclient for this? for proxying - const response = await undici.fetch(url, { dispatcher: await makeSecureDispatcher(url) }); + const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) }); // This should never happen in the current state of JS/Undici (2024), but let's check anyways. if (response.body === null) {