fix(scrapeURL/engines/pdf,docx): support authorization

This commit is contained in:
Móricz Gergő 2025-01-09 10:03:27 +01:00
parent 49e584f8e1
commit 3c614a2e5c
3 changed files with 14 additions and 7 deletions

View File

@ -4,7 +4,9 @@ import { downloadFile } from "../utils/downloadFile";
import mammoth from "mammoth"; import mammoth from "mammoth";
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> { export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
const { response, tempFilePath } = await downloadFile(meta.id, meta.url); const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
return { return {
url: response.url, url: response.url,

View File

@ -74,7 +74,9 @@ export async function scrapePDF(
timeToRun: number | undefined, timeToRun: number | undefined,
): Promise<EngineScrapeResult> { ): Promise<EngineScrapeResult> {
if (!meta.options.parsePDF) { if (!meta.options.parsePDF) {
const file = await fetchFileToBuffer(meta.url); const file = await fetchFileToBuffer(meta.url, {
headers: meta.options.headers,
});
const content = file.buffer.toString("base64"); const content = file.buffer.toString("base64");
return { return {
url: file.response.url, url: file.response.url,
@ -85,7 +87,9 @@ export async function scrapePDF(
}; };
} }
const { response, tempFilePath } = await downloadFile(meta.id, meta.url); const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
headers: meta.options.headers,
});
let result: PDFProcessorResult | null = null; let result: PDFProcessorResult | null = null;

View File

@ -7,11 +7,11 @@ import { v4 as uuid } from "uuid";
import * as undici from "undici"; import * as undici from "undici";
import { makeSecureDispatcher } from "./safeFetch"; import { makeSecureDispatcher } from "./safeFetch";
export async function fetchFileToBuffer(url: string): Promise<{ export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{
response: Response; response: undici.Response;
buffer: Buffer; buffer: Buffer;
}> { }> {
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
return { return {
response, response,
buffer: Buffer.from(await response.arrayBuffer()), buffer: Buffer.from(await response.arrayBuffer()),
@ -21,6 +21,7 @@ export async function fetchFileToBuffer(url: string): Promise<{
export async function downloadFile( export async function downloadFile(
id: string, id: string,
url: string, url: string,
init?: undici.RequestInit,
): Promise<{ ): Promise<{
response: undici.Response; response: undici.Response;
tempFilePath: string; tempFilePath: string;
@ -29,7 +30,7 @@ export async function downloadFile(
const tempFileWrite = createWriteStream(tempFilePath); const tempFileWrite = createWriteStream(tempFilePath);
// TODO: maybe we could use tlsclient for this? for proxying // TODO: maybe we could use tlsclient for this? for proxying
const response = await undici.fetch(url, { dispatcher: await makeSecureDispatcher(url) }); const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
// This should never happen in the current state of JS/Undici (2024), but let's check anyways. // This should never happen in the current state of JS/Undici (2024), but let's check anyways.
if (response.body === null) { if (response.body === null) {