mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:55:55 +08:00
fix(scrapeURL/engines/pdf,docx): support authorization
This commit is contained in:
parent
49e584f8e1
commit
3c614a2e5c
@ -4,7 +4,9 @@ import { downloadFile } from "../utils/downloadFile";
|
|||||||
import mammoth from "mammoth";
|
import mammoth from "mammoth";
|
||||||
|
|
||||||
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
export async function scrapeDOCX(meta: Meta): Promise<EngineScrapeResult> {
|
||||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
|
||||||
|
headers: meta.options.headers,
|
||||||
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url: response.url,
|
url: response.url,
|
||||||
|
@ -74,7 +74,9 @@ export async function scrapePDF(
|
|||||||
timeToRun: number | undefined,
|
timeToRun: number | undefined,
|
||||||
): Promise<EngineScrapeResult> {
|
): Promise<EngineScrapeResult> {
|
||||||
if (!meta.options.parsePDF) {
|
if (!meta.options.parsePDF) {
|
||||||
const file = await fetchFileToBuffer(meta.url);
|
const file = await fetchFileToBuffer(meta.url, {
|
||||||
|
headers: meta.options.headers,
|
||||||
|
});
|
||||||
const content = file.buffer.toString("base64");
|
const content = file.buffer.toString("base64");
|
||||||
return {
|
return {
|
||||||
url: file.response.url,
|
url: file.response.url,
|
||||||
@ -85,7 +87,9 @@ export async function scrapePDF(
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
const { response, tempFilePath } = await downloadFile(meta.id, meta.url, {
|
||||||
|
headers: meta.options.headers,
|
||||||
|
});
|
||||||
|
|
||||||
let result: PDFProcessorResult | null = null;
|
let result: PDFProcessorResult | null = null;
|
||||||
|
|
||||||
|
@ -7,11 +7,11 @@ import { v4 as uuid } from "uuid";
|
|||||||
import * as undici from "undici";
|
import * as undici from "undici";
|
||||||
import { makeSecureDispatcher } from "./safeFetch";
|
import { makeSecureDispatcher } from "./safeFetch";
|
||||||
|
|
||||||
export async function fetchFileToBuffer(url: string): Promise<{
|
export async function fetchFileToBuffer(url: string, init?: undici.RequestInit): Promise<{
|
||||||
response: Response;
|
response: undici.Response;
|
||||||
buffer: Buffer;
|
buffer: Buffer;
|
||||||
}> {
|
}> {
|
||||||
const response = await fetch(url); // TODO: maybe we could use tlsclient for this? for proxying
|
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
|
||||||
return {
|
return {
|
||||||
response,
|
response,
|
||||||
buffer: Buffer.from(await response.arrayBuffer()),
|
buffer: Buffer.from(await response.arrayBuffer()),
|
||||||
@ -21,6 +21,7 @@ export async function fetchFileToBuffer(url: string): Promise<{
|
|||||||
export async function downloadFile(
|
export async function downloadFile(
|
||||||
id: string,
|
id: string,
|
||||||
url: string,
|
url: string,
|
||||||
|
init?: undici.RequestInit,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
response: undici.Response;
|
response: undici.Response;
|
||||||
tempFilePath: string;
|
tempFilePath: string;
|
||||||
@ -29,7 +30,7 @@ export async function downloadFile(
|
|||||||
const tempFileWrite = createWriteStream(tempFilePath);
|
const tempFileWrite = createWriteStream(tempFilePath);
|
||||||
|
|
||||||
// TODO: maybe we could use tlsclient for this? for proxying
|
// TODO: maybe we could use tlsclient for this? for proxying
|
||||||
const response = await undici.fetch(url, { dispatcher: await makeSecureDispatcher(url) });
|
const response = await undici.fetch(url, { ...init, redirect: "follow", dispatcher: await makeSecureDispatcher(url) });
|
||||||
|
|
||||||
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
|
// This should never happen in the current state of JS/Undici (2024), but let's check anyways.
|
||||||
if (response.body === null) {
|
if (response.body === null) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user