mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 14:10:42 +08:00
added remove base64 images options (true by default)
This commit is contained in:
parent
45debc9977
commit
4c5bb21a6f
@ -140,6 +140,7 @@ export const scrapeOptions = z.object({
|
|||||||
languages: z.string().array().optional(),
|
languages: z.string().array().optional(),
|
||||||
}).optional(),
|
}).optional(),
|
||||||
skipTlsVerification: z.boolean().default(false),
|
skipTlsVerification: z.boolean().default(false),
|
||||||
|
removeBase64Images: z.boolean().default(true),
|
||||||
}).strict(strictMessage)
|
}).strict(strictMessage)
|
||||||
|
|
||||||
|
|
||||||
@ -468,7 +469,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
|||||||
parsePDF: x.parsePDF,
|
parsePDF: x.parsePDF,
|
||||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||||
geolocation: x.location ?? x.geolocation,
|
geolocation: x.location ?? x.geolocation,
|
||||||
skipTlsVerification: x.skipTlsVerification
|
skipTlsVerification: x.skipTlsVerification,
|
||||||
|
removeBase64Images: x.removeBase64Images,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -58,6 +58,7 @@ export type PageOptions = {
|
|||||||
country?: string;
|
country?: string;
|
||||||
};
|
};
|
||||||
skipTlsVerification?: boolean;
|
skipTlsVerification?: boolean;
|
||||||
|
removeBase64Images?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -595,6 +595,7 @@ export class WebScraperDataProvider {
|
|||||||
actions: options.pageOptions?.actions ?? undefined,
|
actions: options.pageOptions?.actions ?? undefined,
|
||||||
geolocation: options.pageOptions?.geolocation ?? undefined,
|
geolocation: options.pageOptions?.geolocation ?? undefined,
|
||||||
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
|
||||||
|
removeBase64Images: options.pageOptions?.removeBase64Images ?? true,
|
||||||
};
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||||
this.replaceAllPathsWithAbsolutePaths =
|
this.replaceAllPathsWithAbsolutePaths =
|
||||||
|
@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor = 0,
|
waitFor = 0,
|
||||||
screenshot = false,
|
screenshot = false,
|
||||||
fullPageScreenshot = false,
|
fullPageScreenshot = false,
|
||||||
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false },
|
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true },
|
||||||
fireEngineOptions = {},
|
fireEngineOptions = {},
|
||||||
headers,
|
headers,
|
||||||
options,
|
options,
|
||||||
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
|
|||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
fullPageScreenshot?: boolean;
|
fullPageScreenshot?: boolean;
|
||||||
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean };
|
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean };
|
||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
@ -120,6 +120,7 @@ export async function scrapWithFireEngine({
|
|||||||
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
scrollXPaths: pageOptions?.scrollXPaths ?? [],
|
||||||
geolocation: pageOptions?.geolocation,
|
geolocation: pageOptions?.geolocation,
|
||||||
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
|
||||||
|
removeBase64Images: pageOptions?.removeBase64Images ?? true,
|
||||||
actions: actions,
|
actions: actions,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -22,6 +22,7 @@ import { Logger } from "../../lib/logger";
|
|||||||
import { ScrapeEvents } from "../../lib/scrape-events";
|
import { ScrapeEvents } from "../../lib/scrape-events";
|
||||||
import { clientSideError } from "../../strings";
|
import { clientSideError } from "../../strings";
|
||||||
import { ScrapeActionContent } from "../../lib/entities";
|
import { ScrapeActionContent } from "../../lib/entities";
|
||||||
|
import { removeBase64Images } from "./utils/removeBase64Images";
|
||||||
|
|
||||||
dotenv.config();
|
dotenv.config();
|
||||||
|
|
||||||
@ -159,6 +160,7 @@ export async function scrapSingleUrl(
|
|||||||
actions: pageOptions.actions ?? undefined,
|
actions: pageOptions.actions ?? undefined,
|
||||||
geolocation: pageOptions.geolocation ?? undefined,
|
geolocation: pageOptions.geolocation ?? undefined,
|
||||||
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
skipTlsVerification: pageOptions.skipTlsVerification ?? false,
|
||||||
|
removeBase64Images: pageOptions.removeBase64Images ?? true,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (extractorOptions) {
|
if (extractorOptions) {
|
||||||
@ -350,7 +352,10 @@ export async function scrapSingleUrl(
|
|||||||
}
|
}
|
||||||
//* TODO: add an optional to return markdown or structured/extracted content
|
//* TODO: add an optional to return markdown or structured/extracted content
|
||||||
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
|
||||||
const text = await parseMarkdown(cleanedHtml);
|
let text = await parseMarkdown(cleanedHtml);
|
||||||
|
if (pageOptions.removeBase64Images) {
|
||||||
|
text = await removeBase64Images(text);
|
||||||
|
}
|
||||||
|
|
||||||
const insertedLogId = await logInsertPromise;
|
const insertedLogId = await logInsertPromise;
|
||||||
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
ScrapeEvents.updateScrapeResult(insertedLogId, {
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
export const removeBase64Images = async (
|
||||||
|
markdown: string,
|
||||||
|
) => {
|
||||||
|
const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g;
|
||||||
|
markdown = markdown.replace(regex, '$1(<Base64-Image-Removed>)');
|
||||||
|
return markdown;
|
||||||
|
};
|
Loading…
x
Reference in New Issue
Block a user