Merge pull request #867 from mendableai/fix/remove-base64-images

[Feat] Added remove base64 images options (true by default)
This commit is contained in:
Nicolas 2024-11-04 13:17:09 -05:00 committed by GitHub
commit 12c0aa6b4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 20 additions and 3 deletions

View File

@ -141,6 +141,7 @@ export const scrapeOptions = z.object({
languages: z.string().array().optional(), languages: z.string().array().optional(),
}).optional(), }).optional(),
skipTlsVerification: z.boolean().default(false), skipTlsVerification: z.boolean().default(false),
removeBase64Images: z.boolean().default(true),
}).strict(strictMessage) }).strict(strictMessage)
@ -470,6 +471,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
actions: x.actions as Action[], // no strict null checking grrrr - mogery actions: x.actions as Action[], // no strict null checking grrrr - mogery
geolocation: x.location ?? x.geolocation, geolocation: x.location ?? x.geolocation,
skipTlsVerification: x.skipTlsVerification, skipTlsVerification: x.skipTlsVerification,
removeBase64Images: x.removeBase64Images,
mobile: x.mobile, mobile: x.mobile,
}; };
} }

View File

@ -58,6 +58,7 @@ export type PageOptions = {
country?: string; country?: string;
}; };
skipTlsVerification?: boolean; skipTlsVerification?: boolean;
removeBase64Images?: boolean;
mobile?: boolean; mobile?: boolean;
}; };

View File

@ -595,6 +595,7 @@ export class WebScraperDataProvider {
actions: options.pageOptions?.actions ?? undefined, actions: options.pageOptions?.actions ?? undefined,
geolocation: options.pageOptions?.geolocation ?? undefined, geolocation: options.pageOptions?.geolocation ?? undefined,
skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false, skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false,
removeBase64Images: options.pageOptions?.removeBase64Images ?? true,
mobile: options.pageOptions?.mobile ?? false, mobile: options.pageOptions?.mobile ?? false,
}; };
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };

View File

@ -28,7 +28,7 @@ export async function scrapWithFireEngine({
waitFor = 0, waitFor = 0,
screenshot = false, screenshot = false,
fullPageScreenshot = false, fullPageScreenshot = false,
pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, mobile: false }, pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true, mobile: false },
fireEngineOptions = {}, fireEngineOptions = {},
headers, headers,
options, options,
@ -40,7 +40,7 @@ export async function scrapWithFireEngine({
waitFor?: number; waitFor?: number;
screenshot?: boolean; screenshot?: boolean;
fullPageScreenshot?: boolean; fullPageScreenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, mobile?: boolean }; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean, mobile?: boolean };
fireEngineOptions?: FireEngineOptions; fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>; headers?: Record<string, string>;
options?: any; options?: any;
@ -121,6 +121,7 @@ export async function scrapWithFireEngine({
scrollXPaths: pageOptions?.scrollXPaths ?? [], scrollXPaths: pageOptions?.scrollXPaths ?? [],
geolocation: pageOptions?.geolocation, geolocation: pageOptions?.geolocation,
skipTlsVerification: pageOptions?.skipTlsVerification ?? false, skipTlsVerification: pageOptions?.skipTlsVerification ?? false,
removeBase64Images: pageOptions?.removeBase64Images ?? true,
actions: actions, actions: actions,
}, },
{ {

View File

@ -22,6 +22,7 @@ import { Logger } from "../../lib/logger";
import { ScrapeEvents } from "../../lib/scrape-events"; import { ScrapeEvents } from "../../lib/scrape-events";
import { clientSideError } from "../../strings"; import { clientSideError } from "../../strings";
import { ScrapeActionContent } from "../../lib/entities"; import { ScrapeActionContent } from "../../lib/entities";
import { removeBase64Images } from "./utils/removeBase64Images";
dotenv.config(); dotenv.config();
@ -159,6 +160,7 @@ export async function scrapSingleUrl(
actions: pageOptions.actions ?? undefined, actions: pageOptions.actions ?? undefined,
geolocation: pageOptions.geolocation ?? undefined, geolocation: pageOptions.geolocation ?? undefined,
skipTlsVerification: pageOptions.skipTlsVerification ?? false, skipTlsVerification: pageOptions.skipTlsVerification ?? false,
removeBase64Images: pageOptions.removeBase64Images ?? true,
mobile: pageOptions.mobile ?? false, mobile: pageOptions.mobile ?? false,
} }
@ -351,7 +353,10 @@ export async function scrapSingleUrl(
} }
//* TODO: add an optional to return markdown or structured/extracted content //* TODO: add an optional to return markdown or structured/extracted content
let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
const text = await parseMarkdown(cleanedHtml); let text = await parseMarkdown(cleanedHtml);
if (pageOptions.removeBase64Images) {
text = await removeBase64Images(text);
}
const insertedLogId = await logInsertPromise; const insertedLogId = await logInsertPromise;
ScrapeEvents.updateScrapeResult(insertedLogId, { ScrapeEvents.updateScrapeResult(insertedLogId, {

View File

@ -0,0 +1,7 @@
export const removeBase64Images = async (
markdown: string,
) => {
const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g;
markdown = markdown.replace(regex, '$1(<Base64-Image-Removed>)');
return markdown;
};