diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 45aa3c9b..14d42025 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -140,6 +140,7 @@ export const scrapeOptions = z.object({ languages: z.string().array().optional(), }).optional(), skipTlsVerification: z.boolean().default(false), + removeBase64Images: z.boolean().default(true), }).strict(strictMessage) @@ -468,7 +469,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions { parsePDF: x.parsePDF, actions: x.actions as Action[], // no strict null checking grrrr - mogery geolocation: x.location ?? x.geolocation, - skipTlsVerification: x.skipTlsVerification + skipTlsVerification: x.skipTlsVerification, + removeBase64Images: x.removeBase64Images, }; } diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 81bca571..87aa44ac 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -58,6 +58,7 @@ export type PageOptions = { country?: string; }; skipTlsVerification?: boolean; + removeBase64Images?: boolean; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 1817a07b..615e49b4 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -595,6 +595,7 @@ export class WebScraperDataProvider { actions: options.pageOptions?.actions ?? undefined, geolocation: options.pageOptions?.geolocation ?? undefined, skipTlsVerification: options.pageOptions?.skipTlsVerification ?? false, + removeBase64Images: options.pageOptions?.removeBase64Images ?? true, }; this.extractorOptions = options.extractorOptions ?? { mode: "markdown" }; this.replaceAllPathsWithAbsolutePaths = diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 7332874f..8a37457c 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -28,7 +28,7 @@ export async function scrapWithFireEngine({ waitFor = 0, screenshot = false, fullPageScreenshot = false, - pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false }, + pageOptions = { parsePDF: true, atsv: false, useFastMode: false, disableJsDom: false, geolocation: { country: "US" }, skipTlsVerification: false, removeBase64Images: true }, fireEngineOptions = {}, headers, options, @@ -40,7 +40,7 @@ export async function scrapWithFireEngine({ waitFor?: number; screenshot?: boolean; fullPageScreenshot?: boolean; - pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean }; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean, atsv?: boolean, useFastMode?: boolean, disableJsDom?: boolean, geolocation?: { country?: string }, skipTlsVerification?: boolean, removeBase64Images?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; @@ -120,6 +120,7 @@ export async function scrapWithFireEngine({ scrollXPaths: pageOptions?.scrollXPaths ?? [], geolocation: pageOptions?.geolocation, skipTlsVerification: pageOptions?.skipTlsVerification ?? false, + removeBase64Images: pageOptions?.removeBase64Images ?? true, actions: actions, }, { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 611a7b5c..6f5ec541 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -22,6 +22,7 @@ import { Logger } from "../../lib/logger"; import { ScrapeEvents } from "../../lib/scrape-events"; import { clientSideError } from "../../strings"; import { ScrapeActionContent } from "../../lib/entities"; +import { removeBase64Images } from "./utils/removeBase64Images"; dotenv.config(); @@ -159,6 +160,7 @@ export async function scrapSingleUrl( actions: pageOptions.actions ?? undefined, geolocation: pageOptions.geolocation ?? undefined, skipTlsVerification: pageOptions.skipTlsVerification ?? false, + removeBase64Images: pageOptions.removeBase64Images ?? true, } if (extractorOptions) { @@ -350,7 +352,10 @@ export async function scrapSingleUrl( } //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); - const text = await parseMarkdown(cleanedHtml); + let text = await parseMarkdown(cleanedHtml); + if (pageOptions.removeBase64Images) { + text = await removeBase64Images(text); + } const insertedLogId = await logInsertPromise; ScrapeEvents.updateScrapeResult(insertedLogId, { diff --git a/apps/api/src/scraper/WebScraper/utils/removeBase64Images.ts b/apps/api/src/scraper/WebScraper/utils/removeBase64Images.ts new file mode 100644 index 00000000..2845589c --- /dev/null +++ b/apps/api/src/scraper/WebScraper/utils/removeBase64Images.ts @@ -0,0 +1,7 @@ +export const removeBase64Images = async ( + markdown: string, +) => { + const regex = /(!\[.*?\])\(data:image\/.*?;base64,.*?\)/g; + markdown = markdown.replace(regex, '$1()'); + return markdown; +};