From 3edc3a3d1580b7ca10a51dbd852a37def6103c0c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 5 Aug 2024 18:17:37 -0300 Subject: [PATCH] added fullpagescreenshot capabilities, wip on fire-engine side --- apps/api/openapi.json | 10 ++++++++++ apps/api/src/lib/default-values.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 4 ++++ apps/api/src/scraper/WebScraper/single_url.ts | 2 ++ 5 files changed, 18 insertions(+) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index e0b583f0..fb0c4305 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -84,6 +84,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", @@ -317,6 +322,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index 3b303781..152f47d7 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -7,6 +7,7 @@ export const defaultPageOptions = { includeHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, parsePDF: true }; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 9ffa4810..4dc2050d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { fetchPageContent?: boolean; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; parsePDF?: boolean; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index ba67043c..dfe23a89 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger"; * @param url The URL to scrape * @param waitFor The time to wait for the page to load * @param screenshot Whether to take a screenshot + * @param fullPageScreenshot Whether to take a full page screenshot * @param pageOptions The options for the page * @param headers The headers to send with the request * @param options The options for the request @@ -20,6 +21,7 @@ export async function scrapWithFireEngine({ url, waitFor = 0, screenshot = false, + fullPageScreenshot = false, pageOptions = { parsePDF: true }, fireEngineOptions = {}, headers, @@ -28,6 +30,7 @@ export async function scrapWithFireEngine({ url: string; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; @@ -71,6 +74,7 @@ export async function scrapWithFireEngine({ url: url, wait: waitParam, screenshot: screenshotParam, + fullPageScreenshot: fullPageScreenshot, headers: headers, pageOptions: pageOptions, ...fireEngineOptionsParam, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4a44b23f..0fa2fc8b 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -128,6 +128,7 @@ export async function scrapSingleUrl( includeRawHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, headers: undefined, }, extractorOptions: ExtractorOptions = { @@ -171,6 +172,7 @@ export async function scrapSingleUrl( url, waitFor: pageOptions.waitFor, screenshot: pageOptions.screenshot, + fullPageScreenshot: pageOptions.fullPageScreenshot, pageOptions: pageOptions, headers: pageOptions.headers, fireEngineOptions: {