From 5c65ec58e594fe56da2d15ad13f148303faf0ace Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Mon, 15 Jul 2024 18:40:43 +0300 Subject: [PATCH] Support chrome-cdp and restructure sitemap fire-engine support. --- .../src/scraper/WebScraper/scrapers/fireEngine.ts | 11 ++++++++++- apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++++++++ apps/api/src/scraper/WebScraper/sitemap.ts | 2 +- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 30412f40..bbcf374c 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -31,6 +31,7 @@ export async function scrapWithFireEngine({ fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; + engine?: 'playwright' | 'chrome-cdp' | 'tlsclient'; }): Promise { const logParams = { url, @@ -49,7 +50,14 @@ export async function scrapWithFireEngine({ const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; - let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape"; + + let endpoint = "/scrape"; + + if(options?.endpoint === "request") { + endpoint = "/request"; + } + + let engine = fireEngineOptions?.engine ?? options?.engine ?? "playwright"; // do we want fireEngineOptions as first choice? console.log( `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` @@ -65,6 +73,7 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, headers: headers, pageOptions: pageOptions, + engine: engine, ...fireEngineOptionsParam, }, { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d24e5c2e..4ea46097 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -21,6 +21,7 @@ dotenv.config(); const baseScrapers = [ "fire-engine", + "fire-engine;chrome-cdp", "scrapingBee", "playwright", "scrapingBeeLoad", @@ -70,6 +71,8 @@ function getScrapingFallbackOrder( return !!process.env.SCRAPING_BEE_API_KEY; case "fire-engine": return !!process.env.FIRE_ENGINE_BETA_URL; + case "fire-engine;chrome-cdp": + return !!process.env.FIRE_ENGINE_BETA_URL; case "playwright": return !!process.env.PLAYWRIGHT_MICROSERVICE_URL; default: @@ -80,6 +83,7 @@ function getScrapingFallbackOrder( let defaultOrder = [ "scrapingBee", "fire-engine", + "fire-engine;chrome-cdp", "playwright", "scrapingBeeLoad", "fetch", @@ -136,8 +140,16 @@ export async function scrapSingleUrl( metadata: { pageStatusCode?: number; pageError?: string | null }; } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; + switch (method) { case "fire-engine": + case "fire-engine;chrome-cdp": + + let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright"; + if(method === "fire-engine;chrome-cdp"){ + engine = "chrome-cdp"; + } + if (process.env.FIRE_ENGINE_BETA_URL) { console.log(`Scraping ${url} with Fire Engine`); const response = await scrapWithFireEngine({ @@ -146,6 +158,7 @@ export async function scrapSingleUrl( screenshot: pageOptions.screenshot, pageOptions: pageOptions, headers: pageOptions.headers, + engine: engine, }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot; diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 1184ef27..c9368f41 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -21,7 +21,7 @@ export async function getLinksFromSitemap( const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } }); + const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} }); content = response.html; } } catch (error) {