Support chrome-cdp and restructure sitemap fire-engine support.

This commit is contained in:
Thomas Kosmas 2024-07-15 18:40:43 +03:00
parent 8efd444ba2
commit 5c65ec58e5
3 changed files with 24 additions and 2 deletions

View File

@ -31,6 +31,7 @@ export async function scrapWithFireEngine({
fireEngineOptions?: FireEngineOptions;
headers?: Record<string, string>;
options?: any;
engine?: 'playwright' | 'chrome-cdp' | 'tlsclient';
}): Promise<FireEngineResponse> {
const logParams = {
url,
@ -49,7 +50,14 @@ export async function scrapWithFireEngine({
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape";
let endpoint = "/scrape";
if(options?.endpoint === "request") {
endpoint = "/request";
}
let engine = fireEngineOptions?.engine ?? options?.engine ?? "playwright"; // do we want fireEngineOptions as first choice?
console.log(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
@ -65,6 +73,7 @@ export async function scrapWithFireEngine({
screenshot: screenshotParam,
headers: headers,
pageOptions: pageOptions,
engine: engine,
...fireEngineOptionsParam,
},
{

View File

@ -21,6 +21,7 @@ dotenv.config();
const baseScrapers = [
"fire-engine",
"fire-engine;chrome-cdp",
"scrapingBee",
"playwright",
"scrapingBeeLoad",
@ -70,6 +71,8 @@ function getScrapingFallbackOrder(
return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "fire-engine;chrome-cdp":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default:
@ -80,6 +83,7 @@ function getScrapingFallbackOrder(
let defaultOrder = [
"scrapingBee",
"fire-engine",
"fire-engine;chrome-cdp",
"playwright",
"scrapingBeeLoad",
"fetch",
@ -136,8 +140,16 @@ export async function scrapSingleUrl(
metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} };
let screenshot = "";
switch (method) {
case "fire-engine":
case "fire-engine;chrome-cdp":
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
if(method === "fire-engine;chrome-cdp"){
engine = "chrome-cdp";
}
if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine({
@ -146,6 +158,7 @@ export async function scrapSingleUrl(
screenshot: pageOptions.screenshot,
pageOptions: pageOptions,
headers: pageOptions.headers,
engine: engine,
});
scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot;

View File

@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} });
content = response.html;
}
} catch (error) {