mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 00:56:01 +08:00
Support chrome-cdp and restructure sitemap fire-engine support.
This commit is contained in:
parent
8efd444ba2
commit
5c65ec58e5
@ -31,6 +31,7 @@ export async function scrapWithFireEngine({
|
|||||||
fireEngineOptions?: FireEngineOptions;
|
fireEngineOptions?: FireEngineOptions;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
options?: any;
|
options?: any;
|
||||||
|
engine?: 'playwright' | 'chrome-cdp' | 'tlsclient';
|
||||||
}): Promise<FireEngineResponse> {
|
}): Promise<FireEngineResponse> {
|
||||||
const logParams = {
|
const logParams = {
|
||||||
url,
|
url,
|
||||||
@ -49,7 +50,14 @@ export async function scrapWithFireEngine({
|
|||||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||||
|
|
||||||
let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape";
|
|
||||||
|
let endpoint = "/scrape";
|
||||||
|
|
||||||
|
if(options?.endpoint === "request") {
|
||||||
|
endpoint = "/request";
|
||||||
|
}
|
||||||
|
|
||||||
|
let engine = fireEngineOptions?.engine ?? options?.engine ?? "playwright"; // do we want fireEngineOptions as first choice?
|
||||||
|
|
||||||
console.log(
|
console.log(
|
||||||
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
|
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
|
||||||
@ -65,6 +73,7 @@ export async function scrapWithFireEngine({
|
|||||||
screenshot: screenshotParam,
|
screenshot: screenshotParam,
|
||||||
headers: headers,
|
headers: headers,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
|
engine: engine,
|
||||||
...fireEngineOptionsParam,
|
...fireEngineOptionsParam,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -21,6 +21,7 @@ dotenv.config();
|
|||||||
|
|
||||||
const baseScrapers = [
|
const baseScrapers = [
|
||||||
"fire-engine",
|
"fire-engine",
|
||||||
|
"fire-engine;chrome-cdp",
|
||||||
"scrapingBee",
|
"scrapingBee",
|
||||||
"playwright",
|
"playwright",
|
||||||
"scrapingBeeLoad",
|
"scrapingBeeLoad",
|
||||||
@ -70,6 +71,8 @@ function getScrapingFallbackOrder(
|
|||||||
return !!process.env.SCRAPING_BEE_API_KEY;
|
return !!process.env.SCRAPING_BEE_API_KEY;
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||||
|
case "fire-engine;chrome-cdp":
|
||||||
|
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||||
case "playwright":
|
case "playwright":
|
||||||
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
||||||
default:
|
default:
|
||||||
@ -80,6 +83,7 @@ function getScrapingFallbackOrder(
|
|||||||
let defaultOrder = [
|
let defaultOrder = [
|
||||||
"scrapingBee",
|
"scrapingBee",
|
||||||
"fire-engine",
|
"fire-engine",
|
||||||
|
"fire-engine;chrome-cdp",
|
||||||
"playwright",
|
"playwright",
|
||||||
"scrapingBeeLoad",
|
"scrapingBeeLoad",
|
||||||
"fetch",
|
"fetch",
|
||||||
@ -136,8 +140,16 @@ export async function scrapSingleUrl(
|
|||||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||||
} = { text: "", screenshot: "", metadata: {} };
|
} = { text: "", screenshot: "", metadata: {} };
|
||||||
let screenshot = "";
|
let screenshot = "";
|
||||||
|
|
||||||
switch (method) {
|
switch (method) {
|
||||||
case "fire-engine":
|
case "fire-engine":
|
||||||
|
case "fire-engine;chrome-cdp":
|
||||||
|
|
||||||
|
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||||
|
if(method === "fire-engine;chrome-cdp"){
|
||||||
|
engine = "chrome-cdp";
|
||||||
|
}
|
||||||
|
|
||||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||||
console.log(`Scraping ${url} with Fire Engine`);
|
console.log(`Scraping ${url} with Fire Engine`);
|
||||||
const response = await scrapWithFireEngine({
|
const response = await scrapWithFireEngine({
|
||||||
@ -146,6 +158,7 @@ export async function scrapSingleUrl(
|
|||||||
screenshot: pageOptions.screenshot,
|
screenshot: pageOptions.screenshot,
|
||||||
pageOptions: pageOptions,
|
pageOptions: pageOptions,
|
||||||
headers: pageOptions.headers,
|
headers: pageOptions.headers,
|
||||||
|
engine: engine,
|
||||||
});
|
});
|
||||||
scraperResponse.text = response.html;
|
scraperResponse.text = response.html;
|
||||||
scraperResponse.screenshot = response.screenshot;
|
scraperResponse.screenshot = response.screenshot;
|
||||||
|
@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
|
|||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
} else if (mode === 'fire-engine') {
|
} else if (mode === 'fire-engine') {
|
||||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
|
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} });
|
||||||
content = response.html;
|
content = response.html;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user