mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 13:00:37 +08:00
Merge pull request #410 from mendableai/feat/fire-engine-chrome-cdp
Support chrome-cdp and restructure sitemap fire-engine support.
This commit is contained in:
commit
f10f3f886b
@ -50,6 +50,9 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||
}
|
||||
|
||||
try {
|
||||
await getWebScraperQueue().client.del(job.lockKey());
|
||||
await job.takeLock();
|
||||
await job.discard();
|
||||
await job.moveToFailed(Error("Job cancelled by user"), true);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
@ -58,7 +61,7 @@ export async function crawlCancelController(req: Request, res: Response) {
|
||||
const newJobState = await job.getState();
|
||||
|
||||
res.json({
|
||||
status: newJobState === "failed" ? "cancelled" : "Cancelling...",
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
@ -19,7 +19,10 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
|
||||
}
|
||||
}
|
||||
|
||||
const jobStatus = await job.getState();
|
||||
let jobStatus = await job.getState();
|
||||
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
|
||||
jobStatus = 'active';
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: jobStatus,
|
||||
|
@ -46,16 +46,24 @@ export async function scrapWithFireEngine({
|
||||
try {
|
||||
const reqParams = await generateRequestParams(url);
|
||||
const waitParam = reqParams["params"]?.wait ?? waitFor;
|
||||
const engineParam = reqParams["params"]?.engine ?? fireEngineOptions?.engine ?? "playwright";
|
||||
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
|
||||
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
|
||||
|
||||
let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape";
|
||||
|
||||
let endpoint = "/scrape";
|
||||
|
||||
if(options?.endpoint === "request") {
|
||||
endpoint = "/request";
|
||||
}
|
||||
|
||||
let engine = engineParam; // do we want fireEngineOptions as first choice?
|
||||
|
||||
console.log(
|
||||
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
|
||||
`[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
|
||||
);
|
||||
|
||||
console.log(fireEngineOptionsParam)
|
||||
// console.log(fireEngineOptionsParam)
|
||||
|
||||
const response = await axios.post(
|
||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||
@ -77,14 +85,14 @@ export async function scrapWithFireEngine({
|
||||
|
||||
if (response.status !== 200) {
|
||||
console.error(
|
||||
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
|
||||
`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`
|
||||
);
|
||||
|
||||
logParams.error_message = response.data?.pageError;
|
||||
logParams.response_code = response.data?.pageStatusCode;
|
||||
|
||||
if(response.data && response.data?.pageStatusCode !== 200) {
|
||||
console.error(`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`);
|
||||
console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`);
|
||||
}
|
||||
|
||||
return {
|
||||
|
@ -22,6 +22,7 @@ dotenv.config();
|
||||
|
||||
const baseScrapers = [
|
||||
"fire-engine",
|
||||
"fire-engine;chrome-cdp",
|
||||
"scrapingBee",
|
||||
"playwright",
|
||||
"scrapingBeeLoad",
|
||||
@ -71,6 +72,8 @@ function getScrapingFallbackOrder(
|
||||
return !!process.env.SCRAPING_BEE_API_KEY;
|
||||
case "fire-engine":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "fire-engine;chrome-cdp":
|
||||
return !!process.env.FIRE_ENGINE_BETA_URL;
|
||||
case "playwright":
|
||||
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
|
||||
default:
|
||||
@ -81,6 +84,7 @@ function getScrapingFallbackOrder(
|
||||
let defaultOrder = [
|
||||
"scrapingBee",
|
||||
"fire-engine",
|
||||
"fire-engine;chrome-cdp",
|
||||
"playwright",
|
||||
"scrapingBeeLoad",
|
||||
"fetch",
|
||||
@ -139,8 +143,16 @@ export async function scrapSingleUrl(
|
||||
metadata: { pageStatusCode?: number; pageError?: string | null };
|
||||
} = { text: "", screenshot: "", metadata: {} };
|
||||
let screenshot = "";
|
||||
|
||||
switch (method) {
|
||||
case "fire-engine":
|
||||
case "fire-engine;chrome-cdp":
|
||||
|
||||
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
|
||||
if(method === "fire-engine;chrome-cdp"){
|
||||
engine = "chrome-cdp";
|
||||
}
|
||||
|
||||
if (process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.log(`Scraping ${url} with Fire Engine`);
|
||||
const response = await scrapWithFireEngine({
|
||||
@ -149,6 +161,9 @@ export async function scrapSingleUrl(
|
||||
screenshot: pageOptions.screenshot,
|
||||
pageOptions: pageOptions,
|
||||
headers: pageOptions.headers,
|
||||
fireEngineOptions: {
|
||||
engine: engine,
|
||||
}
|
||||
});
|
||||
scraperResponse.text = response.html;
|
||||
scraperResponse.screenshot = response.screenshot;
|
||||
|
@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } });
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} });
|
||||
content = response.html;
|
||||
}
|
||||
} catch (error) {
|
||||
|
@ -175,6 +175,7 @@ export const urlSpecificParams = {
|
||||
"firecrawl.dev":{
|
||||
defaultScraper: "fire-engine",
|
||||
params: {
|
||||
engine: "playwright",
|
||||
headers: {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
|
@ -7,11 +7,14 @@ export function getWebScraperQueue() {
|
||||
if (!webScraperQueue) {
|
||||
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
|
||||
settings: {
|
||||
lockDuration: 2 * 60 * 1000, // 1 minute in milliseconds,
|
||||
lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
|
||||
lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
|
||||
stalledInterval: 30 * 1000,
|
||||
maxStalledCount: 10,
|
||||
},
|
||||
defaultJobOptions:{
|
||||
attempts: 5
|
||||
}
|
||||
});
|
||||
console.log("Web scraper queue created");
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user