Merge pull request #410 from mendableai/feat/fire-engine-chrome-cdp

Support chrome-cdp and restructure sitemap fire-engine support.
This commit is contained in:
Nicolas 2024-07-18 13:52:08 -04:00 committed by GitHub
commit f10f3f886b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 42 additions and 9 deletions

View File

@ -50,6 +50,9 @@ export async function crawlCancelController(req: Request, res: Response) {
} }
try { try {
await getWebScraperQueue().client.del(job.lockKey());
await job.takeLock();
await job.discard();
await job.moveToFailed(Error("Job cancelled by user"), true); await job.moveToFailed(Error("Job cancelled by user"), true);
} catch (error) { } catch (error) {
console.error(error); console.error(error);
@ -58,7 +61,7 @@ export async function crawlCancelController(req: Request, res: Response) {
const newJobState = await job.getState(); const newJobState = await job.getState();
res.json({ res.json({
status: newJobState === "failed" ? "cancelled" : "Cancelling...", status: "cancelled"
}); });
} catch (error) { } catch (error) {
console.error(error); console.error(error);

View File

@ -19,7 +19,10 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
} }
} }
const jobStatus = await job.getState(); let jobStatus = await job.getState();
if (jobStatus === 'waiting' || jobStatus === 'stuck') {
jobStatus = 'active';
}
res.json({ res.json({
status: jobStatus, status: jobStatus,

View File

@ -46,16 +46,24 @@ export async function scrapWithFireEngine({
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
const engineParam = reqParams["params"]?.engine ?? fireEngineOptions?.engine ?? "playwright";
const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape";
let endpoint = "/scrape";
if(options?.endpoint === "request") {
endpoint = "/request";
}
let engine = engineParam; // do we want fireEngineOptions as first choice?
console.log( console.log(
`[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` `[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}`
); );
console.log(fireEngineOptionsParam) // console.log(fireEngineOptionsParam)
const response = await axios.post( const response = await axios.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint, process.env.FIRE_ENGINE_BETA_URL + endpoint,
@ -77,14 +85,14 @@ export async function scrapWithFireEngine({
if (response.status !== 200) { if (response.status !== 200) {
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`
); );
logParams.error_message = response.data?.pageError; logParams.error_message = response.data?.pageError;
logParams.response_code = response.data?.pageStatusCode; logParams.response_code = response.data?.pageStatusCode;
if(response.data && response.data?.pageStatusCode !== 200) { if(response.data && response.data?.pageStatusCode !== 200) {
console.error(`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`); console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`);
} }
return { return {

View File

@ -22,6 +22,7 @@ dotenv.config();
const baseScrapers = [ const baseScrapers = [
"fire-engine", "fire-engine",
"fire-engine;chrome-cdp",
"scrapingBee", "scrapingBee",
"playwright", "playwright",
"scrapingBeeLoad", "scrapingBeeLoad",
@ -71,6 +72,8 @@ function getScrapingFallbackOrder(
return !!process.env.SCRAPING_BEE_API_KEY; return !!process.env.SCRAPING_BEE_API_KEY;
case "fire-engine": case "fire-engine":
return !!process.env.FIRE_ENGINE_BETA_URL; return !!process.env.FIRE_ENGINE_BETA_URL;
case "fire-engine;chrome-cdp":
return !!process.env.FIRE_ENGINE_BETA_URL;
case "playwright": case "playwright":
return !!process.env.PLAYWRIGHT_MICROSERVICE_URL; return !!process.env.PLAYWRIGHT_MICROSERVICE_URL;
default: default:
@ -81,6 +84,7 @@ function getScrapingFallbackOrder(
let defaultOrder = [ let defaultOrder = [
"scrapingBee", "scrapingBee",
"fire-engine", "fire-engine",
"fire-engine;chrome-cdp",
"playwright", "playwright",
"scrapingBeeLoad", "scrapingBeeLoad",
"fetch", "fetch",
@ -139,8 +143,16 @@ export async function scrapSingleUrl(
metadata: { pageStatusCode?: number; pageError?: string | null }; metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} }; } = { text: "", screenshot: "", metadata: {} };
let screenshot = ""; let screenshot = "";
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
case "fire-engine;chrome-cdp":
let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright";
if(method === "fire-engine;chrome-cdp"){
engine = "chrome-cdp";
}
if (process.env.FIRE_ENGINE_BETA_URL) { if (process.env.FIRE_ENGINE_BETA_URL) {
console.log(`Scraping ${url} with Fire Engine`); console.log(`Scraping ${url} with Fire Engine`);
const response = await scrapWithFireEngine({ const response = await scrapWithFireEngine({
@ -149,6 +161,9 @@ export async function scrapSingleUrl(
screenshot: pageOptions.screenshot, screenshot: pageOptions.screenshot,
pageOptions: pageOptions, pageOptions: pageOptions,
headers: pageOptions.headers, headers: pageOptions.headers,
fireEngineOptions: {
engine: engine,
}
}); });
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;

View File

@ -21,7 +21,7 @@ export async function getLinksFromSitemap(
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') { } else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } }); const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} });
content = response.html; content = response.html;
} }
} catch (error) { } catch (error) {

View File

@ -175,6 +175,7 @@ export const urlSpecificParams = {
"firecrawl.dev":{ "firecrawl.dev":{
defaultScraper: "fire-engine", defaultScraper: "fire-engine",
params: { params: {
engine: "playwright",
headers: { headers: {
"User-Agent": "User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",

View File

@ -7,11 +7,14 @@ export function getWebScraperQueue() {
if (!webScraperQueue) { if (!webScraperQueue) {
webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, { webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, {
settings: { settings: {
lockDuration: 2 * 60 * 1000, // 1 minute in milliseconds, lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds,
lockRenewTime: 15 * 1000, // 15 seconds in milliseconds lockRenewTime: 15 * 1000, // 15 seconds in milliseconds
stalledInterval: 30 * 1000, stalledInterval: 30 * 1000,
maxStalledCount: 10, maxStalledCount: 10,
}, },
defaultJobOptions:{
attempts: 5
}
}); });
console.log("Web scraper queue created"); console.log("Web scraper queue created");
} }