From 5c65ec58e594fe56da2d15ad13f148303faf0ace Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Mon, 15 Jul 2024 18:40:43 +0300 Subject: [PATCH 1/5] Support chrome-cdp and restructure sitemap fire-engine support. --- .../src/scraper/WebScraper/scrapers/fireEngine.ts | 11 ++++++++++- apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++++++++ apps/api/src/scraper/WebScraper/sitemap.ts | 2 +- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 30412f40..bbcf374c 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -31,6 +31,7 @@ export async function scrapWithFireEngine({ fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; + engine?: 'playwright' | 'chrome-cdp' | 'tlsclient'; }): Promise { const logParams = { url, @@ -49,7 +50,14 @@ export async function scrapWithFireEngine({ const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; - let endpoint = fireEngineOptionsParam.method === "get" ? "/request" : "/scrape"; + + let endpoint = "/scrape"; + + if(options?.endpoint === "request") { + endpoint = "/request"; + } + + let engine = fireEngineOptions?.engine ?? options?.engine ?? "playwright"; // do we want fireEngineOptions as first choice? console.log( `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` @@ -65,6 +73,7 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, headers: headers, pageOptions: pageOptions, + engine: engine, ...fireEngineOptionsParam, }, { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d24e5c2e..4ea46097 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -21,6 +21,7 @@ dotenv.config(); const baseScrapers = [ "fire-engine", + "fire-engine;chrome-cdp", "scrapingBee", "playwright", "scrapingBeeLoad", @@ -70,6 +71,8 @@ function getScrapingFallbackOrder( return !!process.env.SCRAPING_BEE_API_KEY; case "fire-engine": return !!process.env.FIRE_ENGINE_BETA_URL; + case "fire-engine;chrome-cdp": + return !!process.env.FIRE_ENGINE_BETA_URL; case "playwright": return !!process.env.PLAYWRIGHT_MICROSERVICE_URL; default: @@ -80,6 +83,7 @@ function getScrapingFallbackOrder( let defaultOrder = [ "scrapingBee", "fire-engine", + "fire-engine;chrome-cdp", "playwright", "scrapingBeeLoad", "fetch", @@ -136,8 +140,16 @@ export async function scrapSingleUrl( metadata: { pageStatusCode?: number; pageError?: string | null }; } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; + switch (method) { case "fire-engine": + case "fire-engine;chrome-cdp": + + let engine: "playwright" | "chrome-cdp" | "tlsclient" = "playwright"; + if(method === "fire-engine;chrome-cdp"){ + engine = "chrome-cdp"; + } + if (process.env.FIRE_ENGINE_BETA_URL) { console.log(`Scraping ${url} with Fire Engine`); const response = await scrapWithFireEngine({ @@ -146,6 +158,7 @@ export async function scrapSingleUrl( screenshot: pageOptions.screenshot, pageOptions: pageOptions, headers: pageOptions.headers, + engine: engine, }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot; diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 1184ef27..c9368f41 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -21,7 +21,7 @@ export async function getLinksFromSitemap( const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine: "request", method: "get", mobileProxy: true } }); + const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} }); content = response.html; } } catch (error) { From d2de01d34236d98205c4420eeb85103da5b8023c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Jul 2024 13:19:44 -0400 Subject: [PATCH 2/5] Nick: fixes --- apps/api/src/controllers/status.ts | 5 ++++- .../src/scraper/WebScraper/scrapers/fireEngine.ts | 13 ++++++------- apps/api/src/scraper/WebScraper/single_url.ts | 4 +++- .../WebScraper/utils/custom/website_params.ts | 1 + apps/api/src/services/queue-service.ts | 5 ++++- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index ec4ec131..231885f4 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -19,7 +19,10 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons } } - const jobStatus = await job.getState(); + let jobStatus = await job.getState(); + if (jobStatus === 'waiting' || jobStatus === 'stuck') { + jobStatus = 'active'; + } res.json({ status: jobStatus, diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index bbcf374c..e547c019 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -31,7 +31,6 @@ export async function scrapWithFireEngine({ fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; - engine?: 'playwright' | 'chrome-cdp' | 'tlsclient'; }): Promise { const logParams = { url, @@ -47,6 +46,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); const waitParam = reqParams["params"]?.wait ?? waitFor; + const engineParam = reqParams["params"]?.engine ?? fireEngineOptions?.engine ?? "playwright"; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -57,13 +57,13 @@ export async function scrapWithFireEngine({ endpoint = "/request"; } - let engine = fireEngineOptions?.engine ?? options?.engine ?? "playwright"; // do we want fireEngineOptions as first choice? + let engine = engineParam; // do we want fireEngineOptions as first choice? console.log( - `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` + `[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` ); - console.log(fireEngineOptionsParam) + // console.log(fireEngineOptionsParam) const response = await axios.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, @@ -73,7 +73,6 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, headers: headers, pageOptions: pageOptions, - engine: engine, ...fireEngineOptionsParam, }, { @@ -86,14 +85,14 @@ export async function scrapWithFireEngine({ if (response.status !== 200) { console.error( - `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + `[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; if(response.data && response.data?.pageStatusCode !== 200) { - console.error(`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`); + console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`); } return { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index ca290a51..8fbd31e4 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -161,7 +161,9 @@ export async function scrapSingleUrl( screenshot: pageOptions.screenshot, pageOptions: pageOptions, headers: pageOptions.headers, - engine: engine, + fireEngineOptions: { + engine: engine, + } }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot; diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index a1c256cc..4da56619 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -175,6 +175,7 @@ export const urlSpecificParams = { "firecrawl.dev":{ defaultScraper: "fire-engine", params: { + engine: "playwright", headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index f93c3504..6c817a4a 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -7,11 +7,14 @@ export function getWebScraperQueue() { if (!webScraperQueue) { webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, { settings: { - lockDuration: 2 * 60 * 1000, // 1 minute in milliseconds, + lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds, lockRenewTime: 15 * 1000, // 15 seconds in milliseconds stalledInterval: 30 * 1000, maxStalledCount: 10, }, + defaultJobOptions:{ + attempts: 5 + } }); console.log("Web scraper queue created"); } From ce804d3c204241c2072e045cdd2305adb08261b9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Jul 2024 13:40:24 -0400 Subject: [PATCH 3/5] Update crawl-cancel.ts --- apps/api/src/controllers/crawl-cancel.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts index 160824ff..8a2c91f9 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -50,6 +50,8 @@ export async function crawlCancelController(req: Request, res: Response) { } try { + await getWebScraperQueue().client.del(job.lockKey()); + await job.takeLock(); await job.moveToFailed(Error("Job cancelled by user"), true); } catch (error) { console.error(error); From 11768571ed9a9df91db34c98057d14f3317186f2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Jul 2024 13:43:03 -0400 Subject: [PATCH 4/5] Update crawl-cancel.ts --- apps/api/src/controllers/crawl-cancel.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts index 8a2c91f9..a5e28323 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -60,7 +60,7 @@ export async function crawlCancelController(req: Request, res: Response) { const newJobState = await job.getState(); res.json({ - status: newJobState === "failed" ? "cancelled" : "Cancelling...", + status: "cancelled" }); } catch (error) { console.error(error); From 9a1a227797c2d0f51d3da36f414702e2cfbab08e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Jul 2024 13:49:51 -0400 Subject: [PATCH 5/5] Update crawl-cancel.ts --- apps/api/src/controllers/crawl-cancel.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts index a5e28323..ff4b2c58 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -52,6 +52,7 @@ export async function crawlCancelController(req: Request, res: Response) { try { await getWebScraperQueue().client.del(job.lockKey()); await job.takeLock(); + await job.discard(); await job.moveToFailed(Error("Job cancelled by user"), true); } catch (error) { console.error(error);