From d2de01d34236d98205c4420eeb85103da5b8023c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 18 Jul 2024 13:19:44 -0400 Subject: [PATCH] Nick: fixes --- apps/api/src/controllers/status.ts | 5 ++++- .../src/scraper/WebScraper/scrapers/fireEngine.ts | 13 ++++++------- apps/api/src/scraper/WebScraper/single_url.ts | 4 +++- .../WebScraper/utils/custom/website_params.ts | 1 + apps/api/src/services/queue-service.ts | 5 ++++- 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index ec4ec131..231885f4 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -19,7 +19,10 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons } } - const jobStatus = await job.getState(); + let jobStatus = await job.getState(); + if (jobStatus === 'waiting' || jobStatus === 'stuck') { + jobStatus = 'active'; + } res.json({ status: jobStatus, diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index bbcf374c..e547c019 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -31,7 +31,6 @@ export async function scrapWithFireEngine({ fireEngineOptions?: FireEngineOptions; headers?: Record; options?: any; - engine?: 'playwright' | 'chrome-cdp' | 'tlsclient'; }): Promise { const logParams = { url, @@ -47,6 +46,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); const waitParam = reqParams["params"]?.wait ?? waitFor; + const engineParam = reqParams["params"]?.engine ?? fireEngineOptions?.engine ?? "playwright"; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -57,13 +57,13 @@ export async function scrapWithFireEngine({ endpoint = "/request"; } - let engine = fireEngineOptions?.engine ?? options?.engine ?? "playwright"; // do we want fireEngineOptions as first choice? + let engine = engineParam; // do we want fireEngineOptions as first choice? console.log( - `[Fire-Engine] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` + `[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` ); - console.log(fireEngineOptionsParam) + // console.log(fireEngineOptionsParam) const response = await axios.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, @@ -73,7 +73,6 @@ export async function scrapWithFireEngine({ screenshot: screenshotParam, headers: headers, pageOptions: pageOptions, - engine: engine, ...fireEngineOptionsParam, }, { @@ -86,14 +85,14 @@ export async function scrapWithFireEngine({ if (response.status !== 200) { console.error( - `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` + `[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; if(response.data && response.data?.pageStatusCode !== 200) { - console.error(`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`); + console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`); } return { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index ca290a51..8fbd31e4 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -161,7 +161,9 @@ export async function scrapSingleUrl( screenshot: pageOptions.screenshot, pageOptions: pageOptions, headers: pageOptions.headers, - engine: engine, + fireEngineOptions: { + engine: engine, + } }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot; diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index a1c256cc..4da56619 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -175,6 +175,7 @@ export const urlSpecificParams = { "firecrawl.dev":{ defaultScraper: "fire-engine", params: { + engine: "playwright", headers: { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index f93c3504..6c817a4a 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -7,11 +7,14 @@ export function getWebScraperQueue() { if (!webScraperQueue) { webScraperQueue = new Queue("web-scraper", process.env.REDIS_URL, { settings: { - lockDuration: 2 * 60 * 1000, // 1 minute in milliseconds, + lockDuration: 1 * 60 * 1000, // 1 minute in milliseconds, lockRenewTime: 15 * 1000, // 15 seconds in milliseconds stalledInterval: 30 * 1000, maxStalledCount: 10, }, + defaultJobOptions:{ + attempts: 5 + } }); console.log("Web scraper queue created"); }