From 6208ecdbc0c357099c057891031e7141e785d5d0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:30:46 -0300 Subject: [PATCH 01/11] added logger --- apps/api/.env.example | 11 ++++ .../__tests__/e2e_full_withAuth/index.test.ts | 1 - apps/api/src/controllers/auth.ts | 7 +-- apps/api/src/controllers/crawl-cancel.ts | 7 +-- apps/api/src/index.ts | 49 ++++++++--------- apps/api/src/lib/logger.ts | 53 +++++++++++++++++++ apps/api/src/main/runWebScraper.ts | 7 ++- apps/api/src/scraper/WebScraper/crawler.ts | 21 +++++--- .../WebScraper/custom/handleCustomScraping.ts | 8 +-- apps/api/src/scraper/WebScraper/index.ts | 9 ++-- .../src/scraper/WebScraper/scrapers/fetch.ts | 9 ++-- .../scraper/WebScraper/scrapers/fireEngine.ts | 17 +++--- .../scraper/WebScraper/scrapers/playwright.ts | 13 ++--- .../WebScraper/scrapers/scrapingBee.ts | 7 +-- apps/api/src/scraper/WebScraper/single_url.ts | 24 +++++---- apps/api/src/scraper/WebScraper/sitemap.ts | 5 +- .../utils/__tests__/socialBlockList.test.ts | 3 +- .../scraper/WebScraper/utils/pdfProcessor.ts | 6 ++- .../src/services/billing/credit_billing.ts | 5 +- apps/api/src/services/logging/log_job.ts | 5 +- apps/api/src/services/logging/scrape_log.ts | 5 +- apps/api/src/services/logtail.ts | 7 +-- apps/api/src/services/queue-service.ts | 3 +- apps/api/src/services/queue-worker.ts | 15 +++--- apps/api/src/services/redis.ts | 13 ++--- 25 files changed, 201 insertions(+), 109 deletions(-) create mode 100644 apps/api/src/lib/logger.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index 5d8e746d..08ff7d7f 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -57,3 +57,14 @@ SELF_HOSTED_WEBHOOK_URL= # Resend API Key for transactional emails RESEND_API_KEY= + +# LOGGING_LEVEL determines the verbosity of logs that the system will output. +# Available levels are: +# NONE - No logs will be output. +# ERROR - For logging error messages that indicate a failure in a specific operation. +# WARN - For logging potentially harmful situations that are not necessarily errors. +# INFO - For logging informational messages that highlight the progress of the application. +# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging. +# TRACE - For logging more detailed information than the DEBUG level. +# Set LOGGING_LEVEL to one of the above options to control logging output. +LOGGING_LEVEL=INFO diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index 3e324d39..019bc968 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => { await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again } } - console.log(crawlData) expect(crawlData.length).toBeGreaterThan(0); expect(crawlData).toEqual(expect.arrayContaining([ expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 56a5ec61..aab3d188 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -6,6 +6,7 @@ import { withAuth } from "../../src/lib/withAuth"; import { RateLimiterRedis } from "rate-limiter-flexible"; import { setTraceAttributes } from '@hyperdx/node-opentelemetry'; import { sendNotification } from "../services/notification/email_notification"; +import { Logger } from "../lib/logger"; export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise { return withAuth(supaAuthenticateUser)(req, res, mode); @@ -17,7 +18,7 @@ function setTrace(team_id: string, api_key: string) { api_key }); } catch (error) { - console.error('Error setting trace attributes:', error); + Logger.error(`Error setting trace attributes: ${error.message}`); } } @@ -82,7 +83,7 @@ export async function supaAuthenticateUser( // $$ language plpgsql; if (error) { - console.error('Error fetching key and price_id:', error); + Logger.warn(`Error fetching key and price_id: ${error.message}`); } else { // console.log('Key and Price ID:', data); } @@ -135,7 +136,7 @@ export async function supaAuthenticateUser( try { await rateLimiter.consume(team_endpoint_token); } catch (rateLimiterRes) { - console.error(rateLimiterRes); + Logger.error(`Rate limit exceeded: ${rateLimiterRes}`); const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts index ff4b2c58..d0c109ec 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -5,6 +5,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabase_service } from "../../src/services/supabase"; import { billTeam } from "../../src/services/billing/credit_billing"; +import { Logger } from "../../src/lib/logger"; export async function crawlCancelController(req: Request, res: Response) { try { @@ -43,7 +44,7 @@ export async function crawlCancelController(req: Request, res: Response) { const { partialDocs } = await job.progress(); if (partialDocs && partialDocs.length > 0 && jobState === "active") { - console.log("Billing team for partial docs..."); + Logger.info("Billing team for partial docs..."); // Note: the credits that we will bill them here might be lower than the actual // due to promises that are not yet resolved await billTeam(team_id, partialDocs.length); @@ -55,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) { await job.discard(); await job.moveToFailed(Error("Job cancelled by user"), true); } catch (error) { - console.error(error); + Logger.error(error); } const newJobState = await job.getState(); @@ -64,7 +65,7 @@ export async function crawlCancelController(req: Request, res: Response) { status: "cancelled" }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 88ec4418..700cc98e 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -12,16 +12,17 @@ import { sendSlackWebhook } from "./services/alerts/slack"; import { checkAlerts } from "./services/alerts"; import Redis from "ioredis"; import { redisRateLimitClient } from "./services/rate-limiter"; +import { Logger } from "./lib/logger"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; -console.log(`Number of CPUs: ${numCPUs} available`); +Logger.info(`Number of CPUs: ${numCPUs} available`); if (cluster.isMaster) { - console.log(`Master ${process.pid} is running`); + Logger.info(`Master ${process.pid} is running`); // Fork workers. for (let i = 0; i < numCPUs; i++) { @@ -30,8 +31,8 @@ if (cluster.isMaster) { cluster.on("exit", (worker, code, signal) => { if (code !== null) { - console.log(`Worker ${worker.process.pid} exited`); - console.log("Starting a new worker"); + Logger.info(`Worker ${worker.process.pid} exited`); + Logger.info("Starting a new worker"); cluster.fork(); } }); @@ -81,15 +82,10 @@ if (cluster.isMaster) { function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { - console.log(`Worker ${process.pid} listening on port ${port}`); - console.log( - `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - ); - console.log(""); - console.log("1. Make sure Redis is running on port 6379 by default"); - console.log( - "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " - ); + Logger.info(`Worker ${process.pid} listening on port ${port}`); + // Logger.info( + // `For the UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + // ); }); return server; } @@ -114,7 +110,7 @@ if (cluster.isMaster) { noActiveJobs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } }); @@ -132,7 +128,7 @@ if (cluster.isMaster) { waitingJobs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } }); @@ -177,13 +173,13 @@ if (cluster.isMaster) { }); if (!response.ok) { - console.error("Failed to send Slack notification"); + Logger.error("Failed to send Slack notification"); } } }, timeout); } } catch (error) { - console.error(error); + Logger.debug(error); } }; @@ -198,7 +194,7 @@ if (cluster.isMaster) { await checkAlerts(); return res.status(200).send("Alerts initialized"); } catch (error) { - console.error("Failed to initialize alerts:", error); + Logger.debug(`Failed to initialize alerts: ${error}`); return res.status(500).send("Failed to initialize alerts"); } } @@ -207,6 +203,7 @@ if (cluster.isMaster) { app.get( `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => { + Logger.info("🐂 Cleaning jobs older than 24h"); try { const webScraperQueue = getWebScraperQueue(); const batchSize = 10; @@ -241,12 +238,12 @@ if (cluster.isMaster) { await job.remove(); count++; } catch (jobError) { - console.error(`Failed to remove job with ID ${job.id}:`, jobError); + Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}` ); } } return res.status(200).send(`Removed ${count} completed jobs.`); } catch (error) { - console.error("Failed to clean last 24h complete jobs:", error); + Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`); return res.status(500).send("Failed to clean jobs"); } } @@ -272,7 +269,7 @@ if (cluster.isMaster) { queueRedisHealth = await queueRedis.get(testKey); await queueRedis.del(testKey); } catch (error) { - console.error("queueRedis health check failed:", error); + Logger.error(`queueRedis health check failed: ${error}`); queueRedisHealth = null; } @@ -283,7 +280,7 @@ if (cluster.isMaster) { redisRateLimitHealth = await redisRateLimitClient.get(testKey); await redisRateLimitClient.del(testKey); } catch (error) { - console.error("redisRateLimitClient health check failed:", error); + Logger.error(`redisRateLimitClient health check failed: ${error}`); redisRateLimitHealth = null; } @@ -297,12 +294,12 @@ if (cluster.isMaster) { healthStatus.queueRedis === "healthy" && healthStatus.redisRateLimitClient === "healthy" ) { - console.log("Both Redis instances are healthy"); + Logger.info("Both Redis instances are healthy"); return res .status(200) .json({ status: "healthy", details: healthStatus }); } else { - console.log("Redis instances health check:", healthStatus); + Logger.info(`Redis instances health check: ${JSON.stringify(healthStatus)}`); await sendSlackWebhook( `[REDIS DOWN] Redis instances health check: ${JSON.stringify( healthStatus @@ -314,7 +311,7 @@ if (cluster.isMaster) { .json({ status: "unhealthy", details: healthStatus }); } } catch (error) { - console.error("Redis health check failed:", error); + Logger.error(`Redis health check failed: ${error}`); await sendSlackWebhook( `[REDIS DOWN] Redis instances health check: ${error.message}`, true @@ -326,5 +323,5 @@ if (cluster.isMaster) { } ); - console.log(`Worker ${process.pid} started`); + Logger.info(`Worker ${process.pid} started`); } diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts new file mode 100644 index 00000000..872dbf51 --- /dev/null +++ b/apps/api/src/lib/logger.ts @@ -0,0 +1,53 @@ +enum LogLevel { + NONE = 'NONE', // No logs will be output. + ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. + WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors. + INFO = 'INFO', // For logging informational messages that highlight the progress of the application. + DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging. + TRACE = 'TRACE' // For logging more detailed information than the DEBUG level. +} +export class Logger { + static colors = { + ERROR: '\x1b[31m%s\x1b[0m', // Red + WARN: '\x1b[33m%s\x1b[0m', // Yellow + INFO: '\x1b[34m%s\x1b[0m', // Blue + DEBUG: '\x1b[36m%s\x1b[0m', // Cyan + TRACE: '\x1b[35m%s\x1b[0m' // Magenta + }; + + static log (message: string, level: LogLevel) { + const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO; + const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE]; + const currentLevelIndex = levels.indexOf(logLevel); + const messageLevelIndex = levels.indexOf(level); + + if (currentLevelIndex >= messageLevelIndex) { + const color = Logger.colors[level]; + console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`); + + // if (process.env.USE_DB_AUTH) { + // save to supabase? another place? + // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean }); + // } + } + } + static error(message: string | any) { + Logger.log(message, LogLevel.ERROR); + } + + static warn(message: string) { + Logger.log(message, LogLevel.WARN); + } + + static info(message: string) { + Logger.log(message, LogLevel.INFO); + } + + static debug(message: string) { + Logger.log(message, LogLevel.DEBUG); + } + + static trace(message: string) { + Logger.log(message, LogLevel.TRACE); + } +} diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 3c98e11e..0a7da035 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -10,6 +10,7 @@ import { DocumentUrl, Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; +import { Logger } from "../lib/logger"; export async function startWebScraperPipeline({ job, @@ -23,6 +24,7 @@ export async function startWebScraperPipeline({ crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, inProgress: (progress) => { + Logger.debug(`🐂 Job in progress ${job.id}`); if (progress.currentDocument) { partialDocs.push(progress.currentDocument); if (partialDocs.length > 50) { @@ -32,9 +34,11 @@ export async function startWebScraperPipeline({ } }, onSuccess: (result) => { + Logger.debug(`🐂 Job completed ${job.id}`); saveJob(job, result); }, onError: (error) => { + Logger.error(`🐂 Job failed ${job.id}`); job.moveToFailed(error); }, team_id: job.data.team_id, @@ -108,7 +112,6 @@ export async function runWebScraper({ // this return doesn't matter too much for the job completion result return { success: true, message: "", docs: filteredDocs }; } catch (error) { - console.error("Error running web scraper", error); onError(error); return { success: false, message: error.message, docs: [] }; } @@ -136,6 +139,6 @@ const saveJob = async (job: Job, result: any) => { } } } catch (error) { - console.error("Failed to update job status:", error); + Logger.error(`🐂 Failed to update job status: ${error}`); } }; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 59b53642..3d7c267a 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; +import { Logger } from "../../../src/lib/logger"; export class WebCrawler { private initialUrl: string; @@ -116,7 +117,7 @@ export class WebCrawler { const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; // Check if the link is disallowed by robots.txt if (!isAllowed) { - console.log(`Link disallowed by robots.txt: ${link}`); + Logger.debug(`Link disallowed by robots.txt: ${link}`); return false; } @@ -133,15 +134,19 @@ export class WebCrawler { limit: number = 10000, maxDepth: number = 10 ): Promise<{ url: string, html: string }[]> { + + Logger.debug(`Crawler starting with ${this.initialUrl}`); // Fetch and parse robots.txt try { const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); this.robots = robotsParser(this.robotsTxtUrl, response.data); + Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`); } catch (error) { - console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); + Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); } - if(!crawlerOptions?.ignoreSitemap){ + if (!crawlerOptions?.ignoreSitemap){ + Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); @@ -175,6 +180,7 @@ export class WebCrawler { inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { const queue = async.queue(async (task: string, callback) => { + Logger.debug(`Crawling ${task}`); if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { if (callback && typeof callback === "function") { callback(); @@ -216,16 +222,18 @@ export class WebCrawler { } }, concurrencyLimit); + Logger.debug(`🐂 Pushing ${urls.length} URLs to the queue`); queue.push( urls.filter( (url) => !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent") ), (err) => { - if (err) console.error(err); + if (err) Logger.error(`🐂 Error pushing URLs to the queue: ${err}`); } ); await queue.drain(); + Logger.debug(`🐂 Crawled ${this.crawledUrls.size} URLs, Queue drained.`); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } @@ -282,7 +290,6 @@ export class WebCrawler { const urlObj = new URL(fullUrl); const path = urlObj.pathname; - if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS if (this.isInternalLink(fullUrl) && this.noSections(fullUrl) && @@ -452,7 +459,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); } } catch (error) { - console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); + Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); if (response) { sitemapLinks = response; @@ -467,7 +474,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap }); } } catch (error) { - console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); } } diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index f8b2503e..e2f8d8cc 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,10 +1,12 @@ +import { Logger } from "../../../lib/logger"; + export async function handleCustomScraping( text: string, url: string ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { // Check for Readme Docs special case if (text.includes(' 0 ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch( (error) => { - console.error("Failed to fetch sitemap data:", error); + Logger.debug(`Failed to fetch sitemap data: ${error}`); return null; } ) @@ -460,7 +461,7 @@ export class WebScraperDataProvider { let documents: Document[] = []; for (const url of urls) { const normalizedUrl = this.normalizeUrl(url); - console.log( + Logger.debug( "Getting cached document for web-scraper-cache:" + normalizedUrl ); const cachedDocumentString = await getValue( diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 4c31438c..c9ddf93a 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -2,6 +2,7 @@ import axios from "axios"; import { logScrape } from "../../../services/logging/scrape_log"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Axios @@ -34,9 +35,7 @@ export async function scrapWithFetch( }); if (response.status !== 200) { - console.error( - `[Axios] Error fetching url: ${url} with status: ${response.status}` - ); + Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`); logParams.error_message = response.statusText; logParams.response_code = response.status; return { @@ -63,10 +62,10 @@ export async function scrapWithFetch( } catch (error) { if (error.code === "ECONNABORTED") { logParams.error_message = "Request timed out"; - console.log(`[Axios] Request timed out for ${url}`); + Logger.debug(`⛏️ Axios: Request timed out for ${url}`); } else { logParams.error_message = error.message || error; - console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`); } return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index e547c019..0f4c2320 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -4,6 +4,7 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Fire-Engine @@ -59,12 +60,10 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? - console.log( - `[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` + Logger.info( + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); - // console.log(fireEngineOptionsParam) - const response = await axios.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, { @@ -84,15 +83,15 @@ export async function scrapWithFireEngine({ ); if (response.status !== 200) { - console.error( - `[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}` + Logger.debug( + `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; if(response.data && response.data?.pageStatusCode !== 200) { - console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`); + Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`); } return { @@ -130,10 +129,10 @@ export async function scrapWithFireEngine({ } } catch (error) { if (error.code === "ECONNABORTED") { - console.log(`[Fire-Engine] Request timed out for ${url}`); + Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`); logParams.error_message = "Request timed out"; } else { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; } return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index 11c3c5ad..4b3180a3 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Playwright @@ -51,8 +52,8 @@ export async function scrapWithPlaywright( ); if (response.status !== 200) { - console.error( - `[Playwright] Error fetching url: ${url} with status: ${response.status}` + Logger.debug( + `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; @@ -86,8 +87,8 @@ export async function scrapWithPlaywright( }; } catch (jsonError) { logParams.error_message = jsonError.message || jsonError; - console.error( - `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + Logger.debug( + `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}` ); return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } @@ -95,10 +96,10 @@ export async function scrapWithPlaywright( } catch (error) { if (error.code === "ECONNABORTED") { logParams.error_message = "Request timed out"; - console.log(`[Playwright] Request timed out for ${url}`); + Logger.debug(`⛏️ Playwright: Request timed out for ${url}`); } else { logParams.error_message = error.message || error; - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`); } return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts index 9a1f0b35..554bfe22 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; import { ScrapingBeeClient } from "scrapingbee"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with ScrapingBee @@ -56,8 +57,8 @@ export async function scrapWithScrapingBee( text = decoder.decode(response.data); logParams.success = true; } catch (decodeError) { - console.error( - `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + Logger.debug( + `⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}` ); logParams.error_message = decodeError.message || decodeError; } @@ -72,7 +73,7 @@ export async function scrapWithScrapingBee( }; } } catch (error) { - console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; logParams.response_code = error.response?.status; return { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8fbd31e4..1e7f1fac 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -17,6 +17,7 @@ import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { extractLinks } from "./utils/utils"; +import { Logger } from "../../lib/logger"; dotenv.config(); @@ -48,7 +49,7 @@ export async function generateRequestParams( return defaultParams; } } catch (error) { - console.error(`Error generating URL key: ${error}`); + Logger.error(`Error generating URL key: ${error}`); return defaultParams; } } @@ -154,7 +155,6 @@ export async function scrapSingleUrl( } if (process.env.FIRE_ENGINE_BETA_URL) { - console.log(`Scraping ${url} with Fire Engine`); const response = await scrapWithFireEngine({ url, waitFor: pageOptions.waitFor, @@ -277,7 +277,7 @@ export async function scrapSingleUrl( try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); } catch (error) { - console.error(`Invalid URL key, trying: ${urlToScrap}`); + Logger.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const scrapersInOrder = getScrapingFallbackOrder( @@ -311,12 +311,18 @@ export async function scrapSingleUrl( pageError = undefined; } - if (text && text.trim().length >= 100) break; - if (pageStatusCode && pageStatusCode == 404) break; - const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; - if (nextScraperIndex < scrapersInOrder.length) { - console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); + if (text && text.trim().length >= 100) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); + break; } + if (pageStatusCode && pageStatusCode == 404) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`); + break; + } + // const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; + // if (nextScraperIndex < scrapersInOrder.length) { + // Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`); + // } } if (!text) { @@ -372,7 +378,7 @@ export async function scrapSingleUrl( return document; } catch (error) { - console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); + Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); return { content: "", markdown: "", diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 1dfbf3a1..b41f3eeb 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -3,6 +3,7 @@ import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { WebCrawler } from "./crawler"; +import { Logger } from "../../lib/logger"; export async function getLinksFromSitemap( { @@ -26,7 +27,7 @@ export async function getLinksFromSitemap( content = response.html; } } catch (error) { - console.error(`Request failed for ${sitemapUrl}: ${error}`); + Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`); return allUrls; } @@ -48,7 +49,7 @@ export async function getLinksFromSitemap( } } } catch (error) { - console.error(`Error processing ${sitemapUrl}: ${error}`); + Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`); } return allUrls; diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts index 4449285d..c09cc5b3 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts @@ -1,3 +1,4 @@ +import { Logger } from '../../../../lib/logger'; import { isUrlBlocked } from '../blocklist'; describe('isUrlBlocked', () => { @@ -19,7 +20,7 @@ describe('isUrlBlocked', () => { blockedUrls.forEach(url => { if (!isUrlBlocked(url)) { - console.log(`URL not blocked: ${url}`); + Logger.debug(`URL not blocked: ${url}`); } expect(isUrlBlocked(url)).toBe(true); }); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 3e01571e..1a9fa11b 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -7,6 +7,7 @@ import pdf from "pdf-parse"; import path from "path"; import os from "os"; import { axiosTimeout } from "../../../lib/timeout"; +import { Logger } from "../../../lib/logger"; dotenv.config(); @@ -39,6 +40,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro let content = ""; if (process.env.LLAMAPARSE_API_KEY && parsePDF) { + Logger.debug("Processing pdf document w/ LlamaIndex"); const apiKey = process.env.LLAMAPARSE_API_KEY; const headers = { Authorization: `Bearer ${apiKey}`, @@ -81,7 +83,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result w/ LlamaIndex"); + Logger.debug("Error fetching result w/ LlamaIndex"); attempt++; await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently @@ -93,7 +95,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro } content = resultResponse.data[resultType]; } catch (error) { - console.error("Error processing pdf document w/ LlamaIndex(2)"); + Logger.debug("Error processing pdf document w/ LlamaIndex(2)"); content = await processPdf(filePath); } } else if (parsePDF) { diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 82668111..6d0f9a69 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -2,6 +2,7 @@ import { NotificationType } from "../../types"; import { withAuth } from "../../lib/withAuth"; import { sendNotification } from "../notification/email_notification"; import { supabase_service } from "../supabase"; +import { Logger } from "../../lib/logger"; const FREE_CREDITS = 500; @@ -12,7 +13,7 @@ export async function supaBillTeam(team_id: string, credits: number) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } - console.log(`Billing team ${team_id} for ${credits} credits`); + Logger.info(`Billing team ${team_id} for ${credits} credits`); // When the API is used, you can log the credit usage in the credit_usage table: // team_id: The ID of the team using the API. // subscription_id: The ID of the team's active subscription. @@ -218,7 +219,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { 0 ); - console.log("totalCreditsUsed", totalCreditsUsed); + Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`); const end = new Date(); end.setDate(end.getDate() + 30); diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 9764493f..93d0b311 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -3,6 +3,7 @@ import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; import "dotenv/config"; +import { Logger } from "../../lib/logger"; export async function logJob(job: FirecrawlJob) { try { @@ -68,9 +69,9 @@ export async function logJob(job: FirecrawlJob) { posthog.capture(phLog); } if (error) { - console.error("Error logging job:\n", error); + Logger.error(`Error logging job: ${error.message}`); } } catch (error) { - console.error("Error logging job:\n", error); + Logger.error(`Error logging job: ${error.message}`); } } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 9f34222f..b99d99b4 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -2,6 +2,7 @@ import "dotenv/config"; import { ScrapeLog } from "../../types"; import { supabase_service } from "../supabase"; import { PageOptions } from "../../lib/entities"; +import { Logger } from "../../lib/logger"; export async function logScrape( scrapeLog: ScrapeLog, @@ -39,9 +40,9 @@ export async function logScrape( ]); if (error) { - console.error("Error logging proxy:\n", error); + Logger.error(`Error logging proxy:\n${error}`); } } catch (error) { - console.error("Error logging proxy:\n", error); + Logger.error(`Error logging proxy:\n${error}`); } } diff --git a/apps/api/src/services/logtail.ts b/apps/api/src/services/logtail.ts index 8b86a6b1..d9af3c7a 100644 --- a/apps/api/src/services/logtail.ts +++ b/apps/api/src/services/logtail.ts @@ -1,19 +1,20 @@ import { Logtail } from "@logtail/node"; import "dotenv/config"; +import { Logger } from "../lib/logger"; // A mock Logtail class to handle cases where LOGTAIL_KEY is not provided class MockLogtail { info(message: string, context?: Record): void { - console.log(message, context); + Logger.debug(`${message} - ${context}`); } error(message: string, context: Record = {}): void { - console.error(message, context); + Logger.error(`${message} - ${context}`); } } // Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class // Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => { - console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); + Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); return new MockLogtail(); })(); diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 6c817a4a..d531c2db 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -1,5 +1,6 @@ import Queue from "bull"; import { Queue as BullQueue } from "bull"; +import { Logger } from "../lib/logger"; let webScraperQueue: BullQueue; @@ -16,7 +17,7 @@ export function getWebScraperQueue() { attempts: 5 } }); - console.log("Web scraper queue created"); + Logger.info("Web scraper queue created"); } return webScraperQueue; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index be2a4c70..532c5bc1 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -7,8 +7,9 @@ import { callWebhook } from "./webhook"; import { logJob } from "./logging/log_job"; import { initSDK } from '@hyperdx/node-opentelemetry'; import { Job } from "bull"; +import { Logger } from "../lib/logger"; -if(process.env.ENV === 'production') { +if (process.env.ENV === 'production') { initSDK({ consoleCapture: true, additionalInstrumentations: [], @@ -18,7 +19,7 @@ if(process.env.ENV === 'production') { const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { - console.log("taking job", job.id); + Logger.debug(`🐂 Worker taking job ${job.id}`); try { job.progress({ current: 1, @@ -58,18 +59,18 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); - console.log("job done", job.id); + Logger.debug(`🐂 Job done ${job.id}`); done(null, data); } catch (error) { - console.log("job errored", job.id, error); + Logger.error(`🐂 Job errored ${job.id} - ${error}`); if (await getWebScraperQueue().isPaused(false)) { - console.log("queue is paused, ignoring"); + Logger.debug("🐂Queue is paused, ignoring"); return; } if (error instanceof CustomError) { // Here we handle the error, then save the failed job - console.error(error.message); // or any other error handling + Logger.error(error.message); // or any other error handling logtail.error("Custom error while ingesting", { job_id: job.id, @@ -77,7 +78,7 @@ async function processJob(job: Job, done) { dataIngestionJob: error.dataIngestionJob, }); } - console.log(error); + Logger.error(error); logtail.error("Overall error ingesting", { job_id: job.id, diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index b720a330..173d2b78 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -1,14 +1,15 @@ import Redis from "ioredis"; import { redisRateLimitClient } from "./rate-limiter"; +import { Logger } from "../lib/logger"; // Listen to 'error' events to the Redis connection redisRateLimitClient.on("error", (error) => { try { if (error.message === "ECONNRESET") { - console.log("Connection to Redis Session Store timed out."); + Logger.error("Connection to Redis Session Rate Limit Store timed out."); } else if (error.message === "ECONNREFUSED") { - console.log("Connection to Redis Session Store refused!"); - } else console.log(error); + Logger.error("Connection to Redis Session Rate Limit Store refused!"); + } else Logger.error(error); } catch (error) {} }); @@ -16,15 +17,15 @@ redisRateLimitClient.on("error", (error) => { redisRateLimitClient.on("reconnecting", (err) => { try { if (redisRateLimitClient.status === "reconnecting") - console.log("Reconnecting to Redis Session Store..."); - else console.log("Error reconnecting to Redis Session Store."); + Logger.info("Reconnecting to Redis Session Rate Limit Store..."); + else Logger.error("Error reconnecting to Redis Session Rate Limit Store."); } catch (error) {} }); // Listen to the 'connect' event to Redis redisRateLimitClient.on("connect", (err) => { try { - if (!err) console.log("Connected to Redis Session Store!"); + if (!err) Logger.info("Connected to Redis Session Rate Limit Store!"); } catch (error) {} }); From 7cd9bf92e33b1079cff7e92dcb28bc548021bee5 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 14:31:25 +0200 Subject: [PATCH 02/11] feat: scrape event logging to DB --- apps/api/src/controllers/crawl.ts | 4 +- apps/api/src/controllers/scrape.ts | 2 + apps/api/src/controllers/search.ts | 7 +++ apps/api/src/example.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/api/src/lib/scrape-events.ts | 58 +++++++++++++++++++ apps/api/src/main/runWebScraper.ts | 2 + .../WebScraper/__tests__/crawler.test.ts | 6 ++ .../WebScraper/__tests__/single_url.test.ts | 6 +- apps/api/src/scraper/WebScraper/crawler.ts | 6 +- apps/api/src/scraper/WebScraper/index.ts | 4 ++ apps/api/src/scraper/WebScraper/single_url.ts | 28 ++++++++- 12 files changed, 118 insertions(+), 7 deletions(-) create mode 100644 apps/api/src/lib/scrape-events.ts diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 89358fcc..3388f8a7 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; +import { v4 as uuidv4 } from "uuid"; export async function crawlController(req: Request, res: Response) { try { @@ -60,10 +61,11 @@ export async function crawlController(req: Request, res: Response) { const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; - if (mode === "single_urls" && !url.includes(",")) { + if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? try { const a = new WebScraperDataProvider(); await a.setOptions({ + jobId: uuidv4(), mode: "single_urls", urls: [url], crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index fb57e41d..bfbb1e2a 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -9,6 +9,7 @@ import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; +import { v4 as uuidv4 } from "uuid"; export async function scrapeHelper( req: Request, @@ -35,6 +36,7 @@ export async function scrapeHelper( const a = new WebScraperDataProvider(); await a.setOptions({ + jobId: uuidv4(), mode: "single_urls", urls: [url], crawlerOptions: { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 8cb6d55b..e6ab8e9d 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -7,8 +7,10 @@ import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { v4 as uuidv4 } from "uuid"; export async function searchHelper( + jobId: string, req: Request, team_id: string, crawlerOptions: any, @@ -75,6 +77,7 @@ export async function searchHelper( const a = new WebScraperDataProvider(); await a.setOptions({ + jobId, mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), crawlerOptions: { @@ -148,6 +151,8 @@ export async function searchController(req: Request, res: Response) { const searchOptions = req.body.searchOptions ?? { limit: 7 }; + const jobId = uuidv4(); + try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); @@ -160,6 +165,7 @@ export async function searchController(req: Request, res: Response) { } const startTime = new Date().getTime(); const result = await searchHelper( + jobId, req, team_id, crawlerOptions, @@ -169,6 +175,7 @@ export async function searchController(req: Request, res: Response) { const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; logJob({ + job_id: jobId, success: result.success, message: result.error, num_docs: result.data ? result.data.length : 0, diff --git a/apps/api/src/example.ts b/apps/api/src/example.ts index 0c30ea33..edf0faef 100644 --- a/apps/api/src/example.ts +++ b/apps/api/src/example.ts @@ -4,6 +4,7 @@ async function example() { const example = new WebScraperDataProvider(); await example.setOptions({ + jobId: "TEST", mode: "crawl", urls: ["https://mendable.ai"], crawlerOptions: {}, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index f60e197f..56cd793a 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -56,6 +56,7 @@ export type CrawlerOptions = { } export type WebScraperOptions = { + jobId: string; urls: string[]; mode: "single_urls" | "sitemap" | "crawl"; crawlerOptions?: CrawlerOptions; diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts new file mode 100644 index 00000000..ea30ffc2 --- /dev/null +++ b/apps/api/src/lib/scrape-events.ts @@ -0,0 +1,58 @@ +import type { baseScrapers } from "../scraper/WebScraper/single_url"; +import { supabase_service as supabase } from "../services/supabase"; + +export type ScrapeErrorEvent = { + type: "error", + message: string, + stack?: string, +} + +export type ScrapeScrapeEvent = { + type: "scrape", + method: (typeof baseScrapers)[number], + result: null | { + success: boolean, + response_code?: number, + error?: string, + // proxy?: string, + time_taken: number, + }, +} + +export type ScrapeQueueEvent = { + type: "queue", + event: "created" | "started" | "interrupted" | "finished", + worker?: string, +} + +export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent; + +export class ScrapeEvents { + static async insert(jobId: string, content: ScrapeEvent) { + if (jobId === "TEST") return null; + + if (process.env.USE_DB_AUTH) { + const result = await supabase.from("scrape_events").insert({ + job_id: jobId, + type: content.type, + content: content, + // created_at + }).single(); + return (result.data as any).id; + } + + return null; + } + + static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) { + if (logId === null) return; + + const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; + await supabase.from("scrape_events").update({ + content: { + ...previousLog.content, + result, + } + }).eq("id", logId); + } +} diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 0a7da035..5c97ef41 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -60,6 +60,7 @@ export async function runWebScraper({ const provider = new WebScraperDataProvider(); if (mode === "crawl") { await provider.setOptions({ + jobId: bull_job_id, mode: mode, urls: [url], crawlerOptions: crawlerOptions, @@ -68,6 +69,7 @@ export async function runWebScraper({ }); } else { await provider.setOptions({ + jobId: bull_job_id, mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index 32c8b0a0..20419ffa 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -42,6 +42,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -76,6 +77,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -104,6 +106,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -133,6 +136,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -161,6 +165,7 @@ describe('WebCrawler', () => { // Setup the crawler with the specific test case options const crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -194,6 +199,7 @@ describe('WebCrawler', () => { const limit = 2; // Set a limit for the number of links crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 8a9df227..4b720835 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => { const pageOptionsWithHtml: PageOptions = { includeHtml: true }; const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; - const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml); - const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml); + const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml); + const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml); expect(resultWithHtml.html).toBeDefined(); expect(resultWithoutHtml.html).toBeUndefined(); @@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; const pageOptions: PageOptions = { includeHtml: true }; - const result = await scrapSingleUrl(url, pageOptions); + const result = await scrapSingleUrl("TEST", url, pageOptions); // Check if the result contains a list of links expect(result.linksOnPage).toBeDefined(); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3d7c267a..c592f400 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -11,6 +11,7 @@ import { axiosTimeout } from "../../../src/lib/timeout"; import { Logger } from "../../../src/lib/logger"; export class WebCrawler { + private jobId: string; private initialUrl: string; private baseUrl: string; private includes: string[]; @@ -27,6 +28,7 @@ export class WebCrawler { private allowExternalContentLinks: boolean; constructor({ + jobId, initialUrl, includes, excludes, @@ -37,6 +39,7 @@ export class WebCrawler { allowBackwardCrawling = false, allowExternalContentLinks = false }: { + jobId: string; initialUrl: string; includes?: string[]; excludes?: string[]; @@ -47,6 +50,7 @@ export class WebCrawler { allowBackwardCrawling?: boolean; allowExternalContentLinks?: boolean; }) { + this.jobId = jobId; this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; this.includes = includes ?? []; @@ -261,7 +265,7 @@ export class WebCrawler { // If it is the first link, fetch with single url if (this.visited.size === 1) { - const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); + const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true }); content = page.html ?? ""; pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 6506620e..eff709fa 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -22,6 +22,7 @@ import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { Logger } from "../../lib/logger"; export class WebScraperDataProvider { + private jobId: string; private bullJobId: string; private urls: string[] = [""]; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; @@ -66,6 +67,7 @@ export class WebScraperDataProvider { batchUrls.map(async (url, index) => { const existingHTML = allHtmls ? allHtmls[i + index] : ""; const result = await scrapSingleUrl( + this.jobId, url, this.pageOptions, this.extractorOptions, @@ -166,6 +168,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { const crawler = new WebCrawler({ + jobId: this.jobId, initialUrl: this.urls[0], includes: this.includes, excludes: this.excludes, @@ -500,6 +503,7 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } + this.jobId = options.jobId; this.bullJobId = options.bullJobId; this.urls = options.urls; this.mode = options.mode; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 1e7f1fac..68474fed 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -18,10 +18,11 @@ import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { extractLinks } from "./utils/utils"; import { Logger } from "../../lib/logger"; +import { ScrapeEvents } from "../../lib/scrape-events"; dotenv.config(); -const baseScrapers = [ +export const baseScrapers = [ "fire-engine", "fire-engine;chrome-cdp", "scrapingBee", @@ -118,6 +119,7 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( + jobId: string, urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, @@ -145,6 +147,13 @@ export async function scrapSingleUrl( } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; + const timer = Date.now(); + const logInsertPromise = ScrapeEvents.insert(jobId, { + type: "scrape", + method, + result: null, + }); + switch (method) { case "fire-engine": case "fire-engine;chrome-cdp": @@ -254,8 +263,18 @@ export async function scrapSingleUrl( } //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); + const text = await parseMarkdown(cleanedHtml); + + const insertedLogId = await logInsertPromise; + ScrapeEvents.updateScrapeResult(insertedLogId, { + success: !!scraperResponse.metadata.pageError && !!text, + error: scraperResponse.metadata.pageError, + response_code: scraperResponse.metadata.pageStatusCode, + time_taken: Date.now() - timer, + }); + return { - text: await parseMarkdown(cleanedHtml), + text, html: cleanedHtml, rawHtml: scraperResponse.text, screenshot: scraperResponse.screenshot, @@ -379,6 +398,11 @@ export async function scrapSingleUrl( return document; } catch (error) { Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); + ScrapeEvents.insert(jobId, { + type: "error", + message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), + stack: error.stack, + }); return { content: "", markdown: "", From 71072fef3b85c22a3f8daf6f667e17bce6efc7d5 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 14:46:41 +0200 Subject: [PATCH 03/11] fix(scrape-events): bad logic --- apps/api/src/lib/scrape-events.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index ea30ffc2..d7958d14 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -13,7 +13,7 @@ export type ScrapeScrapeEvent = { result: null | { success: boolean, response_code?: number, - error?: string, + error?: string | object, // proxy?: string, time_taken: number, }, @@ -31,13 +31,13 @@ export class ScrapeEvents { static async insert(jobId: string, content: ScrapeEvent) { if (jobId === "TEST") return null; - if (process.env.USE_DB_AUTH) { + if (process.env.USE_DB_AUTHENTICATION) { const result = await supabase.from("scrape_events").insert({ job_id: jobId, type: content.type, content: content, // created_at - }).single(); + }).select().single(); return (result.data as any).id; } From d57dbbd0c606f55836200281e981927b1ccfc491 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 15:18:12 +0200 Subject: [PATCH 04/11] fix: add jobId for scrape --- apps/api/src/controllers/scrape.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index bfbb1e2a..c409371d 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -12,6 +12,7 @@ import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOri import { v4 as uuidv4 } from "uuid"; export async function scrapeHelper( + jobId: string, req: Request, team_id: string, crawlerOptions: any, @@ -36,7 +37,7 @@ export async function scrapeHelper( const a = new WebScraperDataProvider(); await a.setOptions({ - jobId: uuidv4(), + jobId, mode: "single_urls", urls: [url], crawlerOptions: { @@ -129,8 +130,11 @@ export async function scrapeController(req: Request, res: Response) { checkCredits(); } + const jobId = uuidv4(); + const startTime = new Date().getTime(); const result = await scrapeHelper( + jobId, req, team_id, crawlerOptions, @@ -171,6 +175,7 @@ export async function scrapeController(req: Request, res: Response) { } logJob({ + job_id: jobId, success: result.success, message: result.error, num_docs: 1, From 64bcedeefc82e753e3939c6f29d473a7238ef8da Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 16:21:59 +0200 Subject: [PATCH 05/11] fix(monitoring): bad success check on scrape --- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 68474fed..61ee89af 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -267,7 +267,7 @@ export async function scrapSingleUrl( const insertedLogId = await logInsertPromise; ScrapeEvents.updateScrapeResult(insertedLogId, { - success: !!scraperResponse.metadata.pageError && !!text, + success: !scraperResponse.metadata.pageError && !!text, error: scraperResponse.metadata.pageError, response_code: scraperResponse.metadata.pageStatusCode, time_taken: Date.now() - timer, From 4d35ad073c86e74840dc0074cdee1e7a5a05ea90 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 16:43:39 +0200 Subject: [PATCH 06/11] feat(monitoring/scrape): include url, worker, response_size --- apps/api/src/lib/scrape-events.ts | 3 +++ apps/api/src/scraper/WebScraper/single_url.ts | 3 +++ 2 files changed, 6 insertions(+) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index d7958d14..9b050c30 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -9,10 +9,13 @@ export type ScrapeErrorEvent = { export type ScrapeScrapeEvent = { type: "scrape", + url: string, + worker?: string, method: (typeof baseScrapers)[number], result: null | { success: boolean, response_code?: number, + response_size?: number, error?: string | object, // proxy?: string, time_taken: number, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 61ee89af..4b428d35 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -150,6 +150,8 @@ export async function scrapSingleUrl( const timer = Date.now(); const logInsertPromise = ScrapeEvents.insert(jobId, { type: "scrape", + url, + worker: process.env.FLY_MACHINE_ID, method, result: null, }); @@ -267,6 +269,7 @@ export async function scrapSingleUrl( const insertedLogId = await logInsertPromise; ScrapeEvents.updateScrapeResult(insertedLogId, { + response_size: scraperResponse.text.length, success: !scraperResponse.metadata.pageError && !!text, error: scraperResponse.metadata.pageError, response_code: scraperResponse.metadata.pageStatusCode, From 60c74357dfbb133658c2cb73b40f0573c2ed07e8 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 18:44:14 +0200 Subject: [PATCH 07/11] feat(ScrapeEvents): log queue events --- apps/api/src/index.ts | 10 ++++++++++ apps/api/src/lib/scrape-events.ts | 11 ++++++++++- apps/api/src/services/queue-worker.ts | 9 +++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 700cc98e..cbd37eb7 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -13,6 +13,7 @@ import { checkAlerts } from "./services/alerts"; import Redis from "ioredis"; import { redisRateLimitClient } from "./services/rate-limiter"; import { Logger } from "./lib/logger"; +import { ScrapeEvents } from "./lib/scrape-events"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -325,3 +326,12 @@ if (cluster.isMaster) { Logger.info(`Worker ${process.pid} started`); } + +const wsq = getWebScraperQueue(); + +wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); +wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active")); +wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); +wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); +wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); +wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 9b050c30..7015c92d 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -1,3 +1,4 @@ +import { Job, JobId } from "bull"; import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; @@ -24,7 +25,7 @@ export type ScrapeScrapeEvent = { export type ScrapeQueueEvent = { type: "queue", - event: "created" | "started" | "interrupted" | "finished", + event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed", worker?: string, } @@ -58,4 +59,12 @@ export class ScrapeEvents { } }).eq("id", logId); } + + static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) { + await this.insert(((job as any).id ? (job as any).id : job) as string, { + type: "queue", + event, + worker: process.env.FLY_MACHINE_ID, + }); + } } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 532c5bc1..e7767809 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -8,6 +8,7 @@ import { logJob } from "./logging/log_job"; import { initSDK } from '@hyperdx/node-opentelemetry'; import { Job } from "bull"; import { Logger } from "../lib/logger"; +import { ScrapeEvents } from "../lib/scrape-events"; if (process.env.ENV === 'production') { initSDK({ @@ -20,6 +21,7 @@ const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { Logger.debug(`🐂 Worker taking job ${job.id}`); + try { job.progress({ current: 1, @@ -114,3 +116,10 @@ wsq.process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), processJob ); + +wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); +wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active")); +wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); +wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); +wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); +wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); From cc98f83fdab20efbc48b1632372a75b866857395 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:25:36 -0300 Subject: [PATCH 08/11] added failed and completed log events --- apps/api/src/lib/scrape-events.ts | 2 +- apps/api/src/main/runWebScraper.ts | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 7015c92d..dda03c38 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -25,7 +25,7 @@ export type ScrapeScrapeEvent = { export type ScrapeQueueEvent = { type: "queue", - event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed", + event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed", worker?: string, } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 5c97ef41..5e7d2279 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -11,6 +11,7 @@ import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; import { Logger } from "../lib/logger"; +import { ScrapeEvents } from "../lib/scrape-events"; export async function startWebScraperPipeline({ job, @@ -39,6 +40,7 @@ export async function startWebScraperPipeline({ }, onError: (error) => { Logger.error(`🐂 Job failed ${job.id}`); + ScrapeEvents.logJobEvent(job, "failed"); job.moveToFailed(error); }, team_id: job.data.team_id, @@ -140,6 +142,7 @@ const saveJob = async (job: Job, result: any) => { // I think the job won't exist here anymore } } + ScrapeEvents.logJobEvent(job, "completed"); } catch (error) { Logger.error(`🐂 Failed to update job status: ${error}`); } From 309728a4820add6542ea6cc6d2c8633114299c97 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:48:06 -0300 Subject: [PATCH 09/11] updated logs --- apps/api/src/controllers/crawl-status.ts | 3 ++- apps/api/src/controllers/crawl.ts | 7 ++++--- apps/api/src/controllers/crawlPreview.ts | 3 ++- apps/api/src/controllers/scrape.ts | 5 +++-- apps/api/src/controllers/search.ts | 5 +++-- apps/api/src/controllers/status.ts | 3 ++- apps/api/src/index.ts | 6 +++--- apps/api/src/lib/LLM-extraction/index.ts | 3 ++- apps/api/src/lib/withAuth.ts | 5 +++-- .../src/scraper/WebScraper/utils/blocklist.ts | 4 +++- .../WebScraper/utils/imageDescription.ts | 3 ++- .../src/scraper/WebScraper/utils/metadata.ts | 4 +++- .../scraper/WebScraper/utils/replacePaths.ts | 5 +++-- apps/api/src/scraper/WebScraper/utils/utils.ts | 5 +++-- apps/api/src/search/googlesearch.ts | 5 +++-- apps/api/src/search/index.ts | 3 ++- apps/api/src/services/alerts/index.ts | 13 +++++++------ apps/api/src/services/alerts/slack.ts | 5 +++-- apps/api/src/services/billing/credit_billing.ts | 4 ++-- apps/api/src/services/idempotency/create.ts | 3 ++- apps/api/src/services/idempotency/validate.ts | 5 +++-- apps/api/src/services/logging/crawl_log.ts | 3 ++- apps/api/src/services/logging/scrape_log.ts | 4 ++++ .../services/notification/email_notification.ts | 11 ++++++----- apps/api/src/services/posthog.ts | 3 ++- apps/api/src/services/supabase.ts | 16 +++++++++++----- apps/api/src/services/webhook.ts | 11 +++-------- apps/test-suite/utils/supabase.ts | 17 ++++++++++++----- 28 files changed, 100 insertions(+), 64 deletions(-) diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts index a55003cc..5aafa433 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/crawl-status.ts @@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { Logger } from "../../src/lib/logger"; export async function crawlStatusController(req: Request, res: Response) { try { @@ -44,7 +45,7 @@ export async function crawlStatusController(req: Request, res: Response) { partial_data: jobStatus == 'completed' ? [] : partialDocs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 89358fcc..614a5928 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; +import { Logger } from "../../src/lib/logger"; export async function crawlController(req: Request, res: Response) { try { @@ -30,7 +31,7 @@ export async function crawlController(req: Request, res: Response) { try { createIdempotencyKey(req); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -83,7 +84,7 @@ export async function crawlController(req: Request, res: Response) { documents: docs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -101,7 +102,7 @@ export async function crawlController(req: Request, res: Response) { res.json({ jobId: job.id }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 2c3dc4ea..7c5c804d 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -3,6 +3,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; +import { Logger } from "../../src/lib/logger"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) { res.json({ jobId: job.id }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index fb57e41d..615f0c0e 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -9,6 +9,7 @@ import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; +import { Logger } from '../lib/logger'; export async function scrapeHelper( req: Request, @@ -112,7 +113,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } } catch (error) { - console.error(error); + Logger.error(error); earlyReturn = true; return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); } @@ -188,7 +189,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(result.returnCode).json(result); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 8cb6d55b..adacb766 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -7,6 +7,7 @@ import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { Logger } from "../lib/logger"; export async function searchHelper( req: Request, @@ -155,7 +156,7 @@ export async function searchController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: "Internal server error" }); } const startTime = new Date().getTime(); @@ -183,7 +184,7 @@ export async function searchController(req: Request, res: Response) { }); return res.status(result.returnCode).json(result); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index 231885f4..3d7fccbb 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -1,6 +1,7 @@ import { Request, Response } from "express"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { Logger } from "../../src/lib/logger"; export async function crawlJobStatusPreviewController(req: Request, res: Response) { try { @@ -35,7 +36,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons partial_data: jobStatus == 'completed' ? [] : partialDocs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 700cc98e..98243756 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -83,9 +83,9 @@ if (cluster.isMaster) { function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { Logger.info(`Worker ${process.pid} listening on port ${port}`); - // Logger.info( - // `For the UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - // ); + Logger.info( + `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); }); return server; } diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 2156fb3c..85a7e995 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation import { generateOpenAICompletions } from "./models"; import { Document, ExtractorOptions } from "../entities"; +import { Logger } from "../logger"; // Generate completion using OpenAI export async function generateCompletions( @@ -44,7 +45,7 @@ export async function generateCompletions( return completionResult; } catch (error) { - console.error(`Error generating completions: ${error}`); + Logger.error(`Error generating completions: ${error}`); throw new Error(`Error generating completions: ${error.message}`); } default: diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index ea5aa4d8..353c144b 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,4 +1,5 @@ import { AuthResponse } from "../../src/types"; +import { Logger } from "./logger"; let warningCount = 0; @@ -8,7 +9,7 @@ export function withAuth( return async function (...args: U): Promise { if (process.env.USE_DB_AUTHENTICATION === "false") { if (warningCount < 5) { - console.warn("WARNING - You're bypassing authentication"); + Logger.warn("You're bypassing authentication"); warningCount++; } return { success: true } as T; @@ -16,7 +17,7 @@ export function withAuth( try { return await originalFunction(...args); } catch (error) { - console.error("Error in withAuth function: ", error); + Logger.error(`Error in withAuth function: ${error}`); return { success: false, error: error.message } as T; } } diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 633fd5d0..0bdf9876 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,3 +1,5 @@ +import { Logger } from "../../../lib/logger"; + const socialMediaBlocklist = [ 'facebook.com', 'x.com', @@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean { return isBlocked; } catch (e) { // If an error occurs (e.g., invalid URL), return false - console.error(`Error parsing the following URL: ${url}`); + Logger.error(`Error parsing the following URL: ${url}`); return false; } } diff --git a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts index fb8f65d9..318c2340 100644 --- a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts +++ b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts @@ -1,5 +1,6 @@ import Anthropic from '@anthropic-ai/sdk'; import axios from 'axios'; +import { Logger } from '../../../lib/logger'; export async function getImageDescription( imageUrl: string, @@ -82,7 +83,7 @@ export async function getImageDescription( } } } catch (error) { - console.error("Error generating image alt text:", error?.message); + Logger.error(`Error generating image alt text: ${error}`); return ""; } } diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 3f2052c0..9496d569 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -1,4 +1,6 @@ import { CheerioAPI } from "cheerio"; +import { Logger } from "../../../lib/logger"; + interface Metadata { title?: string; description?: string; @@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; } catch (error) { - console.error("Error extracting metadata:", error); + Logger.error(`Error extracting metadata: ${error}`); } return { diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index 25b43f0a..f84db63f 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../../lib/logger"; import { Document } from "../../../lib/entities"; export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { @@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] return documents; } catch (error) { - console.error("Error replacing paths with absolute paths", error); + Logger.debug(`Error replacing paths with absolute paths: ${error}`); return documents; } }; @@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen return documents; } catch (error) { - console.error("Error replacing img paths with absolute paths", error); + Logger.error(`Error replacing img paths with absolute paths: ${error}`); return documents; } }; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 3aa021a6..dd5906b0 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -1,5 +1,6 @@ import axios from "axios"; import * as cheerio from "cheerio"; +import { Logger } from "../../../lib/logger"; export async function attemptScrapWithRequests( @@ -9,13 +10,13 @@ export async function attemptScrapWithRequests( const response = await axios.get(urlToScrap, { timeout: 15000 }); if (!response.data) { - console.log("Failed normal requests as well"); + Logger.debug("Failed normal requests as well"); return null; } return response.data; } catch (error) { - console.error(`Error in attemptScrapWithRequests: ${error}`); + Logger.debug(`Error in attemptScrapWithRequests: ${error}`); return null; } } diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 6bfa1a39..060f4bd8 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -2,6 +2,7 @@ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as querystring from 'querystring'; import { SearchResult } from '../../src/lib/entities'; +import { Logger } from '../../src/lib/logger'; const _useragent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', @@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); } catch (error) { if (error.message === 'Too many requests') { - console.warn('Too many requests, breaking the loop'); + Logger.warn('Too many requests, breaking the loop'); break; } throw error; @@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results } } if (attempts >= maxAttempts) { - console.warn('Max attempts reached, breaking the loop'); + Logger.warn('Max attempts reached, breaking the loop'); } return results } diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 88cbf812..f5bc06e3 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; import { google_search } from "./googlesearch"; import { serper_search } from "./serper"; @@ -47,7 +48,7 @@ export async function search({ timeout ); } catch (error) { - console.error("Error in search function: ", error); + Logger.error(`Error in search function: ${error}`); return [] } // if process.env.SERPER_API_KEY is set, use serper diff --git a/apps/api/src/services/alerts/index.ts b/apps/api/src/services/alerts/index.ts index 1cfb5906..88b3c726 100644 --- a/apps/api/src/services/alerts/index.ts +++ b/apps/api/src/services/alerts/index.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../../src/lib/logger"; import { getWebScraperQueue } from "../queue-service"; import { sendSlackWebhook } from "./slack"; @@ -9,13 +10,13 @@ export async function checkAlerts() { process.env.ALERT_NUM_ACTIVE_JOBS && process.env.ALERT_NUM_WAITING_JOBS ) { - console.info("Initializing alerts"); + Logger.info("Initializing alerts"); const checkActiveJobs = async () => { try { const webScraperQueue = getWebScraperQueue(); const activeJobs = await webScraperQueue.getActiveCount(); if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { - console.warn( + Logger.warn( `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` ); sendSlackWebhook( @@ -23,12 +24,12 @@ export async function checkAlerts() { true ); } else { - console.info( + Logger.info( `Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}` ); } } catch (error) { - console.error("Failed to check active jobs:", error); + Logger.error(`Failed to check active jobs: ${error}`); } }; @@ -38,7 +39,7 @@ export async function checkAlerts() { const paused = await webScraperQueue.getPausedCount(); if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { - console.warn( + Logger.warn( `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.` ); sendSlackWebhook( @@ -57,6 +58,6 @@ export async function checkAlerts() { // setInterval(checkAll, 10000); // Run every } } catch (error) { - console.error("Failed to initialize alerts:", error); + Logger.error(`Failed to initialize alerts: ${error}`); } } diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index f65035b1..89629df7 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -1,4 +1,5 @@ import axios from "axios"; +import { Logger } from "../../../src/lib/logger"; export async function sendSlackWebhook( message: string, @@ -16,8 +17,8 @@ export async function sendSlackWebhook( "Content-Type": "application/json", }, }); - console.log("Webhook sent successfully:", response.data); + Logger.log("Webhook sent successfully:", response.data); } catch (error) { - console.error("Error sending webhook:", error); + Logger.error(`Error sending webhook: ${error}`); } } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 6d0f9a69..9369cdbb 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -263,14 +263,14 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { }); if (creditUsageError) { - console.error("Error calculating credit usage:", creditUsageError); + Logger.error(`Error calculating credit usage: ${creditUsageError}`); } if (creditUsages && creditUsages.length > 0) { totalCreditsUsed = creditUsages[0].total_credits_used; } } catch (error) { - console.error("Error calculating credit usage:", error); + Logger.error(`Error calculating credit usage: ${error}`); } // Adjust total credits used by subtracting coupon value diff --git a/apps/api/src/services/idempotency/create.ts b/apps/api/src/services/idempotency/create.ts index ec3e18e7..291e77d9 100644 --- a/apps/api/src/services/idempotency/create.ts +++ b/apps/api/src/services/idempotency/create.ts @@ -1,5 +1,6 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; +import { Logger } from "../../../src/lib/logger"; export async function createIdempotencyKey( req: Request, @@ -14,7 +15,7 @@ export async function createIdempotencyKey( .insert({ key: idempotencyKey }); if (error) { - console.error("Failed to create idempotency key:", error); + Logger.error(`Failed to create idempotency key: ${error}`); throw error; } diff --git a/apps/api/src/services/idempotency/validate.ts b/apps/api/src/services/idempotency/validate.ts index 1ca348bb..4d58a31d 100644 --- a/apps/api/src/services/idempotency/validate.ts +++ b/apps/api/src/services/idempotency/validate.ts @@ -1,6 +1,7 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; import { validate as isUuid } from 'uuid'; +import { Logger } from "../../../src/lib/logger"; export async function validateIdempotencyKey( req: Request, @@ -13,7 +14,7 @@ export async function validateIdempotencyKey( // Ensure idempotencyKey is treated as a string const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey; if (!isUuid(key)) { - console.error("Invalid idempotency key provided in the request headers."); + Logger.debug("Invalid idempotency key provided in the request headers."); return false; } @@ -23,7 +24,7 @@ export async function validateIdempotencyKey( .eq("key", idempotencyKey); if (error) { - console.error(error); + Logger.error(`Error validating idempotency key: ${error}`); } if (!data || data.length === 0) { diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index 1224bd44..68008e02 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -1,4 +1,5 @@ import { supabase_service } from "../supabase"; +import { Logger } from "../../../src/lib/logger"; import "dotenv/config"; export async function logCrawl(job_id: string, team_id: string) { @@ -13,7 +14,7 @@ export async function logCrawl(job_id: string, team_id: string) { }, ]); } catch (error) { - console.error("Error logging crawl job:\n", error); + Logger.error(`Error logging crawl job to supabase:\n${error}`); } } } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index b99d99b4..208159da 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -8,6 +8,10 @@ export async function logScrape( scrapeLog: ScrapeLog, pageOptions?: PageOptions ) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug("Skipping logging scrape to Supabase"); + return; + } try { // Only log jobs in production // if (process.env.ENV !== "production") { diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index e5102acd..d7cd3de0 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -2,6 +2,7 @@ import { supabase_service } from "../supabase"; import { withAuth } from "../../lib/withAuth"; import { Resend } from "resend"; import { NotificationType } from "../../types"; +import { Logger } from "../../../src/lib/logger"; const emailTemplates: Record< NotificationType, @@ -52,11 +53,11 @@ async function sendEmailNotification( }); if (error) { - console.error("Error sending email: ", error); + Logger.debug(`Error sending email: ${error}`); return { success: false }; } } catch (error) { - console.error("Error sending email (2): ", error); + Logger.debug(`Error sending email (2): ${error}`); return { success: false }; } } @@ -79,7 +80,7 @@ export async function sendNotificationInternal( .lte("sent_date", endDateString); if (error) { - console.error("Error fetching notifications: ", error); + Logger.debug(`Error fetching notifications: ${error}`); return { success: false }; } @@ -93,7 +94,7 @@ export async function sendNotificationInternal( .eq("team_id", team_id); if (emailsError) { - console.error("Error fetching emails: ", emailsError); + Logger.debug(`Error fetching emails: ${emailsError}`); return { success: false }; } @@ -112,7 +113,7 @@ export async function sendNotificationInternal( ]); if (insertError) { - console.error("Error inserting notification record: ", insertError); + Logger.debug(`Error inserting notification record: ${insertError}`); return { success: false }; } diff --git a/apps/api/src/services/posthog.ts b/apps/api/src/services/posthog.ts index 5ec16e2e..a7419883 100644 --- a/apps/api/src/services/posthog.ts +++ b/apps/api/src/services/posthog.ts @@ -1,5 +1,6 @@ import { PostHog } from 'posthog-node'; import "dotenv/config"; +import { Logger } from '../../src/lib/logger'; export default function PostHogClient() { const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, { @@ -19,7 +20,7 @@ class MockPostHog { export const posthog = process.env.POSTHOG_API_KEY ? PostHogClient() : (() => { - console.warn( + Logger.warn( "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more." ); return new MockPostHog(); diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index fa6404d7..d34f7b52 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,4 +1,5 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; +import { Logger } from "../lib/logger"; // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -10,13 +11,13 @@ class SupabaseService { // Only initialize the Supabase client if both URL and Service Token are provided. if (process.env.USE_DB_AUTHENTICATION === "false") { // Warn the user that Authentication is disabled by setting the client to null - console.warn( - "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + Logger.warn( + "Authentication is disabled. Supabase client will not be initialized." ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { - console.error( - "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + Logger.error( + "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable" ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); @@ -35,10 +36,15 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug( + "Attempted to access Supabase client when it's not configured." + ); + } const client = target.getClient(); // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. if (client === null) { - console.error( + Logger.error( "Attempted to access Supabase client when it's not configured." ); return () => { diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 18378546..2fa6cdcd 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../src/lib/logger"; import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, jobId: string,data: any) => { @@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { .eq("team_id", teamId) .limit(1); if (error) { - console.error( - `Error fetching webhook URL for team ID: ${teamId}`, - error.message - ); + Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`); return null; } @@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { }), }); } catch (error) { - console.error( - `Error sending webhook for team ID: ${teamId}`, - error.message - ); + Logger.error(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); } }; diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index aa19a8c9..dcefa38c 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -1,5 +1,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import "dotenv/config"; +import { Logger } from "../../api/src/lib/logger"; + // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { private client: SupabaseClient | null = null; @@ -10,13 +12,13 @@ class SupabaseService { // Only initialize the Supabase client if both URL and Service Token are provided. if (process.env.USE_DB_AUTHENTICATION === "false") { // Warn the user that Authentication is disabled by setting the client to null - console.warn( - "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + Logger.warn( + "Authentication is disabled. Supabase client will not be initialized." ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { - console.error( - "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + Logger.error( + "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable" ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); @@ -35,10 +37,15 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug( + "Attempted to access Supabase client when it's not configured." + ); + } const client = target.getClient(); // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. if (client === null) { - console.error( + Logger.error( "Attempted to access Supabase client when it's not configured." ); return () => { From 1f1c068eeabc4056bedd1ad9612694f00732e7e3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 25 Jul 2024 10:00:50 -0300 Subject: [PATCH 10/11] changing from error to debug --- apps/api/src/services/alerts/slack.ts | 2 +- apps/api/src/services/webhook.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index 89629df7..96bf1c09 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -19,6 +19,6 @@ export async function sendSlackWebhook( }); Logger.log("Webhook sent successfully:", response.data); } catch (error) { - Logger.error(`Error sending webhook: ${error}`); + Logger.debug(`Error sending webhook: ${error}`); } } diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 2fa6cdcd..b0222ea3 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -51,6 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { }), }); } catch (error) { - Logger.error(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); + Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); } }; From 50d2426fc496d4feba7a6bd013a790480cc5bd14 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 16:20:29 -0400 Subject: [PATCH 11/11] Update scrape-events.ts --- apps/api/src/lib/scrape-events.ts | 52 ++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index dda03c38..ab4ef681 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -1,6 +1,7 @@ import { Job, JobId } from "bull"; import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; +import { Logger } from "./logger"; export type ScrapeErrorEvent = { type: "error", @@ -36,13 +37,18 @@ export class ScrapeEvents { if (jobId === "TEST") return null; if (process.env.USE_DB_AUTHENTICATION) { - const result = await supabase.from("scrape_events").insert({ - job_id: jobId, - type: content.type, - content: content, - // created_at - }).select().single(); - return (result.data as any).id; + try { + const result = await supabase.from("scrape_events").insert({ + job_id: jobId, + type: content.type, + content: content, + // created_at + }).select().single(); + return (result.data as any).id; + } catch (error) { + Logger.error(`Error inserting scrape event: ${error}`); + return null; + } } return null; @@ -51,20 +57,28 @@ export class ScrapeEvents { static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) { if (logId === null) return; - const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; - await supabase.from("scrape_events").update({ - content: { - ...previousLog.content, - result, - } - }).eq("id", logId); + try { + const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; + await supabase.from("scrape_events").update({ + content: { + ...previousLog.content, + result, + } + }).eq("id", logId); + } catch (error) { + Logger.error(`Error updating scrape result: ${error}`); + } } static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) { - await this.insert(((job as any).id ? (job as any).id : job) as string, { - type: "queue", - event, - worker: process.env.FLY_MACHINE_ID, - }); + try { + await this.insert(((job as any).id ? (job as any).id : job) as string, { + type: "queue", + event, + worker: process.env.FLY_MACHINE_ID, + }); + } catch (error) { + Logger.error(`Error logging job event: ${error}`); + } } }