From 1b7ae5457f68f43017f99b70d4ffbfd06a28497c Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Tue, 16 Jul 2024 10:22:54 +0800 Subject: [PATCH 01/45] support custom models --- apps/api/src/lib/LLM-extraction/models.ts | 2 +- docker-compose.yaml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index 8de8ee4b..d190c495 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -48,7 +48,7 @@ function prepareOpenAIDoc( export async function generateOpenAICompletions({ client, - model = "gpt-4o", + model = process.env.MODEL_NAME, document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, diff --git a/docker-compose.yaml b/docker-compose.yaml index b88f3ed8..b785308a 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,6 +12,8 @@ x-common-service: &common-service - PORT=${PORT:-3002} - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_BASE_URL=${OPENAI_BASE_URL} + - MODEL_NAME=${MODEL_NAME:-gpt-4o} - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL} - SERPER_API_KEY=${SERPER_API_KEY} - LLAMAPARSE_API_KEY=${LLAMAPARSE_API_KEY} From 4ef47f7765a23318c619fe340c938be7c64fc683 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 15 Jul 2024 22:52:17 -0400 Subject: [PATCH 02/45] Update models.ts --- apps/api/src/lib/LLM-extraction/models.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/LLM-extraction/models.ts b/apps/api/src/lib/LLM-extraction/models.ts index d190c495..e696a8cd 100644 --- a/apps/api/src/lib/LLM-extraction/models.ts +++ b/apps/api/src/lib/LLM-extraction/models.ts @@ -48,7 +48,7 @@ function prepareOpenAIDoc( export async function generateOpenAICompletions({ client, - model = process.env.MODEL_NAME, + model = process.env.MODEL_NAME || "gpt-4o", document, schema, //TODO - add zod dynamic type checking prompt = defaultPrompt, From a684bd3c5d765e263dfec15aae113339bede4991 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Jul 2024 09:07:23 -0300 Subject: [PATCH 03/45] added regex for links in sitemap --- apps/api/src/scraper/WebScraper/crawler.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 59b53642..00d51853 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -64,6 +64,14 @@ export class WebCrawler { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { + + // if link is not a complete url, add the base url + link = link.trim(); + const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i'); + if (!isCompleteUrl.test(link)){ + link = this.baseUrl + link; + } + const url = new URL(link); const path = url.pathname; From 6208ecdbc0c357099c057891031e7141e785d5d0 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:30:46 -0300 Subject: [PATCH 04/45] added logger --- apps/api/.env.example | 11 ++++ .../__tests__/e2e_full_withAuth/index.test.ts | 1 - apps/api/src/controllers/auth.ts | 7 +-- apps/api/src/controllers/crawl-cancel.ts | 7 +-- apps/api/src/index.ts | 49 ++++++++--------- apps/api/src/lib/logger.ts | 53 +++++++++++++++++++ apps/api/src/main/runWebScraper.ts | 7 ++- apps/api/src/scraper/WebScraper/crawler.ts | 21 +++++--- .../WebScraper/custom/handleCustomScraping.ts | 8 +-- apps/api/src/scraper/WebScraper/index.ts | 9 ++-- .../src/scraper/WebScraper/scrapers/fetch.ts | 9 ++-- .../scraper/WebScraper/scrapers/fireEngine.ts | 17 +++--- .../scraper/WebScraper/scrapers/playwright.ts | 13 ++--- .../WebScraper/scrapers/scrapingBee.ts | 7 +-- apps/api/src/scraper/WebScraper/single_url.ts | 24 +++++---- apps/api/src/scraper/WebScraper/sitemap.ts | 5 +- .../utils/__tests__/socialBlockList.test.ts | 3 +- .../scraper/WebScraper/utils/pdfProcessor.ts | 6 ++- .../src/services/billing/credit_billing.ts | 5 +- apps/api/src/services/logging/log_job.ts | 5 +- apps/api/src/services/logging/scrape_log.ts | 5 +- apps/api/src/services/logtail.ts | 7 +-- apps/api/src/services/queue-service.ts | 3 +- apps/api/src/services/queue-worker.ts | 15 +++--- apps/api/src/services/redis.ts | 13 ++--- 25 files changed, 201 insertions(+), 109 deletions(-) create mode 100644 apps/api/src/lib/logger.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index 5d8e746d..08ff7d7f 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -57,3 +57,14 @@ SELF_HOSTED_WEBHOOK_URL= # Resend API Key for transactional emails RESEND_API_KEY= + +# LOGGING_LEVEL determines the verbosity of logs that the system will output. +# Available levels are: +# NONE - No logs will be output. +# ERROR - For logging error messages that indicate a failure in a specific operation. +# WARN - For logging potentially harmful situations that are not necessarily errors. +# INFO - For logging informational messages that highlight the progress of the application. +# DEBUG - For logging detailed information on the flow through the system, primarily used for debugging. +# TRACE - For logging more detailed information than the DEBUG level. +# Set LOGGING_LEVEL to one of the above options to control logging output. +LOGGING_LEVEL=INFO diff --git a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts index 3e324d39..019bc968 100644 --- a/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_full_withAuth/index.test.ts @@ -858,7 +858,6 @@ describe("E2E Tests for API Routes", () => { await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again } } - console.log(crawlData) expect(crawlData.length).toBeGreaterThan(0); expect(crawlData).toEqual(expect.arrayContaining([ expect.objectContaining({ url: expect.stringContaining("https://firecrawl.dev/?ref=mendable+banner") }), diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 56a5ec61..aab3d188 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -6,6 +6,7 @@ import { withAuth } from "../../src/lib/withAuth"; import { RateLimiterRedis } from "rate-limiter-flexible"; import { setTraceAttributes } from '@hyperdx/node-opentelemetry'; import { sendNotification } from "../services/notification/email_notification"; +import { Logger } from "../lib/logger"; export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise { return withAuth(supaAuthenticateUser)(req, res, mode); @@ -17,7 +18,7 @@ function setTrace(team_id: string, api_key: string) { api_key }); } catch (error) { - console.error('Error setting trace attributes:', error); + Logger.error(`Error setting trace attributes: ${error.message}`); } } @@ -82,7 +83,7 @@ export async function supaAuthenticateUser( // $$ language plpgsql; if (error) { - console.error('Error fetching key and price_id:', error); + Logger.warn(`Error fetching key and price_id: ${error.message}`); } else { // console.log('Key and Price ID:', data); } @@ -135,7 +136,7 @@ export async function supaAuthenticateUser( try { await rateLimiter.consume(team_endpoint_token); } catch (rateLimiterRes) { - console.error(rateLimiterRes); + Logger.error(`Rate limit exceeded: ${rateLimiterRes}`); const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); diff --git a/apps/api/src/controllers/crawl-cancel.ts b/apps/api/src/controllers/crawl-cancel.ts index ff4b2c58..d0c109ec 100644 --- a/apps/api/src/controllers/crawl-cancel.ts +++ b/apps/api/src/controllers/crawl-cancel.ts @@ -5,6 +5,7 @@ import { addWebScraperJob } from "../../src/services/queue-jobs"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabase_service } from "../../src/services/supabase"; import { billTeam } from "../../src/services/billing/credit_billing"; +import { Logger } from "../../src/lib/logger"; export async function crawlCancelController(req: Request, res: Response) { try { @@ -43,7 +44,7 @@ export async function crawlCancelController(req: Request, res: Response) { const { partialDocs } = await job.progress(); if (partialDocs && partialDocs.length > 0 && jobState === "active") { - console.log("Billing team for partial docs..."); + Logger.info("Billing team for partial docs..."); // Note: the credits that we will bill them here might be lower than the actual // due to promises that are not yet resolved await billTeam(team_id, partialDocs.length); @@ -55,7 +56,7 @@ export async function crawlCancelController(req: Request, res: Response) { await job.discard(); await job.moveToFailed(Error("Job cancelled by user"), true); } catch (error) { - console.error(error); + Logger.error(error); } const newJobState = await job.getState(); @@ -64,7 +65,7 @@ export async function crawlCancelController(req: Request, res: Response) { status: "cancelled" }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 88ec4418..700cc98e 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -12,16 +12,17 @@ import { sendSlackWebhook } from "./services/alerts/slack"; import { checkAlerts } from "./services/alerts"; import Redis from "ioredis"; import { redisRateLimitClient } from "./services/rate-limiter"; +import { Logger } from "./lib/logger"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; -console.log(`Number of CPUs: ${numCPUs} available`); +Logger.info(`Number of CPUs: ${numCPUs} available`); if (cluster.isMaster) { - console.log(`Master ${process.pid} is running`); + Logger.info(`Master ${process.pid} is running`); // Fork workers. for (let i = 0; i < numCPUs; i++) { @@ -30,8 +31,8 @@ if (cluster.isMaster) { cluster.on("exit", (worker, code, signal) => { if (code !== null) { - console.log(`Worker ${worker.process.pid} exited`); - console.log("Starting a new worker"); + Logger.info(`Worker ${worker.process.pid} exited`); + Logger.info("Starting a new worker"); cluster.fork(); } }); @@ -81,15 +82,10 @@ if (cluster.isMaster) { function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { - console.log(`Worker ${process.pid} listening on port ${port}`); - console.log( - `For the UI, open http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - ); - console.log(""); - console.log("1. Make sure Redis is running on port 6379 by default"); - console.log( - "2. If you want to run nango, make sure you do port forwarding in 3002 using ngrok http 3002 " - ); + Logger.info(`Worker ${process.pid} listening on port ${port}`); + // Logger.info( + // `For the UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + // ); }); return server; } @@ -114,7 +110,7 @@ if (cluster.isMaster) { noActiveJobs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } }); @@ -132,7 +128,7 @@ if (cluster.isMaster) { waitingJobs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } }); @@ -177,13 +173,13 @@ if (cluster.isMaster) { }); if (!response.ok) { - console.error("Failed to send Slack notification"); + Logger.error("Failed to send Slack notification"); } } }, timeout); } } catch (error) { - console.error(error); + Logger.debug(error); } }; @@ -198,7 +194,7 @@ if (cluster.isMaster) { await checkAlerts(); return res.status(200).send("Alerts initialized"); } catch (error) { - console.error("Failed to initialize alerts:", error); + Logger.debug(`Failed to initialize alerts: ${error}`); return res.status(500).send("Failed to initialize alerts"); } } @@ -207,6 +203,7 @@ if (cluster.isMaster) { app.get( `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => { + Logger.info("πŸ‚ Cleaning jobs older than 24h"); try { const webScraperQueue = getWebScraperQueue(); const batchSize = 10; @@ -241,12 +238,12 @@ if (cluster.isMaster) { await job.remove(); count++; } catch (jobError) { - console.error(`Failed to remove job with ID ${job.id}:`, jobError); + Logger.error(`πŸ‚ Failed to remove job with ID ${job.id}: ${jobError}` ); } } return res.status(200).send(`Removed ${count} completed jobs.`); } catch (error) { - console.error("Failed to clean last 24h complete jobs:", error); + Logger.error(`πŸ‚ Failed to clean last 24h complete jobs: ${error}`); return res.status(500).send("Failed to clean jobs"); } } @@ -272,7 +269,7 @@ if (cluster.isMaster) { queueRedisHealth = await queueRedis.get(testKey); await queueRedis.del(testKey); } catch (error) { - console.error("queueRedis health check failed:", error); + Logger.error(`queueRedis health check failed: ${error}`); queueRedisHealth = null; } @@ -283,7 +280,7 @@ if (cluster.isMaster) { redisRateLimitHealth = await redisRateLimitClient.get(testKey); await redisRateLimitClient.del(testKey); } catch (error) { - console.error("redisRateLimitClient health check failed:", error); + Logger.error(`redisRateLimitClient health check failed: ${error}`); redisRateLimitHealth = null; } @@ -297,12 +294,12 @@ if (cluster.isMaster) { healthStatus.queueRedis === "healthy" && healthStatus.redisRateLimitClient === "healthy" ) { - console.log("Both Redis instances are healthy"); + Logger.info("Both Redis instances are healthy"); return res .status(200) .json({ status: "healthy", details: healthStatus }); } else { - console.log("Redis instances health check:", healthStatus); + Logger.info(`Redis instances health check: ${JSON.stringify(healthStatus)}`); await sendSlackWebhook( `[REDIS DOWN] Redis instances health check: ${JSON.stringify( healthStatus @@ -314,7 +311,7 @@ if (cluster.isMaster) { .json({ status: "unhealthy", details: healthStatus }); } } catch (error) { - console.error("Redis health check failed:", error); + Logger.error(`Redis health check failed: ${error}`); await sendSlackWebhook( `[REDIS DOWN] Redis instances health check: ${error.message}`, true @@ -326,5 +323,5 @@ if (cluster.isMaster) { } ); - console.log(`Worker ${process.pid} started`); + Logger.info(`Worker ${process.pid} started`); } diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts new file mode 100644 index 00000000..872dbf51 --- /dev/null +++ b/apps/api/src/lib/logger.ts @@ -0,0 +1,53 @@ +enum LogLevel { + NONE = 'NONE', // No logs will be output. + ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. + WARN = 'WARN', // For logging potentially harmful situations that are not necessarily errors. + INFO = 'INFO', // For logging informational messages that highlight the progress of the application. + DEBUG = 'DEBUG', // For logging detailed information on the flow through the system, primarily used for debugging. + TRACE = 'TRACE' // For logging more detailed information than the DEBUG level. +} +export class Logger { + static colors = { + ERROR: '\x1b[31m%s\x1b[0m', // Red + WARN: '\x1b[33m%s\x1b[0m', // Yellow + INFO: '\x1b[34m%s\x1b[0m', // Blue + DEBUG: '\x1b[36m%s\x1b[0m', // Cyan + TRACE: '\x1b[35m%s\x1b[0m' // Magenta + }; + + static log (message: string, level: LogLevel) { + const logLevel: LogLevel = LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || LogLevel.INFO; + const levels = [LogLevel.NONE, LogLevel.ERROR, LogLevel.WARN, LogLevel.INFO, LogLevel.DEBUG, LogLevel.TRACE]; + const currentLevelIndex = levels.indexOf(logLevel); + const messageLevelIndex = levels.indexOf(level); + + if (currentLevelIndex >= messageLevelIndex) { + const color = Logger.colors[level]; + console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`); + + // if (process.env.USE_DB_AUTH) { + // save to supabase? another place? + // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean }); + // } + } + } + static error(message: string | any) { + Logger.log(message, LogLevel.ERROR); + } + + static warn(message: string) { + Logger.log(message, LogLevel.WARN); + } + + static info(message: string) { + Logger.log(message, LogLevel.INFO); + } + + static debug(message: string) { + Logger.log(message, LogLevel.DEBUG); + } + + static trace(message: string) { + Logger.log(message, LogLevel.TRACE); + } +} diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 3c98e11e..0a7da035 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -10,6 +10,7 @@ import { DocumentUrl, Progress } from "../lib/entities"; import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; +import { Logger } from "../lib/logger"; export async function startWebScraperPipeline({ job, @@ -23,6 +24,7 @@ export async function startWebScraperPipeline({ crawlerOptions: job.data.crawlerOptions, pageOptions: job.data.pageOptions, inProgress: (progress) => { + Logger.debug(`πŸ‚ Job in progress ${job.id}`); if (progress.currentDocument) { partialDocs.push(progress.currentDocument); if (partialDocs.length > 50) { @@ -32,9 +34,11 @@ export async function startWebScraperPipeline({ } }, onSuccess: (result) => { + Logger.debug(`πŸ‚ Job completed ${job.id}`); saveJob(job, result); }, onError: (error) => { + Logger.error(`πŸ‚ Job failed ${job.id}`); job.moveToFailed(error); }, team_id: job.data.team_id, @@ -108,7 +112,6 @@ export async function runWebScraper({ // this return doesn't matter too much for the job completion result return { success: true, message: "", docs: filteredDocs }; } catch (error) { - console.error("Error running web scraper", error); onError(error); return { success: false, message: error.message, docs: [] }; } @@ -136,6 +139,6 @@ const saveJob = async (job: Job, result: any) => { } } } catch (error) { - console.error("Failed to update job status:", error); + Logger.error(`πŸ‚ Failed to update job status: ${error}`); } }; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 59b53642..3d7c267a 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; +import { Logger } from "../../../src/lib/logger"; export class WebCrawler { private initialUrl: string; @@ -116,7 +117,7 @@ export class WebCrawler { const isAllowed = this.robots.isAllowed(link, "FireCrawlAgent") ?? true; // Check if the link is disallowed by robots.txt if (!isAllowed) { - console.log(`Link disallowed by robots.txt: ${link}`); + Logger.debug(`Link disallowed by robots.txt: ${link}`); return false; } @@ -133,15 +134,19 @@ export class WebCrawler { limit: number = 10000, maxDepth: number = 10 ): Promise<{ url: string, html: string }[]> { + + Logger.debug(`Crawler starting with ${this.initialUrl}`); // Fetch and parse robots.txt try { const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); this.robots = robotsParser(this.robotsTxtUrl, response.data); + Logger.debug(`Crawler robots.txt fetched with ${this.robotsTxtUrl}`); } catch (error) { - console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); + Logger.debug(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); } - if(!crawlerOptions?.ignoreSitemap){ + if (!crawlerOptions?.ignoreSitemap){ + Logger.debug(`Fetching sitemap links from ${this.initialUrl}`); const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); @@ -175,6 +180,7 @@ export class WebCrawler { inProgress?: (progress: Progress) => void, ): Promise<{ url: string, html: string }[]> { const queue = async.queue(async (task: string, callback) => { + Logger.debug(`Crawling ${task}`); if (this.crawledUrls.size >= Math.min(this.maxCrawledLinks, this.limit)) { if (callback && typeof callback === "function") { callback(); @@ -216,16 +222,18 @@ export class WebCrawler { } }, concurrencyLimit); + Logger.debug(`πŸ‚ Pushing ${urls.length} URLs to the queue`); queue.push( urls.filter( (url) => !this.visited.has(url) && this.robots.isAllowed(url, "FireCrawlAgent") ), (err) => { - if (err) console.error(err); + if (err) Logger.error(`πŸ‚ Error pushing URLs to the queue: ${err}`); } ); await queue.drain(); + Logger.debug(`πŸ‚ Crawled ${this.crawledUrls.size} URLs, Queue drained.`); return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } @@ -282,7 +290,6 @@ export class WebCrawler { const urlObj = new URL(fullUrl); const path = urlObj.pathname; - if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS if (this.isInternalLink(fullUrl) && this.noSections(fullUrl) && @@ -452,7 +459,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); } } catch (error) { - console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); + Logger.debug(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); if (response) { sitemapLinks = response; @@ -467,7 +474,7 @@ export class WebCrawler { sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap }); } } catch (error) { - console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); } } diff --git a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts index f8b2503e..e2f8d8cc 100644 --- a/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts +++ b/apps/api/src/scraper/WebScraper/custom/handleCustomScraping.ts @@ -1,10 +1,12 @@ +import { Logger } from "../../../lib/logger"; + export async function handleCustomScraping( text: string, url: string ): Promise<{ scraper: string; url: string; waitAfterLoad?: number, pageOptions?: { scrollXPaths?: string[] } } | null> { // Check for Readme Docs special case if (text.includes(' 0 ? this.getSitemapDataForSingleUrl(this.urls[0], links[0], 1500).catch( (error) => { - console.error("Failed to fetch sitemap data:", error); + Logger.debug(`Failed to fetch sitemap data: ${error}`); return null; } ) @@ -460,7 +461,7 @@ export class WebScraperDataProvider { let documents: Document[] = []; for (const url of urls) { const normalizedUrl = this.normalizeUrl(url); - console.log( + Logger.debug( "Getting cached document for web-scraper-cache:" + normalizedUrl ); const cachedDocumentString = await getValue( diff --git a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts index 4c31438c..c9ddf93a 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fetch.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fetch.ts @@ -2,6 +2,7 @@ import axios from "axios"; import { logScrape } from "../../../services/logging/scrape_log"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Axios @@ -34,9 +35,7 @@ export async function scrapWithFetch( }); if (response.status !== 200) { - console.error( - `[Axios] Error fetching url: ${url} with status: ${response.status}` - ); + Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}`); logParams.error_message = response.statusText; logParams.response_code = response.status; return { @@ -63,10 +62,10 @@ export async function scrapWithFetch( } catch (error) { if (error.code === "ECONNABORTED") { logParams.error_message = "Request timed out"; - console.log(`[Axios] Request timed out for ${url}`); + Logger.debug(`⛏️ Axios: Request timed out for ${url}`); } else { logParams.error_message = error.message || error; - console.error(`[Axios] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`); } return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index e547c019..0f4c2320 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -4,6 +4,7 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Fire-Engine @@ -59,12 +60,10 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? - console.log( - `[Fire-Engine][${engine}] Scraping ${url} with wait: ${waitParam} and screenshot: ${screenshotParam} and method: ${fireEngineOptionsParam?.method ?? "null"}` + Logger.info( + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); - // console.log(fireEngineOptionsParam) - const response = await axios.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, { @@ -84,15 +83,15 @@ export async function scrapWithFireEngine({ ); if (response.status !== 200) { - console.error( - `[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}` + Logger.debug( + `⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; if(response.data && response.data?.pageStatusCode !== 200) { - console.error(`[Fire-Engine][${engine}] Error fetching url: ${url} with status: ${response.status}`); + Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`); } return { @@ -130,10 +129,10 @@ export async function scrapWithFireEngine({ } } catch (error) { if (error.code === "ECONNABORTED") { - console.log(`[Fire-Engine] Request timed out for ${url}`); + Logger.debug(`⛏️ Fire-Engine: Request timed out for ${url}`); logParams.error_message = "Request timed out"; } else { - console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Fire-Engine: Failed to fetch url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; } return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message }; diff --git a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts index 11c3c5ad..4b3180a3 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/playwright.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/playwright.ts @@ -3,6 +3,7 @@ import { logScrape } from "../../../services/logging/scrape_log"; import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with Playwright @@ -51,8 +52,8 @@ export async function scrapWithPlaywright( ); if (response.status !== 200) { - console.error( - `[Playwright] Error fetching url: ${url} with status: ${response.status}` + Logger.debug( + `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}` ); logParams.error_message = response.data?.pageError; logParams.response_code = response.data?.pageStatusCode; @@ -86,8 +87,8 @@ export async function scrapWithPlaywright( }; } catch (jsonError) { logParams.error_message = jsonError.message || jsonError; - console.error( - `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + Logger.debug( + `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}` ); return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } @@ -95,10 +96,10 @@ export async function scrapWithPlaywright( } catch (error) { if (error.code === "ECONNABORTED") { logParams.error_message = "Request timed out"; - console.log(`[Playwright] Request timed out for ${url}`); + Logger.debug(`⛏️ Playwright: Request timed out for ${url}`); } else { logParams.error_message = error.message || error; - console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}`); } return { content: "", pageStatusCode: null, pageError: logParams.error_message }; } finally { diff --git a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts index 9a1f0b35..554bfe22 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/scrapingBee.ts @@ -3,6 +3,7 @@ import { generateRequestParams } from "../single_url"; import { fetchAndProcessPdf } from "../utils/pdfProcessor"; import { universalTimeout } from "../global"; import { ScrapingBeeClient } from "scrapingbee"; +import { Logger } from "../../../lib/logger"; /** * Scrapes a URL with ScrapingBee @@ -56,8 +57,8 @@ export async function scrapWithScrapingBee( text = decoder.decode(response.data); logParams.success = true; } catch (decodeError) { - console.error( - `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + Logger.debug( + `⛏️ ScrapingBee: Error decoding response data for url: ${url} | Error: ${decodeError}` ); logParams.error_message = decodeError.message || decodeError; } @@ -72,7 +73,7 @@ export async function scrapWithScrapingBee( }; } } catch (error) { - console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); + Logger.debug(`⛏️ ScrapingBee: Error fetching url: ${url} | Error: ${error}`); logParams.error_message = error.message || error; logParams.response_code = error.response?.status; return { diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 8fbd31e4..1e7f1fac 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -17,6 +17,7 @@ import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { extractLinks } from "./utils/utils"; +import { Logger } from "../../lib/logger"; dotenv.config(); @@ -48,7 +49,7 @@ export async function generateRequestParams( return defaultParams; } } catch (error) { - console.error(`Error generating URL key: ${error}`); + Logger.error(`Error generating URL key: ${error}`); return defaultParams; } } @@ -154,7 +155,6 @@ export async function scrapSingleUrl( } if (process.env.FIRE_ENGINE_BETA_URL) { - console.log(`Scraping ${url} with Fire Engine`); const response = await scrapWithFireEngine({ url, waitFor: pageOptions.waitFor, @@ -277,7 +277,7 @@ export async function scrapSingleUrl( try { urlKey = new URL(urlToScrap).hostname.replace(/^www\./, ""); } catch (error) { - console.error(`Invalid URL key, trying: ${urlToScrap}`); + Logger.error(`Invalid URL key, trying: ${urlToScrap}`); } const defaultScraper = urlSpecificParams[urlKey]?.defaultScraper ?? ""; const scrapersInOrder = getScrapingFallbackOrder( @@ -311,12 +311,18 @@ export async function scrapSingleUrl( pageError = undefined; } - if (text && text.trim().length >= 100) break; - if (pageStatusCode && pageStatusCode == 404) break; - const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; - if (nextScraperIndex < scrapersInOrder.length) { - console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); + if (text && text.trim().length >= 100) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`); + break; } + if (pageStatusCode && pageStatusCode == 404) { + Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with status code 404, breaking`); + break; + } + // const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; + // if (nextScraperIndex < scrapersInOrder.length) { + // Logger.debug(`⛏️ ${scraper} Failed to fetch URL: ${urlToScrap} with status: ${pageStatusCode}, error: ${pageError} | Falling back to ${scrapersInOrder[nextScraperIndex]}`); + // } } if (!text) { @@ -372,7 +378,7 @@ export async function scrapSingleUrl( return document; } catch (error) { - console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); + Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); return { content: "", markdown: "", diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 1dfbf3a1..b41f3eeb 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -3,6 +3,7 @@ import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; import { scrapWithFireEngine } from "./scrapers/fireEngine"; import { WebCrawler } from "./crawler"; +import { Logger } from "../../lib/logger"; export async function getLinksFromSitemap( { @@ -26,7 +27,7 @@ export async function getLinksFromSitemap( content = response.html; } } catch (error) { - console.error(`Request failed for ${sitemapUrl}: ${error}`); + Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`); return allUrls; } @@ -48,7 +49,7 @@ export async function getLinksFromSitemap( } } } catch (error) { - console.error(`Error processing ${sitemapUrl}: ${error}`); + Logger.debug(`Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}`); } return allUrls; diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts index 4449285d..c09cc5b3 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/socialBlockList.test.ts @@ -1,3 +1,4 @@ +import { Logger } from '../../../../lib/logger'; import { isUrlBlocked } from '../blocklist'; describe('isUrlBlocked', () => { @@ -19,7 +20,7 @@ describe('isUrlBlocked', () => { blockedUrls.forEach(url => { if (!isUrlBlocked(url)) { - console.log(`URL not blocked: ${url}`); + Logger.debug(`URL not blocked: ${url}`); } expect(isUrlBlocked(url)).toBe(true); }); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 3e01571e..1a9fa11b 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -7,6 +7,7 @@ import pdf from "pdf-parse"; import path from "path"; import os from "os"; import { axiosTimeout } from "../../../lib/timeout"; +import { Logger } from "../../../lib/logger"; dotenv.config(); @@ -39,6 +40,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro let content = ""; if (process.env.LLAMAPARSE_API_KEY && parsePDF) { + Logger.debug("Processing pdf document w/ LlamaIndex"); const apiKey = process.env.LLAMAPARSE_API_KEY; const headers = { Authorization: `Bearer ${apiKey}`, @@ -81,7 +83,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds } } catch (error) { - console.error("Error fetching result w/ LlamaIndex"); + Logger.debug("Error fetching result w/ LlamaIndex"); attempt++; await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently @@ -93,7 +95,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro } content = resultResponse.data[resultType]; } catch (error) { - console.error("Error processing pdf document w/ LlamaIndex(2)"); + Logger.debug("Error processing pdf document w/ LlamaIndex(2)"); content = await processPdf(filePath); } } else if (parsePDF) { diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 82668111..6d0f9a69 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -2,6 +2,7 @@ import { NotificationType } from "../../types"; import { withAuth } from "../../lib/withAuth"; import { sendNotification } from "../notification/email_notification"; import { supabase_service } from "../supabase"; +import { Logger } from "../../lib/logger"; const FREE_CREDITS = 500; @@ -12,7 +13,7 @@ export async function supaBillTeam(team_id: string, credits: number) { if (team_id === "preview") { return { success: true, message: "Preview team, no credits used" }; } - console.log(`Billing team ${team_id} for ${credits} credits`); + Logger.info(`Billing team ${team_id} for ${credits} credits`); // When the API is used, you can log the credit usage in the credit_usage table: // team_id: The ID of the team using the API. // subscription_id: The ID of the team's active subscription. @@ -218,7 +219,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { 0 ); - console.log("totalCreditsUsed", totalCreditsUsed); + Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`); const end = new Date(); end.setDate(end.getDate() + 30); diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 9764493f..93d0b311 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -3,6 +3,7 @@ import { supabase_service } from "../supabase"; import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; import "dotenv/config"; +import { Logger } from "../../lib/logger"; export async function logJob(job: FirecrawlJob) { try { @@ -68,9 +69,9 @@ export async function logJob(job: FirecrawlJob) { posthog.capture(phLog); } if (error) { - console.error("Error logging job:\n", error); + Logger.error(`Error logging job: ${error.message}`); } } catch (error) { - console.error("Error logging job:\n", error); + Logger.error(`Error logging job: ${error.message}`); } } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 9f34222f..b99d99b4 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -2,6 +2,7 @@ import "dotenv/config"; import { ScrapeLog } from "../../types"; import { supabase_service } from "../supabase"; import { PageOptions } from "../../lib/entities"; +import { Logger } from "../../lib/logger"; export async function logScrape( scrapeLog: ScrapeLog, @@ -39,9 +40,9 @@ export async function logScrape( ]); if (error) { - console.error("Error logging proxy:\n", error); + Logger.error(`Error logging proxy:\n${error}`); } } catch (error) { - console.error("Error logging proxy:\n", error); + Logger.error(`Error logging proxy:\n${error}`); } } diff --git a/apps/api/src/services/logtail.ts b/apps/api/src/services/logtail.ts index 8b86a6b1..d9af3c7a 100644 --- a/apps/api/src/services/logtail.ts +++ b/apps/api/src/services/logtail.ts @@ -1,19 +1,20 @@ import { Logtail } from "@logtail/node"; import "dotenv/config"; +import { Logger } from "../lib/logger"; // A mock Logtail class to handle cases where LOGTAIL_KEY is not provided class MockLogtail { info(message: string, context?: Record): void { - console.log(message, context); + Logger.debug(`${message} - ${context}`); } error(message: string, context: Record = {}): void { - console.error(message, context); + Logger.error(`${message} - ${context}`); } } // Using the actual Logtail class if LOGTAIL_KEY exists, otherwise using the mock class // Additionally, print a warning to the terminal if LOGTAIL_KEY is not provided export const logtail = process.env.LOGTAIL_KEY ? new Logtail(process.env.LOGTAIL_KEY) : (() => { - console.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); + Logger.warn("LOGTAIL_KEY is not provided - your events will not be logged. Using MockLogtail as a fallback. see logtail.ts for more."); return new MockLogtail(); })(); diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index 6c817a4a..d531c2db 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -1,5 +1,6 @@ import Queue from "bull"; import { Queue as BullQueue } from "bull"; +import { Logger } from "../lib/logger"; let webScraperQueue: BullQueue; @@ -16,7 +17,7 @@ export function getWebScraperQueue() { attempts: 5 } }); - console.log("Web scraper queue created"); + Logger.info("Web scraper queue created"); } return webScraperQueue; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index be2a4c70..532c5bc1 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -7,8 +7,9 @@ import { callWebhook } from "./webhook"; import { logJob } from "./logging/log_job"; import { initSDK } from '@hyperdx/node-opentelemetry'; import { Job } from "bull"; +import { Logger } from "../lib/logger"; -if(process.env.ENV === 'production') { +if (process.env.ENV === 'production') { initSDK({ consoleCapture: true, additionalInstrumentations: [], @@ -18,7 +19,7 @@ if(process.env.ENV === 'production') { const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { - console.log("taking job", job.id); + Logger.debug(`πŸ‚ Worker taking job ${job.id}`); try { job.progress({ current: 1, @@ -58,18 +59,18 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); - console.log("job done", job.id); + Logger.debug(`πŸ‚ Job done ${job.id}`); done(null, data); } catch (error) { - console.log("job errored", job.id, error); + Logger.error(`πŸ‚ Job errored ${job.id} - ${error}`); if (await getWebScraperQueue().isPaused(false)) { - console.log("queue is paused, ignoring"); + Logger.debug("πŸ‚Queue is paused, ignoring"); return; } if (error instanceof CustomError) { // Here we handle the error, then save the failed job - console.error(error.message); // or any other error handling + Logger.error(error.message); // or any other error handling logtail.error("Custom error while ingesting", { job_id: job.id, @@ -77,7 +78,7 @@ async function processJob(job: Job, done) { dataIngestionJob: error.dataIngestionJob, }); } - console.log(error); + Logger.error(error); logtail.error("Overall error ingesting", { job_id: job.id, diff --git a/apps/api/src/services/redis.ts b/apps/api/src/services/redis.ts index b720a330..173d2b78 100644 --- a/apps/api/src/services/redis.ts +++ b/apps/api/src/services/redis.ts @@ -1,14 +1,15 @@ import Redis from "ioredis"; import { redisRateLimitClient } from "./rate-limiter"; +import { Logger } from "../lib/logger"; // Listen to 'error' events to the Redis connection redisRateLimitClient.on("error", (error) => { try { if (error.message === "ECONNRESET") { - console.log("Connection to Redis Session Store timed out."); + Logger.error("Connection to Redis Session Rate Limit Store timed out."); } else if (error.message === "ECONNREFUSED") { - console.log("Connection to Redis Session Store refused!"); - } else console.log(error); + Logger.error("Connection to Redis Session Rate Limit Store refused!"); + } else Logger.error(error); } catch (error) {} }); @@ -16,15 +17,15 @@ redisRateLimitClient.on("error", (error) => { redisRateLimitClient.on("reconnecting", (err) => { try { if (redisRateLimitClient.status === "reconnecting") - console.log("Reconnecting to Redis Session Store..."); - else console.log("Error reconnecting to Redis Session Store."); + Logger.info("Reconnecting to Redis Session Rate Limit Store..."); + else Logger.error("Error reconnecting to Redis Session Rate Limit Store."); } catch (error) {} }); // Listen to the 'connect' event to Redis redisRateLimitClient.on("connect", (err) => { try { - if (!err) console.log("Connected to Redis Session Store!"); + if (!err) Logger.info("Connected to Redis Session Rate Limit Store!"); } catch (error) {} }); From 5e728c1a4d1286fb4d2beb6d302d98f731f8ec24 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 24 Jul 2024 08:33:00 -0300 Subject: [PATCH 05/45] Update apps/api/src/scraper/WebScraper/crawler.ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit no need for regex Co-authored-by: GergΕ‘ MΓ³ricz --- apps/api/src/scraper/WebScraper/crawler.ts | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 00d51853..640eada0 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -64,15 +64,7 @@ export class WebCrawler { private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] { return sitemapLinks .filter((link) => { - - // if link is not a complete url, add the base url - link = link.trim(); - const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i'); - if (!isCompleteUrl.test(link)){ - link = this.baseUrl + link; - } - - const url = new URL(link); + const url = new URL(link.trim(), this.baseUrl); const path = url.pathname; const depth = getURLDepth(url.toString()); From 7cd9bf92e33b1079cff7e92dcb28bc548021bee5 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 14:31:25 +0200 Subject: [PATCH 06/45] feat: scrape event logging to DB --- apps/api/src/controllers/crawl.ts | 4 +- apps/api/src/controllers/scrape.ts | 2 + apps/api/src/controllers/search.ts | 7 +++ apps/api/src/example.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/api/src/lib/scrape-events.ts | 58 +++++++++++++++++++ apps/api/src/main/runWebScraper.ts | 2 + .../WebScraper/__tests__/crawler.test.ts | 6 ++ .../WebScraper/__tests__/single_url.test.ts | 6 +- apps/api/src/scraper/WebScraper/crawler.ts | 6 +- apps/api/src/scraper/WebScraper/index.ts | 4 ++ apps/api/src/scraper/WebScraper/single_url.ts | 28 ++++++++- 12 files changed, 118 insertions(+), 7 deletions(-) create mode 100644 apps/api/src/lib/scrape-events.ts diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 89358fcc..3388f8a7 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; +import { v4 as uuidv4 } from "uuid"; export async function crawlController(req: Request, res: Response) { try { @@ -60,10 +61,11 @@ export async function crawlController(req: Request, res: Response) { const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions }; const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions }; - if (mode === "single_urls" && !url.includes(",")) { + if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? try { const a = new WebScraperDataProvider(); await a.setOptions({ + jobId: uuidv4(), mode: "single_urls", urls: [url], crawlerOptions: { ...crawlerOptions, returnOnlyUrls: true }, diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index fb57e41d..bfbb1e2a 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -9,6 +9,7 @@ import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; +import { v4 as uuidv4 } from "uuid"; export async function scrapeHelper( req: Request, @@ -35,6 +36,7 @@ export async function scrapeHelper( const a = new WebScraperDataProvider(); await a.setOptions({ + jobId: uuidv4(), mode: "single_urls", urls: [url], crawlerOptions: { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 8cb6d55b..e6ab8e9d 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -7,8 +7,10 @@ import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { v4 as uuidv4 } from "uuid"; export async function searchHelper( + jobId: string, req: Request, team_id: string, crawlerOptions: any, @@ -75,6 +77,7 @@ export async function searchHelper( const a = new WebScraperDataProvider(); await a.setOptions({ + jobId, mode: "single_urls", urls: res.map((r) => r.url).slice(0, searchOptions.limit ?? 7), crawlerOptions: { @@ -148,6 +151,8 @@ export async function searchController(req: Request, res: Response) { const searchOptions = req.body.searchOptions ?? { limit: 7 }; + const jobId = uuidv4(); + try { const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1); @@ -160,6 +165,7 @@ export async function searchController(req: Request, res: Response) { } const startTime = new Date().getTime(); const result = await searchHelper( + jobId, req, team_id, crawlerOptions, @@ -169,6 +175,7 @@ export async function searchController(req: Request, res: Response) { const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; logJob({ + job_id: jobId, success: result.success, message: result.error, num_docs: result.data ? result.data.length : 0, diff --git a/apps/api/src/example.ts b/apps/api/src/example.ts index 0c30ea33..edf0faef 100644 --- a/apps/api/src/example.ts +++ b/apps/api/src/example.ts @@ -4,6 +4,7 @@ async function example() { const example = new WebScraperDataProvider(); await example.setOptions({ + jobId: "TEST", mode: "crawl", urls: ["https://mendable.ai"], crawlerOptions: {}, diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index f60e197f..56cd793a 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -56,6 +56,7 @@ export type CrawlerOptions = { } export type WebScraperOptions = { + jobId: string; urls: string[]; mode: "single_urls" | "sitemap" | "crawl"; crawlerOptions?: CrawlerOptions; diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts new file mode 100644 index 00000000..ea30ffc2 --- /dev/null +++ b/apps/api/src/lib/scrape-events.ts @@ -0,0 +1,58 @@ +import type { baseScrapers } from "../scraper/WebScraper/single_url"; +import { supabase_service as supabase } from "../services/supabase"; + +export type ScrapeErrorEvent = { + type: "error", + message: string, + stack?: string, +} + +export type ScrapeScrapeEvent = { + type: "scrape", + method: (typeof baseScrapers)[number], + result: null | { + success: boolean, + response_code?: number, + error?: string, + // proxy?: string, + time_taken: number, + }, +} + +export type ScrapeQueueEvent = { + type: "queue", + event: "created" | "started" | "interrupted" | "finished", + worker?: string, +} + +export type ScrapeEvent = ScrapeErrorEvent | ScrapeScrapeEvent | ScrapeQueueEvent; + +export class ScrapeEvents { + static async insert(jobId: string, content: ScrapeEvent) { + if (jobId === "TEST") return null; + + if (process.env.USE_DB_AUTH) { + const result = await supabase.from("scrape_events").insert({ + job_id: jobId, + type: content.type, + content: content, + // created_at + }).single(); + return (result.data as any).id; + } + + return null; + } + + static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) { + if (logId === null) return; + + const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; + await supabase.from("scrape_events").update({ + content: { + ...previousLog.content, + result, + } + }).eq("id", logId); + } +} diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 0a7da035..5c97ef41 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -60,6 +60,7 @@ export async function runWebScraper({ const provider = new WebScraperDataProvider(); if (mode === "crawl") { await provider.setOptions({ + jobId: bull_job_id, mode: mode, urls: [url], crawlerOptions: crawlerOptions, @@ -68,6 +69,7 @@ export async function runWebScraper({ }); } else { await provider.setOptions({ + jobId: bull_job_id, mode: mode, urls: url.split(","), crawlerOptions: crawlerOptions, diff --git a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts index 32c8b0a0..20419ffa 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts @@ -42,6 +42,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -76,6 +77,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -104,6 +106,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -133,6 +136,7 @@ describe('WebCrawler', () => { crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -161,6 +165,7 @@ describe('WebCrawler', () => { // Setup the crawler with the specific test case options const crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], @@ -194,6 +199,7 @@ describe('WebCrawler', () => { const limit = 2; // Set a limit for the number of links crawler = new WebCrawler({ + jobId: "TEST", initialUrl: initialUrl, includes: [], excludes: [], diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 8a9df227..4b720835 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -15,8 +15,8 @@ describe('scrapSingleUrl', () => { const pageOptionsWithHtml: PageOptions = { includeHtml: true }; const pageOptionsWithoutHtml: PageOptions = { includeHtml: false }; - const resultWithHtml = await scrapSingleUrl(url, pageOptionsWithHtml); - const resultWithoutHtml = await scrapSingleUrl(url, pageOptionsWithoutHtml); + const resultWithHtml = await scrapSingleUrl("TEST", url, pageOptionsWithHtml); + const resultWithoutHtml = await scrapSingleUrl("TEST", url, pageOptionsWithoutHtml); expect(resultWithHtml.html).toBeDefined(); expect(resultWithoutHtml.html).toBeUndefined(); @@ -27,7 +27,7 @@ it('should return a list of links on the mendable.ai page', async () => { const url = 'https://mendable.ai'; const pageOptions: PageOptions = { includeHtml: true }; - const result = await scrapSingleUrl(url, pageOptions); + const result = await scrapSingleUrl("TEST", url, pageOptions); // Check if the result contains a list of links expect(result.linksOnPage).toBeDefined(); diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3d7c267a..c592f400 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -11,6 +11,7 @@ import { axiosTimeout } from "../../../src/lib/timeout"; import { Logger } from "../../../src/lib/logger"; export class WebCrawler { + private jobId: string; private initialUrl: string; private baseUrl: string; private includes: string[]; @@ -27,6 +28,7 @@ export class WebCrawler { private allowExternalContentLinks: boolean; constructor({ + jobId, initialUrl, includes, excludes, @@ -37,6 +39,7 @@ export class WebCrawler { allowBackwardCrawling = false, allowExternalContentLinks = false }: { + jobId: string; initialUrl: string; includes?: string[]; excludes?: string[]; @@ -47,6 +50,7 @@ export class WebCrawler { allowBackwardCrawling?: boolean; allowExternalContentLinks?: boolean; }) { + this.jobId = jobId; this.initialUrl = initialUrl; this.baseUrl = new URL(initialUrl).origin; this.includes = includes ?? []; @@ -261,7 +265,7 @@ export class WebCrawler { // If it is the first link, fetch with single url if (this.visited.size === 1) { - const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); + const page = await scrapSingleUrl(this.jobId, url, { ...pageOptions, includeHtml: true }); content = page.html ?? ""; pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 6506620e..eff709fa 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -22,6 +22,7 @@ import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { Logger } from "../../lib/logger"; export class WebScraperDataProvider { + private jobId: string; private bullJobId: string; private urls: string[] = [""]; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; @@ -66,6 +67,7 @@ export class WebScraperDataProvider { batchUrls.map(async (url, index) => { const existingHTML = allHtmls ? allHtmls[i + index] : ""; const result = await scrapSingleUrl( + this.jobId, url, this.pageOptions, this.extractorOptions, @@ -166,6 +168,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { const crawler = new WebCrawler({ + jobId: this.jobId, initialUrl: this.urls[0], includes: this.includes, excludes: this.excludes, @@ -500,6 +503,7 @@ export class WebScraperDataProvider { throw new Error("Urls are required"); } + this.jobId = options.jobId; this.bullJobId = options.bullJobId; this.urls = options.urls; this.mode = options.mode; diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 1e7f1fac..68474fed 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -18,10 +18,11 @@ import { scrapWithPlaywright } from "./scrapers/playwright"; import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { extractLinks } from "./utils/utils"; import { Logger } from "../../lib/logger"; +import { ScrapeEvents } from "../../lib/scrape-events"; dotenv.config(); -const baseScrapers = [ +export const baseScrapers = [ "fire-engine", "fire-engine;chrome-cdp", "scrapingBee", @@ -118,6 +119,7 @@ function getScrapingFallbackOrder( export async function scrapSingleUrl( + jobId: string, urlToScrap: string, pageOptions: PageOptions = { onlyMainContent: true, @@ -145,6 +147,13 @@ export async function scrapSingleUrl( } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; + const timer = Date.now(); + const logInsertPromise = ScrapeEvents.insert(jobId, { + type: "scrape", + method, + result: null, + }); + switch (method) { case "fire-engine": case "fire-engine;chrome-cdp": @@ -254,8 +263,18 @@ export async function scrapSingleUrl( } //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); + const text = await parseMarkdown(cleanedHtml); + + const insertedLogId = await logInsertPromise; + ScrapeEvents.updateScrapeResult(insertedLogId, { + success: !!scraperResponse.metadata.pageError && !!text, + error: scraperResponse.metadata.pageError, + response_code: scraperResponse.metadata.pageStatusCode, + time_taken: Date.now() - timer, + }); + return { - text: await parseMarkdown(cleanedHtml), + text, html: cleanedHtml, rawHtml: scraperResponse.text, screenshot: scraperResponse.screenshot, @@ -379,6 +398,11 @@ export async function scrapSingleUrl( return document; } catch (error) { Logger.debug(`⛏️ Error: ${error.message} - Failed to fetch URL: ${urlToScrap}`); + ScrapeEvents.insert(jobId, { + type: "error", + message: typeof error === "string" ? error : typeof error.message === "string" ? error.message : JSON.stringify(error), + stack: error.stack, + }); return { content: "", markdown: "", From 71072fef3b85c22a3f8daf6f667e17bce6efc7d5 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 14:46:41 +0200 Subject: [PATCH 07/45] fix(scrape-events): bad logic --- apps/api/src/lib/scrape-events.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index ea30ffc2..d7958d14 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -13,7 +13,7 @@ export type ScrapeScrapeEvent = { result: null | { success: boolean, response_code?: number, - error?: string, + error?: string | object, // proxy?: string, time_taken: number, }, @@ -31,13 +31,13 @@ export class ScrapeEvents { static async insert(jobId: string, content: ScrapeEvent) { if (jobId === "TEST") return null; - if (process.env.USE_DB_AUTH) { + if (process.env.USE_DB_AUTHENTICATION) { const result = await supabase.from("scrape_events").insert({ job_id: jobId, type: content.type, content: content, // created_at - }).single(); + }).select().single(); return (result.data as any).id; } From d57dbbd0c606f55836200281e981927b1ccfc491 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 15:18:12 +0200 Subject: [PATCH 08/45] fix: add jobId for scrape --- apps/api/src/controllers/scrape.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index bfbb1e2a..c409371d 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -12,6 +12,7 @@ import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOri import { v4 as uuidv4 } from "uuid"; export async function scrapeHelper( + jobId: string, req: Request, team_id: string, crawlerOptions: any, @@ -36,7 +37,7 @@ export async function scrapeHelper( const a = new WebScraperDataProvider(); await a.setOptions({ - jobId: uuidv4(), + jobId, mode: "single_urls", urls: [url], crawlerOptions: { @@ -129,8 +130,11 @@ export async function scrapeController(req: Request, res: Response) { checkCredits(); } + const jobId = uuidv4(); + const startTime = new Date().getTime(); const result = await scrapeHelper( + jobId, req, team_id, crawlerOptions, @@ -171,6 +175,7 @@ export async function scrapeController(req: Request, res: Response) { } logJob({ + job_id: jobId, success: result.success, message: result.error, num_docs: 1, From 64bcedeefc82e753e3939c6f29d473a7238ef8da Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 16:21:59 +0200 Subject: [PATCH 09/45] fix(monitoring): bad success check on scrape --- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 68474fed..61ee89af 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -267,7 +267,7 @@ export async function scrapSingleUrl( const insertedLogId = await logInsertPromise; ScrapeEvents.updateScrapeResult(insertedLogId, { - success: !!scraperResponse.metadata.pageError && !!text, + success: !scraperResponse.metadata.pageError && !!text, error: scraperResponse.metadata.pageError, response_code: scraperResponse.metadata.pageStatusCode, time_taken: Date.now() - timer, From 4d35ad073c86e74840dc0074cdee1e7a5a05ea90 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 16:43:39 +0200 Subject: [PATCH 10/45] feat(monitoring/scrape): include url, worker, response_size --- apps/api/src/lib/scrape-events.ts | 3 +++ apps/api/src/scraper/WebScraper/single_url.ts | 3 +++ 2 files changed, 6 insertions(+) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index d7958d14..9b050c30 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -9,10 +9,13 @@ export type ScrapeErrorEvent = { export type ScrapeScrapeEvent = { type: "scrape", + url: string, + worker?: string, method: (typeof baseScrapers)[number], result: null | { success: boolean, response_code?: number, + response_size?: number, error?: string | object, // proxy?: string, time_taken: number, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 61ee89af..4b428d35 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -150,6 +150,8 @@ export async function scrapSingleUrl( const timer = Date.now(); const logInsertPromise = ScrapeEvents.insert(jobId, { type: "scrape", + url, + worker: process.env.FLY_MACHINE_ID, method, result: null, }); @@ -267,6 +269,7 @@ export async function scrapSingleUrl( const insertedLogId = await logInsertPromise; ScrapeEvents.updateScrapeResult(insertedLogId, { + response_size: scraperResponse.text.length, success: !scraperResponse.metadata.pageError && !!text, error: scraperResponse.metadata.pageError, response_code: scraperResponse.metadata.pageStatusCode, From 497aa5d25ef0c2d4ed5421689c57bae64b759dd5 Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Wed, 24 Jul 2024 17:55:45 +0200 Subject: [PATCH 11/45] Update Kubernetes configs for playwright-service, api, and worker Added new ConfigMap for playwright-service and adjusted existing references. Applied imagePullPolicy: Always to ensure all images are updated promptly. Updated README to include --no-cache for Docker build instructions. --- examples/kubernetes/cluster-install/README.md | 4 ++-- examples/kubernetes/cluster-install/api.yaml | 5 +++-- examples/kubernetes/cluster-install/configmap.yaml | 6 ++---- .../kubernetes/cluster-install/playwright-service.yaml | 10 +++++++++- examples/kubernetes/cluster-install/worker.yaml | 5 +++-- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/examples/kubernetes/cluster-install/README.md b/examples/kubernetes/cluster-install/README.md index f874d829..2ae39893 100644 --- a/examples/kubernetes/cluster-install/README.md +++ b/examples/kubernetes/cluster-install/README.md @@ -4,12 +4,12 @@ 2. Build Docker images, and host it in your Docker Registry (replace the target registry with your own) 1. API (which is also used as a worker image) 1. ```bash - docker build -t ghcr.io/winkk-dev/firecrawl:latest ../../apps/api + docker build --no-cache -t ghcr.io/winkk-dev/firecrawl:latest ../../apps/api docker push ghcr.io/winkk-dev/firecrawl:latest ``` 2. Playwright 1. ```bash - docker build -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../apps/playwright-service + docker build --no-cache -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../apps/playwright-service docker push ghcr.io/winkk-dev/firecrawl-playwright:latest ``` 3. Replace the image in [worker.yaml](worker.yaml), [api.yaml](api.yaml) and [playwright-service.yaml](playwright-service.yaml) diff --git a/examples/kubernetes/cluster-install/api.yaml b/examples/kubernetes/cluster-install/api.yaml index cdc69c3d..81e61839 100644 --- a/examples/kubernetes/cluster-install/api.yaml +++ b/examples/kubernetes/cluster-install/api.yaml @@ -17,14 +17,15 @@ spec: containers: - name: api image: ghcr.io/winkk-dev/firecrawl:latest + imagePullPolicy: Always args: [ "pnpm", "run", "start:production" ] ports: - containerPort: 3002 envFrom: - configMapRef: name: firecrawl-config - - secretRef: - name: firecrawl-secret + #- secretRef: + # name: firecrawl-secret --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/configmap.yaml b/examples/kubernetes/cluster-install/configmap.yaml index b415d562..32559de7 100644 --- a/examples/kubernetes/cluster-install/configmap.yaml +++ b/examples/kubernetes/cluster-install/configmap.yaml @@ -7,8 +7,6 @@ data: PORT: "3002" HOST: "0.0.0.0" REDIS_URL: "redis://redis:6379" - PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000" + PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000/html" USE_DB_AUTHENTICATION: "false" - SUPABASE_ANON_TOKEN: "" - SUPABASE_URL: "" - SUPABASE_SERVICE_TOKEN: "" + HDX_NODE_BETA_MODE: "1" diff --git a/examples/kubernetes/cluster-install/playwright-service.yaml b/examples/kubernetes/cluster-install/playwright-service.yaml index ce794253..e916d914 100644 --- a/examples/kubernetes/cluster-install/playwright-service.yaml +++ b/examples/kubernetes/cluster-install/playwright-service.yaml @@ -1,3 +1,10 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: playwright-service-config +data: + PORT: "3000" +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -17,11 +24,12 @@ spec: containers: - name: playwright-service image: ghcr.io/winkk-dev/firecrawl-playwright:latest + imagePullPolicy: Always ports: - containerPort: 3000 envFrom: - configMapRef: - name: firecrawl-config + name: playwright-service-config --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/worker.yaml b/examples/kubernetes/cluster-install/worker.yaml index 2b3b2e79..545beaa3 100644 --- a/examples/kubernetes/cluster-install/worker.yaml +++ b/examples/kubernetes/cluster-install/worker.yaml @@ -17,8 +17,9 @@ spec: containers: - name: worker image: ghcr.io/winkk-dev/firecrawl:latest + imagePullPolicy: Always envFrom: - configMapRef: name: firecrawl-config - - secretRef: - name: firecrawl-secret + #- secretRef: + # name: firecrawl-secret From 60c74357dfbb133658c2cb73b40f0573c2ed07e8 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 24 Jul 2024 18:44:14 +0200 Subject: [PATCH 12/45] feat(ScrapeEvents): log queue events --- apps/api/src/index.ts | 10 ++++++++++ apps/api/src/lib/scrape-events.ts | 11 ++++++++++- apps/api/src/services/queue-worker.ts | 9 +++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 700cc98e..cbd37eb7 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -13,6 +13,7 @@ import { checkAlerts } from "./services/alerts"; import Redis from "ioredis"; import { redisRateLimitClient } from "./services/rate-limiter"; import { Logger } from "./lib/logger"; +import { ScrapeEvents } from "./lib/scrape-events"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -325,3 +326,12 @@ if (cluster.isMaster) { Logger.info(`Worker ${process.pid} started`); } + +const wsq = getWebScraperQueue(); + +wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); +wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active")); +wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); +wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); +wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); +wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 9b050c30..7015c92d 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -1,3 +1,4 @@ +import { Job, JobId } from "bull"; import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; @@ -24,7 +25,7 @@ export type ScrapeScrapeEvent = { export type ScrapeQueueEvent = { type: "queue", - event: "created" | "started" | "interrupted" | "finished", + event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed", worker?: string, } @@ -58,4 +59,12 @@ export class ScrapeEvents { } }).eq("id", logId); } + + static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) { + await this.insert(((job as any).id ? (job as any).id : job) as string, { + type: "queue", + event, + worker: process.env.FLY_MACHINE_ID, + }); + } } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 532c5bc1..e7767809 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -8,6 +8,7 @@ import { logJob } from "./logging/log_job"; import { initSDK } from '@hyperdx/node-opentelemetry'; import { Job } from "bull"; import { Logger } from "../lib/logger"; +import { ScrapeEvents } from "../lib/scrape-events"; if (process.env.ENV === 'production') { initSDK({ @@ -20,6 +21,7 @@ const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { Logger.debug(`πŸ‚ Worker taking job ${job.id}`); + try { job.progress({ current: 1, @@ -114,3 +116,10 @@ wsq.process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), processJob ); + +wsq.on("waiting", j => ScrapeEvents.logJobEvent(j, "waiting")); +wsq.on("active", j => ScrapeEvents.logJobEvent(j, "active")); +wsq.on("completed", j => ScrapeEvents.logJobEvent(j, "completed")); +wsq.on("paused", j => ScrapeEvents.logJobEvent(j, "paused")); +wsq.on("resumed", j => ScrapeEvents.logJobEvent(j, "resumed")); +wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); From be9e7f9edf039b523a53a6ea1e38e5975ae1433b Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Wed, 24 Jul 2024 18:54:16 +0200 Subject: [PATCH 13/45] Update Kubernetes configs for playwright-service, api, and worker Added new ConfigMap for playwright-service and adjusted existing references. Applied imagePullPolicy: Always to ensure all images are updated promptly. Updated README to include --no-cache for Docker build instructions. --- apps/api/src/controllers/liveness.ts | 6 ++++++ apps/api/src/controllers/readiness.ts | 6 ++++++ apps/api/src/routes/v0.ts | 5 +++++ apps/playwright-service/main.py | 20 +++++++++++++++++--- 4 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 apps/api/src/controllers/liveness.ts create mode 100644 apps/api/src/controllers/readiness.ts diff --git a/apps/api/src/controllers/liveness.ts b/apps/api/src/controllers/liveness.ts new file mode 100644 index 00000000..8ff1a96f --- /dev/null +++ b/apps/api/src/controllers/liveness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function livenessController(req: Request, res: Response) { + //TODO: add checks if the application is live and healthy like checking the redis connection + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/controllers/readiness.ts b/apps/api/src/controllers/readiness.ts new file mode 100644 index 00000000..cdb1f02c --- /dev/null +++ b/apps/api/src/controllers/readiness.ts @@ -0,0 +1,6 @@ +import { Request, Response } from "express"; + +export async function readinessController(req: Request, res: Response) { + // TODO: add checks when the application is ready to serve traffic + res.status(200).json({ status: "ok" }); +} diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index a9a3a9bf..4284b77c 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -7,6 +7,8 @@ import { crawlJobStatusPreviewController } from "../../src/controllers/status"; import { searchController } from "../../src/controllers/search"; import { crawlCancelController } from "../../src/controllers/crawl-cancel"; import { keyAuthController } from "../../src/controllers/keyAuth"; +import {livenessController} from "../controllers/liveness"; +import {readinessController} from "../controllers/readiness"; export const v0Router = express.Router(); @@ -23,3 +25,6 @@ v0Router.get("/v0/keyAuth", keyAuthController); // Search routes v0Router.post("/v0/search", searchController); +// Health/Probe routes +v0Router.get("/v0/health/liveness", livenessController); +v0Router.get("/v0/health/readiness", readinessController); diff --git a/apps/playwright-service/main.py b/apps/playwright-service/main.py index bd6b14e3..c9099d3b 100644 --- a/apps/playwright-service/main.py +++ b/apps/playwright-service/main.py @@ -5,7 +5,7 @@ the HTML content of a specified URL. It supports optional proxy settings and med from os import environ -from fastapi import FastAPI +from fastapi import FastAPI, Response from fastapi.responses import JSONResponse from playwright.async_api import Browser, async_playwright from pydantic import BaseModel @@ -39,14 +39,28 @@ async def shutdown_event(): """Event handler for application shutdown to close the browser.""" await browser.close() +@app.get("/health/liveness") +def liveness_probe(): + """Endpoint for liveness probe.""" + return JSONResponse(content={"status": "ok"}, status_code=200) + + +@app.get("/health/readiness") +async def readiness_probe(): + """Endpoint for readiness probe. Checks if the browser instance is ready.""" + if browser: + return JSONResponse(content={"status": "ok"}, status_code=200) + return JSONResponse(content={"status": "Service Unavailable"}, status_code=503) + + @app.post("/html") async def root(body: UrlModel): """ Endpoint to fetch and return HTML content of a given URL. - + Args: body (UrlModel): The URL model containing the target URL, wait time, and timeout. - + Returns: JSONResponse: The HTML content of the page. """ From 895e80caa421e07f14f7c7e7496e2a86f72db198 Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Wed, 24 Jul 2024 19:00:23 +0200 Subject: [PATCH 14/45] Add liveness and readiness probes to Kubernetes configs Introduced liveness and readiness probes for the Playwright service, API, and worker components. This ensures that Kubernetes can better manage the health and availability of these services by periodically checking their endpoints. This enhancement will improve the robustness and reliability of the deployed applications. --- examples/kubernetes/cluster-install/api.yaml | 20 +++++++++++++++++++ .../cluster-install/playwright-service.yaml | 20 +++++++++++++++++++ .../kubernetes/cluster-install/worker.yaml | 20 +++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/examples/kubernetes/cluster-install/api.yaml b/examples/kubernetes/cluster-install/api.yaml index 81e61839..c709857c 100644 --- a/examples/kubernetes/cluster-install/api.yaml +++ b/examples/kubernetes/cluster-install/api.yaml @@ -26,6 +26,26 @@ spec: name: firecrawl-config #- secretRef: # name: firecrawl-secret + - livenessProbe: + httpGet: + path: /v0/health/liveness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + name: api-container + - readinessProbe: + httpGet: + path: /v0/health/readiness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + name: api-container --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/playwright-service.yaml b/examples/kubernetes/cluster-install/playwright-service.yaml index e916d914..a9c24d42 100644 --- a/examples/kubernetes/cluster-install/playwright-service.yaml +++ b/examples/kubernetes/cluster-install/playwright-service.yaml @@ -30,6 +30,26 @@ spec: envFrom: - configMapRef: name: playwright-service-config + - livenessProbe: + httpGet: + path: /health/liveness + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + name: playwright-service-container + - readinessProbe: + httpGet: + path: /health/readiness + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + name: playwright-service-container --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/worker.yaml b/examples/kubernetes/cluster-install/worker.yaml index 545beaa3..b904bd78 100644 --- a/examples/kubernetes/cluster-install/worker.yaml +++ b/examples/kubernetes/cluster-install/worker.yaml @@ -23,3 +23,23 @@ spec: name: firecrawl-config #- secretRef: # name: firecrawl-secret + - livenessProbe: + httpGet: + path: /v0/health/liveness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + name: worker-container + - readinessProbe: + httpGet: + path: /v0/health/readiness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + name: worker-container \ No newline at end of file From f26bda2477b84e728a742a21268af863fea8218a Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Wed, 24 Jul 2024 19:06:19 +0200 Subject: [PATCH 15/45] Update Docker build paths in Kubernetes setup README Corrected relative paths for Docker build commands to ensure the appropriate directories are targeted. This fix is crucial for successful image builds and deployment consistency in the Kubernetes cluster setup. --- examples/kubernetes/cluster-install/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/kubernetes/cluster-install/README.md b/examples/kubernetes/cluster-install/README.md index 2ae39893..736ae038 100644 --- a/examples/kubernetes/cluster-install/README.md +++ b/examples/kubernetes/cluster-install/README.md @@ -4,12 +4,12 @@ 2. Build Docker images, and host it in your Docker Registry (replace the target registry with your own) 1. API (which is also used as a worker image) 1. ```bash - docker build --no-cache -t ghcr.io/winkk-dev/firecrawl:latest ../../apps/api + docker build --no-cache -t ghcr.io/winkk-dev/firecrawl:latest ../../../apps/api docker push ghcr.io/winkk-dev/firecrawl:latest ``` 2. Playwright 1. ```bash - docker build --no-cache -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../apps/playwright-service + docker build --no-cache -t ghcr.io/winkk-dev/firecrawl-playwright:latest ../../../apps/playwright-service docker push ghcr.io/winkk-dev/firecrawl-playwright:latest ``` 3. Replace the image in [worker.yaml](worker.yaml), [api.yaml](api.yaml) and [playwright-service.yaml](playwright-service.yaml) From d68f3491099cfe3d20af700e352200cf6808a365 Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Wed, 24 Jul 2024 19:31:37 +0200 Subject: [PATCH 16/45] Update Kubernetes YAMLs and add worker service Refactored container configurations in worker, api, and playwright-service YAMLs to streamline syntax and add missing fields. Added a service definition for the worker component and included a new environment variable in the configmap for rate-limiting. These changes enhance configuration clarity and ensure proper resource definitions. --- examples/kubernetes/cluster-install/api.yaml | 60 ++++++++--------- .../kubernetes/cluster-install/configmap.yaml | 1 + .../cluster-install/playwright-service.yaml | 54 +++++++-------- .../kubernetes/cluster-install/worker.yaml | 67 +++++++++++-------- 4 files changed, 95 insertions(+), 87 deletions(-) diff --git a/examples/kubernetes/cluster-install/api.yaml b/examples/kubernetes/cluster-install/api.yaml index c709857c..54ecfbf6 100644 --- a/examples/kubernetes/cluster-install/api.yaml +++ b/examples/kubernetes/cluster-install/api.yaml @@ -15,37 +15,35 @@ spec: imagePullSecrets: - name: docker-registry-secret containers: - - name: api - image: ghcr.io/winkk-dev/firecrawl:latest - imagePullPolicy: Always - args: [ "pnpm", "run", "start:production" ] - ports: - - containerPort: 3002 - envFrom: - - configMapRef: - name: firecrawl-config - #- secretRef: - # name: firecrawl-secret - - livenessProbe: - httpGet: - path: /v0/health/liveness - port: 3002 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 - name: api-container - - readinessProbe: - httpGet: - path: /v0/health/readiness - port: 3002 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 - name: api-container + - name: api + image: ghcr.io/winkk-dev/firecrawl:latest + imagePullPolicy: Always + args: [ "pnpm", "run", "start:production" ] + ports: + - containerPort: 3002 + envFrom: + - configMapRef: + name: firecrawl-config + #- secretRef: + # name: firecrawl-secret + livenessProbe: + httpGet: + path: /v0/health/liveness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /v0/health/readiness + port: 3002 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/configmap.yaml b/examples/kubernetes/cluster-install/configmap.yaml index 32559de7..b56cfbcd 100644 --- a/examples/kubernetes/cluster-install/configmap.yaml +++ b/examples/kubernetes/cluster-install/configmap.yaml @@ -7,6 +7,7 @@ data: PORT: "3002" HOST: "0.0.0.0" REDIS_URL: "redis://redis:6379" + REDIS_RATE_LIMIT_URL: "redis://redis:6379" PLAYWRIGHT_MICROSERVICE_URL: "http://playwright-service:3000/html" USE_DB_AUTHENTICATION: "false" HDX_NODE_BETA_MODE: "1" diff --git a/examples/kubernetes/cluster-install/playwright-service.yaml b/examples/kubernetes/cluster-install/playwright-service.yaml index a9c24d42..43cf15f0 100644 --- a/examples/kubernetes/cluster-install/playwright-service.yaml +++ b/examples/kubernetes/cluster-install/playwright-service.yaml @@ -22,34 +22,32 @@ spec: imagePullSecrets: - name: docker-registry-secret containers: - - name: playwright-service - image: ghcr.io/winkk-dev/firecrawl-playwright:latest - imagePullPolicy: Always - ports: - - containerPort: 3000 - envFrom: - - configMapRef: - name: playwright-service-config - - livenessProbe: - httpGet: - path: /health/liveness - port: 3000 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 - name: playwright-service-container - - readinessProbe: - httpGet: - path: /health/readiness - port: 3000 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 - name: playwright-service-container + - name: playwright-service + image: ghcr.io/winkk-dev/firecrawl-playwright:latest + imagePullPolicy: Always + ports: + - containerPort: 3000 + envFrom: + - configMapRef: + name: playwright-service-config + livenessProbe: + httpGet: + path: /health/liveness + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health/readiness + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 --- apiVersion: v1 kind: Service diff --git a/examples/kubernetes/cluster-install/worker.yaml b/examples/kubernetes/cluster-install/worker.yaml index b904bd78..6185f991 100644 --- a/examples/kubernetes/cluster-install/worker.yaml +++ b/examples/kubernetes/cluster-install/worker.yaml @@ -15,31 +15,42 @@ spec: imagePullSecrets: - name: docker-registry-secret containers: - - name: worker - image: ghcr.io/winkk-dev/firecrawl:latest - imagePullPolicy: Always - envFrom: - - configMapRef: - name: firecrawl-config - #- secretRef: - # name: firecrawl-secret - - livenessProbe: - httpGet: - path: /v0/health/liveness - port: 3002 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 - name: worker-container - - readinessProbe: - httpGet: - path: /v0/health/readiness - port: 3002 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 - name: worker-container \ No newline at end of file + - name: worker + image: ghcr.io/winkk-dev/firecrawl:latest + imagePullPolicy: Always + args: [ "pnpm", "run", "workers" ] + envFrom: + - configMapRef: + name: firecrawl-config + #- secretRef: + # name: firecrawl-secret + livenessProbe: + httpGet: + path: /v0/health/liveness + port: 3003 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /v0/health/readiness + port: 3003 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: worker +spec: + selector: + app: worker + ports: + - protocol: TCP + port: 3003 + targetPort: 3003 From 2dc7be38693310484b7eaf61f0b4e85b167bceef Mon Sep 17 00:00:00 2001 From: Jakob Stadlhuber Date: Wed, 24 Jul 2024 19:38:54 +0200 Subject: [PATCH 17/45] Remove liveness and readiness probes from worker.yaml This commit removes the liveness and readiness probes configuration from the Kubernetes worker manifest. Additionally, a Service definition for the worker application has been removed. These changes might be necessary to update the deployment strategy or simplify the configuration. --- .../kubernetes/cluster-install/worker.yaml | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/examples/kubernetes/cluster-install/worker.yaml b/examples/kubernetes/cluster-install/worker.yaml index 6185f991..8e992cf1 100644 --- a/examples/kubernetes/cluster-install/worker.yaml +++ b/examples/kubernetes/cluster-install/worker.yaml @@ -24,33 +24,3 @@ spec: name: firecrawl-config #- secretRef: # name: firecrawl-secret - livenessProbe: - httpGet: - path: /v0/health/liveness - port: 3003 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 - readinessProbe: - httpGet: - path: /v0/health/readiness - port: 3003 - initialDelaySeconds: 30 - periodSeconds: 30 - timeoutSeconds: 5 - successThreshold: 1 - failureThreshold: 3 ---- -apiVersion: v1 -kind: Service -metadata: - name: worker -spec: - selector: - app: worker - ports: - - protocol: TCP - port: 3003 - targetPort: 3003 From cc98f83fdab20efbc48b1632372a75b866857395 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:25:36 -0300 Subject: [PATCH 18/45] added failed and completed log events --- apps/api/src/lib/scrape-events.ts | 2 +- apps/api/src/main/runWebScraper.ts | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 7015c92d..dda03c38 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -25,7 +25,7 @@ export type ScrapeScrapeEvent = { export type ScrapeQueueEvent = { type: "queue", - event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed", + event: "waiting" | "active" | "completed" | "paused" | "resumed" | "removed" | "failed", worker?: string, } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 5c97ef41..5e7d2279 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -11,6 +11,7 @@ import { billTeam } from "../services/billing/credit_billing"; import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; import { Logger } from "../lib/logger"; +import { ScrapeEvents } from "../lib/scrape-events"; export async function startWebScraperPipeline({ job, @@ -39,6 +40,7 @@ export async function startWebScraperPipeline({ }, onError: (error) => { Logger.error(`πŸ‚ Job failed ${job.id}`); + ScrapeEvents.logJobEvent(job, "failed"); job.moveToFailed(error); }, team_id: job.data.team_id, @@ -140,6 +142,7 @@ const saveJob = async (job: Job, result: any) => { // I think the job won't exist here anymore } } + ScrapeEvents.logJobEvent(job, "completed"); } catch (error) { Logger.error(`πŸ‚ Failed to update job status: ${error}`); } From 623b5472922f06472c23606a1699d6f22a5a62bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 24 Jul 2024 23:39:00 +0200 Subject: [PATCH 19/45] fix(fly.toml): scale up memory limit --- apps/api/fly.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/fly.toml b/apps/api/fly.toml index 286bc086..a9b39228 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -11,8 +11,8 @@ kill_timeout = '30s' [build] [processes] - app = 'node --max-old-space-size=4096 dist/src/index.js' - worker = 'node --max-old-space-size=4096 dist/src/services/queue-worker.js' + app = 'node --max-old-space-size=8192 dist/src/index.js' + worker = 'node --max-old-space-size=8192 dist/src/services/queue-worker.js' [http_service] internal_port = 8080 From 1e13ddbe8e3ccd5875e43f9fe92f0c3d96edbb30 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 24 Jul 2024 18:13:34 -0400 Subject: [PATCH 20/45] Nick: changes to the ui component --- apps/api/.gitignore | 4 +- apps/ui/ingestion-ui/src/App.css | 42 ---- apps/ui/ingestion-ui/src/App.tsx | 1 - apps/ui/ingestion-ui/src/assets/react.svg | 1 - .../ingestion-ui/src/components/ingestion.tsx | 200 +++++++++++------- 5 files changed, 132 insertions(+), 116 deletions(-) delete mode 100644 apps/ui/ingestion-ui/src/App.css delete mode 100644 apps/ui/ingestion-ui/src/assets/react.svg diff --git a/apps/api/.gitignore b/apps/api/.gitignore index 66bccfed..edc2faf4 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -3,4 +3,6 @@ .env *.csv dump.rdb -/mongo-data \ No newline at end of file +/mongo-data + +/.next/ diff --git a/apps/ui/ingestion-ui/src/App.css b/apps/ui/ingestion-ui/src/App.css deleted file mode 100644 index b9d355df..00000000 --- a/apps/ui/ingestion-ui/src/App.css +++ /dev/null @@ -1,42 +0,0 @@ -#root { - max-width: 1280px; - margin: 0 auto; - padding: 2rem; - text-align: center; -} - -.logo { - height: 6em; - padding: 1.5em; - will-change: filter; - transition: filter 300ms; -} -.logo:hover { - filter: drop-shadow(0 0 2em #646cffaa); -} -.logo.react:hover { - filter: drop-shadow(0 0 2em #61dafbaa); -} - -@keyframes logo-spin { - from { - transform: rotate(0deg); - } - to { - transform: rotate(360deg); - } -} - -@media (prefers-reduced-motion: no-preference) { - a:nth-of-type(2) .logo { - animation: logo-spin infinite 20s linear; - } -} - -.card { - padding: 2em; -} - -.read-the-docs { - color: #888; -} diff --git a/apps/ui/ingestion-ui/src/App.tsx b/apps/ui/ingestion-ui/src/App.tsx index f0d98a3a..eb0e6954 100644 --- a/apps/ui/ingestion-ui/src/App.tsx +++ b/apps/ui/ingestion-ui/src/App.tsx @@ -1,4 +1,3 @@ -import "./App.css"; import FirecrawlComponent from "./components/ingestion"; function App() { diff --git a/apps/ui/ingestion-ui/src/assets/react.svg b/apps/ui/ingestion-ui/src/assets/react.svg deleted file mode 100644 index 6c87de9b..00000000 --- a/apps/ui/ingestion-ui/src/assets/react.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/apps/ui/ingestion-ui/src/components/ingestion.tsx b/apps/ui/ingestion-ui/src/components/ingestion.tsx index a17ee43a..a7cd9722 100644 --- a/apps/ui/ingestion-ui/src/components/ingestion.tsx +++ b/apps/ui/ingestion-ui/src/components/ingestion.tsx @@ -15,11 +15,12 @@ import { CollapsibleContent, CollapsibleTrigger, } from "@/components/ui/collapsible"; -import { ChevronDown } from "lucide-react"; +import { ChevronDown, ChevronLeft, ChevronRight } from "lucide-react"; -// Hardcoded values (not recommended for production) +//! Hardcoded values (not recommended for production) +//! Highly recommended to move all Firecrawl API calls to the backend (e.g. Next.js API route) const FIRECRAWL_API_URL = "https://api.firecrawl.dev"; // Replace with your actual API URL whether it is local or using Firecrawl Cloud -const FIRECRAWL_API_KEY = ""; // Replace with your actual API key +const FIRECRAWL_API_KEY = "fc-YOUR_API_KEY"; // Replace with your actual API key interface FormData { url: string; @@ -100,7 +101,8 @@ export default function FirecrawlComponent() { const [elapsedTime, setElapsedTime] = useState(0); const [showCrawlStatus, setShowCrawlStatus] = useState(false); const [isScraping, setIsScraping] = useState(false); - const [showAllUrls, setShowAllUrls] = useState(false); + const [currentPage, setCurrentPage] = useState(1); + const urlsPerPage = 10; useEffect(() => { let timer: NodeJS.Timeout; @@ -289,6 +291,7 @@ export default function FirecrawlComponent() { const data: ScrapeResult = await response.json(); newScrapeResults[url] = data; setCrawlStatus((prev) => ({ ...prev, current: index + 1 })); + setScrapeResults({ ...scrapeResults, ...newScrapeResults }); } catch (error) { console.error(`Error scraping ${url}:`, error); newScrapeResults[url] = { @@ -312,22 +315,35 @@ export default function FirecrawlComponent() { } } - setScrapeResults(newScrapeResults); setLoading(false); setIsScraping(false); }; + const handlePageChange = (newPage: number) => { + setCurrentPage(newPage); + }; + + const paginatedUrls = crawledUrls.slice( + (currentPage - 1) * urlsPerPage, + currentPage * urlsPerPage + ); + return (
- - - Extract web content with Firecrawl πŸ”₯ + + + Extract web content + + Powered by Firecrawl πŸ”₯ +
- Use this component to quickly build your own UI for Firecrawl. Plug - in your API key and the component will handle the rest. Learn more - on the{" "} + Use this component to quickly give your users the ability to connect + their AI apps to web data with Firecrawl. Learn more on the{" "}
- +
{selectedUrls.length === crawledUrls.length - ? "Unselect All" - : "Select All"} + ? `Unselect All (${selectedUrls.length})` + : `Select All (${selectedUrls.length})`} )} @@ -503,41 +548,57 @@ export default function FirecrawlComponent() { !isScraping && ( <>
    - {(showAllUrls ? crawledUrls : crawledUrls.slice(0, 10)).map( - (url, index) => ( -
  • - - setSelectedUrls((prev) => - prev.includes(url) - ? prev.filter((u) => u !== url) - : [...prev, url] - ) - } - /> - {url} -
  • - ) - )} -
- {crawledUrls.length > 10 && ( -
- -
- )} + + setSelectedUrls((prev) => + prev.includes(url) + ? prev.filter((u) => u !== url) + : [...prev, url] + ) + } + /> + + {url.length > 70 ? `${url.slice(0, 70)}...` : url} + + + ))} + +
+ + + Page {currentPage} of{" "} + {Math.ceil(crawledUrls.length / urlsPerPage)} + + +
)} - + {crawledUrls.length > 0 && !scrapingSelectedLoading && ( - ))}
From 6ad7e244037bed9280ba284fb39858aecc109bb7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 24 Jul 2024 18:15:51 -0400 Subject: [PATCH 21/45] Update ingestion.tsx --- apps/ui/ingestion-ui/src/components/ingestion.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/ui/ingestion-ui/src/components/ingestion.tsx b/apps/ui/ingestion-ui/src/components/ingestion.tsx index a7cd9722..6ca839e5 100644 --- a/apps/ui/ingestion-ui/src/components/ingestion.tsx +++ b/apps/ui/ingestion-ui/src/components/ingestion.tsx @@ -627,11 +627,11 @@ export default function FirecrawlComponent() {
-
+
{result.success ? ( <>
-                          {result.data.markdown.trim().slice(0, 200)}...
+                          {result.data.markdown.trim()}
                         
) : ( From 309728a4820add6542ea6cc6d2c8633114299c97 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:48:06 -0300 Subject: [PATCH 22/45] updated logs --- apps/api/src/controllers/crawl-status.ts | 3 ++- apps/api/src/controllers/crawl.ts | 7 ++++--- apps/api/src/controllers/crawlPreview.ts | 3 ++- apps/api/src/controllers/scrape.ts | 5 +++-- apps/api/src/controllers/search.ts | 5 +++-- apps/api/src/controllers/status.ts | 3 ++- apps/api/src/index.ts | 6 +++--- apps/api/src/lib/LLM-extraction/index.ts | 3 ++- apps/api/src/lib/withAuth.ts | 5 +++-- .../src/scraper/WebScraper/utils/blocklist.ts | 4 +++- .../WebScraper/utils/imageDescription.ts | 3 ++- .../src/scraper/WebScraper/utils/metadata.ts | 4 +++- .../scraper/WebScraper/utils/replacePaths.ts | 5 +++-- apps/api/src/scraper/WebScraper/utils/utils.ts | 5 +++-- apps/api/src/search/googlesearch.ts | 5 +++-- apps/api/src/search/index.ts | 3 ++- apps/api/src/services/alerts/index.ts | 13 +++++++------ apps/api/src/services/alerts/slack.ts | 5 +++-- apps/api/src/services/billing/credit_billing.ts | 4 ++-- apps/api/src/services/idempotency/create.ts | 3 ++- apps/api/src/services/idempotency/validate.ts | 5 +++-- apps/api/src/services/logging/crawl_log.ts | 3 ++- apps/api/src/services/logging/scrape_log.ts | 4 ++++ .../services/notification/email_notification.ts | 11 ++++++----- apps/api/src/services/posthog.ts | 3 ++- apps/api/src/services/supabase.ts | 16 +++++++++++----- apps/api/src/services/webhook.ts | 11 +++-------- apps/test-suite/utils/supabase.ts | 17 ++++++++++++----- 28 files changed, 100 insertions(+), 64 deletions(-) diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts index a55003cc..5aafa433 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/crawl-status.ts @@ -4,6 +4,7 @@ import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { Logger } from "../../src/lib/logger"; export async function crawlStatusController(req: Request, res: Response) { try { @@ -44,7 +45,7 @@ export async function crawlStatusController(req: Request, res: Response) { partial_data: jobStatus == 'completed' ? [] : partialDocs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 89358fcc..614a5928 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -10,6 +10,7 @@ import { logCrawl } from "../../src/services/logging/crawl_log"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; +import { Logger } from "../../src/lib/logger"; export async function crawlController(req: Request, res: Response) { try { @@ -30,7 +31,7 @@ export async function crawlController(req: Request, res: Response) { try { createIdempotencyKey(req); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -83,7 +84,7 @@ export async function crawlController(req: Request, res: Response) { documents: docs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } @@ -101,7 +102,7 @@ export async function crawlController(req: Request, res: Response) { res.json({ jobId: job.id }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/crawlPreview.ts b/apps/api/src/controllers/crawlPreview.ts index 2c3dc4ea..7c5c804d 100644 --- a/apps/api/src/controllers/crawlPreview.ts +++ b/apps/api/src/controllers/crawlPreview.ts @@ -3,6 +3,7 @@ import { authenticateUser } from "./auth"; import { RateLimiterMode } from "../../src/types"; import { addWebScraperJob } from "../../src/services/queue-jobs"; import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist"; +import { Logger } from "../../src/lib/logger"; export async function crawlPreviewController(req: Request, res: Response) { try { @@ -39,7 +40,7 @@ export async function crawlPreviewController(req: Request, res: Response) { res.json({ jobId: job.id }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index fb57e41d..615f0c0e 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -9,6 +9,7 @@ import { Document } from "../lib/entities"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function import { numTokensFromString } from '../lib/LLM-extraction/helpers'; import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values'; +import { Logger } from '../lib/logger'; export async function scrapeHelper( req: Request, @@ -112,7 +113,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } } catch (error) { - console.error(error); + Logger.error(error); earlyReturn = true; return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); } @@ -188,7 +189,7 @@ export async function scrapeController(req: Request, res: Response) { return res.status(result.returnCode).json(result); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index 8cb6d55b..adacb766 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -7,6 +7,7 @@ import { logJob } from "../services/logging/log_job"; import { PageOptions, SearchOptions } from "../lib/entities"; import { search } from "../search"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; +import { Logger } from "../lib/logger"; export async function searchHelper( req: Request, @@ -155,7 +156,7 @@ export async function searchController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: "Internal server error" }); } const startTime = new Date().getTime(); @@ -183,7 +184,7 @@ export async function searchController(req: Request, res: Response) { }); return res.status(result.returnCode).json(result); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index 231885f4..3d7fccbb 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -1,6 +1,7 @@ import { Request, Response } from "express"; import { getWebScraperQueue } from "../../src/services/queue-service"; import { supabaseGetJobById } from "../../src/lib/supabase-jobs"; +import { Logger } from "../../src/lib/logger"; export async function crawlJobStatusPreviewController(req: Request, res: Response) { try { @@ -35,7 +36,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons partial_data: jobStatus == 'completed' ? [] : partialDocs, }); } catch (error) { - console.error(error); + Logger.error(error); return res.status(500).json({ error: error.message }); } } diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 700cc98e..98243756 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -83,9 +83,9 @@ if (cluster.isMaster) { function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { Logger.info(`Worker ${process.pid} listening on port ${port}`); - // Logger.info( - // `For the UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` - // ); + Logger.info( + `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues` + ); }); return server; } diff --git a/apps/api/src/lib/LLM-extraction/index.ts b/apps/api/src/lib/LLM-extraction/index.ts index 2156fb3c..85a7e995 100644 --- a/apps/api/src/lib/LLM-extraction/index.ts +++ b/apps/api/src/lib/LLM-extraction/index.ts @@ -4,6 +4,7 @@ const ajv = new Ajv(); // Initialize AJV for JSON schema validation import { generateOpenAICompletions } from "./models"; import { Document, ExtractorOptions } from "../entities"; +import { Logger } from "../logger"; // Generate completion using OpenAI export async function generateCompletions( @@ -44,7 +45,7 @@ export async function generateCompletions( return completionResult; } catch (error) { - console.error(`Error generating completions: ${error}`); + Logger.error(`Error generating completions: ${error}`); throw new Error(`Error generating completions: ${error.message}`); } default: diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index ea5aa4d8..353c144b 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,4 +1,5 @@ import { AuthResponse } from "../../src/types"; +import { Logger } from "./logger"; let warningCount = 0; @@ -8,7 +9,7 @@ export function withAuth( return async function (...args: U): Promise { if (process.env.USE_DB_AUTHENTICATION === "false") { if (warningCount < 5) { - console.warn("WARNING - You're bypassing authentication"); + Logger.warn("You're bypassing authentication"); warningCount++; } return { success: true } as T; @@ -16,7 +17,7 @@ export function withAuth( try { return await originalFunction(...args); } catch (error) { - console.error("Error in withAuth function: ", error); + Logger.error(`Error in withAuth function: ${error}`); return { success: false, error: error.message } as T; } } diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 633fd5d0..0bdf9876 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,3 +1,5 @@ +import { Logger } from "../../../lib/logger"; + const socialMediaBlocklist = [ 'facebook.com', 'x.com', @@ -59,7 +61,7 @@ export function isUrlBlocked(url: string): boolean { return isBlocked; } catch (e) { // If an error occurs (e.g., invalid URL), return false - console.error(`Error parsing the following URL: ${url}`); + Logger.error(`Error parsing the following URL: ${url}`); return false; } } diff --git a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts index fb8f65d9..318c2340 100644 --- a/apps/api/src/scraper/WebScraper/utils/imageDescription.ts +++ b/apps/api/src/scraper/WebScraper/utils/imageDescription.ts @@ -1,5 +1,6 @@ import Anthropic from '@anthropic-ai/sdk'; import axios from 'axios'; +import { Logger } from '../../../lib/logger'; export async function getImageDescription( imageUrl: string, @@ -82,7 +83,7 @@ export async function getImageDescription( } } } catch (error) { - console.error("Error generating image alt text:", error?.message); + Logger.error(`Error generating image alt text: ${error}`); return ""; } } diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 3f2052c0..9496d569 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -1,4 +1,6 @@ import { CheerioAPI } from "cheerio"; +import { Logger } from "../../../lib/logger"; + interface Metadata { title?: string; description?: string; @@ -105,7 +107,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; } catch (error) { - console.error("Error extracting metadata:", error); + Logger.error(`Error extracting metadata: ${error}`); } return { diff --git a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts index 25b43f0a..f84db63f 100644 --- a/apps/api/src/scraper/WebScraper/utils/replacePaths.ts +++ b/apps/api/src/scraper/WebScraper/utils/replacePaths.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../../lib/logger"; import { Document } from "../../../lib/entities"; export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { @@ -39,7 +40,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] return documents; } catch (error) { - console.error("Error replacing paths with absolute paths", error); + Logger.debug(`Error replacing paths with absolute paths: ${error}`); return documents; } }; @@ -78,7 +79,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen return documents; } catch (error) { - console.error("Error replacing img paths with absolute paths", error); + Logger.error(`Error replacing img paths with absolute paths: ${error}`); return documents; } }; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 3aa021a6..dd5906b0 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -1,5 +1,6 @@ import axios from "axios"; import * as cheerio from "cheerio"; +import { Logger } from "../../../lib/logger"; export async function attemptScrapWithRequests( @@ -9,13 +10,13 @@ export async function attemptScrapWithRequests( const response = await axios.get(urlToScrap, { timeout: 15000 }); if (!response.data) { - console.log("Failed normal requests as well"); + Logger.debug("Failed normal requests as well"); return null; } return response.data; } catch (error) { - console.error(`Error in attemptScrapWithRequests: ${error}`); + Logger.debug(`Error in attemptScrapWithRequests: ${error}`); return null; } } diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 6bfa1a39..060f4bd8 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -2,6 +2,7 @@ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as querystring from 'querystring'; import { SearchResult } from '../../src/lib/entities'; +import { Logger } from '../../src/lib/logger'; const _useragent_list = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', @@ -96,7 +97,7 @@ export async function google_search(term: string, advanced = false, num_results await new Promise(resolve => setTimeout(resolve, sleep_interval * 1000)); } catch (error) { if (error.message === 'Too many requests') { - console.warn('Too many requests, breaking the loop'); + Logger.warn('Too many requests, breaking the loop'); break; } throw error; @@ -107,7 +108,7 @@ export async function google_search(term: string, advanced = false, num_results } } if (attempts >= maxAttempts) { - console.warn('Max attempts reached, breaking the loop'); + Logger.warn('Max attempts reached, breaking the loop'); } return results } diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 88cbf812..f5bc06e3 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; import { google_search } from "./googlesearch"; import { serper_search } from "./serper"; @@ -47,7 +48,7 @@ export async function search({ timeout ); } catch (error) { - console.error("Error in search function: ", error); + Logger.error(`Error in search function: ${error}`); return [] } // if process.env.SERPER_API_KEY is set, use serper diff --git a/apps/api/src/services/alerts/index.ts b/apps/api/src/services/alerts/index.ts index 1cfb5906..88b3c726 100644 --- a/apps/api/src/services/alerts/index.ts +++ b/apps/api/src/services/alerts/index.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../../src/lib/logger"; import { getWebScraperQueue } from "../queue-service"; import { sendSlackWebhook } from "./slack"; @@ -9,13 +10,13 @@ export async function checkAlerts() { process.env.ALERT_NUM_ACTIVE_JOBS && process.env.ALERT_NUM_WAITING_JOBS ) { - console.info("Initializing alerts"); + Logger.info("Initializing alerts"); const checkActiveJobs = async () => { try { const webScraperQueue = getWebScraperQueue(); const activeJobs = await webScraperQueue.getActiveCount(); if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { - console.warn( + Logger.warn( `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` ); sendSlackWebhook( @@ -23,12 +24,12 @@ export async function checkAlerts() { true ); } else { - console.info( + Logger.info( `Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}` ); } } catch (error) { - console.error("Failed to check active jobs:", error); + Logger.error(`Failed to check active jobs: ${error}`); } }; @@ -38,7 +39,7 @@ export async function checkAlerts() { const paused = await webScraperQueue.getPausedCount(); if (waitingJobs !== paused && waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { - console.warn( + Logger.warn( `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.` ); sendSlackWebhook( @@ -57,6 +58,6 @@ export async function checkAlerts() { // setInterval(checkAll, 10000); // Run every } } catch (error) { - console.error("Failed to initialize alerts:", error); + Logger.error(`Failed to initialize alerts: ${error}`); } } diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index f65035b1..89629df7 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -1,4 +1,5 @@ import axios from "axios"; +import { Logger } from "../../../src/lib/logger"; export async function sendSlackWebhook( message: string, @@ -16,8 +17,8 @@ export async function sendSlackWebhook( "Content-Type": "application/json", }, }); - console.log("Webhook sent successfully:", response.data); + Logger.log("Webhook sent successfully:", response.data); } catch (error) { - console.error("Error sending webhook:", error); + Logger.error(`Error sending webhook: ${error}`); } } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 6d0f9a69..9369cdbb 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -263,14 +263,14 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { }); if (creditUsageError) { - console.error("Error calculating credit usage:", creditUsageError); + Logger.error(`Error calculating credit usage: ${creditUsageError}`); } if (creditUsages && creditUsages.length > 0) { totalCreditsUsed = creditUsages[0].total_credits_used; } } catch (error) { - console.error("Error calculating credit usage:", error); + Logger.error(`Error calculating credit usage: ${error}`); } // Adjust total credits used by subtracting coupon value diff --git a/apps/api/src/services/idempotency/create.ts b/apps/api/src/services/idempotency/create.ts index ec3e18e7..291e77d9 100644 --- a/apps/api/src/services/idempotency/create.ts +++ b/apps/api/src/services/idempotency/create.ts @@ -1,5 +1,6 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; +import { Logger } from "../../../src/lib/logger"; export async function createIdempotencyKey( req: Request, @@ -14,7 +15,7 @@ export async function createIdempotencyKey( .insert({ key: idempotencyKey }); if (error) { - console.error("Failed to create idempotency key:", error); + Logger.error(`Failed to create idempotency key: ${error}`); throw error; } diff --git a/apps/api/src/services/idempotency/validate.ts b/apps/api/src/services/idempotency/validate.ts index 1ca348bb..4d58a31d 100644 --- a/apps/api/src/services/idempotency/validate.ts +++ b/apps/api/src/services/idempotency/validate.ts @@ -1,6 +1,7 @@ import { Request } from "express"; import { supabase_service } from "../supabase"; import { validate as isUuid } from 'uuid'; +import { Logger } from "../../../src/lib/logger"; export async function validateIdempotencyKey( req: Request, @@ -13,7 +14,7 @@ export async function validateIdempotencyKey( // Ensure idempotencyKey is treated as a string const key = Array.isArray(idempotencyKey) ? idempotencyKey[0] : idempotencyKey; if (!isUuid(key)) { - console.error("Invalid idempotency key provided in the request headers."); + Logger.debug("Invalid idempotency key provided in the request headers."); return false; } @@ -23,7 +24,7 @@ export async function validateIdempotencyKey( .eq("key", idempotencyKey); if (error) { - console.error(error); + Logger.error(`Error validating idempotency key: ${error}`); } if (!data || data.length === 0) { diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index 1224bd44..68008e02 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -1,4 +1,5 @@ import { supabase_service } from "../supabase"; +import { Logger } from "../../../src/lib/logger"; import "dotenv/config"; export async function logCrawl(job_id: string, team_id: string) { @@ -13,7 +14,7 @@ export async function logCrawl(job_id: string, team_id: string) { }, ]); } catch (error) { - console.error("Error logging crawl job:\n", error); + Logger.error(`Error logging crawl job to supabase:\n${error}`); } } } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index b99d99b4..208159da 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -8,6 +8,10 @@ export async function logScrape( scrapeLog: ScrapeLog, pageOptions?: PageOptions ) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug("Skipping logging scrape to Supabase"); + return; + } try { // Only log jobs in production // if (process.env.ENV !== "production") { diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index e5102acd..d7cd3de0 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -2,6 +2,7 @@ import { supabase_service } from "../supabase"; import { withAuth } from "../../lib/withAuth"; import { Resend } from "resend"; import { NotificationType } from "../../types"; +import { Logger } from "../../../src/lib/logger"; const emailTemplates: Record< NotificationType, @@ -52,11 +53,11 @@ async function sendEmailNotification( }); if (error) { - console.error("Error sending email: ", error); + Logger.debug(`Error sending email: ${error}`); return { success: false }; } } catch (error) { - console.error("Error sending email (2): ", error); + Logger.debug(`Error sending email (2): ${error}`); return { success: false }; } } @@ -79,7 +80,7 @@ export async function sendNotificationInternal( .lte("sent_date", endDateString); if (error) { - console.error("Error fetching notifications: ", error); + Logger.debug(`Error fetching notifications: ${error}`); return { success: false }; } @@ -93,7 +94,7 @@ export async function sendNotificationInternal( .eq("team_id", team_id); if (emailsError) { - console.error("Error fetching emails: ", emailsError); + Logger.debug(`Error fetching emails: ${emailsError}`); return { success: false }; } @@ -112,7 +113,7 @@ export async function sendNotificationInternal( ]); if (insertError) { - console.error("Error inserting notification record: ", insertError); + Logger.debug(`Error inserting notification record: ${insertError}`); return { success: false }; } diff --git a/apps/api/src/services/posthog.ts b/apps/api/src/services/posthog.ts index 5ec16e2e..a7419883 100644 --- a/apps/api/src/services/posthog.ts +++ b/apps/api/src/services/posthog.ts @@ -1,5 +1,6 @@ import { PostHog } from 'posthog-node'; import "dotenv/config"; +import { Logger } from '../../src/lib/logger'; export default function PostHogClient() { const posthogClient = new PostHog(process.env.POSTHOG_API_KEY, { @@ -19,7 +20,7 @@ class MockPostHog { export const posthog = process.env.POSTHOG_API_KEY ? PostHogClient() : (() => { - console.warn( + Logger.warn( "POSTHOG_API_KEY is not provided - your events will not be logged. Using MockPostHog as a fallback. See posthog.ts for more." ); return new MockPostHog(); diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index fa6404d7..d34f7b52 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,4 +1,5 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; +import { Logger } from "../lib/logger"; // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -10,13 +11,13 @@ class SupabaseService { // Only initialize the Supabase client if both URL and Service Token are provided. if (process.env.USE_DB_AUTHENTICATION === "false") { // Warn the user that Authentication is disabled by setting the client to null - console.warn( - "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + Logger.warn( + "Authentication is disabled. Supabase client will not be initialized." ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { - console.error( - "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + Logger.error( + "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable" ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); @@ -35,10 +36,15 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug( + "Attempted to access Supabase client when it's not configured." + ); + } const client = target.getClient(); // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. if (client === null) { - console.error( + Logger.error( "Attempted to access Supabase client when it's not configured." ); return () => { diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 18378546..2fa6cdcd 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -1,3 +1,4 @@ +import { Logger } from "../../src/lib/logger"; import { supabase_service } from "./supabase"; export const callWebhook = async (teamId: string, jobId: string,data: any) => { @@ -15,10 +16,7 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { .eq("team_id", teamId) .limit(1); if (error) { - console.error( - `Error fetching webhook URL for team ID: ${teamId}`, - error.message - ); + Logger.error(`Error fetching webhook URL for team ID: ${teamId}, error: ${error.message}`); return null; } @@ -53,9 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { }), }); } catch (error) { - console.error( - `Error sending webhook for team ID: ${teamId}`, - error.message - ); + Logger.error(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); } }; diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index aa19a8c9..dcefa38c 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -1,5 +1,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import "dotenv/config"; +import { Logger } from "../../api/src/lib/logger"; + // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { private client: SupabaseClient | null = null; @@ -10,13 +12,13 @@ class SupabaseService { // Only initialize the Supabase client if both URL and Service Token are provided. if (process.env.USE_DB_AUTHENTICATION === "false") { // Warn the user that Authentication is disabled by setting the client to null - console.warn( - "\x1b[33mAuthentication is disabled. Supabase client will not be initialized.\x1b[0m" + Logger.warn( + "Authentication is disabled. Supabase client will not be initialized." ); this.client = null; } else if (!supabaseUrl || !supabaseServiceToken) { - console.error( - "\x1b[31mSupabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable\x1b[0m" + Logger.error( + "Supabase environment variables aren't configured correctly. Supabase client will not be initialized. Fix ENV configuration or disable DB authentication with USE_DB_AUTHENTICATION env variable" ); } else { this.client = createClient(supabaseUrl, supabaseServiceToken); @@ -35,10 +37,15 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { + if (process.env.USE_DB_AUTHENTICATION === "false") { + Logger.debug( + "Attempted to access Supabase client when it's not configured." + ); + } const client = target.getClient(); // If the Supabase client is not initialized, intercept property access to provide meaningful error feedback. if (client === null) { - console.error( + Logger.error( "Attempted to access Supabase client when it's not configured." ); return () => { From 1f1c068eeabc4056bedd1ad9612694f00732e7e3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 25 Jul 2024 10:00:50 -0300 Subject: [PATCH 23/45] changing from error to debug --- apps/api/src/services/alerts/slack.ts | 2 +- apps/api/src/services/webhook.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/alerts/slack.ts b/apps/api/src/services/alerts/slack.ts index 89629df7..96bf1c09 100644 --- a/apps/api/src/services/alerts/slack.ts +++ b/apps/api/src/services/alerts/slack.ts @@ -19,6 +19,6 @@ export async function sendSlackWebhook( }); Logger.log("Webhook sent successfully:", response.data); } catch (error) { - Logger.error(`Error sending webhook: ${error}`); + Logger.debug(`Error sending webhook: ${error}`); } } diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 2fa6cdcd..b0222ea3 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -51,6 +51,6 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => { }), }); } catch (error) { - Logger.error(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); + Logger.debug(`Error sending webhook for team ID: ${teamId}, error: ${error.message}`); } }; From 2014d9dd2e18a1f57a23bb6f81eba395b7e5d360 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 14:54:20 -0400 Subject: [PATCH 24/45] Nick: admin router --- apps/api/src/controllers/admin/queue.ts | 87 ++++++++++ .../api/src/controllers/admin/redis-health.ts | 86 ++++++++++ apps/api/src/index.ts | 161 +----------------- apps/api/src/routes/admin.ts | 29 ++++ 4 files changed, 204 insertions(+), 159 deletions(-) create mode 100644 apps/api/src/controllers/admin/queue.ts create mode 100644 apps/api/src/controllers/admin/redis-health.ts create mode 100644 apps/api/src/routes/admin.ts diff --git a/apps/api/src/controllers/admin/queue.ts b/apps/api/src/controllers/admin/queue.ts new file mode 100644 index 00000000..cb5f99ed --- /dev/null +++ b/apps/api/src/controllers/admin/queue.ts @@ -0,0 +1,87 @@ +import { Request, Response } from "express"; + +import { Job } from "bull"; +import { Logger } from "../../lib/logger"; +import { getWebScraperQueue } from "../../services/queue-service"; +import { checkAlerts } from "../../services/alerts"; + +export async function cleanBefore24hCompleteJobsController( + req: Request, + res: Response +) { + Logger.info("πŸ‚ Cleaning jobs older than 24h"); + try { + const webScraperQueue = getWebScraperQueue(); + const batchSize = 10; + const numberOfBatches = 9; // Adjust based on your needs + const completedJobsPromises: Promise[] = []; + for (let i = 0; i < numberOfBatches; i++) { + completedJobsPromises.push( + webScraperQueue.getJobs( + ["completed"], + i * batchSize, + i * batchSize + batchSize, + true + ) + ); + } + const completedJobs: Job[] = ( + await Promise.all(completedJobsPromises) + ).flat(); + const before24hJobs = + completedJobs.filter( + (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 + ) || []; + + let count = 0; + + if (!before24hJobs) { + return res.status(200).send(`No jobs to remove.`); + } + + for (const job of before24hJobs) { + try { + await job.remove(); + count++; + } catch (jobError) { + Logger.error(`πŸ‚ Failed to remove job with ID ${job.id}: ${jobError}`); + } + } + return res.status(200).send(`Removed ${count} completed jobs.`); + } catch (error) { + Logger.error(`πŸ‚ Failed to clean last 24h complete jobs: ${error}`); + return res.status(500).send("Failed to clean jobs"); + } +} + + +export async function checkQueuesController(req: Request, res: Response) { + try { + await checkAlerts(); + return res.status(200).send("Alerts initialized"); + } catch (error) { + Logger.debug(`Failed to initialize alerts: ${error}`); + return res.status(500).send("Failed to initialize alerts"); + } + } + + // Use this as a "health check" that way we dont destroy the server +export async function queuesController(req: Request, res: Response) { + try { + const webScraperQueue = getWebScraperQueue(); + + const [webScraperActive] = await Promise.all([ + webScraperQueue.getActiveCount(), + ]); + + const noActiveJobs = webScraperActive === 0; + // 200 if no active jobs, 503 if there are active jobs + return res.status(noActiveJobs ? 200 : 500).json({ + webScraperActive, + noActiveJobs, + }); + } catch (error) { + Logger.error(error); + return res.status(500).json({ error: error.message }); + } + } \ No newline at end of file diff --git a/apps/api/src/controllers/admin/redis-health.ts b/apps/api/src/controllers/admin/redis-health.ts new file mode 100644 index 00000000..e35d6db9 --- /dev/null +++ b/apps/api/src/controllers/admin/redis-health.ts @@ -0,0 +1,86 @@ +import { Request, Response } from "express"; +import Redis from "ioredis"; +import { Logger } from "../../lib/logger"; +import { sendSlackWebhook } from "../../services/alerts/slack"; +import { redisRateLimitClient } from "../../services/rate-limiter"; + +export async function redisHealthController(req: Request, res: Response) { + const retryOperation = async (operation, retries = 3) => { + for (let attempt = 1; attempt <= retries; attempt++) { + try { + return await operation(); + } catch (error) { + if (attempt === retries) throw error; + Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`); + await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying + } + } + }; + + try { + const queueRedis = new Redis(process.env.REDIS_URL); + + const testKey = "test"; + const testValue = "test"; + + // Test queueRedis + let queueRedisHealth; + try { + await retryOperation(() => queueRedis.set(testKey, testValue)); + queueRedisHealth = await retryOperation(() => queueRedis.get(testKey)); + await retryOperation(() => queueRedis.del(testKey)); + } catch (error) { + Logger.error(`queueRedis health check failed: ${error}`); + queueRedisHealth = null; + } + + // Test redisRateLimitClient + let redisRateLimitHealth; + try { + await retryOperation(() => redisRateLimitClient.set(testKey, testValue)); + redisRateLimitHealth = await retryOperation(() => + redisRateLimitClient.get(testKey) + ); + await retryOperation(() => redisRateLimitClient.del(testKey)); + } catch (error) { + Logger.error(`redisRateLimitClient health check failed: ${error}`); + redisRateLimitHealth = null; + } + + const healthStatus = { + queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", + redisRateLimitClient: + redisRateLimitHealth === testValue ? "healthy" : "unhealthy", + }; + + if ( + healthStatus.queueRedis === "healthy" && + healthStatus.redisRateLimitClient === "healthy" + ) { + Logger.info("Both Redis instances are healthy"); + return res.status(200).json({ status: "healthy", details: healthStatus }); + } else { + Logger.info( + `Redis instances health check: ${JSON.stringify(healthStatus)}` + ); + await sendSlackWebhook( + `[REDIS DOWN] Redis instances health check: ${JSON.stringify( + healthStatus + )}`, + true + ); + return res + .status(500) + .json({ status: "unhealthy", details: healthStatus }); + } + } catch (error) { + Logger.error(`Redis health check failed: ${error}`); + await sendSlackWebhook( + `[REDIS DOWN] Redis instances health check: ${error.message}`, + true + ); + return res + .status(500) + .json({ status: "unhealthy", message: error.message }); + } +} diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 98243756..69433d6b 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -7,12 +7,8 @@ import { v0Router } from "./routes/v0"; import { initSDK } from "@hyperdx/node-opentelemetry"; import cluster from "cluster"; import os from "os"; -import { Job } from "bull"; -import { sendSlackWebhook } from "./services/alerts/slack"; -import { checkAlerts } from "./services/alerts"; -import Redis from "ioredis"; -import { redisRateLimitClient } from "./services/rate-limiter"; import { Logger } from "./lib/logger"; +import { adminRouter } from "./routes/admin"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -46,7 +42,6 @@ if (cluster.isMaster) { app.use(cors()); // Add this line to enable CORS - const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath(`/admin/${process.env.BULL_AUTH_KEY}/queues`); @@ -71,6 +66,7 @@ if (cluster.isMaster) { // register router app.use(v0Router); + app.use(adminRouter); const DEFAULT_PORT = process.env.PORT ?? 3002; const HOST = process.env.HOST ?? "localhost"; @@ -94,27 +90,6 @@ if (cluster.isMaster) { startServer(); } - // Use this as a "health check" that way we dont destroy the server - app.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, async (req, res) => { - try { - const webScraperQueue = getWebScraperQueue(); - - const [webScraperActive] = await Promise.all([ - webScraperQueue.getActiveCount(), - ]); - - const noActiveJobs = webScraperActive === 0; - // 200 if no active jobs, 503 if there are active jobs - return res.status(noActiveJobs ? 200 : 500).json({ - webScraperActive, - noActiveJobs, - }); - } catch (error) { - Logger.error(error); - return res.status(500).json({ error: error.message }); - } - }); - app.get(`/serverHealthCheck`, async (req, res) => { try { const webScraperQueue = getWebScraperQueue(); @@ -187,141 +162,9 @@ if (cluster.isMaster) { } }); - app.get( - `/admin/${process.env.BULL_AUTH_KEY}/check-queues`, - async (req, res) => { - try { - await checkAlerts(); - return res.status(200).send("Alerts initialized"); - } catch (error) { - Logger.debug(`Failed to initialize alerts: ${error}`); - return res.status(500).send("Failed to initialize alerts"); - } - } - ); - - app.get( - `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, - async (req, res) => { - Logger.info("πŸ‚ Cleaning jobs older than 24h"); - try { - const webScraperQueue = getWebScraperQueue(); - const batchSize = 10; - const numberOfBatches = 9; // Adjust based on your needs - const completedJobsPromises: Promise[] = []; - for (let i = 0; i < numberOfBatches; i++) { - completedJobsPromises.push( - webScraperQueue.getJobs( - ["completed"], - i * batchSize, - i * batchSize + batchSize, - true - ) - ); - } - const completedJobs: Job[] = ( - await Promise.all(completedJobsPromises) - ).flat(); - const before24hJobs = - completedJobs.filter( - (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 - ) || []; - - let count = 0; - - if (!before24hJobs) { - return res.status(200).send(`No jobs to remove.`); - } - - for (const job of before24hJobs) { - try { - await job.remove(); - count++; - } catch (jobError) { - Logger.error(`πŸ‚ Failed to remove job with ID ${job.id}: ${jobError}` ); - } - } - return res.status(200).send(`Removed ${count} completed jobs.`); - } catch (error) { - Logger.error(`πŸ‚ Failed to clean last 24h complete jobs: ${error}`); - return res.status(500).send("Failed to clean jobs"); - } - } - ); - app.get("/is-production", (req, res) => { res.send({ isProduction: global.isProduction }); }); - app.get( - `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, - async (req, res) => { - try { - const queueRedis = new Redis(process.env.REDIS_URL); - - const testKey = "test"; - const testValue = "test"; - - // Test queueRedis - let queueRedisHealth; - try { - await queueRedis.set(testKey, testValue); - queueRedisHealth = await queueRedis.get(testKey); - await queueRedis.del(testKey); - } catch (error) { - Logger.error(`queueRedis health check failed: ${error}`); - queueRedisHealth = null; - } - - // Test redisRateLimitClient - let redisRateLimitHealth; - try { - await redisRateLimitClient.set(testKey, testValue); - redisRateLimitHealth = await redisRateLimitClient.get(testKey); - await redisRateLimitClient.del(testKey); - } catch (error) { - Logger.error(`redisRateLimitClient health check failed: ${error}`); - redisRateLimitHealth = null; - } - - const healthStatus = { - queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", - redisRateLimitClient: - redisRateLimitHealth === testValue ? "healthy" : "unhealthy", - }; - - if ( - healthStatus.queueRedis === "healthy" && - healthStatus.redisRateLimitClient === "healthy" - ) { - Logger.info("Both Redis instances are healthy"); - return res - .status(200) - .json({ status: "healthy", details: healthStatus }); - } else { - Logger.info(`Redis instances health check: ${JSON.stringify(healthStatus)}`); - await sendSlackWebhook( - `[REDIS DOWN] Redis instances health check: ${JSON.stringify( - healthStatus - )}`, - true - ); - return res - .status(500) - .json({ status: "unhealthy", details: healthStatus }); - } - } catch (error) { - Logger.error(`Redis health check failed: ${error}`); - await sendSlackWebhook( - `[REDIS DOWN] Redis instances health check: ${error.message}`, - true - ); - return res - .status(500) - .json({ status: "unhealthy", message: error.message }); - } - } - ); - Logger.info(`Worker ${process.pid} started`); } diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts new file mode 100644 index 00000000..81766184 --- /dev/null +++ b/apps/api/src/routes/admin.ts @@ -0,0 +1,29 @@ +import express from "express"; +import { redisHealthController } from "../controllers/admin/redis-health"; +import { + checkQueuesController, + cleanBefore24hCompleteJobsController, + queuesController, +} from "../controllers/admin/queue"; + +export const adminRouter = express.Router(); + +adminRouter.post( + `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, + redisHealthController +); + +adminRouter.post( + `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, + cleanBefore24hCompleteJobsController +); + +adminRouter.post( + `/admin/${process.env.BULL_AUTH_KEY}/check-queues`, + checkQueuesController +); + +adminRouter.post( + `/admin/${process.env.BULL_AUTH_KEY}/queues`, + queuesController +); From 28a8a9849154ca57497b3725f696847132f7a853 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 14:58:14 -0400 Subject: [PATCH 25/45] Update admin.ts --- apps/api/src/routes/admin.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/routes/admin.ts b/apps/api/src/routes/admin.ts index 81766184..77d1bf46 100644 --- a/apps/api/src/routes/admin.ts +++ b/apps/api/src/routes/admin.ts @@ -8,22 +8,22 @@ import { export const adminRouter = express.Router(); -adminRouter.post( +adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, redisHealthController ); -adminRouter.post( +adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, cleanBefore24hCompleteJobsController ); -adminRouter.post( +adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/check-queues`, checkQueuesController ); -adminRouter.post( +adminRouter.get( `/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController ); From 50d2426fc496d4feba7a6bd013a790480cc5bd14 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 16:20:29 -0400 Subject: [PATCH 26/45] Update scrape-events.ts --- apps/api/src/lib/scrape-events.ts | 52 ++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index dda03c38..ab4ef681 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -1,6 +1,7 @@ import { Job, JobId } from "bull"; import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; +import { Logger } from "./logger"; export type ScrapeErrorEvent = { type: "error", @@ -36,13 +37,18 @@ export class ScrapeEvents { if (jobId === "TEST") return null; if (process.env.USE_DB_AUTHENTICATION) { - const result = await supabase.from("scrape_events").insert({ - job_id: jobId, - type: content.type, - content: content, - // created_at - }).select().single(); - return (result.data as any).id; + try { + const result = await supabase.from("scrape_events").insert({ + job_id: jobId, + type: content.type, + content: content, + // created_at + }).select().single(); + return (result.data as any).id; + } catch (error) { + Logger.error(`Error inserting scrape event: ${error}`); + return null; + } } return null; @@ -51,20 +57,28 @@ export class ScrapeEvents { static async updateScrapeResult(logId: number | null, result: ScrapeScrapeEvent["result"]) { if (logId === null) return; - const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; - await supabase.from("scrape_events").update({ - content: { - ...previousLog.content, - result, - } - }).eq("id", logId); + try { + const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; + await supabase.from("scrape_events").update({ + content: { + ...previousLog.content, + result, + } + }).eq("id", logId); + } catch (error) { + Logger.error(`Error updating scrape result: ${error}`); + } } static async logJobEvent(job: Job | JobId, event: ScrapeQueueEvent["event"]) { - await this.insert(((job as any).id ? (job as any).id : job) as string, { - type: "queue", - event, - worker: process.env.FLY_MACHINE_ID, - }); + try { + await this.insert(((job as any).id ? (job as any).id : job) as string, { + type: "queue", + event, + worker: process.env.FLY_MACHINE_ID, + }); + } catch (error) { + Logger.error(`Error logging job event: ${error}`); + } } } From 81aa919262bad218f6a8f337e9a4b951ccc8e966 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 25 Jul 2024 17:47:43 -0300 Subject: [PATCH 27/45] fix --- apps/test-suite/utils/supabase.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index aa19a8c9..642a9b7c 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -1,5 +1,6 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import "dotenv/config"; + // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { private client: SupabaseClient | null = null; From 7129d7993ee1d51ab171d6d747a84ef111e1146c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 17:19:45 -0400 Subject: [PATCH 28/45] Update v0.ts --- apps/api/src/routes/v0.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index 4284b77c..9c68d9bb 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -7,8 +7,8 @@ import { crawlJobStatusPreviewController } from "../../src/controllers/status"; import { searchController } from "../../src/controllers/search"; import { crawlCancelController } from "../../src/controllers/crawl-cancel"; import { keyAuthController } from "../../src/controllers/keyAuth"; -import {livenessController} from "../controllers/liveness"; -import {readinessController} from "../controllers/readiness"; +import { livenessController } from "../controllers/liveness"; +import { readinessController } from "../controllers/readiness"; export const v0Router = express.Router(); From 3242872503ea22fe86b7c0ecfd4eb02c968c80a1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 17:43:55 -0400 Subject: [PATCH 29/45] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4b428d35..3854c737 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -270,7 +270,7 @@ export async function scrapSingleUrl( const insertedLogId = await logInsertPromise; ScrapeEvents.updateScrapeResult(insertedLogId, { response_size: scraperResponse.text.length, - success: !scraperResponse.metadata.pageError && !!text, + success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text, error: scraperResponse.metadata.pageError, response_code: scraperResponse.metadata.pageStatusCode, time_taken: Date.now() - timer, From 56042d090c66543bc82ae21c08ce38145d4b5e26 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 17:48:44 -0400 Subject: [PATCH 30/45] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 3854c737..a7f57368 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -87,10 +87,10 @@ function getScrapingFallbackOrder( "scrapingBee", "fire-engine", "fire-engine;chrome-cdp", - "playwright", + process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", "scrapingBeeLoad", "fetch", - ]; + ].filter(Boolean); if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { defaultOrder = [ From 01fab6e036f44406204cc7ade38116654120a19f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 17:51:41 -0400 Subject: [PATCH 31/45] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index a7f57368..dd4d9f71 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -26,10 +26,10 @@ export const baseScrapers = [ "fire-engine", "fire-engine;chrome-cdp", "scrapingBee", - "playwright", + process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", "scrapingBeeLoad", "fetch", -] as const; +].filter(Boolean); export async function generateRequestParams( url: string, @@ -95,11 +95,11 @@ function getScrapingFallbackOrder( if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { defaultOrder = [ "fire-engine", - "playwright", + process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", ...defaultOrder.filter( (scraper) => scraper !== "fire-engine" && scraper !== "playwright" ), - ]; + ].filter(Boolean); } const filteredDefaultOrder = defaultOrder.filter( From f82ca3be17c01bf3209e382756434fe6eeecf1b3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 19:53:29 -0400 Subject: [PATCH 32/45] Nick: --- apps/api/src/scraper/WebScraper/single_url.ts | 3 ++- apps/api/src/strings.ts | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index dd4d9f71..0fce76b6 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -19,6 +19,7 @@ import { scrapWithScrapingBee } from "./scrapers/scrapingBee"; import { extractLinks } from "./utils/utils"; import { Logger } from "../../lib/logger"; import { ScrapeEvents } from "../../lib/scrape-events"; +import { clientSideError } from "../../strings"; dotenv.config(); @@ -311,7 +312,7 @@ export async function scrapSingleUrl( for (const scraper of scrapersInOrder) { // If exists text coming from crawler, use it - if (existingHtml && existingHtml.trim().length >= 100) { + if (existingHtml && existingHtml.trim().length >= 100 && !existingHtml.includes(clientSideError)) { let cleanedHtml = removeUnwantedElements(existingHtml, pageOptions); text = await parseMarkdown(cleanedHtml); html = cleanedHtml; diff --git a/apps/api/src/strings.ts b/apps/api/src/strings.ts index e7a6f21e..8edc57f1 100644 --- a/apps/api/src/strings.ts +++ b/apps/api/src/strings.ts @@ -1,2 +1,4 @@ export const errorNoResults = "No results found, please check the URL or contact us at help@mendable.ai to file a ticket."; + +export const clientSideError = "client-side exception has occurred" \ No newline at end of file From dc6f825270e481186c66913480c57894fa356f56 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 25 Jul 2024 20:43:50 -0400 Subject: [PATCH 33/45] Update email_notification.ts --- .../notification/email_notification.ts | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/notification/email_notification.ts b/apps/api/src/services/notification/email_notification.ts index d7cd3de0..a63d78ff 100644 --- a/apps/api/src/services/notification/email_notification.ts +++ b/apps/api/src/services/notification/email_notification.ts @@ -71,13 +71,16 @@ export async function sendNotificationInternal( if (team_id === "preview") { return { success: true }; } + + const fifteenDaysAgo = new Date(); + fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15); + const { data, error } = await supabase_service .from("user_notifications") .select("*") .eq("team_id", team_id) .eq("notification_type", notificationType) - .gte("sent_date", startDateString) - .lte("sent_date", endDateString); + .gte("sent_date", fifteenDaysAgo.toISOString()); if (error) { Logger.debug(`Error fetching notifications: ${error}`); @@ -85,8 +88,28 @@ export async function sendNotificationInternal( } if (data.length !== 0) { + // Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`); + return { success: false }; + } + + const { data: recentData, error: recentError } = await supabase_service + .from("user_notifications") + .select("*") + .eq("team_id", team_id) + .eq("notification_type", notificationType) + .gte("sent_date", startDateString) + .lte("sent_date", endDateString); + + if (recentError) { + Logger.debug(`Error fetching recent notifications: ${recentError}`); + return { success: false }; + } + + if (recentData.length !== 0) { + // Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`); return { success: false }; } else { + console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`); // get the emails from the user with the team_id const { data: emails, error: emailsError } = await supabase_service .from("users") From 96cec2a673e86694377f18597f28d4bb99602821 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 26 Jul 2024 12:00:52 -0300 Subject: [PATCH 34/45] fix checking scrape log success content length --- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0fce76b6..12482ec9 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -271,7 +271,7 @@ export async function scrapSingleUrl( const insertedLogId = await logInsertPromise; ScrapeEvents.updateScrapeResult(insertedLogId, { response_size: scraperResponse.text.length, - success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text, + success: !(scraperResponse.metadata.pageStatusCode && scraperResponse.metadata.pageStatusCode >= 400) && !!text && (text.trim().length >= 100), error: scraperResponse.metadata.pageError, response_code: scraperResponse.metadata.pageStatusCode, time_taken: Date.now() - timer, From ff4266f09ea51ff232d77e72cfcb6a8a4a5df7a6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 26 Jul 2024 17:21:09 -0400 Subject: [PATCH 35/45] Update pdfProcessor.ts --- .../src/scraper/WebScraper/utils/pdfProcessor.ts | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 1a9fa11b..660d27eb 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -12,10 +12,15 @@ import { Logger } from "../../../lib/logger"; dotenv.config(); export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { - const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); - const content = await processPdfToText(tempFilePath, parsePDF); - fs.unlinkSync(tempFilePath); // Clean up the temporary file - return { content, pageStatusCode, pageError }; + try { + const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url); + const content = await processPdfToText(tempFilePath, parsePDF); + fs.unlinkSync(tempFilePath); // Clean up the temporary file + return { content, pageStatusCode, pageError }; + } catch (error) { + Logger.error(`Failed to fetch and process PDF: ${error.message}`); + return { content: "", pageStatusCode: 500, pageError: error.message }; + } } async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> { From 091924a63678eb95cd50acfd06ee001e95b71211 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 26 Jul 2024 17:37:46 -0400 Subject: [PATCH 36/45] Nick: moving machines from mia to virginia --- apps/api/fly.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/fly.toml b/apps/api/fly.toml index a9b39228..e248e59b 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -4,7 +4,7 @@ # app = 'firecrawl-scraper-js' -primary_region = 'mia' +primary_region = 'iad' kill_signal = 'SIGINT' kill_timeout = '30s' From 4c9d62f6d3c6cb7bd13590a8149e70dd81d8e282 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 26 Jul 2024 18:25:44 -0400 Subject: [PATCH 37/45] Nick: fixing sitemap fallback --- apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/sitemap.ts | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 56cd793a..9ffa4810 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -139,4 +139,5 @@ export interface FireEngineOptions{ engine?: string; blockMedia?: boolean; blockAds?: boolean; + disableJsDom?: boolean; } diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index b41f3eeb..3dfc9a1c 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -23,7 +23,7 @@ export async function getLinksFromSitemap( const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { - const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { method: "get", mobileProxy: true },options:{endpoint:"request"} }); + const response = await scrapWithFireEngine({ url: sitemapUrl, fireEngineOptions: { engine:"tlsclient", disableJsDom: true, mobileProxy: true } }); content = response.html; } } catch (error) { From b4833c1694cfec974bcee335cc6efd77bcfd7980 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 29 Jul 2024 17:21:11 -0400 Subject: [PATCH 38/45] Nick: increasing default timeout to 45s --- apps/api/src/lib/default-values.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index 6ae5f99f..3b303781 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -1,6 +1,6 @@ export const defaultOrigin = "api"; -export const defaultTimeout = 30000; // 30 seconds +export const defaultTimeout = 45000; // 45 seconds export const defaultPageOptions = { onlyMainContent: false, From 04942bb9dea14a43ac26bc434ef8efa3eadbabbd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 29 Jul 2024 18:31:43 -0400 Subject: [PATCH 39/45] Nick: --- apps/api/fly.toml | 4 ++-- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 17 +++++++++++++---- apps/api/src/controllers/auth.ts | 21 +++++++++++++++++++-- apps/api/src/index.ts | 11 +++++++++++ 5 files changed, 46 insertions(+), 8 deletions(-) diff --git a/apps/api/fly.toml b/apps/api/fly.toml index e248e59b..94108e5f 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -24,8 +24,8 @@ kill_timeout = '30s' [http_service.concurrency] type = "requests" - hard_limit = 100 - soft_limit = 50 + hard_limit = 200 + soft_limit = 75 [[http_service.checks]] grace_period = "20s" diff --git a/apps/api/package.json b/apps/api/package.json index da1b2b33..153ac841 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -63,6 +63,7 @@ "axios": "^1.3.4", "bottleneck": "^2.19.5", "bull": "^4.15.0", + "cacheable-lookup": "^6.1.0", "cheerio": "^1.0.0-rc.12", "cohere": "^1.1.1", "cors": "^2.8.5", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 02d8363b..9c8dbbc9 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -59,6 +59,9 @@ importers: bull: specifier: ^4.15.0 version: 4.15.0 + cacheable-lookup: + specifier: ^6.1.0 + version: 6.1.0 cheerio: specifier: ^1.0.0-rc.12 version: 1.0.0-rc.12 @@ -1937,6 +1940,10 @@ packages: resolution: {integrity: sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==} engines: {node: '>= 0.8'} + cacheable-lookup@6.1.0: + resolution: {integrity: sha512-KJ/Dmo1lDDhmW2XDPMo+9oiy/CeqosPguPCrgcVzKyZrL6pM1gU2GmPY/xo6OQPTUaA/c0kwHuywB4E6nmT9ww==} + engines: {node: '>=10.6.0'} + call-bind@1.0.7: resolution: {integrity: sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==} engines: {node: '>= 0.4'} @@ -4369,8 +4376,8 @@ packages: engines: {node: '>=14.17'} hasBin: true - typescript@5.5.3: - resolution: {integrity: sha512-/hreyEujaB0w76zKo6717l3L0o/qEUtRgdvUBvlkhoWeOVMjMuHNHk0BRBzikzuGDqNmPQbg5ifMEqsHLiIUcQ==} + typescript@5.5.4: + resolution: {integrity: sha512-Mtq29sKDAEYP7aljRgtPOpTvOfbwRWlS6dPRzwjdE+C0R4brX/GUyhHSecbHMFLNBLcJIPt9nl9yG5TZ1weH+Q==} engines: {node: '>=14.17'} hasBin: true @@ -6917,6 +6924,8 @@ snapshots: bytes@3.1.2: {} + cacheable-lookup@6.1.0: {} + call-bind@1.0.7: dependencies: es-define-property: 1.0.0 @@ -8927,7 +8936,7 @@ snapshots: csv-parse: 5.5.6 gpt3-tokenizer: 1.1.5 openai: 3.3.0 - typescript: 5.5.3 + typescript: 5.5.4 uuid: 9.0.1 zod: 3.23.8 transitivePeerDependencies: @@ -9519,7 +9528,7 @@ snapshots: typescript@5.4.5: {} - typescript@5.5.3: {} + typescript@5.5.4: {} typesense@1.8.2(@babel/runtime@7.24.6): dependencies: diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index aab3d188..7154c694 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -88,7 +88,15 @@ export async function supaAuthenticateUser( // console.log('Key and Price ID:', data); } - if (error || !data || data.length === 0) { + if(error) { + return { + success: false, + error: "Unauthorized: Invalid token or server error.", + status: 500, + }; + } + + if (!data || data.length === 0) { return { success: false, error: "Unauthorized: Invalid token", @@ -178,7 +186,16 @@ export async function supaAuthenticateUser( .select("*") .eq("key", normalizedApi); - if (error || !data || data.length === 0) { + if (error) { + Logger.warn(`Error fetching api key: ${error.message}`); + return { + success: false, + error: "Unauthorized: Invalid token or server error.", + status: 500, + }; + } + + if (!data || data.length === 0) { return { success: false, error: "Unauthorized: Invalid token", diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 3fba86f8..ebe6ef38 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -10,6 +10,9 @@ import os from "os"; import { Logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import { ScrapeEvents } from "./lib/scrape-events"; +import http from 'node:http'; +import https from 'node:https'; +import CacheableLookup from 'cacheable-lookup'; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -18,6 +21,14 @@ const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; Logger.info(`Number of CPUs: ${numCPUs} available`); +const cacheable = new CacheableLookup({ + // this is important to avoid querying local hostnames see https://github.com/szmarczak/cacheable-lookup readme + lookup:false +}); + +cacheable.install(http.globalAgent); +cacheable.install(https.globalAgent) + if (cluster.isMaster) { Logger.info(`Master ${process.pid} is running`); From 968a2dc75330968953569dd1856e4da27953f8e7 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 29 Jul 2024 18:37:09 -0400 Subject: [PATCH 40/45] Nick: --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 3 +++ .../src/scraper/WebScraper/__tests__/dns.test.ts | 15 +++++++++++++++ 3 files changed, 19 insertions(+) create mode 100644 apps/api/src/scraper/WebScraper/__tests__/dns.test.ts diff --git a/apps/api/package.json b/apps/api/package.json index 153ac841..15e97377 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -26,6 +26,7 @@ "license": "ISC", "devDependencies": { "@flydotio/dockerfile": "^0.4.10", + "@jest/globals": "^29.7.0", "@tsconfig/recommended": "^1.0.3", "@types/body-parser": "^1.19.2", "@types/bull": "^4.10.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 9c8dbbc9..ec83e18b 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -192,6 +192,9 @@ importers: '@flydotio/dockerfile': specifier: ^0.4.10 version: 0.4.11 + '@jest/globals': + specifier: ^29.7.0 + version: 29.7.0 '@tsconfig/recommended': specifier: ^1.0.3 version: 1.0.6 diff --git a/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts b/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts new file mode 100644 index 00000000..968ed121 --- /dev/null +++ b/apps/api/src/scraper/WebScraper/__tests__/dns.test.ts @@ -0,0 +1,15 @@ +import CacheableLookup from 'cacheable-lookup'; +import https from 'node:https'; +import axios from "axios"; + +describe("DNS", () => { + it("cached dns", async () => { + const cachedDns = new CacheableLookup(); + cachedDns.install(https.globalAgent); + jest.spyOn(cachedDns, "lookupAsync"); + + const res = await axios.get("https://example.com"); + expect(res.status).toBe(200); + expect(cachedDns.lookupAsync).toHaveBeenCalled(); + }); +}); From e99c2568f4401634d43d6c10aa495d68370c0ed1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 29 Jul 2024 18:44:18 -0400 Subject: [PATCH 41/45] Update auth.ts --- apps/api/src/controllers/auth.ts | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 7154c694..035d847d 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -88,15 +88,10 @@ export async function supaAuthenticateUser( // console.log('Key and Price ID:', data); } - if(error) { - return { - success: false, - error: "Unauthorized: Invalid token or server error.", - status: 500, - }; - } + - if (!data || data.length === 0) { + if (error || !data || data.length === 0) { + Logger.warn(`Error fetching api key: ${error.message} or data is empty`); return { success: false, error: "Unauthorized: Invalid token", @@ -186,16 +181,10 @@ export async function supaAuthenticateUser( .select("*") .eq("key", normalizedApi); - if (error) { - Logger.warn(`Error fetching api key: ${error.message}`); - return { - success: false, - error: "Unauthorized: Invalid token or server error.", - status: 500, - }; - } + - if (!data || data.length === 0) { + if (error || !data || data.length === 0) { + Logger.warn(`Error fetching api key: ${error.message} or data is empty`); return { success: false, error: "Unauthorized: Invalid token", From 7b813883efe8182da71b1064d27d05d609e4241e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 29 Jul 2024 20:31:51 -0400 Subject: [PATCH 42/45] Nick: first layer --- apps/api/src/scraper/WebScraper/single_url.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 12482ec9..4a44b23f 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -85,9 +85,9 @@ function getScrapingFallbackOrder( }); let defaultOrder = [ + !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine", + !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", "scrapingBee", - "fire-engine", - "fire-engine;chrome-cdp", process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", "scrapingBeeLoad", "fetch", From 5e8ffcf50562fced88fa01fdad70edbf8b433678 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 29 Jul 2024 20:43:47 -0400 Subject: [PATCH 43/45] Update website_params.ts --- .../scraper/WebScraper/utils/custom/website_params.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 89836b4a..4b82b075 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -212,4 +212,14 @@ export const urlSpecificParams = { engine: "playwright", } }, + "mendable.ai":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + mobileProxy: true, + method: "get", + engine: "chrome-cdp", + }, + }, + }, }; From d25d7e72443c5c38b296c482275b23f93518be28 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:13:09 -0300 Subject: [PATCH 44/45] special case: developer.apple.com --- .../scraper/WebScraper/utils/custom/website_params.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 4b82b075..c688061d 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -222,4 +222,14 @@ export const urlSpecificParams = { }, }, }, + "developer.apple.com":{ + defaultScraper: "fire-engine", + params:{ + engine: "playwright", + wait: 2000, + fireEngineOptions: { + blockMedia: false, + } + }, + }, }; From 8f5174ffc72fe2fb730356668108ff2bc3a1847e Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 30 Jul 2024 10:37:33 -0300 Subject: [PATCH 45/45] Update auth.ts --- apps/api/src/controllers/auth.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index 035d847d..5dff80b8 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -197,7 +197,6 @@ export async function supaAuthenticateUser( return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""}; } - function getPlanByPriceId(price_id: string) { switch (price_id) { case process.env.STRIPE_PRICE_ID_STARTER: @@ -206,11 +205,14 @@ function getPlanByPriceId(price_id: string) { return 'standard'; case process.env.STRIPE_PRICE_ID_SCALE: return 'scale'; - case process.env.STRIPE_PRICE_ID_HOBBY || process.env.STRIPE_PRICE_ID_HOBBY_YEARLY: + case process.env.STRIPE_PRICE_ID_HOBBY: + case process.env.STRIPE_PRICE_ID_HOBBY_YEARLY: return 'hobby'; - case process.env.STRIPE_PRICE_ID_STANDARD_NEW || process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY: + case process.env.STRIPE_PRICE_ID_STANDARD_NEW: + case process.env.STRIPE_PRICE_ID_STANDARD_NEW_YEARLY: return 'standardnew'; - case process.env.STRIPE_PRICE_ID_GROWTH || process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: + case process.env.STRIPE_PRICE_ID_GROWTH: + case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY: return 'growth'; default: return 'free';