From ba2af74adf6f174c5bb7bc8fe32fd128e7c16e6e Mon Sep 17 00:00:00 2001 From: Kevin Swiber Date: Fri, 9 Aug 2024 15:29:18 -0700 Subject: [PATCH 1/5] Ensuring USE_DB_AUTHENTICATION is true in single URL scraper. --- apps/api/src/scraper/WebScraper/single_url.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 12e075fd..96adcf49 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -23,11 +23,13 @@ import { clientSideError } from "../../strings"; dotenv.config(); +const useDatabaseAuth = process.env.USE_DB_AUTHENTICATION === "true"; + export const baseScrapers = [ "fire-engine", "fire-engine;chrome-cdp", "scrapingBee", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", + useDatabaseAuth ? undefined : "playwright", "scrapingBeeLoad", "fetch", ].filter(Boolean); @@ -85,10 +87,10 @@ function getScrapingFallbackOrder( }); let defaultOrder = [ - !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine", - !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", + !useDatabaseAuth ? undefined : "fire-engine", + !useDatabaseAuth ? undefined : "fire-engine;chrome-cdp", "scrapingBee", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", + useDatabaseAuth ? undefined : "playwright", "scrapingBeeLoad", "fetch", ].filter(Boolean); @@ -96,7 +98,7 @@ function getScrapingFallbackOrder( if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { defaultOrder = [ "fire-engine", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", + useDatabaseAuth ? undefined : "playwright", ...defaultOrder.filter( (scraper) => scraper !== "fire-engine" && scraper !== "playwright" ), From bbed6ef23d325d4c7fba2774bd760513aa2a75ce Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:20:41 -0300 Subject: [PATCH 2/5] added validation on every USE_DB_AUTHENTICATION call --- apps/api/src/controllers/crawl-status.ts | 3 ++- apps/api/src/controllers/status.ts | 3 ++- apps/api/src/lib/logger.ts | 3 ++- apps/api/src/lib/scrape-events.ts | 3 ++- apps/api/src/lib/withAuth.ts | 3 ++- apps/api/src/main/runWebScraper.ts | 3 ++- apps/api/src/services/logging/crawl_log.ts | 3 ++- apps/api/src/services/logging/log_job.ts | 3 ++- apps/api/src/services/logging/scrape_log.ts | 3 ++- apps/api/src/services/supabase.ts | 3 ++- apps/test-suite/utils/supabase.ts | 6 ++++-- 11 files changed, 24 insertions(+), 12 deletions(-) diff --git a/apps/api/src/controllers/crawl-status.ts b/apps/api/src/controllers/crawl-status.ts index 5aafa433..82a49db8 100644 --- a/apps/api/src/controllers/crawl-status.ts +++ b/apps/api/src/controllers/crawl-status.ts @@ -24,7 +24,8 @@ export async function crawlStatusController(req: Request, res: Response) { const { current, current_url, total, current_step, partialDocs } = await job.progress(); let data = job.returnvalue; - if (process.env.USE_DB_AUTHENTICATION === "true") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { const supabaseData = await supabaseGetJobById(req.params.jobId); if (supabaseData) { diff --git a/apps/api/src/controllers/status.ts b/apps/api/src/controllers/status.ts index 3d7fccbb..935338bd 100644 --- a/apps/api/src/controllers/status.ts +++ b/apps/api/src/controllers/status.ts @@ -12,7 +12,8 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons const { current, current_url, total, current_step, partialDocs } = await job.progress(); let data = job.returnvalue; - if (process.env.USE_DB_AUTHENTICATION === "true") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { const supabaseData = await supabaseGetJobById(req.params.jobId); if (supabaseData) { diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index 872dbf51..fb0468c2 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -25,7 +25,8 @@ export class Logger { const color = Logger.colors[level]; console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`); - // if (process.env.USE_DB_AUTH) { + // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + // if (useDbAuthentication) { // save to supabase? another place? // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean }); // } diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 8d677279..02ef670b 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -36,7 +36,8 @@ export class ScrapeEvents { static async insert(jobId: string, content: ScrapeEvent) { if (jobId === "TEST") return null; - if (process.env.USE_DB_AUTHENTICATION) { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { try { const result = await supabase.from("scrape_events").insert({ job_id: jobId, diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index 353c144b..1979907e 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -7,7 +7,8 @@ export function withAuth( originalFunction: (...args: U) => Promise ) { return async function (...args: U): Promise { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { if (warningCount < 5) { Logger.warn("You're bypassing authentication"); warningCount++; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 76665aa2..3f3293b2 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -123,7 +123,8 @@ export async function runWebScraper({ const saveJob = async (job: Job, result: any) => { try { - if (process.env.USE_DB_AUTHENTICATION === "true") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { const { data, error } = await supabase_service .from("firecrawl_jobs") .update({ docs: result }) diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index 68008e02..f19b0297 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -3,7 +3,8 @@ import { Logger } from "../../../src/lib/logger"; import "dotenv/config"; export async function logCrawl(job_id: string, team_id: string) { - if (process.env.USE_DB_AUTHENTICATION === 'true') { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { try { const { data, error } = await supabase_service .from("bulljobs_teams") diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 93d0b311..2525917c 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -7,7 +7,8 @@ import { Logger } from "../../lib/logger"; export async function logJob(job: FirecrawlJob) { try { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { return; } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 099e4a0b..30d8fd1e 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -8,7 +8,8 @@ export async function logScrape( scrapeLog: ScrapeLog, pageOptions?: PageOptions ) { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { Logger.debug("Skipping logging scrape to Supabase"); return; } diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 70ada12b..414d1925 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -8,8 +8,9 @@ class SupabaseService { constructor() { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; // Only initialize the Supabase client if both URL and Service Token are provided. - if (process.env.USE_DB_AUTHENTICATION === "false") { + if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null Logger.warn( "Authentication is disabled. Supabase client will not be initialized." diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index abf7fd78..3e66a991 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -9,7 +9,8 @@ class SupabaseService { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; // Only initialize the Supabase client if both URL and Service Token are provided. - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null console.warn( "Authentication is disabled. Supabase client will not be initialized." @@ -36,7 +37,8 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { console.debug( "Attempted to access Supabase client when it's not configured." ); From c3aeed510b10793f93c723bacf365bb27f2a57b3 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:40:31 -0300 Subject: [PATCH 3/5] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 96adcf49..0c0de938 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -24,13 +24,15 @@ import { clientSideError } from "../../strings"; dotenv.config(); const useDatabaseAuth = process.env.USE_DB_AUTHENTICATION === "true"; +const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; +const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; export const baseScrapers = [ - "fire-engine", - "fire-engine;chrome-cdp", - "scrapingBee", + useFireEngine ? "fire-engine" : undefined, + useFireEngine ? "fire-engine;chrome-cdp" : undefined, + useScrapingBee ? "scrapingBee" : undefined, useDatabaseAuth ? undefined : "playwright", - "scrapingBeeLoad", + useScrapingBee ? "scrapingBeeLoad" : undefined, "fetch", ].filter(Boolean); @@ -87,11 +89,11 @@ function getScrapingFallbackOrder( }); let defaultOrder = [ - !useDatabaseAuth ? undefined : "fire-engine", - !useDatabaseAuth ? undefined : "fire-engine;chrome-cdp", - "scrapingBee", + useFireEngine ? "fire-engine" : undefined, + useFireEngine ? "fire-engine;chrome-cdp" : undefined, + useScrapingBee ? "scrapingBee" : undefined, + useScrapingBee ? "scrapingBeeLoad" : undefined, useDatabaseAuth ? undefined : "playwright", - "scrapingBeeLoad", "fetch", ].filter(Boolean); From 7c339ea125df178be880841bc34bab6a2cedf5ee Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:55:10 -0300 Subject: [PATCH 4/5] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0c0de938..d844535b 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -93,14 +93,14 @@ function getScrapingFallbackOrder( useFireEngine ? "fire-engine;chrome-cdp" : undefined, useScrapingBee ? "scrapingBee" : undefined, useScrapingBee ? "scrapingBeeLoad" : undefined, - useDatabaseAuth ? undefined : "playwright", + useFireEngine ? undefined : "playwright", "fetch", ].filter(Boolean); if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { defaultOrder = [ "fire-engine", - useDatabaseAuth ? undefined : "playwright", + useFireEngine ? undefined : "playwright", ...defaultOrder.filter( (scraper) => scraper !== "fire-engine" && scraper !== "playwright" ), From 76160a38db8f556261506eeb3ce25f28086d94fc Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:57:00 -0300 Subject: [PATCH 5/5] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index d844535b..d5bdcffe 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -23,7 +23,6 @@ import { clientSideError } from "../../strings"; dotenv.config(); -const useDatabaseAuth = process.env.USE_DB_AUTHENTICATION === "true"; const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; @@ -31,7 +30,7 @@ export const baseScrapers = [ useFireEngine ? "fire-engine" : undefined, useFireEngine ? "fire-engine;chrome-cdp" : undefined, useScrapingBee ? "scrapingBee" : undefined, - useDatabaseAuth ? undefined : "playwright", + useFireEngine ? undefined : "playwright", useScrapingBee ? "scrapingBeeLoad" : undefined, "fetch", ].filter(Boolean);