diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2e42e4a..ff22858b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,6 +28,7 @@ env: HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} HDX_NODE_BETA_MODE: 1 FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }} + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} jobs: diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml index ba4a099e..9209309f 100644 --- a/.github/workflows/fly.yml +++ b/.github/workflows/fly.yml @@ -28,6 +28,7 @@ env: NPM_TOKEN: ${{ secrets.NPM_TOKEN }} CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }} jobs: pre-deploy-e2e-tests: @@ -57,6 +58,9 @@ jobs: run: npm run workers & working-directory: ./apps/api id: start_workers + - name: Wait for the application to be ready + run: | + sleep 10 - name: Run E2E tests run: | npm run test:prod @@ -338,6 +342,7 @@ jobs: build-and-publish-rust-sdk: name: Build and publish Rust SDK runs-on: ubuntu-latest + needs: deploy steps: - name: Checkout repository diff --git a/README.md b/README.md index 89ed0127..63dd6ea5 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,9 @@ GitHub Contributors - - Open Source + + Visit firecrawl.dev -

@@ -391,7 +390,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup from firecrawl.firecrawl import FirecrawlApp -app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0") +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") class ArticleSchema(BaseModel): title: str @@ -466,8 +465,7 @@ import FirecrawlApp from "@mendable/firecrawl-js"; import { z } from "zod"; const app = new FirecrawlApp({ - apiKey: "fc-YOUR_API_KEY", - version: "v0" + apiKey: "fc-YOUR_API_KEY" }); // Define schema to extract contents into diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index dd7d4f16..8aabf748 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -1,11 +1,11 @@ import request from "supertest"; -import dotenv from "dotenv"; +import { configDotenv } from "dotenv"; import { ScrapeRequest, ScrapeResponseRequestTest, } from "../../controllers/v1/types"; -dotenv.config(); +configDotenv(); const TEST_URL = "http://127.0.0.1:3002"; describe("E2E Tests for v1 API Routes", () => { @@ -22,6 +22,13 @@ describe("E2E Tests for v1 API Routes", () => { const response: ScrapeResponseRequestTest = await request(TEST_URL).get( "/is-production" ); + + console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION); + console.log('?', process.env.USE_DB_AUTHENTICATION === 'true'); + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + console.log('!!useDbAuthentication', !!useDbAuthentication); + console.log('!useDbAuthentication', !useDbAuthentication); + expect(response.statusCode).toBe(200); expect(response.body).toHaveProperty("isProduction"); }); @@ -29,9 +36,10 @@ describe("E2E Tests for v1 API Routes", () => { describe("POST /v1/scrape", () => { it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL).post( - "/v1/scrape" - ); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .send({ url: "https://firecrawl.dev"}) + expect(response.statusCode).toBe(401); }); @@ -389,7 +397,7 @@ describe("E2E Tests for v1 API Routes", () => { const scrapeRequest: ScrapeRequest = { url: "https://ycombinator.com/companies", formats: ["markdown"], - waitFor: 5000 + waitFor: 8000 }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -451,9 +459,9 @@ describe("E2E Tests for v1 API Routes", () => { describe("POST /v1/map", () => { it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL).post( - "/v1/map" - ); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); @@ -534,7 +542,9 @@ describe("POST /v1/map", () => { const links = response.body.links as unknown[]; expect(Array.isArray(links)).toBe(true); expect(links.length).toBeGreaterThan(0); - expect(links[0]).toContain("docs.firecrawl.dev"); + + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); }); it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { @@ -559,7 +569,9 @@ describe("POST /v1/map", () => { const links = response.body.links as unknown[]; expect(Array.isArray(links)).toBe(true); expect(links.length).toBeGreaterThan(0); - expect(links[0]).toContain("docs.firecrawl.dev"); + + const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev")); + expect(containsDocsFirecrawlDev).toBe(true); }, 10000) it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { @@ -609,9 +621,9 @@ describe("POST /v1/map", () => { describe("POST /v1/crawl", () => { it.concurrent("should require authorization", async () => { - const response: ScrapeResponseRequestTest = await request(TEST_URL).post( - "/v1/crawl" - ); + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/crawl") + .send({ url: "https://firecrawl.dev" }); expect(response.statusCode).toBe(401); }); @@ -863,7 +875,7 @@ describe("GET /v1/crawl/:jobId", () => { .post("/v1/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://docs.mendable.ai" }); + .send({ url: "https://docs.firecrawl.dev" }); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; @@ -893,9 +905,7 @@ describe("GET /v1/crawl/:jobId", () => { expect(completedResponse.body.data[0]).not.toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].metadata.statusCode).toBe( - 200 - ); + expect(completedResponse.body.data[0].metadata.statusCode).toBe(200); expect( completedResponse.body.data[0].metadata.error ).toBeUndefined(); diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts index 330f8130..26caf63e 100644 --- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts @@ -659,7 +659,7 @@ describe("E2E Tests for v0 API Routes", () => { .post("/v0/crawl") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) .set("Content-Type", "application/json") - .send({ url: "https://mendable.ai/blog" }); + .send({ url: "https://firecrawl.dev/blog" }); expect(crawlResponse.statusCode).toBe(200); let isCompleted = false; @@ -689,10 +689,8 @@ describe("E2E Tests for v0 API Routes", () => { expect(completedResponse.body.data[0]).toHaveProperty("content"); expect(completedResponse.body.data[0]).toHaveProperty("markdown"); expect(completedResponse.body.data[0]).toHaveProperty("metadata"); - expect(completedResponse.body.data[0].content).toContain("Mendable"); - expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe( - 200 - ); + expect(completedResponse.body.data[0].content).toContain("Firecrawl"); + expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200); expect( completedResponse.body.data[0].metadata.pageError ).toBeUndefined(); @@ -701,7 +699,7 @@ describe("E2E Tests for v0 API Routes", () => { (doc) => doc.metadata && doc.metadata.sourceURL && - doc.metadata.sourceURL.includes("mendable.ai/blog") + doc.metadata.sourceURL.includes("firecrawl.dev/blog") ); expect(childrenLinks.length).toBe(completedResponse.body.data.length); diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts index bf1c2d0a..efcd454a 100644 --- a/apps/api/src/controllers/v0/crawl-cancel.ts +++ b/apps/api/src/controllers/v0/crawl-cancel.ts @@ -5,6 +5,8 @@ import { supabase_service } from "../../../src/services/supabase"; import { Logger } from "../../../src/lib/logger"; import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function crawlCancelController(req: Request, res: Response) { try { diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts index b0649cd0..a3f3f16f 100644 --- a/apps/api/src/controllers/v0/crawl-status.ts +++ b/apps/api/src/controllers/v0/crawl-status.ts @@ -6,6 +6,8 @@ import { Logger } from "../../../src/lib/logger"; import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function getJobs(ids: string[]) { const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 40df5021..bc91da18 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -244,14 +244,10 @@ export async function scrapeController(req: Request, res: Response) { } if (creditsToBeBilled > 0) { // billing for doc done on queue end, bill only for llm extraction - const billingResult = await billTeam(team_id, creditsToBeBilled); - if (!billingResult.success) { - return res.status(402).json({ - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - }); - } + billTeam(team_id, creditsToBeBilled).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); } } diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index 825abbe1..5ef2b767 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -54,18 +54,10 @@ export async function searchHelper( if (justSearch) { - const billingResult = await billTeam( - team_id, - res.length - ); - if (!billingResult.success) { - return { - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - returnCode: 402, - }; - } + billTeam(team_id, res.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); return { success: true, data: res, returnCode: 200 }; } diff --git a/apps/api/src/controllers/v1/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts index 06a5b26e..21fc7cf9 100644 --- a/apps/api/src/controllers/v1/crawl-cancel.ts +++ b/apps/api/src/controllers/v1/crawl-cancel.ts @@ -5,6 +5,8 @@ import { supabase_service } from "../../services/supabase"; import { Logger } from "../../lib/logger"; import { getCrawl, saveCrawl } from "../../lib/crawl-redis"; import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function crawlCancelController(req: Request, res: Response) { try { diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 845f616c..05144a9b 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -3,6 +3,8 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentCo import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis"; import { getScrapeQueue } from "../../services/queue-service"; import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function getJob(id: string) { const job = await getScrapeQueue().getJob(id); diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 32294a0f..e6abd9ae 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -18,6 +18,7 @@ import { fireEngineMap } from "../../search/fireEngine"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; import { performCosineSimilarity } from "../../lib/map-cosine"; +import { Logger } from "../../lib/logger"; configDotenv(); @@ -61,8 +62,8 @@ export async function mapController( : `site:${req.body.url}`; // www. seems to exclude subdomains in some cases const mapResults = await fireEngineMap(mapUrl, { - // limit to 50 results (beta) - numResults: Math.min(limit, 50), + // limit to 100 results (beta) + numResults: Math.min(limit, 100), }); if (mapResults.length > 0) { @@ -100,7 +101,10 @@ export async function mapController( // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); - await billTeam(req.auth.team_id, 1); + billTeam(req.auth.team_id, 1).catch(error => { + Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; @@ -127,5 +131,6 @@ export async function mapController( return res.status(200).json({ success: true, links: linksToReturn, + scrape_id: req.body.origin?.includes("website") ? id : undefined, }); } diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index c573e100..0835cc2a 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -106,14 +106,10 @@ export async function scrapeController( creditsToBeBilled = 50; } - const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled); - if (!billingResult.success) { - return res.status(402).json({ - success: false, - error: - "Failed to bill team. Insufficient credits or subscription not found.", - }); - } + billTeam(req.auth.team_id, creditsToBeBilled).catch(error => { + Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); if (!pageOptions || !pageOptions.includeRawHtml) { if (doc && doc.rawHtml) { @@ -147,5 +143,6 @@ export async function scrapeController( return res.status(200).json({ success: true, data: legacyDocumentConverter(doc), + scrape_id: origin?.includes("website") ? jobId : undefined, }); } diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 85bd625f..c4e0cf84 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -225,6 +225,7 @@ export type ScrapeResponse = success: true; warning?: string; data: Document; + scrape_id?: string; }; export interface ScrapeResponseRequestTest { @@ -246,6 +247,7 @@ export type MapResponse = | { success: true; links: string[]; + scrape_id?: string; }; export type CrawlStatusParams = { diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts index 872dbf51..cb8b4119 100644 --- a/apps/api/src/lib/logger.ts +++ b/apps/api/src/lib/logger.ts @@ -1,3 +1,6 @@ +import { configDotenv } from "dotenv"; +configDotenv(); + enum LogLevel { NONE = 'NONE', // No logs will be output. ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation. @@ -25,7 +28,8 @@ export class Logger { const color = Logger.colors[level]; console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`); - // if (process.env.USE_DB_AUTH) { + // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + // if (useDbAuthentication) { // save to supabase? another place? // supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean }); // } diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 04850b4e..ad70dfef 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -2,6 +2,8 @@ import { Job } from "bullmq"; import type { baseScrapers } from "../scraper/WebScraper/single_url"; import { supabase_service as supabase } from "../services/supabase"; import { Logger } from "./logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export type ScrapeErrorEvent = { type: "error", @@ -36,7 +38,8 @@ export class ScrapeEvents { static async insert(jobId: string, content: ScrapeEvent) { if (jobId === "TEST") return null; - if (process.env.USE_DB_AUTHENTICATION) { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { try { const result = await supabase.from("scrape_events").insert({ job_id: jobId, diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts index 353c144b..b45b8973 100644 --- a/apps/api/src/lib/withAuth.ts +++ b/apps/api/src/lib/withAuth.ts @@ -1,5 +1,8 @@ import { AuthResponse } from "../../src/types"; import { Logger } from "./logger"; +import * as Sentry from "@sentry/node"; +import { configDotenv } from "dotenv"; +configDotenv(); let warningCount = 0; @@ -7,7 +10,8 @@ export function withAuth( originalFunction: (...args: U) => Promise ) { return async function (...args: U): Promise { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { if (warningCount < 5) { Logger.warn("You're bypassing authentication"); warningCount++; @@ -17,6 +21,7 @@ export function withAuth( try { return await originalFunction(...args); } catch (error) { + Sentry.captureException(error); Logger.error(`Error in withAuth function: ${error}`); return { success: false, error: error.message } as T; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 2b5388c1..f67a1cd0 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -12,6 +12,8 @@ import { Document } from "../lib/entities"; import { supabase_service } from "../services/supabase"; import { Logger } from "../lib/logger"; import { ScrapeEvents } from "../lib/scrape-events"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function startWebScraperPipeline({ job, @@ -118,15 +120,10 @@ export async function runWebScraper({ : docs; if(is_scrape === false) { - const billingResult = await billTeam(team_id, filteredDocs.length); - if (!billingResult.success) { - // throw new Error("Failed to bill team, no subscription was found"); - return { - success: false, - message: "Failed to bill team, no subscription was found", - docs: [], - }; - } + billTeam(team_id, filteredDocs.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`); + // Optionally, you could notify an admin or add to a retry queue here + }); } @@ -144,7 +141,8 @@ export async function runWebScraper({ const saveJob = async (job: Job, result: any, token: string, mode: string) => { try { - if (process.env.USE_DB_AUTHENTICATION === "true") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { const { data, error } = await supabase_service .from("firecrawl_jobs") .update({ docs: result }) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 9dcbf111..daa9bf43 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -33,7 +33,9 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); if (!success) { Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); - return res.status(402).json({ success: false, error: "Insufficient credits" }); + if (!res.headersSent) { + return res.status(402).json({ success: false, error: "Insufficient credits" }); + } } req.account = { remainingCredits } next(); @@ -52,7 +54,9 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW ); if (!success) { - return res.status(status).json({ success: false, error }); + if (!res.headersSent) { + return res.status(status).json({ success: false, error }); + } } req.auth = { team_id, plan }; @@ -67,7 +71,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) if (req.headers["x-idempotency-key"]) { const isIdempotencyValid = await validateIdempotencyKey(req); if (!isIdempotencyValid) { - return res.status(409).json({ success: false, error: "Idempotency key already used" }); + if (!res.headersSent) { + return res.status(409).json({ success: false, error: "Idempotency key already used" }); + } } createIdempotencyKey(req); } @@ -78,7 +84,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { if (req.body.url && isUrlBlocked(req.body.url)) { - return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); + if (!res.headersSent) { + return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); + } } next(); } @@ -96,26 +104,26 @@ export const v1Router = express.Router(); v1Router.post( "/scrape", - blocklistMiddleware, authMiddleware(RateLimiterMode.Scrape), checkCreditsMiddleware(1), + blocklistMiddleware, wrap(scrapeController) ); v1Router.post( "/crawl", - blocklistMiddleware, authMiddleware(RateLimiterMode.Crawl), - idempotencyMiddleware, checkCreditsMiddleware(), + blocklistMiddleware, + idempotencyMiddleware, wrap(crawlController) ); v1Router.post( "/map", - blocklistMiddleware, authMiddleware(RateLimiterMode.Map), checkCreditsMiddleware(1), + blocklistMiddleware, wrap(mapController) ); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index adf7e53c..11e1fe37 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -23,12 +23,15 @@ import { clientSideError } from "../../strings"; dotenv.config(); +const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; +const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; + export const baseScrapers = [ - "fire-engine;chrome-cdp", - "fire-engine", - "scrapingBee", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", - "scrapingBeeLoad", + useFireEngine ? "fire-engine;chrome-cdp" : undefined, + useFireEngine ? "fire-engine" : undefined, + useScrapingBee ? "scrapingBee" : undefined, + useFireEngine ? undefined : "playwright", + useScrapingBee ? "scrapingBeeLoad" : undefined, "fetch", ].filter(Boolean); @@ -85,18 +88,18 @@ function getScrapingFallbackOrder( }); let defaultOrder = [ - !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp", - !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine", - "scrapingBee", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", - "scrapingBeeLoad", + useFireEngine ? "fire-engine;chrome-cdp" : undefined, + useFireEngine ? "fire-engine" : undefined, + useScrapingBee ? "scrapingBee" : undefined, + useScrapingBee ? "scrapingBeeLoad" : undefined, + useFireEngine ? undefined : "playwright", "fetch", ].filter(Boolean); if (isWaitPresent || isScreenshotPresent || isHeadersPresent) { defaultOrder = [ "fire-engine", - process.env.USE_DB_AUTHENTICATION ? undefined : "playwright", + useFireEngine ? undefined : "playwright", ...defaultOrder.filter( (scraper) => scraper !== "fire-engine" && scraper !== "playwright" ), diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 22dc72df..53031de9 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -5,7 +5,7 @@ import { supabase_service } from "../supabase"; import { Logger } from "../../lib/logger"; import { getValue, setValue } from "../redis"; import { redlock } from "../redlock"; - +import * as Sentry from "@sentry/node"; const FREE_CREDITS = 500; @@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) { ]); let couponCredits = 0; + let sortedCoupons = []; + if (coupons && coupons.length > 0) { couponCredits = coupons.reduce( (total, coupon) => total + coupon.credits, 0 ); + sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits); } - - let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits); // using coupon credits: if (couponCredits > 0) { // if there is no subscription and they have enough coupon credits @@ -175,9 +176,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity }; } - // Retrieve the team's active subscription and check for available coupons concurrently - const [{ data: subscription, error: subscriptionError }, { data: coupons }] = - await Promise.all([ + + let cacheKeySubscription = `subscription_${team_id}`; + let cacheKeyCoupons = `coupons_${team_id}`; + + // Try to get data from cache first + const [cachedSubscription, cachedCoupons] = await Promise.all([ + getValue(cacheKeySubscription), + getValue(cacheKeyCoupons) + ]); + + let subscription, subscriptionError, coupons; + + if (cachedSubscription && cachedCoupons) { + subscription = JSON.parse(cachedSubscription); + coupons = JSON.parse(cachedCoupons); + } else { + // If not in cache, retrieve from database + const [subscriptionResult, couponsResult] = await Promise.all([ supabase_service .from("subscriptions") .select("id, price_id, current_period_start, current_period_end") @@ -191,6 +207,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { .eq("status", "active"), ]); + subscription = subscriptionResult.data; + subscriptionError = subscriptionResult.error; + coupons = couponsResult.data; + + // Cache the results for a minute, sub can be null and that's fine + await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null + await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute + + } + let couponCredits = 0; if (coupons && coupons.length > 0) { couponCredits = coupons.reduce( @@ -211,41 +237,54 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { let creditUsages; let creditUsageError; - let retries = 0; - const maxRetries = 3; - const retryInterval = 2000; // 2 seconds + let totalCreditsUsed = 0; + const cacheKeyCreditUsage = `credit_usage_${team_id}`; - while (retries < maxRetries) { - const result = await supabase_service - .from("credit_usage") - .select("credits_used") - .is("subscription_id", null) - .eq("team_id", team_id); + // Try to get credit usage from cache + const cachedCreditUsage = await getValue(cacheKeyCreditUsage); - creditUsages = result.data; - creditUsageError = result.error; + if (cachedCreditUsage) { + totalCreditsUsed = parseInt(cachedCreditUsage); + } else { + let retries = 0; + const maxRetries = 3; + const retryInterval = 2000; // 2 seconds - if (!creditUsageError) { - break; + while (retries < maxRetries) { + const result = await supabase_service + .from("credit_usage") + .select("credits_used") + .is("subscription_id", null) + .eq("team_id", team_id); + + creditUsages = result.data; + creditUsageError = result.error; + + if (!creditUsageError) { + break; + } + + retries++; + if (retries < maxRetries) { + await new Promise(resolve => setTimeout(resolve, retryInterval)); + } } - retries++; - if (retries < maxRetries) { - await new Promise(resolve => setTimeout(resolve, retryInterval)); + if (creditUsageError) { + Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`); + throw new Error( + `Failed to retrieve credit usage for team_id: ${team_id}` + ); } - } - if (creditUsageError) { - Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`); - throw new Error( - `Failed to retrieve credit usage for team_id: ${team_id}` + totalCreditsUsed = creditUsages.reduce( + (acc, usage) => acc + usage.credits_used, + 0 ); - } - const totalCreditsUsed = creditUsages.reduce( - (acc, usage) => acc + usage.credits_used, - 0 - ); + // Cache the result for 30 seconds + await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30); + } Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`); @@ -255,7 +294,9 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { const creditLimit = FREE_CREDITS; const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit; - if (creditUsagePercentage >= 0.8) { + // Add a check to ensure totalCreditsUsed is greater than 0 + if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) { + Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`); await sendNotification( team_id, NotificationType.APPROACHING_LIMIT, @@ -309,7 +350,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { if (creditUsages && creditUsages.length > 0) { totalCreditsUsed = creditUsages[0].total_credits_used; - await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes + await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes // Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`); } } @@ -322,17 +363,38 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { // Adjust total credits used by subtracting coupon value const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits); - // Get the price details - const { data: price, error: priceError } = await supabase_service - .from("prices") - .select("credits") - .eq("id", subscription.price_id) - .single(); - if (priceError) { - throw new Error( - `Failed to retrieve price for price_id: ${subscription.price_id}` - ); + // Get the price details from cache or database + const priceCacheKey = `price_${subscription.price_id}`; + let price; + + try { + const cachedPrice = await getValue(priceCacheKey); + if (cachedPrice) { + price = JSON.parse(cachedPrice); + } else { + const { data, error: priceError } = await supabase_service + .from("prices") + .select("credits") + .eq("id", subscription.price_id) + .single(); + + if (priceError) { + throw new Error( + `Failed to retrieve price for price_id: ${subscription.price_id}` + ); + } + + price = data; + // There are only 21 records, so this is super fine + // Cache the price for a long time (e.g., 1 day) + await setValue(priceCacheKey, JSON.stringify(price), 86400); + } + } catch (error) { + Logger.error(`Error retrieving or caching price: ${error}`); + Sentry.captureException(error); + // If errors, just assume it's a big number so user don't get an error + price = { credits: 1000000 }; } const creditLimit = price.credits; @@ -462,8 +524,8 @@ async function createCreditUsage({ subscription_id?: string; credits: number; }) { - const { data: credit_usage } = await supabase_service - .from("credit_usage") + await supabase_service + .from("credit_usage") .insert([ { team_id, @@ -471,8 +533,7 @@ async function createCreditUsage({ subscription_id: subscription_id || null, created_at: new Date(), }, - ]) - .select(); + ]); - return { success: true, credit_usage }; + return { success: true }; } diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts index 68008e02..3850e05b 100644 --- a/apps/api/src/services/logging/crawl_log.ts +++ b/apps/api/src/services/logging/crawl_log.ts @@ -1,9 +1,11 @@ import { supabase_service } from "../supabase"; import { Logger } from "../../../src/lib/logger"; -import "dotenv/config"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logCrawl(job_id: string, team_id: string) { - if (process.env.USE_DB_AUTHENTICATION === 'true') { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (useDbAuthentication) { try { const { data, error } = await supabase_service .from("bulljobs_teams") diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 61983be0..4d8ee014 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types"; import { posthog } from "../posthog"; import "dotenv/config"; import { Logger } from "../../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logJob(job: FirecrawlJob) { try { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { return; } diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 099e4a0b..fbe41653 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types"; import { supabase_service } from "../supabase"; import { PageOptions } from "../../lib/entities"; import { Logger } from "../../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); export async function logScrape( scrapeLog: ScrapeLog, pageOptions?: PageOptions ) { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { Logger.debug("Skipping logging scrape to Supabase"); return; } diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 941b571d..7a698772 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -67,6 +67,6 @@ export function waitForJob(jobId: string, timeout: number) { reject((await getScrapeQueue().getJob(jobId)).failedReason); } } - }, 1000); + }, 500); }) } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 6488759f..ad0e4ad5 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -36,6 +36,8 @@ import { } from "../../src/lib/job-priority"; import { PlanType } from "../types"; import { getJobs } from "../../src/controllers/v1/crawl-status"; +import { configDotenv } from "dotenv"; +configDotenv(); if (process.env.ENV === "production") { initSDK({ diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts index 70ada12b..7636717e 100644 --- a/apps/api/src/services/supabase.ts +++ b/apps/api/src/services/supabase.ts @@ -1,5 +1,7 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; import { Logger } from "../lib/logger"; +import { configDotenv } from "dotenv"; +configDotenv(); // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -8,8 +10,9 @@ class SupabaseService { constructor() { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; // Only initialize the Supabase client if both URL and Service Token are provided. - if (process.env.USE_DB_AUTHENTICATION === "false") { + if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null Logger.warn( "Authentication is disabled. Supabase client will not be initialized." diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts index 56dd5c58..06e5649d 100644 --- a/apps/api/src/services/webhook.ts +++ b/apps/api/src/services/webhook.ts @@ -3,6 +3,8 @@ import { legacyDocumentConverter } from "../../src/controllers/v1/types"; import { Logger } from "../../src/lib/logger"; import { supabase_service } from "./supabase"; import { WebhookEventType } from "../types"; +import { configDotenv } from "dotenv"; +configDotenv(); export const callWebhook = async ( teamId: string, diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e68b3014..7114a625 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.1", + "version": "1.2.2", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 1d1bb4ee..8b16adfb 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -454,20 +454,27 @@ export default class FirecrawlApp { checkInterval: number ): Promise { while (true) { - const statusResponse: AxiosResponse = await this.getRequest( + let statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/crawl/${id}`, headers ); if (statusResponse.status === 200) { - const statusData = statusResponse.data; + let statusData = statusResponse.data; if (statusData.status === "completed") { if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; return statusData; } else { throw new Error("Crawl job completed but no data was returned"); } } else if ( - ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status) + ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) ) { checkInterval = Math.max(checkInterval, 2); await new Promise((resolve) => diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 4b3807be..f178cd61 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp -__version__ = "1.2.1" +__version__ = "1.2.3" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 75245e8d..254f4c70 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -238,7 +238,6 @@ class FirecrawlApp: ) if response.status_code == 200: response = response.json() - print(response) if response['success'] and 'links' in response: return response['links'] else: @@ -346,6 +345,12 @@ class FirecrawlApp: status_data = status_response.json() if status_data['status'] == 'completed': if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + status_response = self._get_request(status_data['next'], headers) + status_data = status_response.json() + data.extend(status_data['data']) + status_data['data'] = data return status_data else: raise Exception('Crawl job completed but no data was returned') diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts index abf7fd78..a1549e24 100644 --- a/apps/test-suite/utils/supabase.ts +++ b/apps/test-suite/utils/supabase.ts @@ -1,5 +1,6 @@ import { createClient, SupabaseClient } from "@supabase/supabase-js"; -import "dotenv/config"; +import { configDotenv } from "dotenv"; +configDotenv(); // SupabaseService class initializes the Supabase client conditionally based on environment variables. class SupabaseService { @@ -9,7 +10,8 @@ class SupabaseService { const supabaseUrl = process.env.SUPABASE_URL; const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN; // Only initialize the Supabase client if both URL and Service Token are provided. - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { // Warn the user that Authentication is disabled by setting the client to null console.warn( "Authentication is disabled. Supabase client will not be initialized." @@ -36,7 +38,8 @@ export const supabase_service: SupabaseClient = new Proxy( new SupabaseService(), { get: function (target, prop, receiver) { - if (process.env.USE_DB_AUTHENTICATION === "false") { + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; + if (!useDbAuthentication) { console.debug( "Attempted to access Supabase client when it's not configured." );