diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b2e42e4a..ff22858b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -28,6 +28,7 @@ env:
HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }}
HDX_NODE_BETA_MODE: 1
FIRE_ENGINE_BETA_URL: ${{ secrets.FIRE_ENGINE_BETA_URL }}
+ USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
jobs:
diff --git a/.github/workflows/fly.yml b/.github/workflows/fly.yml
index ba4a099e..9209309f 100644
--- a/.github/workflows/fly.yml
+++ b/.github/workflows/fly.yml
@@ -28,6 +28,7 @@ env:
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }}
+ USE_DB_AUTHENTICATION: ${{ secrets.USE_DB_AUTHENTICATION }}
jobs:
pre-deploy-e2e-tests:
@@ -57,6 +58,9 @@ jobs:
run: npm run workers &
working-directory: ./apps/api
id: start_workers
+ - name: Wait for the application to be ready
+ run: |
+ sleep 10
- name: Run E2E tests
run: |
npm run test:prod
@@ -338,6 +342,7 @@ jobs:
build-and-publish-rust-sdk:
name: Build and publish Rust SDK
runs-on: ubuntu-latest
+ needs: deploy
steps:
- name: Checkout repository
diff --git a/README.md b/README.md
index 89ed0127..63dd6ea5 100644
--- a/README.md
+++ b/README.md
@@ -14,10 +14,9 @@
-
-
+
+
-
@@ -391,7 +390,7 @@ With LLM extraction, you can easily extract structured data from any URL. We sup
from firecrawl.firecrawl import FirecrawlApp
-app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
+app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
class ArticleSchema(BaseModel):
title: str
@@ -466,8 +465,7 @@ import FirecrawlApp from "@mendable/firecrawl-js";
import { z } from "zod";
const app = new FirecrawlApp({
- apiKey: "fc-YOUR_API_KEY",
- version: "v0"
+ apiKey: "fc-YOUR_API_KEY"
});
// Define schema to extract contents into
diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
index dd7d4f16..8aabf748 100644
--- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
@@ -1,11 +1,11 @@
import request from "supertest";
-import dotenv from "dotenv";
+import { configDotenv } from "dotenv";
import {
ScrapeRequest,
ScrapeResponseRequestTest,
} from "../../controllers/v1/types";
-dotenv.config();
+configDotenv();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for v1 API Routes", () => {
@@ -22,6 +22,13 @@ describe("E2E Tests for v1 API Routes", () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
"/is-production"
);
+
+ console.log('process.env.USE_DB_AUTHENTICATION', process.env.USE_DB_AUTHENTICATION);
+ console.log('?', process.env.USE_DB_AUTHENTICATION === 'true');
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ console.log('!!useDbAuthentication', !!useDbAuthentication);
+ console.log('!useDbAuthentication', !useDbAuthentication);
+
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("isProduction");
});
@@ -29,9 +36,10 @@ describe("E2E Tests for v1 API Routes", () => {
describe("POST /v1/scrape", () => {
it.concurrent("should require authorization", async () => {
- const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
- "/v1/scrape"
- );
+ const response: ScrapeResponseRequestTest = await request(TEST_URL)
+ .post("/v1/scrape")
+ .send({ url: "https://firecrawl.dev"})
+
expect(response.statusCode).toBe(401);
});
@@ -389,7 +397,7 @@ describe("E2E Tests for v1 API Routes", () => {
const scrapeRequest: ScrapeRequest = {
url: "https://ycombinator.com/companies",
formats: ["markdown"],
- waitFor: 5000
+ waitFor: 8000
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
@@ -451,9 +459,9 @@ describe("E2E Tests for v1 API Routes", () => {
describe("POST /v1/map", () => {
it.concurrent("should require authorization", async () => {
- const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
- "/v1/map"
- );
+ const response: ScrapeResponseRequestTest = await request(TEST_URL)
+ .post("/v1/map")
+ .send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
@@ -534,7 +542,9 @@ describe("POST /v1/map", () => {
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
- expect(links[0]).toContain("docs.firecrawl.dev");
+
+ const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
+ expect(containsDocsFirecrawlDev).toBe(true);
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
@@ -559,7 +569,9 @@ describe("POST /v1/map", () => {
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
- expect(links[0]).toContain("docs.firecrawl.dev");
+
+ const containsDocsFirecrawlDev = links.some((link: string) => link.includes("docs.firecrawl.dev"));
+ expect(containsDocsFirecrawlDev).toBe(true);
}, 10000)
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
@@ -609,9 +621,9 @@ describe("POST /v1/map", () => {
describe("POST /v1/crawl", () => {
it.concurrent("should require authorization", async () => {
- const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
- "/v1/crawl"
- );
+ const response: ScrapeResponseRequestTest = await request(TEST_URL)
+ .post("/v1/crawl")
+ .send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
@@ -863,7 +875,7 @@ describe("GET /v1/crawl/:jobId", () => {
.post("/v1/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
- .send({ url: "https://docs.mendable.ai" });
+ .send({ url: "https://docs.firecrawl.dev" });
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
@@ -893,9 +905,7 @@ describe("GET /v1/crawl/:jobId", () => {
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
- expect(completedResponse.body.data[0].metadata.statusCode).toBe(
- 200
- );
+ expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
expect(
completedResponse.body.data[0].metadata.error
).toBeUndefined();
diff --git a/apps/api/src/__tests__/e2e_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
index 330f8130..26caf63e 100644
--- a/apps/api/src/__tests__/e2e_withAuth/index.test.ts
+++ b/apps/api/src/__tests__/e2e_withAuth/index.test.ts
@@ -659,7 +659,7 @@ describe("E2E Tests for v0 API Routes", () => {
.post("/v0/crawl")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
- .send({ url: "https://mendable.ai/blog" });
+ .send({ url: "https://firecrawl.dev/blog" });
expect(crawlResponse.statusCode).toBe(200);
let isCompleted = false;
@@ -689,10 +689,8 @@ describe("E2E Tests for v0 API Routes", () => {
expect(completedResponse.body.data[0]).toHaveProperty("content");
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
- expect(completedResponse.body.data[0].content).toContain("Mendable");
- expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(
- 200
- );
+ expect(completedResponse.body.data[0].content).toContain("Firecrawl");
+ expect(completedResponse.body.data[0].metadata.pageStatusCode).toBe(200);
expect(
completedResponse.body.data[0].metadata.pageError
).toBeUndefined();
@@ -701,7 +699,7 @@ describe("E2E Tests for v0 API Routes", () => {
(doc) =>
doc.metadata &&
doc.metadata.sourceURL &&
- doc.metadata.sourceURL.includes("mendable.ai/blog")
+ doc.metadata.sourceURL.includes("firecrawl.dev/blog")
);
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
diff --git a/apps/api/src/controllers/v0/crawl-cancel.ts b/apps/api/src/controllers/v0/crawl-cancel.ts
index bf1c2d0a..efcd454a 100644
--- a/apps/api/src/controllers/v0/crawl-cancel.ts
+++ b/apps/api/src/controllers/v0/crawl-cancel.ts
@@ -5,6 +5,8 @@ import { supabase_service } from "../../../src/services/supabase";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
import * as Sentry from "@sentry/node";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function crawlCancelController(req: Request, res: Response) {
try {
diff --git a/apps/api/src/controllers/v0/crawl-status.ts b/apps/api/src/controllers/v0/crawl-status.ts
index b0649cd0..a3f3f16f 100644
--- a/apps/api/src/controllers/v0/crawl-status.ts
+++ b/apps/api/src/controllers/v0/crawl-status.ts
@@ -6,6 +6,8 @@ import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function getJobs(ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts
index 40df5021..bc91da18 100644
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@@ -244,14 +244,10 @@ export async function scrapeController(req: Request, res: Response) {
}
if (creditsToBeBilled > 0) {
// billing for doc done on queue end, bill only for llm extraction
- const billingResult = await billTeam(team_id, creditsToBeBilled);
- if (!billingResult.success) {
- return res.status(402).json({
- success: false,
- error:
- "Failed to bill team. Insufficient credits or subscription not found.",
- });
- }
+ billTeam(team_id, creditsToBeBilled).catch(error => {
+ Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`);
+ // Optionally, you could notify an admin or add to a retry queue here
+ });
}
}
diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts
index 825abbe1..5ef2b767 100644
--- a/apps/api/src/controllers/v0/search.ts
+++ b/apps/api/src/controllers/v0/search.ts
@@ -54,18 +54,10 @@ export async function searchHelper(
if (justSearch) {
- const billingResult = await billTeam(
- team_id,
- res.length
- );
- if (!billingResult.success) {
- return {
- success: false,
- error:
- "Failed to bill team. Insufficient credits or subscription not found.",
- returnCode: 402,
- };
- }
+ billTeam(team_id, res.length).catch(error => {
+ Logger.error(`Failed to bill team ${team_id} for ${res.length} credits: ${error}`);
+ // Optionally, you could notify an admin or add to a retry queue here
+ });
return { success: true, data: res, returnCode: 200 };
}
diff --git a/apps/api/src/controllers/v1/crawl-cancel.ts b/apps/api/src/controllers/v1/crawl-cancel.ts
index 06a5b26e..21fc7cf9 100644
--- a/apps/api/src/controllers/v1/crawl-cancel.ts
+++ b/apps/api/src/controllers/v1/crawl-cancel.ts
@@ -5,6 +5,8 @@ import { supabase_service } from "../../services/supabase";
import { Logger } from "../../lib/logger";
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
import * as Sentry from "@sentry/node";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function crawlCancelController(req: Request, res: Response) {
try {
diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts
index 845f616c..05144a9b 100644
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@@ -3,6 +3,8 @@ import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentCo
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
import { getScrapeQueue } from "../../services/queue-service";
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function getJob(id: string) {
const job = await getScrapeQueue().getJob(id);
diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts
index 32294a0f..e6abd9ae 100644
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@@ -18,6 +18,7 @@ import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { performCosineSimilarity } from "../../lib/map-cosine";
+import { Logger } from "../../lib/logger";
configDotenv();
@@ -61,8 +62,8 @@ export async function mapController(
: `site:${req.body.url}`;
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
- // limit to 50 results (beta)
- numResults: Math.min(limit, 50),
+ // limit to 100 results (beta)
+ numResults: Math.min(limit, 100),
});
if (mapResults.length > 0) {
@@ -100,7 +101,10 @@ export async function mapController(
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
- await billTeam(req.auth.team_id, 1);
+ billTeam(req.auth.team_id, 1).catch(error => {
+ Logger.error(`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`);
+ // Optionally, you could notify an admin or add to a retry queue here
+ });
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
@@ -127,5 +131,6 @@ export async function mapController(
return res.status(200).json({
success: true,
links: linksToReturn,
+ scrape_id: req.body.origin?.includes("website") ? id : undefined,
});
}
diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts
index c573e100..0835cc2a 100644
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@@ -106,14 +106,10 @@ export async function scrapeController(
creditsToBeBilled = 50;
}
- const billingResult = await billTeam(req.auth.team_id, creditsToBeBilled);
- if (!billingResult.success) {
- return res.status(402).json({
- success: false,
- error:
- "Failed to bill team. Insufficient credits or subscription not found.",
- });
- }
+ billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {
+ Logger.error(`Failed to bill team ${req.auth.team_id} for ${creditsToBeBilled} credits: ${error}`);
+ // Optionally, you could notify an admin or add to a retry queue here
+ });
if (!pageOptions || !pageOptions.includeRawHtml) {
if (doc && doc.rawHtml) {
@@ -147,5 +143,6 @@ export async function scrapeController(
return res.status(200).json({
success: true,
data: legacyDocumentConverter(doc),
+ scrape_id: origin?.includes("website") ? jobId : undefined,
});
}
diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts
index 85bd625f..c4e0cf84 100644
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@@ -225,6 +225,7 @@ export type ScrapeResponse =
success: true;
warning?: string;
data: Document;
+ scrape_id?: string;
};
export interface ScrapeResponseRequestTest {
@@ -246,6 +247,7 @@ export type MapResponse =
| {
success: true;
links: string[];
+ scrape_id?: string;
};
export type CrawlStatusParams = {
diff --git a/apps/api/src/lib/logger.ts b/apps/api/src/lib/logger.ts
index 872dbf51..cb8b4119 100644
--- a/apps/api/src/lib/logger.ts
+++ b/apps/api/src/lib/logger.ts
@@ -1,3 +1,6 @@
+import { configDotenv } from "dotenv";
+configDotenv();
+
enum LogLevel {
NONE = 'NONE', // No logs will be output.
ERROR = 'ERROR', // For logging error messages that indicate a failure in a specific operation.
@@ -25,7 +28,8 @@ export class Logger {
const color = Logger.colors[level];
console[level.toLowerCase()](color, `[${new Date().toISOString()}]${level} - ${message}`);
- // if (process.env.USE_DB_AUTH) {
+ // const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ // if (useDbAuthentication) {
// save to supabase? another place?
// supabase.from('logs').insert({ level: level, message: message, timestamp: new Date().toISOString(), success: boolean });
// }
diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts
index 04850b4e..ad70dfef 100644
--- a/apps/api/src/lib/scrape-events.ts
+++ b/apps/api/src/lib/scrape-events.ts
@@ -2,6 +2,8 @@ import { Job } from "bullmq";
import type { baseScrapers } from "../scraper/WebScraper/single_url";
import { supabase_service as supabase } from "../services/supabase";
import { Logger } from "./logger";
+import { configDotenv } from "dotenv";
+configDotenv();
export type ScrapeErrorEvent = {
type: "error",
@@ -36,7 +38,8 @@ export class ScrapeEvents {
static async insert(jobId: string, content: ScrapeEvent) {
if (jobId === "TEST") return null;
- if (process.env.USE_DB_AUTHENTICATION) {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (useDbAuthentication) {
try {
const result = await supabase.from("scrape_events").insert({
job_id: jobId,
diff --git a/apps/api/src/lib/withAuth.ts b/apps/api/src/lib/withAuth.ts
index 353c144b..b45b8973 100644
--- a/apps/api/src/lib/withAuth.ts
+++ b/apps/api/src/lib/withAuth.ts
@@ -1,5 +1,8 @@
import { AuthResponse } from "../../src/types";
import { Logger } from "./logger";
+import * as Sentry from "@sentry/node";
+import { configDotenv } from "dotenv";
+configDotenv();
let warningCount = 0;
@@ -7,7 +10,8 @@ export function withAuth(
originalFunction: (...args: U) => Promise
) {
return async function (...args: U): Promise {
- if (process.env.USE_DB_AUTHENTICATION === "false") {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (!useDbAuthentication) {
if (warningCount < 5) {
Logger.warn("You're bypassing authentication");
warningCount++;
@@ -17,6 +21,7 @@ export function withAuth(
try {
return await originalFunction(...args);
} catch (error) {
+ Sentry.captureException(error);
Logger.error(`Error in withAuth function: ${error}`);
return { success: false, error: error.message } as T;
}
diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts
index 2b5388c1..f67a1cd0 100644
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@@ -12,6 +12,8 @@ import { Document } from "../lib/entities";
import { supabase_service } from "../services/supabase";
import { Logger } from "../lib/logger";
import { ScrapeEvents } from "../lib/scrape-events";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function startWebScraperPipeline({
job,
@@ -118,15 +120,10 @@ export async function runWebScraper({
: docs;
if(is_scrape === false) {
- const billingResult = await billTeam(team_id, filteredDocs.length);
- if (!billingResult.success) {
- // throw new Error("Failed to bill team, no subscription was found");
- return {
- success: false,
- message: "Failed to bill team, no subscription was found",
- docs: [],
- };
- }
+ billTeam(team_id, filteredDocs.length).catch(error => {
+ Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
+ // Optionally, you could notify an admin or add to a retry queue here
+ });
}
@@ -144,7 +141,8 @@ export async function runWebScraper({
const saveJob = async (job: Job, result: any, token: string, mode: string) => {
try {
- if (process.env.USE_DB_AUTHENTICATION === "true") {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (useDbAuthentication) {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.update({ docs: result })
diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts
index 9dcbf111..daa9bf43 100644
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@@ -33,7 +33,9 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
if (!success) {
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
- return res.status(402).json({ success: false, error: "Insufficient credits" });
+ if (!res.headersSent) {
+ return res.status(402).json({ success: false, error: "Insufficient credits" });
+ }
}
req.account = { remainingCredits }
next();
@@ -52,7 +54,9 @@ export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestW
);
if (!success) {
- return res.status(status).json({ success: false, error });
+ if (!res.headersSent) {
+ return res.status(status).json({ success: false, error });
+ }
}
req.auth = { team_id, plan };
@@ -67,7 +71,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
if (req.headers["x-idempotency-key"]) {
const isIdempotencyValid = await validateIdempotencyKey(req);
if (!isIdempotencyValid) {
- return res.status(409).json({ success: false, error: "Idempotency key already used" });
+ if (!res.headersSent) {
+ return res.status(409).json({ success: false, error: "Idempotency key already used" });
+ }
}
createIdempotencyKey(req);
}
@@ -78,7 +84,9 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (req.body.url && isUrlBlocked(req.body.url)) {
- return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
+ if (!res.headersSent) {
+ return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
+ }
}
next();
}
@@ -96,26 +104,26 @@ export const v1Router = express.Router();
v1Router.post(
"/scrape",
- blocklistMiddleware,
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
+ blocklistMiddleware,
wrap(scrapeController)
);
v1Router.post(
"/crawl",
- blocklistMiddleware,
authMiddleware(RateLimiterMode.Crawl),
- idempotencyMiddleware,
checkCreditsMiddleware(),
+ blocklistMiddleware,
+ idempotencyMiddleware,
wrap(crawlController)
);
v1Router.post(
"/map",
- blocklistMiddleware,
authMiddleware(RateLimiterMode.Map),
checkCreditsMiddleware(1),
+ blocklistMiddleware,
wrap(mapController)
);
diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts
index adf7e53c..11e1fe37 100644
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@@ -23,12 +23,15 @@ import { clientSideError } from "../../strings";
dotenv.config();
+const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
+const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
+
export const baseScrapers = [
- "fire-engine;chrome-cdp",
- "fire-engine",
- "scrapingBee",
- process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
- "scrapingBeeLoad",
+ useFireEngine ? "fire-engine;chrome-cdp" : undefined,
+ useFireEngine ? "fire-engine" : undefined,
+ useScrapingBee ? "scrapingBee" : undefined,
+ useFireEngine ? undefined : "playwright",
+ useScrapingBee ? "scrapingBeeLoad" : undefined,
"fetch",
].filter(Boolean);
@@ -85,18 +88,18 @@ function getScrapingFallbackOrder(
});
let defaultOrder = [
- !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine;chrome-cdp",
- !process.env.USE_DB_AUTHENTICATION ? undefined : "fire-engine",
- "scrapingBee",
- process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
- "scrapingBeeLoad",
+ useFireEngine ? "fire-engine;chrome-cdp" : undefined,
+ useFireEngine ? "fire-engine" : undefined,
+ useScrapingBee ? "scrapingBee" : undefined,
+ useScrapingBee ? "scrapingBeeLoad" : undefined,
+ useFireEngine ? undefined : "playwright",
"fetch",
].filter(Boolean);
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
defaultOrder = [
"fire-engine",
- process.env.USE_DB_AUTHENTICATION ? undefined : "playwright",
+ useFireEngine ? undefined : "playwright",
...defaultOrder.filter(
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
),
diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts
index 22dc72df..53031de9 100644
--- a/apps/api/src/services/billing/credit_billing.ts
+++ b/apps/api/src/services/billing/credit_billing.ts
@@ -5,7 +5,7 @@ import { supabase_service } from "../supabase";
import { Logger } from "../../lib/logger";
import { getValue, setValue } from "../redis";
import { redlock } from "../redlock";
-
+import * as Sentry from "@sentry/node";
const FREE_CREDITS = 500;
@@ -40,14 +40,15 @@ export async function supaBillTeam(team_id: string, credits: number) {
]);
let couponCredits = 0;
+ let sortedCoupons = [];
+
if (coupons && coupons.length > 0) {
couponCredits = coupons.reduce(
(total, coupon) => total + coupon.credits,
0
);
+ sortedCoupons = [...coupons].sort((a, b) => b.credits - a.credits);
}
-
- let sortedCoupons = coupons.sort((a, b) => b.credits - a.credits);
// using coupon credits:
if (couponCredits > 0) {
// if there is no subscription and they have enough coupon credits
@@ -175,9 +176,24 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
}
- // Retrieve the team's active subscription and check for available coupons concurrently
- const [{ data: subscription, error: subscriptionError }, { data: coupons }] =
- await Promise.all([
+
+ let cacheKeySubscription = `subscription_${team_id}`;
+ let cacheKeyCoupons = `coupons_${team_id}`;
+
+ // Try to get data from cache first
+ const [cachedSubscription, cachedCoupons] = await Promise.all([
+ getValue(cacheKeySubscription),
+ getValue(cacheKeyCoupons)
+ ]);
+
+ let subscription, subscriptionError, coupons;
+
+ if (cachedSubscription && cachedCoupons) {
+ subscription = JSON.parse(cachedSubscription);
+ coupons = JSON.parse(cachedCoupons);
+ } else {
+ // If not in cache, retrieve from database
+ const [subscriptionResult, couponsResult] = await Promise.all([
supabase_service
.from("subscriptions")
.select("id, price_id, current_period_start, current_period_end")
@@ -191,6 +207,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
.eq("status", "active"),
]);
+ subscription = subscriptionResult.data;
+ subscriptionError = subscriptionResult.error;
+ coupons = couponsResult.data;
+
+ // Cache the results for a minute, sub can be null and that's fine
+ await setValue(cacheKeySubscription, JSON.stringify(subscription), 60); // Cache for 1 minute, even if null
+ await setValue(cacheKeyCoupons, JSON.stringify(coupons), 60); // Cache for 1 minute
+
+ }
+
let couponCredits = 0;
if (coupons && coupons.length > 0) {
couponCredits = coupons.reduce(
@@ -211,41 +237,54 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
let creditUsages;
let creditUsageError;
- let retries = 0;
- const maxRetries = 3;
- const retryInterval = 2000; // 2 seconds
+ let totalCreditsUsed = 0;
+ const cacheKeyCreditUsage = `credit_usage_${team_id}`;
- while (retries < maxRetries) {
- const result = await supabase_service
- .from("credit_usage")
- .select("credits_used")
- .is("subscription_id", null)
- .eq("team_id", team_id);
+ // Try to get credit usage from cache
+ const cachedCreditUsage = await getValue(cacheKeyCreditUsage);
- creditUsages = result.data;
- creditUsageError = result.error;
+ if (cachedCreditUsage) {
+ totalCreditsUsed = parseInt(cachedCreditUsage);
+ } else {
+ let retries = 0;
+ const maxRetries = 3;
+ const retryInterval = 2000; // 2 seconds
- if (!creditUsageError) {
- break;
+ while (retries < maxRetries) {
+ const result = await supabase_service
+ .from("credit_usage")
+ .select("credits_used")
+ .is("subscription_id", null)
+ .eq("team_id", team_id);
+
+ creditUsages = result.data;
+ creditUsageError = result.error;
+
+ if (!creditUsageError) {
+ break;
+ }
+
+ retries++;
+ if (retries < maxRetries) {
+ await new Promise(resolve => setTimeout(resolve, retryInterval));
+ }
}
- retries++;
- if (retries < maxRetries) {
- await new Promise(resolve => setTimeout(resolve, retryInterval));
+ if (creditUsageError) {
+ Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
+ throw new Error(
+ `Failed to retrieve credit usage for team_id: ${team_id}`
+ );
}
- }
- if (creditUsageError) {
- Logger.error(`Credit usage error after ${maxRetries} attempts: ${creditUsageError}`);
- throw new Error(
- `Failed to retrieve credit usage for team_id: ${team_id}`
+ totalCreditsUsed = creditUsages.reduce(
+ (acc, usage) => acc + usage.credits_used,
+ 0
);
- }
- const totalCreditsUsed = creditUsages.reduce(
- (acc, usage) => acc + usage.credits_used,
- 0
- );
+ // Cache the result for 30 seconds
+ await setValue(cacheKeyCreditUsage, totalCreditsUsed.toString(), 30);
+ }
Logger.info(`totalCreditsUsed: ${totalCreditsUsed}`);
@@ -255,7 +294,9 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
const creditLimit = FREE_CREDITS;
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
- if (creditUsagePercentage >= 0.8) {
+ // Add a check to ensure totalCreditsUsed is greater than 0
+ if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
+ Logger.info(`Sending notification for team ${team_id}. Total credits used: ${totalCreditsUsed}, Credit usage percentage: ${creditUsagePercentage}`);
await sendNotification(
team_id,
NotificationType.APPROACHING_LIMIT,
@@ -309,7 +350,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (creditUsages && creditUsages.length > 0) {
totalCreditsUsed = creditUsages[0].total_credits_used;
- await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes
+ await setValue(cacheKey, totalCreditsUsed.toString(), 500); // Cache for 8 minutes
// Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`);
}
}
@@ -322,17 +363,38 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
// Adjust total credits used by subtracting coupon value
const adjustedCreditsUsed = Math.max(0, totalCreditsUsed - couponCredits);
- // Get the price details
- const { data: price, error: priceError } = await supabase_service
- .from("prices")
- .select("credits")
- .eq("id", subscription.price_id)
- .single();
- if (priceError) {
- throw new Error(
- `Failed to retrieve price for price_id: ${subscription.price_id}`
- );
+ // Get the price details from cache or database
+ const priceCacheKey = `price_${subscription.price_id}`;
+ let price;
+
+ try {
+ const cachedPrice = await getValue(priceCacheKey);
+ if (cachedPrice) {
+ price = JSON.parse(cachedPrice);
+ } else {
+ const { data, error: priceError } = await supabase_service
+ .from("prices")
+ .select("credits")
+ .eq("id", subscription.price_id)
+ .single();
+
+ if (priceError) {
+ throw new Error(
+ `Failed to retrieve price for price_id: ${subscription.price_id}`
+ );
+ }
+
+ price = data;
+ // There are only 21 records, so this is super fine
+ // Cache the price for a long time (e.g., 1 day)
+ await setValue(priceCacheKey, JSON.stringify(price), 86400);
+ }
+ } catch (error) {
+ Logger.error(`Error retrieving or caching price: ${error}`);
+ Sentry.captureException(error);
+ // If errors, just assume it's a big number so user don't get an error
+ price = { credits: 1000000 };
}
const creditLimit = price.credits;
@@ -462,8 +524,8 @@ async function createCreditUsage({
subscription_id?: string;
credits: number;
}) {
- const { data: credit_usage } = await supabase_service
- .from("credit_usage")
+ await supabase_service
+ .from("credit_usage")
.insert([
{
team_id,
@@ -471,8 +533,7 @@ async function createCreditUsage({
subscription_id: subscription_id || null,
created_at: new Date(),
},
- ])
- .select();
+ ]);
- return { success: true, credit_usage };
+ return { success: true };
}
diff --git a/apps/api/src/services/logging/crawl_log.ts b/apps/api/src/services/logging/crawl_log.ts
index 68008e02..3850e05b 100644
--- a/apps/api/src/services/logging/crawl_log.ts
+++ b/apps/api/src/services/logging/crawl_log.ts
@@ -1,9 +1,11 @@
import { supabase_service } from "../supabase";
import { Logger } from "../../../src/lib/logger";
-import "dotenv/config";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function logCrawl(job_id: string, team_id: string) {
- if (process.env.USE_DB_AUTHENTICATION === 'true') {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (useDbAuthentication) {
try {
const { data, error } = await supabase_service
.from("bulljobs_teams")
diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts
index 61983be0..4d8ee014 100644
--- a/apps/api/src/services/logging/log_job.ts
+++ b/apps/api/src/services/logging/log_job.ts
@@ -4,10 +4,13 @@ import { FirecrawlJob } from "../../types";
import { posthog } from "../posthog";
import "dotenv/config";
import { Logger } from "../../lib/logger";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function logJob(job: FirecrawlJob) {
try {
- if (process.env.USE_DB_AUTHENTICATION === "false") {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (!useDbAuthentication) {
return;
}
diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts
index 099e4a0b..fbe41653 100644
--- a/apps/api/src/services/logging/scrape_log.ts
+++ b/apps/api/src/services/logging/scrape_log.ts
@@ -3,12 +3,15 @@ import { ScrapeLog } from "../../types";
import { supabase_service } from "../supabase";
import { PageOptions } from "../../lib/entities";
import { Logger } from "../../lib/logger";
+import { configDotenv } from "dotenv";
+configDotenv();
export async function logScrape(
scrapeLog: ScrapeLog,
pageOptions?: PageOptions
) {
- if (process.env.USE_DB_AUTHENTICATION === "false") {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (!useDbAuthentication) {
Logger.debug("Skipping logging scrape to Supabase");
return;
}
diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts
index 941b571d..7a698772 100644
--- a/apps/api/src/services/queue-jobs.ts
+++ b/apps/api/src/services/queue-jobs.ts
@@ -67,6 +67,6 @@ export function waitForJob(jobId: string, timeout: number) {
reject((await getScrapeQueue().getJob(jobId)).failedReason);
}
}
- }, 1000);
+ }, 500);
})
}
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 6488759f..ad0e4ad5 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -36,6 +36,8 @@ import {
} from "../../src/lib/job-priority";
import { PlanType } from "../types";
import { getJobs } from "../../src/controllers/v1/crawl-status";
+import { configDotenv } from "dotenv";
+configDotenv();
if (process.env.ENV === "production") {
initSDK({
diff --git a/apps/api/src/services/supabase.ts b/apps/api/src/services/supabase.ts
index 70ada12b..7636717e 100644
--- a/apps/api/src/services/supabase.ts
+++ b/apps/api/src/services/supabase.ts
@@ -1,5 +1,7 @@
import { createClient, SupabaseClient } from "@supabase/supabase-js";
import { Logger } from "../lib/logger";
+import { configDotenv } from "dotenv";
+configDotenv();
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
class SupabaseService {
@@ -8,8 +10,9 @@ class SupabaseService {
constructor() {
const supabaseUrl = process.env.SUPABASE_URL;
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
// Only initialize the Supabase client if both URL and Service Token are provided.
- if (process.env.USE_DB_AUTHENTICATION === "false") {
+ if (!useDbAuthentication) {
// Warn the user that Authentication is disabled by setting the client to null
Logger.warn(
"Authentication is disabled. Supabase client will not be initialized."
diff --git a/apps/api/src/services/webhook.ts b/apps/api/src/services/webhook.ts
index 56dd5c58..06e5649d 100644
--- a/apps/api/src/services/webhook.ts
+++ b/apps/api/src/services/webhook.ts
@@ -3,6 +3,8 @@ import { legacyDocumentConverter } from "../../src/controllers/v1/types";
import { Logger } from "../../src/lib/logger";
import { supabase_service } from "./supabase";
import { WebhookEventType } from "../types";
+import { configDotenv } from "dotenv";
+configDotenv();
export const callWebhook = async (
teamId: string,
diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json
index e68b3014..7114a625 100644
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
- "version": "1.2.1",
+ "version": "1.2.2",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/cjs/index.js",
"types": "types/index.d.ts",
diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index 1d1bb4ee..8b16adfb 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -454,20 +454,27 @@ export default class FirecrawlApp {
checkInterval: number
): Promise {
while (true) {
- const statusResponse: AxiosResponse = await this.getRequest(
+ let statusResponse: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/crawl/${id}`,
headers
);
if (statusResponse.status === 200) {
- const statusData = statusResponse.data;
+ let statusData = statusResponse.data;
if (statusData.status === "completed") {
if ("data" in statusData) {
+ let data = statusData.data;
+ while ('next' in statusData) {
+ statusResponse = await this.getRequest(statusData.next, headers);
+ statusData = statusResponse.data;
+ data = data.concat(statusData.data);
+ }
+ statusData.data = data;
return statusData;
} else {
throw new Error("Crawl job completed but no data was returned");
}
} else if (
- ["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
+ ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
) {
checkInterval = Math.max(checkInterval, 2);
await new Promise((resolve) =>
diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py
index 4b3807be..f178cd61 100644
--- a/apps/python-sdk/firecrawl/__init__.py
+++ b/apps/python-sdk/firecrawl/__init__.py
@@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp
-__version__ = "1.2.1"
+__version__ = "1.2.3"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 75245e8d..254f4c70 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -238,7 +238,6 @@ class FirecrawlApp:
)
if response.status_code == 200:
response = response.json()
- print(response)
if response['success'] and 'links' in response:
return response['links']
else:
@@ -346,6 +345,12 @@ class FirecrawlApp:
status_data = status_response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
+ data = status_data['data']
+ while 'next' in status_data:
+ status_response = self._get_request(status_data['next'], headers)
+ status_data = status_response.json()
+ data.extend(status_data['data'])
+ status_data['data'] = data
return status_data
else:
raise Exception('Crawl job completed but no data was returned')
diff --git a/apps/test-suite/utils/supabase.ts b/apps/test-suite/utils/supabase.ts
index abf7fd78..a1549e24 100644
--- a/apps/test-suite/utils/supabase.ts
+++ b/apps/test-suite/utils/supabase.ts
@@ -1,5 +1,6 @@
import { createClient, SupabaseClient } from "@supabase/supabase-js";
-import "dotenv/config";
+import { configDotenv } from "dotenv";
+configDotenv();
// SupabaseService class initializes the Supabase client conditionally based on environment variables.
class SupabaseService {
@@ -9,7 +10,8 @@ class SupabaseService {
const supabaseUrl = process.env.SUPABASE_URL;
const supabaseServiceToken = process.env.SUPABASE_SERVICE_TOKEN;
// Only initialize the Supabase client if both URL and Service Token are provided.
- if (process.env.USE_DB_AUTHENTICATION === "false") {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (!useDbAuthentication) {
// Warn the user that Authentication is disabled by setting the client to null
console.warn(
"Authentication is disabled. Supabase client will not be initialized."
@@ -36,7 +38,8 @@ export const supabase_service: SupabaseClient = new Proxy(
new SupabaseService(),
{
get: function (target, prop, receiver) {
- if (process.env.USE_DB_AUTHENTICATION === "false") {
+ const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
+ if (!useDbAuthentication) {
console.debug(
"Attempted to access Supabase client when it's not configured."
);