mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-10 04:48:59 +08:00
Merge branch 'main' into mog/concurrency-limit-2
This commit is contained in:
commit
ea85b1d602
16
README.md
16
README.md
@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
||||
- **Media parsing**: pdfs, docx, images.
|
||||
- **Reliability first**: designed to get the data you need - no matter how hard it is.
|
||||
- **Actions**: click, scroll, input, wait and more before extracting data
|
||||
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
|
||||
|
||||
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
||||
|
||||
@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
}'
|
||||
```
|
||||
|
||||
### Batch Scraping Multiple URLs (New)
|
||||
|
||||
You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
|
||||
"formats" : ["markdown", "html"]
|
||||
}'
|
||||
```
|
||||
|
||||
### Search (v0) (Beta)
|
||||
|
||||
@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
|
||||
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
|
@ -1,5 +1,5 @@
|
||||
# ===== Required ENVS ======
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
NUM_WORKERS_PER_QUEUE=8
|
||||
PORT=3002
|
||||
HOST=0.0.0.0
|
||||
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||
@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true
|
||||
|
||||
# ===== Optional ENVS ======
|
||||
|
||||
# SearchApi key. Head to https://searchapi.com/ to get your API key
|
||||
SEARCHAPI_API_KEY=
|
||||
# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
|
||||
SEARCHAPI_ENGINE=
|
||||
|
||||
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_ANON_TOKEN=
|
||||
SUPABASE_URL=
|
||||
SUPABASE_SERVICE_TOKEN=
|
||||
|
||||
# Other Optionals
|
||||
|
@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
|
||||
BULL_AUTH_KEY=
|
||||
LOGTAIL_KEY=
|
||||
PLAYWRIGHT_MICROSERVICE_URL=
|
||||
|
||||
SEARCHAPI_API_KEY=
|
||||
|
@ -75,15 +75,19 @@ export async function setCachedACUC(
|
||||
|
||||
export async function getACUC(
|
||||
api_key: string,
|
||||
cacheOnly = false
|
||||
cacheOnly = false,
|
||||
useCache = true
|
||||
): Promise<AuthCreditUsageChunk | null> {
|
||||
const cacheKeyACUC = `acuc_${api_key}`;
|
||||
|
||||
const cachedACUC = await getValue(cacheKeyACUC);
|
||||
if (useCache) {
|
||||
const cachedACUC = await getValue(cacheKeyACUC);
|
||||
if (cachedACUC !== null) {
|
||||
return JSON.parse(cachedACUC);
|
||||
}
|
||||
}
|
||||
|
||||
if (cachedACUC !== null) {
|
||||
return JSON.parse(cachedACUC);
|
||||
} else if (!cacheOnly) {
|
||||
if (!cacheOnly) {
|
||||
let data;
|
||||
let error;
|
||||
let retries = 0;
|
||||
@ -91,7 +95,7 @@ export async function getACUC(
|
||||
|
||||
while (retries < maxRetries) {
|
||||
({ data, error } = await supabase_service.rpc(
|
||||
"auth_credit_usage_chunk_test_3",
|
||||
"auth_credit_usage_chunk_test_21_credit_pack",
|
||||
{ input_key: api_key }
|
||||
));
|
||||
|
||||
@ -118,9 +122,11 @@ export async function getACUC(
|
||||
data.length === 0 ? null : data[0].team_id === null ? null : data[0];
|
||||
|
||||
// NOTE: Should we cache null chunks? - mogery
|
||||
if (chunk !== null) {
|
||||
if (chunk !== null && useCache) {
|
||||
setCachedACUC(api_key, chunk);
|
||||
}
|
||||
|
||||
// console.log(chunk);
|
||||
|
||||
return chunk;
|
||||
} else {
|
||||
@ -348,9 +354,12 @@ function getPlanByPriceId(price_id: string): PlanType {
|
||||
return "standardnew";
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH:
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_YEARLY:
|
||||
case process.env.STRIPE_PRICE_ID_SCALE_2M:
|
||||
return "growth";
|
||||
case process.env.STRIPE_PRICE_ID_GROWTH_DOUBLE_MONTHLY:
|
||||
return "growthdouble";
|
||||
case process.env.STRIPE_PRICE_ID_ETIER2C:
|
||||
return "etier2c";
|
||||
default:
|
||||
return "free";
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import {
|
||||
BatchScrapeRequest,
|
||||
batchScrapeRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyExtractorOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
@ -35,6 +36,8 @@ export async function batchScrapeController(
|
||||
}
|
||||
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
crawlerOptions: null,
|
||||
@ -64,6 +67,7 @@ export async function batchScrapeController(
|
||||
plan: req.auth.plan,
|
||||
crawlerOptions: null,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
|
@ -109,13 +109,26 @@ export const scrapeOptions = z.object({
|
||||
extract: extractOptions.optional(),
|
||||
parsePDF: z.boolean().default(true),
|
||||
actions: actionsSchema.optional(),
|
||||
// New
|
||||
location: z.object({
|
||||
country: z.string().optional().refine(
|
||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||
{
|
||||
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||
}
|
||||
).transform(val => val ? val.toUpperCase() : 'US'),
|
||||
languages: z.string().array().optional(),
|
||||
}).optional(),
|
||||
|
||||
// Deprecated
|
||||
geolocation: z.object({
|
||||
country: z.string().optional().refine(
|
||||
(val) => !val || Object.keys(countries).includes(val.toUpperCase()),
|
||||
{
|
||||
message: "Invalid country code. Please use a valid ISO 3166-1 alpha-2 country code.",
|
||||
}
|
||||
).transform(val => val ? val.toUpperCase() : 'US')
|
||||
).transform(val => val ? val.toUpperCase() : 'US'),
|
||||
languages: z.string().array().optional(),
|
||||
}).optional(),
|
||||
skipTlsVerification: z.boolean().default(false),
|
||||
}).strict(strictMessage)
|
||||
@ -362,6 +375,8 @@ export type AuthCreditUsageChunk = {
|
||||
coupons: any[];
|
||||
adjusted_credits_used: number; // credits this period minus coupons used
|
||||
remaining_credits: number;
|
||||
sub_user_id: string | null;
|
||||
total_credits_sum: number;
|
||||
};
|
||||
|
||||
export interface RequestWithMaybeACUC<
|
||||
@ -443,7 +458,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
actions: x.actions as Action[], // no strict null checking grrrr - mogery
|
||||
geolocation: x.geolocation,
|
||||
geolocation: x.location ?? x.geolocation,
|
||||
skipTlsVerification: x.skipTlsVerification
|
||||
};
|
||||
}
|
||||
|
@ -70,6 +70,10 @@ export async function getJobPriority({
|
||||
bucketLimit = 400;
|
||||
planModifier = 0.1;
|
||||
break;
|
||||
case "etier2c":
|
||||
bucketLimit = 1000;
|
||||
planModifier = 0.05;
|
||||
break;
|
||||
|
||||
default:
|
||||
bucketLimit = 25;
|
||||
|
@ -121,8 +121,13 @@ export async function runWebScraper({
|
||||
: docs;
|
||||
|
||||
if(is_scrape === false) {
|
||||
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
|
||||
creditsToBeBilled = 5;
|
||||
}
|
||||
|
||||
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
|
||||
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
|
||||
// Optionally, you could notify an admin or add to a retry queue here
|
||||
});
|
||||
}
|
||||
|
@ -209,14 +209,15 @@ export async function scrapSingleUrl(
|
||||
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
||||
const result: Action[] = [];
|
||||
// Don't add a wait if the previous action is a wait
|
||||
if (index === 0 || array[index - 1].type !== "wait") {
|
||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
}
|
||||
// if (index === 0 || array[index - 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
// Fire-engine now handles wait times automatically, leaving the code here for now
|
||||
result.push(action);
|
||||
// Don't add a wait if the next action is a wait
|
||||
if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
}
|
||||
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||
// }
|
||||
return result;
|
||||
}
|
||||
return [action as Action];
|
||||
|
@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineMap } from "./fireEngine";
|
||||
import { searchapi_search } from "./searchapi";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
export async function search({
|
||||
@ -30,7 +31,16 @@ export async function search({
|
||||
timeout?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
try {
|
||||
|
||||
if (process.env.SEARCHAPI_API_KEY) {
|
||||
return await searchapi_search(query, {
|
||||
num_results,
|
||||
tbs,
|
||||
filter,
|
||||
lang,
|
||||
country,
|
||||
location
|
||||
});
|
||||
}
|
||||
if (process.env.SERPER_API_KEY) {
|
||||
return await serper_search(query, {
|
||||
num_results,
|
||||
|
60
apps/api/src/search/searchapi.ts
Normal file
60
apps/api/src/search/searchapi.ts
Normal file
@ -0,0 +1,60 @@
|
||||
import axios from "axios";
|
||||
import dotenv from "dotenv";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
interface SearchOptions {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
num_results: number;
|
||||
page?: number;
|
||||
}
|
||||
|
||||
export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
|
||||
const params = {
|
||||
q: q,
|
||||
hl: options.lang,
|
||||
gl: options.country,
|
||||
location: options.location,
|
||||
num: options.num_results,
|
||||
page: options.page ?? 1,
|
||||
engine: process.env.SEARCHAPI_ENGINE || "google",
|
||||
};
|
||||
|
||||
const url = `https://www.searchapi.io/api/v1/search`;
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
"Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
|
||||
"Content-Type": "application/json",
|
||||
"X-SearchApi-Source": "Firecrawl",
|
||||
},
|
||||
params: params,
|
||||
});
|
||||
|
||||
|
||||
if (response.status === 401) {
|
||||
throw new Error("Unauthorized. Please check your API key.");
|
||||
}
|
||||
|
||||
const data = response.data;
|
||||
|
||||
if (data && Array.isArray(data.organic_results)) {
|
||||
return data.organic_results.map((a: any) => ({
|
||||
url: a.link,
|
||||
title: a.title,
|
||||
description: a.snippet,
|
||||
}));
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`There was an error searching for content: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
176
apps/api/src/services/billing/auto_charge.ts
Normal file
176
apps/api/src/services/billing/auto_charge.ts
Normal file
@ -0,0 +1,176 @@
|
||||
// Import necessary dependencies and types
|
||||
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
||||
import { getACUC, setCachedACUC } from "../../controllers/auth";
|
||||
import { redlock } from "../redlock";
|
||||
import { supabase_service } from "../supabase";
|
||||
import { createPaymentIntent } from "./stripe";
|
||||
import { issueCredits } from "./issue_credits";
|
||||
import { sendNotification } from "../notification/email_notification";
|
||||
import { NotificationType } from "../../types";
|
||||
import { deleteKey, getValue, setValue } from "../redis";
|
||||
import { sendSlackWebhook } from "../alerts/slack";
|
||||
import { Logger } from "../../lib/logger";
|
||||
|
||||
// Define the number of credits to be added during auto-recharge
|
||||
const AUTO_RECHARGE_CREDITS = 1000;
|
||||
const AUTO_RECHARGE_COOLDOWN = 300; // 5 minutes in seconds
|
||||
|
||||
/**
|
||||
* Attempt to automatically charge a user's account when their credit balance falls below a threshold
|
||||
* @param chunk The user's current usage data
|
||||
* @param autoRechargeThreshold The credit threshold that triggers auto-recharge
|
||||
*/
|
||||
export async function autoCharge(
|
||||
chunk: AuthCreditUsageChunk,
|
||||
autoRechargeThreshold: number
|
||||
): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> {
|
||||
const resource = `auto-recharge:${chunk.team_id}`;
|
||||
const cooldownKey = `auto-recharge-cooldown:${chunk.team_id}`;
|
||||
|
||||
try {
|
||||
// Check if the team is in the cooldown period
|
||||
// Another check to prevent race conditions, double charging - cool down of 5 minutes
|
||||
const cooldownValue = await getValue(cooldownKey);
|
||||
if (cooldownValue) {
|
||||
Logger.info(`Auto-recharge for team ${chunk.team_id} is in cooldown period`);
|
||||
return {
|
||||
success: false,
|
||||
message: "Auto-recharge is in cooldown period",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
|
||||
// Use a distributed lock to prevent concurrent auto-charge attempts
|
||||
return await redlock.using([resource], 5000, async (signal) : Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> => {
|
||||
// Recheck the condition inside the lock to prevent race conditions
|
||||
const updatedChunk = await getACUC(chunk.api_key, false, false);
|
||||
if (
|
||||
updatedChunk &&
|
||||
updatedChunk.remaining_credits < autoRechargeThreshold
|
||||
) {
|
||||
if (chunk.sub_user_id) {
|
||||
// Fetch the customer's Stripe information
|
||||
const { data: customer, error: customersError } =
|
||||
await supabase_service
|
||||
.from("customers")
|
||||
.select("id, stripe_customer_id")
|
||||
.eq("id", chunk.sub_user_id)
|
||||
.single();
|
||||
|
||||
if (customersError) {
|
||||
Logger.error(`Error fetching customer data: ${customersError}`);
|
||||
return {
|
||||
success: false,
|
||||
message: "Error fetching customer data",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
|
||||
if (customer && customer.stripe_customer_id) {
|
||||
let issueCreditsSuccess = false;
|
||||
// Attempt to create a payment intent
|
||||
const paymentStatus = await createPaymentIntent(
|
||||
chunk.team_id,
|
||||
customer.stripe_customer_id
|
||||
);
|
||||
|
||||
// If payment is successful or requires further action, issue credits
|
||||
if (
|
||||
paymentStatus.return_status === "succeeded" ||
|
||||
paymentStatus.return_status === "requires_action"
|
||||
) {
|
||||
issueCreditsSuccess = await issueCredits(
|
||||
chunk.team_id,
|
||||
AUTO_RECHARGE_CREDITS
|
||||
);
|
||||
}
|
||||
|
||||
// Record the auto-recharge transaction
|
||||
await supabase_service.from("auto_recharge_transactions").insert({
|
||||
team_id: chunk.team_id,
|
||||
initial_payment_status: paymentStatus.return_status,
|
||||
credits_issued: issueCreditsSuccess ? AUTO_RECHARGE_CREDITS : 0,
|
||||
stripe_charge_id: paymentStatus.charge_id,
|
||||
});
|
||||
|
||||
// Send a notification if credits were successfully issued
|
||||
if (issueCreditsSuccess) {
|
||||
await sendNotification(
|
||||
chunk.team_id,
|
||||
NotificationType.AUTO_RECHARGE_SUCCESS,
|
||||
chunk.sub_current_period_start,
|
||||
chunk.sub_current_period_end,
|
||||
chunk,
|
||||
true
|
||||
);
|
||||
|
||||
// Set cooldown period
|
||||
await setValue(cooldownKey, 'true', AUTO_RECHARGE_COOLDOWN);
|
||||
}
|
||||
|
||||
// Reset ACUC cache to reflect the new credit balance
|
||||
const cacheKeyACUC = `acuc_${chunk.api_key}`;
|
||||
await deleteKey(cacheKeyACUC);
|
||||
|
||||
if (process.env.SLACK_ADMIN_WEBHOOK_URL) {
|
||||
const webhookCooldownKey = `webhook_cooldown_${chunk.team_id}`;
|
||||
const isInCooldown = await getValue(webhookCooldownKey);
|
||||
|
||||
if (!isInCooldown) {
|
||||
sendSlackWebhook(
|
||||
`Auto-recharge: Team ${chunk.team_id}. ${AUTO_RECHARGE_CREDITS} credits added. Payment status: ${paymentStatus.return_status}.`,
|
||||
false,
|
||||
process.env.SLACK_ADMIN_WEBHOOK_URL
|
||||
).catch((error) => {
|
||||
Logger.debug(`Error sending slack notification: ${error}`);
|
||||
});
|
||||
|
||||
// Set cooldown for 1 hour
|
||||
await setValue(webhookCooldownKey, 'true', 60 * 60);
|
||||
}
|
||||
}
|
||||
return {
|
||||
success: true,
|
||||
message: "Auto-recharge successful",
|
||||
remainingCredits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS,
|
||||
chunk: {...chunk, remaining_credits: chunk.remaining_credits + AUTO_RECHARGE_CREDITS},
|
||||
};
|
||||
} else {
|
||||
Logger.error("No Stripe customer ID found for user");
|
||||
return {
|
||||
success: false,
|
||||
message: "No Stripe customer ID found for user",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
} else {
|
||||
Logger.error("No sub_user_id found in chunk");
|
||||
return {
|
||||
success: false,
|
||||
message: "No sub_user_id found in chunk",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
message: "No need to auto-recharge",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
|
||||
});
|
||||
} catch (error) {
|
||||
Logger.error(`Failed to acquire lock for auto-recharge: ${error}`);
|
||||
return {
|
||||
success: false,
|
||||
message: "Failed to acquire lock for auto-recharge",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
}
|
@ -6,24 +6,40 @@ import { Logger } from "../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { AuthCreditUsageChunk } from "../../controllers/v1/types";
|
||||
import { getACUC, setCachedACUC } from "../../controllers/auth";
|
||||
import { issueCredits } from "./issue_credits";
|
||||
import { redlock } from "../redlock";
|
||||
import { autoCharge } from "./auto_charge";
|
||||
import { getValue, setValue } from "../redis";
|
||||
|
||||
const FREE_CREDITS = 500;
|
||||
|
||||
/**
|
||||
* If you do not know the subscription_id in the current context, pass subscription_id as undefined.
|
||||
*/
|
||||
export async function billTeam(team_id: string, subscription_id: string | null | undefined, credits: number) {
|
||||
export async function billTeam(
|
||||
team_id: string,
|
||||
subscription_id: string | null | undefined,
|
||||
credits: number
|
||||
) {
|
||||
return withAuth(supaBillTeam)(team_id, subscription_id, credits);
|
||||
}
|
||||
export async function supaBillTeam(team_id: string, subscription_id: string, credits: number) {
|
||||
export async function supaBillTeam(
|
||||
team_id: string,
|
||||
subscription_id: string,
|
||||
credits: number
|
||||
) {
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used" };
|
||||
}
|
||||
Logger.info(`Billing team ${team_id} for ${credits} credits`);
|
||||
|
||||
const { data, error } =
|
||||
await supabase_service.rpc("bill_team", { _team_id: team_id, sub_id: subscription_id ?? null, fetch_subscription: subscription_id === undefined, credits });
|
||||
|
||||
const { data, error } = await supabase_service.rpc("bill_team", {
|
||||
_team_id: team_id,
|
||||
sub_id: subscription_id ?? null,
|
||||
fetch_subscription: subscription_id === undefined,
|
||||
credits,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error("Failed to bill team: " + JSON.stringify(error));
|
||||
@ -31,48 +47,109 @@ export async function supaBillTeam(team_id: string, subscription_id: string, cre
|
||||
}
|
||||
|
||||
(async () => {
|
||||
for (const apiKey of (data ?? []).map(x => x.api_key)) {
|
||||
await setCachedACUC(apiKey, acuc => (acuc ? {
|
||||
...acuc,
|
||||
credits_used: acuc.credits_used + credits,
|
||||
adjusted_credits_used: acuc.adjusted_credits_used + credits,
|
||||
remaining_credits: acuc.remaining_credits - credits,
|
||||
} : null));
|
||||
for (const apiKey of (data ?? []).map((x) => x.api_key)) {
|
||||
await setCachedACUC(apiKey, (acuc) =>
|
||||
acuc
|
||||
? {
|
||||
...acuc,
|
||||
credits_used: acuc.credits_used + credits,
|
||||
adjusted_credits_used: acuc.adjusted_credits_used + credits,
|
||||
remaining_credits: acuc.remaining_credits - credits,
|
||||
}
|
||||
: null
|
||||
);
|
||||
}
|
||||
})();
|
||||
}
|
||||
|
||||
export async function checkTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) {
|
||||
return withAuth(supaCheckTeamCredits)(chunk, team_id, credits);
|
||||
export async function checkTeamCredits(
|
||||
chunk: AuthCreditUsageChunk,
|
||||
team_id: string,
|
||||
credits: number
|
||||
): Promise<{ success: boolean; message: string; remainingCredits: number; chunk: AuthCreditUsageChunk }> {
|
||||
const result = await withAuth(supaCheckTeamCredits)(chunk, team_id, credits);
|
||||
return {
|
||||
success: result.success,
|
||||
message: result.message,
|
||||
remainingCredits: result.remainingCredits,
|
||||
chunk: chunk // Ensure chunk is always returned
|
||||
};
|
||||
}
|
||||
|
||||
// if team has enough credits for the operation, return true, else return false
|
||||
export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id: string, credits: number) {
|
||||
export async function supaCheckTeamCredits(
|
||||
chunk: AuthCreditUsageChunk,
|
||||
team_id: string,
|
||||
credits: number
|
||||
) {
|
||||
// WARNING: chunk will be null if team_id is preview -- do not perform operations on it under ANY circumstances - mogery
|
||||
if (team_id === "preview") {
|
||||
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||
return {
|
||||
success: true,
|
||||
message: "Preview team, no credits used",
|
||||
remainingCredits: Infinity,
|
||||
};
|
||||
}
|
||||
|
||||
const creditsWillBeUsed = chunk.adjusted_credits_used + credits;
|
||||
|
||||
// In case chunk.price_credits is undefined, set it to a large number to avoid mistakes
|
||||
const totalPriceCredits = chunk.price_credits ?? 100000000;
|
||||
const totalPriceCredits = chunk.total_credits_sum ?? 100000000;
|
||||
// Removal of + credits
|
||||
const creditUsagePercentage = chunk.adjusted_credits_used / totalPriceCredits;
|
||||
|
||||
let isAutoRechargeEnabled = false, autoRechargeThreshold = 1000;
|
||||
const cacheKey = `team_auto_recharge_${team_id}`;
|
||||
let cachedData = await getValue(cacheKey);
|
||||
if (cachedData) {
|
||||
const parsedData = JSON.parse(cachedData);
|
||||
isAutoRechargeEnabled = parsedData.auto_recharge;
|
||||
autoRechargeThreshold = parsedData.auto_recharge_threshold;
|
||||
} else {
|
||||
const { data, error } = await supabase_service
|
||||
.from("teams")
|
||||
.select("auto_recharge, auto_recharge_threshold")
|
||||
.eq("id", team_id)
|
||||
.single();
|
||||
|
||||
if (data) {
|
||||
isAutoRechargeEnabled = data.auto_recharge;
|
||||
autoRechargeThreshold = data.auto_recharge_threshold;
|
||||
await setValue(cacheKey, JSON.stringify(data), 300); // Cache for 5 minutes (300 seconds)
|
||||
}
|
||||
}
|
||||
|
||||
if (isAutoRechargeEnabled && chunk.remaining_credits < autoRechargeThreshold) {
|
||||
const autoChargeResult = await autoCharge(chunk, autoRechargeThreshold);
|
||||
if (autoChargeResult.success) {
|
||||
return {
|
||||
success: true,
|
||||
message: autoChargeResult.message,
|
||||
remainingCredits: autoChargeResult.remainingCredits,
|
||||
chunk: autoChargeResult.chunk,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Compare the adjusted total credits used with the credits allowed by the plan
|
||||
if (creditsWillBeUsed > totalPriceCredits) {
|
||||
// Only notify if their actual credits (not what they will use) used is greater than the total price credits
|
||||
if(chunk.adjusted_credits_used > totalPriceCredits) {
|
||||
if (chunk.adjusted_credits_used > totalPriceCredits) {
|
||||
sendNotification(
|
||||
team_id,
|
||||
NotificationType.LIMIT_REACHED,
|
||||
chunk.sub_current_period_start,
|
||||
chunk.sub_current_period_end,
|
||||
chunk
|
||||
);
|
||||
}
|
||||
return { success: false, message: "Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.", remainingCredits: chunk.remaining_credits, chunk };
|
||||
NotificationType.LIMIT_REACHED,
|
||||
chunk.sub_current_period_start,
|
||||
chunk.sub_current_period_end,
|
||||
chunk
|
||||
);
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
message:
|
||||
"Insufficient credits to perform this request. For more credits, you can upgrade your plan at https://firecrawl.dev/pricing.",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
|
||||
// Send email notification for approaching credit limit
|
||||
sendNotification(
|
||||
@ -84,7 +161,12 @@ export async function supaCheckTeamCredits(chunk: AuthCreditUsageChunk, team_id:
|
||||
);
|
||||
}
|
||||
|
||||
return { success: true, message: "Sufficient credits available", remainingCredits: chunk.remaining_credits, chunk };
|
||||
return {
|
||||
success: true,
|
||||
message: "Sufficient credits available",
|
||||
remainingCredits: chunk.remaining_credits,
|
||||
chunk,
|
||||
};
|
||||
}
|
||||
|
||||
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
||||
|
20
apps/api/src/services/billing/issue_credits.ts
Normal file
20
apps/api/src/services/billing/issue_credits.ts
Normal file
@ -0,0 +1,20 @@
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { supabase_service } from "../supabase";
|
||||
|
||||
export async function issueCredits(team_id: string, credits: number) {
|
||||
// Add an entry to supabase coupons
|
||||
const { data, error } = await supabase_service.from("coupons").insert({
|
||||
team_id: team_id,
|
||||
credits: credits,
|
||||
status: "active",
|
||||
// indicates that this coupon was issued from auto recharge
|
||||
from_auto_recharge: true,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
Logger.error(`Error adding coupon: ${error}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
56
apps/api/src/services/billing/stripe.ts
Normal file
56
apps/api/src/services/billing/stripe.ts
Normal file
@ -0,0 +1,56 @@
|
||||
import { Logger } from "../../lib/logger";
|
||||
import Stripe from "stripe";
|
||||
|
||||
const stripe = new Stripe(process.env.STRIPE_SECRET_KEY ?? "");
|
||||
|
||||
async function getCustomerDefaultPaymentMethod(customerId: string) {
|
||||
const paymentMethods = await stripe.customers.listPaymentMethods(customerId, {
|
||||
limit: 3,
|
||||
});
|
||||
return paymentMethods.data[0] ?? null;
|
||||
}
|
||||
|
||||
type ReturnStatus = "succeeded" | "requires_action" | "failed";
|
||||
export async function createPaymentIntent(
|
||||
team_id: string,
|
||||
customer_id: string
|
||||
): Promise<{ return_status: ReturnStatus; charge_id: string }> {
|
||||
try {
|
||||
const defaultPaymentMethod = await getCustomerDefaultPaymentMethod(customer_id);
|
||||
if (!defaultPaymentMethod) {
|
||||
Logger.error(`No default payment method found for customer: ${customer_id}`);
|
||||
return { return_status: "failed", charge_id: "" };
|
||||
}
|
||||
const paymentIntent = await stripe.paymentIntents.create({
|
||||
amount: 1100,
|
||||
currency: "usd",
|
||||
customer: customer_id,
|
||||
description: "Firecrawl: Auto re-charge of 1000 credits",
|
||||
payment_method_types: [defaultPaymentMethod?.type ?? "card"],
|
||||
payment_method: defaultPaymentMethod?.id,
|
||||
off_session: true,
|
||||
confirm: true,
|
||||
});
|
||||
|
||||
if (paymentIntent.status === "succeeded") {
|
||||
Logger.info(`Payment succeeded for team: ${team_id}`);
|
||||
return { return_status: "succeeded", charge_id: paymentIntent.id };
|
||||
} else if (
|
||||
paymentIntent.status === "requires_action" ||
|
||||
paymentIntent.status === "processing" ||
|
||||
paymentIntent.status === "requires_capture"
|
||||
) {
|
||||
Logger.warn(`Payment requires further action for team: ${team_id}`);
|
||||
return { return_status: "requires_action", charge_id: paymentIntent.id };
|
||||
} else {
|
||||
Logger.error(`Payment failed for team: ${team_id}`);
|
||||
return { return_status: "failed", charge_id: paymentIntent.id };
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(
|
||||
`Failed to create or confirm PaymentIntent for team: ${team_id}`
|
||||
);
|
||||
console.error(error);
|
||||
return { return_status: "failed", charge_id: "" };
|
||||
}
|
||||
}
|
@ -70,7 +70,9 @@ export async function logJob(job: FirecrawlJob) {
|
||||
retry: job.retry,
|
||||
},
|
||||
};
|
||||
posthog.capture(phLog);
|
||||
if(job.mode !== "single_urls") {
|
||||
posthog.capture(phLog);
|
||||
}
|
||||
}
|
||||
if (error) {
|
||||
Logger.error(`Error logging job: ${error.message}`);
|
||||
|
@ -24,6 +24,14 @@ const emailTemplates: Record<
|
||||
subject: "Rate Limit Reached - Firecrawl",
|
||||
html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
|
||||
},
|
||||
[NotificationType.AUTO_RECHARGE_SUCCESS]: {
|
||||
subject: "Auto recharge successful - Firecrawl",
|
||||
html: "Hey there,<br/><p>Your account was successfully recharged with 1000 credits because your remaining credits were below the threshold. Consider upgrading your plan at <a href='https://firecrawl.dev/pricing'>firecrawl.dev/pricing</a> to avoid hitting the limit.</p><br/>Thanks,<br/>Firecrawl Team<br/>",
|
||||
},
|
||||
[NotificationType.AUTO_RECHARGE_FAILED]: {
|
||||
subject: "Auto recharge failed - Firecrawl",
|
||||
html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
|
||||
},
|
||||
};
|
||||
|
||||
export async function sendNotification(
|
||||
@ -31,18 +39,20 @@ export async function sendNotification(
|
||||
notificationType: NotificationType,
|
||||
startDateString: string,
|
||||
endDateString: string,
|
||||
chunk: AuthCreditUsageChunk
|
||||
chunk: AuthCreditUsageChunk,
|
||||
bypassRecentChecks: boolean = false
|
||||
) {
|
||||
return withAuth(sendNotificationInternal)(
|
||||
team_id,
|
||||
notificationType,
|
||||
startDateString,
|
||||
endDateString,
|
||||
chunk
|
||||
chunk,
|
||||
bypassRecentChecks
|
||||
);
|
||||
}
|
||||
|
||||
async function sendEmailNotification(
|
||||
export async function sendEmailNotification(
|
||||
email: string,
|
||||
notificationType: NotificationType,
|
||||
) {
|
||||
@ -72,90 +82,94 @@ export async function sendNotificationInternal(
|
||||
notificationType: NotificationType,
|
||||
startDateString: string,
|
||||
endDateString: string,
|
||||
chunk: AuthCreditUsageChunk
|
||||
chunk: AuthCreditUsageChunk,
|
||||
bypassRecentChecks: boolean = false
|
||||
): Promise<{ success: boolean }> {
|
||||
if (team_id === "preview") {
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
const fifteenDaysAgo = new Date();
|
||||
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
|
||||
if (!bypassRecentChecks) {
|
||||
const fifteenDaysAgo = new Date();
|
||||
fifteenDaysAgo.setDate(fifteenDaysAgo.getDate() - 15);
|
||||
|
||||
const { data, error } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", fifteenDaysAgo.toISOString());
|
||||
|
||||
if (error) {
|
||||
Logger.debug(`Error fetching notifications: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (data.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} in the last 15 days`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
const { data: recentData, error: recentError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", startDateString)
|
||||
.lte("sent_date", endDateString);
|
||||
|
||||
if (recentError) {
|
||||
Logger.debug(`Error fetching recent notifications: ${recentError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (recentData.length !== 0) {
|
||||
// Logger.debug(`Notification already sent for team_id: ${team_id} and notificationType: ${notificationType} within the specified date range`);
|
||||
return { success: false };
|
||||
} else {
|
||||
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||
// get the emails from the user with the team_id
|
||||
const { data: emails, error: emailsError } = await supabase_service
|
||||
.from("users")
|
||||
.select("email")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
if (emailsError) {
|
||||
Logger.debug(`Error fetching emails: ${emailsError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
for (const email of emails) {
|
||||
await sendEmailNotification(email.email, notificationType);
|
||||
}
|
||||
|
||||
const { error: insertError } = await supabase_service
|
||||
const { data, error } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.insert([
|
||||
{
|
||||
team_id: team_id,
|
||||
notification_type: notificationType,
|
||||
sent_date: new Date().toISOString(),
|
||||
},
|
||||
]);
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", fifteenDaysAgo.toISOString());
|
||||
|
||||
if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) {
|
||||
sendSlackWebhook(
|
||||
`${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`,
|
||||
false,
|
||||
process.env.SLACK_ADMIN_WEBHOOK_URL
|
||||
).catch((error) => {
|
||||
Logger.debug(`Error sending slack notification: ${error}`);
|
||||
});
|
||||
}
|
||||
|
||||
if (insertError) {
|
||||
Logger.debug(`Error inserting notification record: ${insertError}`);
|
||||
if (error) {
|
||||
Logger.debug(`Error fetching notifications: ${error}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
return { success: true };
|
||||
if (data.length !== 0) {
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
// TODO: observation: Free credits people are not receiving notifications
|
||||
|
||||
const { data: recentData, error: recentError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.select("*")
|
||||
.eq("team_id", team_id)
|
||||
.eq("notification_type", notificationType)
|
||||
.gte("sent_date", startDateString)
|
||||
.lte("sent_date", endDateString);
|
||||
|
||||
if (recentError) {
|
||||
Logger.debug(`Error fetching recent notifications: ${recentError.message}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
if (recentData.length !== 0) {
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
console.log(`Sending notification for team_id: ${team_id} and notificationType: ${notificationType}`);
|
||||
// get the emails from the user with the team_id
|
||||
const { data: emails, error: emailsError } = await supabase_service
|
||||
.from("users")
|
||||
.select("email")
|
||||
.eq("team_id", team_id);
|
||||
|
||||
if (emailsError) {
|
||||
Logger.debug(`Error fetching emails: ${emailsError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
for (const email of emails) {
|
||||
await sendEmailNotification(email.email, notificationType);
|
||||
}
|
||||
|
||||
const { error: insertError } = await supabase_service
|
||||
.from("user_notifications")
|
||||
.insert([
|
||||
{
|
||||
team_id: team_id,
|
||||
notification_type: notificationType,
|
||||
sent_date: new Date().toISOString(),
|
||||
},
|
||||
]);
|
||||
|
||||
if (process.env.SLACK_ADMIN_WEBHOOK_URL && emails.length > 0) {
|
||||
sendSlackWebhook(
|
||||
`${getNotificationString(notificationType)}: Team ${team_id}, with email ${emails[0].email}. Number of credits used: ${chunk.adjusted_credits_used} | Number of credits in the plan: ${chunk.price_credits}`,
|
||||
false,
|
||||
process.env.SLACK_ADMIN_WEBHOOK_URL
|
||||
).catch((error) => {
|
||||
Logger.debug(`Error sending slack notification: ${error}`);
|
||||
});
|
||||
}
|
||||
|
||||
if (insertError) {
|
||||
Logger.debug(`Error inserting notification record: ${insertError}`);
|
||||
return { success: false };
|
||||
}
|
||||
|
||||
return { success: true };
|
||||
}
|
||||
|
@ -11,6 +11,10 @@ export function getNotificationString(
|
||||
return "Limit reached (100%)";
|
||||
case NotificationType.RATE_LIMIT_REACHED:
|
||||
return "Rate limit reached";
|
||||
case NotificationType.AUTO_RECHARGE_SUCCESS:
|
||||
return "Auto-recharge successful";
|
||||
case NotificationType.AUTO_RECHARGE_FAILED:
|
||||
return "Auto-recharge failed";
|
||||
default:
|
||||
return "Unknown notification type";
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ const RATE_LIMITS = {
|
||||
standardnew: 10,
|
||||
growth: 50,
|
||||
growthdouble: 50,
|
||||
etier2c: 300,
|
||||
},
|
||||
scrape: {
|
||||
default: 20,
|
||||
@ -28,6 +29,7 @@ const RATE_LIMITS = {
|
||||
standardnew: 100,
|
||||
growth: 1000,
|
||||
growthdouble: 1000,
|
||||
etier2c: 2500,
|
||||
},
|
||||
search: {
|
||||
default: 20,
|
||||
@ -41,6 +43,7 @@ const RATE_LIMITS = {
|
||||
standardnew: 50,
|
||||
growth: 500,
|
||||
growthdouble: 500,
|
||||
etier2c: 2500,
|
||||
},
|
||||
map:{
|
||||
default: 20,
|
||||
@ -54,6 +57,7 @@ const RATE_LIMITS = {
|
||||
standardnew: 50,
|
||||
growth: 500,
|
||||
growthdouble: 500,
|
||||
etier2c: 2500,
|
||||
},
|
||||
preview: {
|
||||
free: 5,
|
||||
|
@ -130,6 +130,8 @@ export enum NotificationType {
|
||||
APPROACHING_LIMIT = "approachingLimit",
|
||||
LIMIT_REACHED = "limitReached",
|
||||
RATE_LIMIT_REACHED = "rateLimitReached",
|
||||
AUTO_RECHARGE_SUCCESS = "autoRechargeSuccess",
|
||||
AUTO_RECHARGE_FAILED = "autoRechargeFailed",
|
||||
}
|
||||
|
||||
export type ScrapeLog = {
|
||||
@ -155,6 +157,7 @@ export type PlanType =
|
||||
| "standardnew"
|
||||
| "growth"
|
||||
| "growthdouble"
|
||||
| "etier2c"
|
||||
| "free"
|
||||
| "";
|
||||
|
||||
|
@ -147,7 +147,7 @@ watch.addEventListener("done", state => {
|
||||
|
||||
### Batch scraping multiple URLs
|
||||
|
||||
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the output formats.
|
||||
To batch scrape multiple URLs with error handling, use the `batchScrapeUrls` method. It takes the starting URLs and optional parameters as arguments. The `params` argument allows you to specify additional options for the batch scrape job, such as the output formats.
|
||||
|
||||
```js
|
||||
const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], {
|
||||
@ -158,10 +158,10 @@ const batchScrapeResponse = await app.batchScrapeUrls(['https://firecrawl.dev',
|
||||
|
||||
#### Asynchronous batch scrape
|
||||
|
||||
To initiate an asynchronous batch scrape, utilize the `asyncBulkScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
|
||||
To initiate an asynchronous batch scrape, utilize the `asyncBatchScrapeUrls` method. This method requires the starting URLs and optional parameters as inputs. The params argument enables you to define various settings for the scrape, such as the output formats. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the batch scrape.
|
||||
|
||||
```js
|
||||
const asyncBulkScrapeResult = await app.asyncBulkScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
const asyncBatchScrapeResult = await app.asyncBatchScrapeUrls(['https://firecrawl.dev', 'https://mendable.ai'], { formats: ['markdown', 'html'] });
|
||||
```
|
||||
|
||||
#### Batch scrape with WebSockets
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "1.7.1",
|
||||
"version": "1.7.2",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
|
@ -82,6 +82,10 @@ export interface CrawlScrapeOptions {
|
||||
onlyMainContent?: boolean;
|
||||
waitFor?: number;
|
||||
timeout?: number;
|
||||
location?: {
|
||||
country?: string;
|
||||
languages?: string[];
|
||||
};
|
||||
}
|
||||
|
||||
export type Action = {
|
||||
|
@ -170,11 +170,11 @@ print(batch_scrape_result)
|
||||
|
||||
### Checking batch scrape status
|
||||
|
||||
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_job` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
|
||||
To check the status of an asynchronous batch scrape job, use the `check_batch_scrape_status` method. It takes the job ID as a parameter and returns the current status of the batch scrape job.
|
||||
|
||||
```python
|
||||
id = batch_scrape_result['id']
|
||||
status = app.check_batch_scrape_job(id)
|
||||
status = app.check_batch_scrape_status(id)
|
||||
```
|
||||
|
||||
### Batch scrape with WebSockets
|
||||
|
180
examples/claude_stock_analyzer/claude_stock_analyzer.py
Normal file
180
examples/claude_stock_analyzer/claude_stock_analyzer.py
Normal file
@ -0,0 +1,180 @@
|
||||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import anthropic
|
||||
from e2b_code_interpreter import Sandbox
|
||||
import base64
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Retrieve API keys from environment variables
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
|
||||
e2b_api_key = os.getenv("E2B_API_KEY")
|
||||
|
||||
# Initialize the FirecrawlApp and Anthropic client
|
||||
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||
client = anthropic.Anthropic(api_key=anthropic_api_key)
|
||||
sandbox = Sandbox(api_key=e2b_api_key)
|
||||
|
||||
# Find the relevant stock pages via map
|
||||
def find_relevant_page_via_map(stock_search_term, url, app):
|
||||
try:
|
||||
print(f"{Colors.CYAN}Searching for stock: {stock_search_term}{Colors.RESET}")
|
||||
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
|
||||
|
||||
map_search_parameter = stock_search_term
|
||||
|
||||
print(f"{Colors.GREEN}Search parameter: {map_search_parameter}{Colors.RESET}")
|
||||
|
||||
print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
|
||||
map_website = app.map_url(url, params={"search": map_search_parameter})
|
||||
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
|
||||
print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}")
|
||||
return map_website['links']
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
# Function to plot the scores using e2b
|
||||
def plot_scores(stock_names, stock_scores):
|
||||
print(f"{Colors.YELLOW}Plotting scores...{Colors.RESET}")
|
||||
code_to_run = f"""
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
stock_names = {stock_names}
|
||||
stock_scores = {stock_scores}
|
||||
|
||||
plt.figure(figsize=(10, 5))
|
||||
plt.bar(stock_names, stock_scores, color='blue')
|
||||
plt.xlabel('Stock Names')
|
||||
plt.ylabel('Scores')
|
||||
plt.title('Stock Investment Scores')
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.savefig('chart.png')
|
||||
plt.show()
|
||||
"""
|
||||
# Run the code inside the sandbox
|
||||
execution = sandbox.run_code(code_to_run)
|
||||
|
||||
# Check if there are any results
|
||||
if execution.results and execution.results[0].png:
|
||||
first_result = execution.results[0]
|
||||
|
||||
# Get the directory where the current python file is located
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
# Save the png to a file in the examples directory. The png is in base64 format.
|
||||
with open(os.path.join(current_dir, 'chart.png'), 'wb') as f:
|
||||
f.write(base64.b64decode(first_result.png))
|
||||
print('Chart saved as examples/chart.png')
|
||||
else:
|
||||
print(f"{Colors.RED}No results returned from the sandbox execution.{Colors.RESET}")
|
||||
|
||||
# Analyze the top stocks and provide investment recommendation
|
||||
def analyze_top_stocks(map_website, app, client):
|
||||
try:
|
||||
# Get top 5 links from the map result
|
||||
top_links = map_website[:10]
|
||||
print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
|
||||
|
||||
# Scrape the pages in batch
|
||||
batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})
|
||||
print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}")
|
||||
|
||||
# Prepare content for LLM
|
||||
stock_contents = []
|
||||
for scrape_result in batch_scrape_result['data']:
|
||||
stock_contents.append({
|
||||
'content': scrape_result['markdown']
|
||||
})
|
||||
|
||||
# Pass all the content to the LLM to analyze and decide which stock to invest in
|
||||
analyze_prompt = f"""
|
||||
Based on the following information about different stocks from their Robinhood pages, analyze and determine which stock is the best investment opportunity. DO NOT include any other text, just the JSON.
|
||||
|
||||
Return the result in the following JSON format. Only return the JSON, nothing else. Do not include backticks or any other formatting, just the JSON.
|
||||
{{
|
||||
"scores": [
|
||||
{{
|
||||
"stock_name": "<stock_name>",
|
||||
"score": <score-out-of-100>
|
||||
}},
|
||||
...
|
||||
]
|
||||
}}
|
||||
|
||||
Stock Information:
|
||||
"""
|
||||
|
||||
for stock in stock_contents:
|
||||
analyze_prompt += f"Content:\n{stock['content']}\n"
|
||||
|
||||
print(f"{Colors.YELLOW}Analyzing stock information with LLM...{Colors.RESET}")
|
||||
analyze_prompt += f"\n\nStart JSON:\n"
|
||||
completion = client.messages.create(
|
||||
model="claude-3-5-sonnet-20240620",
|
||||
max_tokens=1000,
|
||||
temperature=0,
|
||||
system="You are a financial analyst. Only return the JSON, nothing else.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": analyze_prompt
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
result = completion.content[0].text
|
||||
print(f"{Colors.GREEN}Analysis completed. Here is the recommendation:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{result}{Colors.RESET}")
|
||||
|
||||
# Plot the scores using e2b
|
||||
try:
|
||||
result_json = json.loads(result)
|
||||
scores = result_json['scores']
|
||||
stock_names = [score['stock_name'] for score in scores]
|
||||
stock_scores = [score['score'] for score in scores]
|
||||
|
||||
plot_scores(stock_names, stock_scores)
|
||||
except json.JSONDecodeError as json_err:
|
||||
print(f"{Colors.RED}Error decoding JSON response: {str(json_err)}{Colors.RESET}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error encountered during stock analysis: {str(e)}{Colors.RESET}")
|
||||
|
||||
# Main function to execute the process
|
||||
def main():
|
||||
# Get user input
|
||||
stock_search_term = input(f"{Colors.BLUE}Enter the stock you're interested in: {Colors.RESET}")
|
||||
if not stock_search_term.strip():
|
||||
print(f"{Colors.RED}No stock entered. Exiting.{Colors.RESET}")
|
||||
return
|
||||
|
||||
url = "https://robinhood.com/stocks"
|
||||
|
||||
print(f"{Colors.YELLOW}Initiating stock analysis process...{Colors.RESET}")
|
||||
# Find the relevant pages
|
||||
map_website = find_relevant_page_via_map(stock_search_term, url, app)
|
||||
|
||||
if map_website:
|
||||
print(f"{Colors.GREEN}Relevant stock pages identified. Proceeding with detailed analysis...{Colors.RESET}")
|
||||
# Analyze top stocks
|
||||
analyze_top_stocks(map_website, app, client)
|
||||
else:
|
||||
print(f"{Colors.RED}No relevant stock pages identified. Consider refining the search term or trying a different stock.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
import anthropic
|
||||
import agentops
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
@ -161,4 +162,5 @@ def main():
|
||||
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
agentops.init(os.getenv("AGENTOPS_API_KEY"))
|
||||
main()
|
||||
|
@ -98,7 +98,7 @@
|
||||
"source": [
|
||||
"# Create a cache with a 5 minute TTL\n",
|
||||
"cache = caching.CachedContent.create(\n",
|
||||
" model=\"models/gemini-1.5-pro-001\",\n",
|
||||
" model=\"models/gemini-1.5-pro-002\",\n",
|
||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||
" contents=[text_file],\n",
|
||||
|
@ -0,0 +1,166 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import datetime\n",
|
||||
"import time\n",
|
||||
"import google.generativeai as genai\n",
|
||||
"from google.generativeai import caching\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"from firecrawl import FirecrawlApp\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# Load environment variables\n",
|
||||
"load_dotenv()\n",
|
||||
"\n",
|
||||
"# Retrieve API keys from environment variables\n",
|
||||
"google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
|
||||
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
|
||||
"\n",
|
||||
"# Configure the Google Generative AI module with the API key\n",
|
||||
"genai.configure(api_key=google_api_key)\n",
|
||||
"\n",
|
||||
"# Initialize the FirecrawlApp with your API key\n",
|
||||
"app = FirecrawlApp(api_key=firecrawl_api_key)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"No data returned from crawl.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Crawl a website\n",
|
||||
"crawl_url = 'https://dify.ai/'\n",
|
||||
"params = {\n",
|
||||
" \n",
|
||||
" 'crawlOptions': {\n",
|
||||
" 'limit': 100\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"crawl_result = app.crawl_url(crawl_url, params=params)\n",
|
||||
"\n",
|
||||
"if crawl_result is not None:\n",
|
||||
" # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
|
||||
" cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
|
||||
"\n",
|
||||
" # Save the modified results as a text file containing JSON data\n",
|
||||
" with open('crawl_result.txt', 'w') as file:\n",
|
||||
" file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
|
||||
"else:\n",
|
||||
" print(\"No data returned from crawl.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Upload the video using the Files API\n",
|
||||
"text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
|
||||
"\n",
|
||||
"# Wait for the file to finish processing\n",
|
||||
"while text_file.state.name == \"PROCESSING\":\n",
|
||||
" print('Waiting for file to be processed.')\n",
|
||||
" time.sleep(2)\n",
|
||||
" text_file = genai.get_file(text_file.name)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a cache with a 5 minute TTL\n",
|
||||
"cache = caching.CachedContent.create(\n",
|
||||
" model=\"models/gemini-1.5-flash-002\",\n",
|
||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||
" contents=[text_file],\n",
|
||||
" ttl=datetime.timedelta(minutes=15),\n",
|
||||
")\n",
|
||||
"# Construct a GenerativeModel which uses the created cache.\n",
|
||||
"model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
|
||||
"\n",
|
||||
"Here's how Firecrawl helps:\n",
|
||||
"\n",
|
||||
"* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
|
||||
"* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
|
||||
"* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
|
||||
"\n",
|
||||
"You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
|
||||
"\n",
|
||||
"Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Query the model\n",
|
||||
"response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
|
||||
"response_dict = response.to_dict()\n",
|
||||
"response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
|
||||
"print(response_text)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user