Merge branch 'main' into nsc/map-pagination

This commit is contained in:
Nicolas 2024-09-16 11:04:44 -04:00
commit af4804e13b
44 changed files with 2194 additions and 1308 deletions

View File

@ -1,7 +1,7 @@
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
title: "[Bug] "
labels: bug
assignees: ''

View File

@ -1,7 +1,7 @@
---
name: Feature request
about: Suggest an idea for this project
title: "[Feat]"
title: "[Feat] "
labels: ''
assignees: ''

View File

@ -0,0 +1,40 @@
---
name: Self-host issue
about: Report an issue with self-hosting Firecrawl
title: "[Self-Host] "
labels: self-host
assignees: ''
---
**Describe the Issue**
Provide a clear and concise description of the self-hosting issue you're experiencing.
**To Reproduce**
Steps to reproduce the issue:
1. Configure the environment or settings with '...'
2. Run the command '...'
3. Observe the error or unexpected output at '...'
4. Log output/error message
**Expected Behavior**
A clear and concise description of what you expected to happen when self-hosting.
**Screenshots**
If applicable, add screenshots or copies of the command line output to help explain the self-hosting issue.
**Environment (please complete the following information):**
- OS: [e.g. macOS, Linux, Windows]
- Firecrawl Version: [e.g. 1.2.3]
- Node.js Version: [e.g. 14.x]
- Docker Version (if applicable): [e.g. 20.10.14]
- Database Type and Version: [e.g. PostgreSQL 13.4]
**Logs**
If applicable, include detailed logs to help understand the self-hosting problem.
**Configuration**
Provide relevant parts of your configuration files (with sensitive information redacted).
**Additional Context**
Add any other context about the self-hosting issue here, such as specific infrastructure details, network setup, or any modifications made to the original Firecrawl setup.

2
.gitignore vendored
View File

@ -21,3 +21,5 @@ apps/playwright-service-ts/package-lock.json
*.pyc
.rdb
apps/js-sdk/firecrawl/dist

View File

@ -106,7 +106,7 @@ You should be able to see the Bull Queue Manager UI on `http://localhost:3002/ad
If youd like to test the crawl endpoint, you can run this:
```bash
curl -X POST http://localhost:3002/v0/crawl \
curl -X POST http://localhost:3002/v1/crawl \
-H 'Content-Type: application/json' \
-d '{
"url": "https://mendable.ai"

View File

@ -4,16 +4,16 @@ import { RateLimiterMode } from "../../../src/types";
import { getScrapeQueue } from "../../../src/services/queue-service";
import { Logger } from "../../../src/lib/logger";
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
import { supabaseGetJobsByCrawlId } from "../../../src/lib/supabase-jobs";
import * as Sentry from "@sentry/node";
import { configDotenv } from "dotenv";
configDotenv();
export async function getJobs(ids: string[]) {
export async function getJobs(crawlId: string, ids: string[]) {
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
if (process.env.USE_DB_AUTHENTICATION === "true") {
const supabaseData = await supabaseGetJobsById(ids);
const supabaseData = await supabaseGetJobsByCrawlId(crawlId);
supabaseData.forEach(x => {
const job = jobs.find(y => y.id === x.job_id);
@ -52,7 +52,7 @@ export async function crawlStatusController(req: Request, res: Response) {
const jobIDs = await getCrawlJobs(req.params.jobId);
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

View File

@ -39,7 +39,7 @@ export async function scrapeHelper(
returnCode: number;
}> {
const url = req.body.url;
if (!url) {
if (typeof url !== "string") {
return { success: false, error: "Url is required", returnCode: 400 };
}
@ -229,7 +229,7 @@ export async function scrapeController(req: Request, res: Response) {
if (result.success) {
let creditsToBeBilled = 1;
const creditsPerLLMExtract = 49;
const creditsPerLLMExtract = 4;
if (extractorOptions.mode.includes("llm-extraction")) {
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);

View File

@ -22,7 +22,7 @@ export async function crawlJobStatusPreviewController(req: Request, res: Respons
// }
// }
const jobs = (await getJobs(jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp);
const jobStatuses = await Promise.all(jobs.map(x => x.getState()));
const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active";

View File

@ -103,6 +103,7 @@ async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusPara
send(ws, {
type: "catchup",
data: {
success: true,
status,
total: jobIDs.length,
completed: doneJobIDs.length,

View File

@ -114,6 +114,7 @@ export async function crawlStatusController(req: RequestWithAuth<CrawlStatusPara
}
res.status(200).json({
success: true,
status,
completed: doneJobsLength,
total: jobIDs.length,

View File

@ -106,7 +106,13 @@ export async function mapController(
links = performCosineSimilarity(links, searchQuery);
}
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
links = links.map((x) => {
try {
return checkAndUpdateURLForMap(x).url.trim()
} catch (_) {
return null;
}
}).filter(x => x !== null);
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));

View File

@ -103,7 +103,7 @@ export async function scrapeController(
return;
}
if(req.body.extract && req.body.formats.includes("extract")) {
creditsToBeBilled = 50;
creditsToBeBilled = 5;
}
billTeam(req.auth.team_id, creditsToBeBilled).catch(error => {

View File

@ -30,7 +30,14 @@ export const url = z.preprocess(
"URL must have a valid top-level domain or be a valid path"
)
.refine(
(x) => checkUrl(x as string),
(x) => {
try {
checkUrl(x as string)
return true;
} catch (_) {
return false;
}
},
"Invalid URL"
)
.refine(
@ -257,6 +264,7 @@ export type CrawlStatusParams = {
export type CrawlStatusResponse =
| ErrorResponse
| {
success: true;
status: "scraping" | "completed" | "failed" | "cancelled";
completed: number;
total: number;
@ -322,6 +330,7 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
removeTags: x.excludeTags,
onlyMainContent: x.onlyMainContent,
waitFor: x.waitFor,
headers: x.headers,
includeLinks: x.formats.includes("links"),
screenshot: x.formats.includes("screenshot"),
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
@ -339,7 +348,7 @@ export function legacyExtractorOptions(x: ExtractOptions): ExtractorOptions {
}
export function legacyDocumentConverter(doc: any): Document {
if (doc === null || doc === undefined) return doc;
if (doc === null || doc === undefined) return null;
if (doc.metadata) {
if (doc.metadata.screenshot) {

View File

@ -201,16 +201,20 @@ if (cluster.isMaster) {
Sentry.setupExpressErrorHandler(app);
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
if (err instanceof SyntaxError && 'status' in err && err.status === 400 && 'body' in err) {
return res.status(400).json({ success: false, error: 'Bad request, malformed JSON' });
}
const id = res.sentry ?? uuidv4();
let verbose = JSON.stringify(err);
if (verbose === "{}") {
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
if (err instanceof Error) {
verbose = JSON.stringify({
message: err.message,
name: err.name,
stack: err.stack,
});
}
}
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);

View File

@ -2,6 +2,11 @@ import { supabase_service } from "../services/supabase";
import { Logger } from "./logger";
import * as Sentry from "@sentry/node";
/**
* Get a single firecrawl_job by ID
* @param jobId ID of Job
* @returns {any | null} Job
*/
export const supabaseGetJobById = async (jobId: string) => {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
@ -20,13 +25,43 @@ export const supabaseGetJobById = async (jobId: string) => {
return data;
};
/**
* Get multiple firecrawl_jobs by ID. Use this if you're not requesting a lot (50+) of jobs at once.
* @param jobIds IDs of Jobs
* @returns {any[]} Jobs
*/
export const supabaseGetJobsById = async (jobIds: string[]) => {
const { data, error } = await supabase_service.rpc("get_jobs_by_ids", {
job_ids: jobIds,
});
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.select()
.in("job_id", jobIds);
if (error) {
Logger.error(`Error in get_jobs_by_ids: ${error}`);
Logger.error(`Error in supabaseGetJobsById: ${error}`);
Sentry.captureException(error);
return [];
}
if (!data) {
return [];
}
return data;
};
/**
* Get multiple firecrawl_jobs by crawl ID. Use this if you need a lot of jobs at once.
* @param crawlId ID of crawl
* @returns {any[]} Jobs
*/
export const supabaseGetJobsByCrawlId = async (crawlId: string) => {
const { data, error } = await supabase_service
.from("firecrawl_jobs")
.select()
.eq("crawl_id", crawlId)
if (error) {
Logger.error(`Error in supabaseGetJobsByCrawlId: ${error}`);
Sentry.captureException(error);
return [];
}

View File

@ -83,7 +83,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
}
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (req.body.url && isUrlBlocked(req.body.url)) {
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
if (!res.headersSent) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
}

View File

@ -55,7 +55,7 @@ export async function scrapWithFireEngine({
try {
const reqParams = await generateRequestParams(url);
let waitParam = reqParams["params"]?.wait ?? waitFor;
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright";
let engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "chrome-cdp";
let screenshotParam = reqParams["params"]?.screenshot ?? screenshot;
let fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot;
let fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions;
@ -127,7 +127,7 @@ export async function scrapWithFireEngine({
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
await new Promise(resolve => setTimeout(resolve, 250)); // wait 0.25 seconds
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
}

View File

@ -96,15 +96,15 @@ function getScrapingFallbackOrder(
"fetch",
].filter(Boolean);
if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
defaultOrder = [
"fire-engine",
useFireEngine ? undefined : "playwright",
...defaultOrder.filter(
(scraper) => scraper !== "fire-engine" && scraper !== "playwright"
),
].filter(Boolean);
}
// if (isWaitPresent || isScreenshotPresent || isHeadersPresent) {
// defaultOrder = [
// "fire-engine",
// useFireEngine ? undefined : "playwright",
// ...defaultOrder.filter(
// (scraper) => scraper !== "fire-engine" && scraper !== "playwright"
// ),
// ].filter(Boolean);
// }
const filteredDefaultOrder = defaultOrder.filter(
(scraper: (typeof baseScrapers)[number]) =>

View File

@ -242,5 +242,13 @@ export const urlSpecificParams = {
engine: "chrome-cdp",
},
},
},
"lorealparis.hu":{
defaultScraper: "fire-engine",
params:{
fireEngineOptions:{
engine: "tlsclient",
},
},
}
};

View File

@ -39,16 +39,8 @@ export const excludeNonMainTags = [
"#search",
".share",
"#share",
".pagination",
"#pagination",
".widget",
"#widget",
".related",
"#related",
".tag",
"#tag",
".category",
"#category",
".cookie",
"#cookie"
];

View File

@ -186,7 +186,8 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
getValue(cacheKeyCoupons)
]);
let subscription, subscriptionError, coupons;
let subscription, subscriptionError;
let coupons : {credits: number}[];
if (cachedSubscription && cachedCoupons) {
subscription = JSON.parse(cachedSubscription);
@ -225,16 +226,16 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
);
}
// If there are available coupons and they are enough for the operation
if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
}
// Free credits, no coupons
if (!subscription || subscriptionError) {
// If there is no active subscription but there are available coupons
if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
}
let creditUsages;
let creditUsageError;
let totalCreditsUsed = 0;
@ -251,6 +252,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
const retryInterval = 2000; // 2 seconds
while (retries < maxRetries) {
// Reminder, this has an 1000 limit.
const result = await supabase_service
.from("credit_usage")
.select("credits_used")
@ -292,7 +294,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
end.setDate(end.getDate() + 30);
// check if usage is within 80% of the limit
const creditLimit = FREE_CREDITS;
const creditUsagePercentage = (totalCreditsUsed + credits) / creditLimit;
const creditUsagePercentage = totalCreditsUsed / creditLimit;
// Add a check to ensure totalCreditsUsed is greater than 0
if (totalCreditsUsed > 0 && creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
@ -306,7 +308,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
}
// 5. Compare the total credits used with the credits allowed by the plan.
if (totalCreditsUsed + credits > FREE_CREDITS) {
if (totalCreditsUsed >= FREE_CREDITS) {
// Send email notification for insufficient credits
await sendNotification(
team_id,
@ -366,7 +368,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
// Get the price details from cache or database
const priceCacheKey = `price_${subscription.price_id}`;
let price;
let price : {credits: number};
try {
const cachedPrice = await getValue(priceCacheKey);
@ -394,29 +396,31 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
Logger.error(`Error retrieving or caching price: ${error}`);
Sentry.captureException(error);
// If errors, just assume it's a big number so user don't get an error
price = { credits: 1000000 };
price = { credits: 10000000 };
}
const creditLimit = price.credits;
const creditUsagePercentage = (adjustedCreditsUsed + credits) / creditLimit;
// Removal of + credits
const creditUsagePercentage = adjustedCreditsUsed / creditLimit;
// Compare the adjusted total credits used with the credits allowed by the plan
if (adjustedCreditsUsed + credits > price.credits) {
// await sendNotification(
// team_id,
// NotificationType.LIMIT_REACHED,
// subscription.current_period_start,
// subscription.current_period_end
// );
if (adjustedCreditsUsed >= price.credits) {
await sendNotification(
team_id,
NotificationType.LIMIT_REACHED,
subscription.current_period_start,
subscription.current_period_end
);
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
} else if (creditUsagePercentage >= 0.8) {
} else if (creditUsagePercentage >= 0.8 && creditUsagePercentage < 1) {
// Send email notification for approaching credit limit
// await sendNotification(
// team_id,
// NotificationType.APPROACHING_LIMIT,
// subscription.current_period_start,
// subscription.current_period_end
// );
await sendNotification(
team_id,
NotificationType.APPROACHING_LIMIT,
subscription.current_period_start,
subscription.current_period_end
);
}
return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };

View File

@ -16,6 +16,14 @@ export function getScrapeQueue() {
scrapeQueueName,
{
connection: redisConnection,
defaultJobOptions: {
removeOnComplete: {
age: 90000, // 25 hours
},
removeOnFail: {
age: 90000, // 25 hours
},
},
}
// {
// settings: {

View File

@ -448,11 +448,13 @@ async function processJob(job: Job, token: string) {
} catch (error) {
Logger.error(`🐂 Job errored ${job.id} - ${error}`);
Sentry.captureException(error, {
data: {
job: job.id,
},
});
if (!(error instanceof Error && error.message.includes("JSON parsing error(s): "))) {
Sentry.captureException(error, {
data: {
job: job.id,
},
});
}
if (error instanceof CustomError) {
// Here we handle the error, then save the failed job

View File

@ -6,7 +6,7 @@ const RATE_LIMITS = {
crawl: {
default: 3,
free: 2,
starter: 3,
starter: 10,
standard: 5,
standardOld: 40,
scale: 50,
@ -19,9 +19,9 @@ const RATE_LIMITS = {
scrape: {
default: 20,
free: 10,
starter: 20,
starter: 100,
standard: 100,
standardOld: 40,
standardOld: 100,
scale: 500,
hobby: 20,
standardNew: 100,
@ -32,8 +32,8 @@ const RATE_LIMITS = {
search: {
default: 20,
free: 5,
starter: 20,
standard: 40,
starter: 50,
standard: 50,
standardOld: 40,
scale: 500,
hobby: 10,
@ -45,9 +45,9 @@ const RATE_LIMITS = {
map:{
default: 20,
free: 5,
starter: 20,
standard: 40,
standardOld: 40,
starter: 50,
standard: 50,
standardOld: 50,
scale: 500,
hobby: 10,
standardNew: 50,
@ -104,6 +104,13 @@ export const devBRateLimiter = new RateLimiterRedis({
duration: 60, // Duration in seconds
});
export const manualRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
keyPrefix: "manual",
points: 2000,
duration: 60, // Duration in seconds
});
export const scrapeStatusRateLimiter = new RateLimiterRedis({
storeClient: redisRateLimitClient,
@ -112,14 +119,18 @@ export const scrapeStatusRateLimiter = new RateLimiterRedis({
duration: 60, // Duration in seconds
});
const testSuiteTokens = ["a01ccae", "6254cf9", "0f96e673", "23befa1b", "69141c4"];
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];
export function getRateLimiter(
mode: RateLimiterMode,
token: string,
plan?: string,
teamId?: string
) {
if (token.includes("a01ccae") || token.includes("6254cf9") || token.includes("0f96e673") || token.includes("23befa1b")) {
if (testSuiteTokens.some(testToken => token.includes(testToken))) {
return testSuiteRateLimiter;
}
@ -127,6 +138,10 @@ export function getRateLimiter(
return devBRateLimiter;
}
if(teamId && manual.includes(teamId)) {
return manualRateLimiter;
}
const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5}
if (!rateLimitConfig) return serverRateLimiter;

View File

@ -1,4 +1,4 @@
import FirecrawlApp from '@mendable/firecrawl-js';
import FirecrawlApp from 'firecrawl';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});

View File

@ -1,4 +1,4 @@
import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from '@mendable/firecrawl-js';
import FirecrawlApp, { CrawlStatusResponse, ErrorResponse } from 'firecrawl';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});

View File

@ -1,347 +0,0 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.CrawlWatcher = void 0;
const axios_1 = __importDefault(require("axios"));
const zod_to_json_schema_1 = require("zod-to-json-schema");
const isows_1 = require("isows");
const typescript_event_target_1 = require("typescript-event-target");
/**
* Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content.
*/
class FirecrawlApp {
/**
* Initializes a new instance of the FirecrawlApp class.
* @param config - Configuration options for the FirecrawlApp instance.
*/
constructor({ apiKey = null, apiUrl = null }) {
this.apiKey = apiKey || "";
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
}
/**
* Scrapes a URL using the Firecrawl API.
* @param url - The URL to scrape.
* @param params - Additional parameters for the scrape request.
* @returns The response from the scrape operation.
*/
async scrapeUrl(url, params) {
const headers = {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
};
let jsonData = { url, ...params };
if (jsonData?.extract?.schema) {
let schema = jsonData.extract.schema;
// Try parsing the schema as a Zod schema
try {
schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema);
}
catch (error) {
}
jsonData = {
...jsonData,
extract: {
...jsonData.extract,
schema: schema,
},
};
}
try {
const response = await axios_1.default.post(this.apiUrl + `/v1/scrape`, jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return {
success: true,
warning: responseData.warning,
error: responseData.error,
...responseData.data
};
}
else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
}
}
else {
this.handleError(response, "scrape URL");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: "Internal server error." };
}
/**
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
* @param query - The search query string.
* @param params - Additional parameters for the search.
* @returns Throws an error advising to use version 0 of the API.
*/
async search(query, params) {
throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0.");
}
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param url - The URL to crawl.
* @param params - Additional parameters for the crawl request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns The response from the crawl operation.
*/
async crawlUrl(url, params, pollInterval = 2, idempotencyKey) {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData = { url, ...params };
try {
const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
if (response.status === 200) {
const id = response.data.id;
return this.monitorJobStatus(id, headers, pollInterval);
}
else {
this.handleError(response, "start crawl job");
}
}
catch (error) {
if (error.response?.data?.error) {
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
}
else {
throw new Error(error.message);
}
}
return { success: false, error: "Internal server error." };
}
async asyncCrawlUrl(url, params, idempotencyKey) {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData = { url, ...params };
try {
const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
if (response.status === 200) {
return response.data;
}
else {
this.handleError(response, "start crawl job");
}
}
catch (error) {
if (error.response?.data?.error) {
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
}
else {
throw new Error(error.message);
}
}
return { success: false, error: "Internal server error." };
}
/**
* Checks the status of a crawl job using the Firecrawl API.
* @param id - The ID of the crawl operation.
* @returns The response containing the job status.
*/
async checkCrawlStatus(id) {
if (!id) {
throw new Error("No crawl ID provided");
}
const headers = this.prepareHeaders();
try {
const response = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
if (response.status === 200) {
return ({
success: true,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: response.data.data,
error: response.data.error
});
}
else {
this.handleError(response, "check crawl status");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: "Internal server error." };
}
async crawlUrlAndWatch(url, params, idempotencyKey) {
const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey);
if (crawl.success && crawl.id) {
const id = crawl.id;
return new CrawlWatcher(id, this);
}
throw new Error("Crawl job failed to start");
}
async mapUrl(url, params) {
const headers = this.prepareHeaders();
let jsonData = { url, ...params };
try {
const response = await this.postRequest(this.apiUrl + `/v1/map`, jsonData, headers);
if (response.status === 200) {
return response.data;
}
else {
this.handleError(response, "map");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: "Internal server error." };
}
/**
* Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency.
* @returns The prepared headers.
*/
prepareHeaders(idempotencyKey) {
return {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
};
}
/**
* Sends a POST request to the specified URL.
* @param url - The URL to send the request to.
* @param data - The data to send in the request.
* @param headers - The headers for the request.
* @returns The response from the POST request.
*/
postRequest(url, data, headers) {
return axios_1.default.post(url, data, { headers });
}
/**
* Sends a GET request to the specified URL.
* @param url - The URL to send the request to.
* @param headers - The headers for the request.
* @returns The response from the GET request.
*/
getRequest(url, headers) {
return axios_1.default.get(url, { headers });
}
/**
* Monitors the status of a crawl job until completion or failure.
* @param id - The ID of the crawl operation.
* @param headers - The headers for the request.
* @param checkInterval - Interval in seconds for job status checks.
* @param checkUrl - Optional URL to check the status (used for v1 API)
* @returns The final job status or data.
*/
async monitorJobStatus(id, headers, checkInterval) {
while (true) {
const statusResponse = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
if (statusResponse.status === 200) {
const statusData = statusResponse.data;
if (statusData.status === "completed") {
if ("data" in statusData) {
return statusData;
}
else {
throw new Error("Crawl job completed but no data was returned");
}
}
else if (["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)) {
checkInterval = Math.max(checkInterval, 2);
await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000));
}
else {
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
}
}
else {
this.handleError(statusResponse, "check crawl status");
}
}
}
/**
* Handles errors from API responses.
* @param {AxiosResponse} response - The response from the API.
* @param {string} action - The action being performed when the error occurred.
*/
handleError(response, action) {
if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage = response.data.error || "Unknown error occurred";
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
}
else {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
}
}
}
exports.default = FirecrawlApp;
class CrawlWatcher extends typescript_event_target_1.TypedEventTarget {
constructor(id, app) {
super();
this.ws = new isows_1.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
this.status = "scraping";
this.data = [];
const messageHandler = (msg) => {
if (msg.type === "done") {
this.status = "completed";
this.dispatchTypedEvent("done", new CustomEvent("done", {
detail: {
status: this.status,
data: this.data,
},
}));
}
else if (msg.type === "error") {
this.status = "failed";
this.dispatchTypedEvent("error", new CustomEvent("error", {
detail: {
status: this.status,
data: this.data,
error: msg.error,
},
}));
}
else if (msg.type === "catchup") {
this.status = msg.data.status;
this.data.push(...(msg.data.data ?? []));
for (const doc of this.data) {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: doc,
}));
}
}
else if (msg.type === "document") {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: msg.data,
}));
}
};
this.ws.onmessage = ((ev) => {
if (typeof ev.data !== "string") {
this.ws.close();
return;
}
const msg = JSON.parse(ev.data);
messageHandler(msg);
}).bind(this);
this.ws.onclose = ((ev) => {
const msg = JSON.parse(ev.reason);
messageHandler(msg);
}).bind(this);
this.ws.onerror = ((_) => {
this.status = "failed";
this.dispatchTypedEvent("error", new CustomEvent("error", {
detail: {
status: this.status,
data: this.data,
error: "WebSocket error",
},
}));
}).bind(this);
}
close() {
this.ws.close();
}
}
exports.CrawlWatcher = CrawlWatcher;

View File

@ -1 +0,0 @@
{"type": "commonjs"}

View File

@ -1,339 +0,0 @@
import axios from "axios";
import { zodToJsonSchema } from "zod-to-json-schema";
import { WebSocket } from "isows";
import { TypedEventTarget } from "typescript-event-target";
/**
* Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content.
*/
export default class FirecrawlApp {
/**
* Initializes a new instance of the FirecrawlApp class.
* @param config - Configuration options for the FirecrawlApp instance.
*/
constructor({ apiKey = null, apiUrl = null }) {
this.apiKey = apiKey || "";
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
}
/**
* Scrapes a URL using the Firecrawl API.
* @param url - The URL to scrape.
* @param params - Additional parameters for the scrape request.
* @returns The response from the scrape operation.
*/
async scrapeUrl(url, params) {
const headers = {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
};
let jsonData = { url, ...params };
if (jsonData?.extract?.schema) {
let schema = jsonData.extract.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
}
catch (error) {
}
jsonData = {
...jsonData,
extract: {
...jsonData.extract,
schema: schema,
},
};
}
try {
const response = await axios.post(this.apiUrl + `/v1/scrape`, jsonData, { headers });
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return {
success: true,
warning: responseData.warning,
error: responseData.error,
...responseData.data
};
}
else {
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
}
}
else {
this.handleError(response, "scrape URL");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: "Internal server error." };
}
/**
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
* @param query - The search query string.
* @param params - Additional parameters for the search.
* @returns Throws an error advising to use version 0 of the API.
*/
async search(query, params) {
throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0.");
}
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param url - The URL to crawl.
* @param params - Additional parameters for the crawl request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns The response from the crawl operation.
*/
async crawlUrl(url, params, pollInterval = 2, idempotencyKey) {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData = { url, ...params };
try {
const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
if (response.status === 200) {
const id = response.data.id;
return this.monitorJobStatus(id, headers, pollInterval);
}
else {
this.handleError(response, "start crawl job");
}
}
catch (error) {
if (error.response?.data?.error) {
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
}
else {
throw new Error(error.message);
}
}
return { success: false, error: "Internal server error." };
}
async asyncCrawlUrl(url, params, idempotencyKey) {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData = { url, ...params };
try {
const response = await this.postRequest(this.apiUrl + `/v1/crawl`, jsonData, headers);
if (response.status === 200) {
return response.data;
}
else {
this.handleError(response, "start crawl job");
}
}
catch (error) {
if (error.response?.data?.error) {
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
}
else {
throw new Error(error.message);
}
}
return { success: false, error: "Internal server error." };
}
/**
* Checks the status of a crawl job using the Firecrawl API.
* @param id - The ID of the crawl operation.
* @returns The response containing the job status.
*/
async checkCrawlStatus(id) {
if (!id) {
throw new Error("No crawl ID provided");
}
const headers = this.prepareHeaders();
try {
const response = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
if (response.status === 200) {
return ({
success: true,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: response.data.data,
error: response.data.error
});
}
else {
this.handleError(response, "check crawl status");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: "Internal server error." };
}
async crawlUrlAndWatch(url, params, idempotencyKey) {
const crawl = await this.asyncCrawlUrl(url, params, idempotencyKey);
if (crawl.success && crawl.id) {
const id = crawl.id;
return new CrawlWatcher(id, this);
}
throw new Error("Crawl job failed to start");
}
async mapUrl(url, params) {
const headers = this.prepareHeaders();
let jsonData = { url, ...params };
try {
const response = await this.postRequest(this.apiUrl + `/v1/map`, jsonData, headers);
if (response.status === 200) {
return response.data;
}
else {
this.handleError(response, "map");
}
}
catch (error) {
throw new Error(error.message);
}
return { success: false, error: "Internal server error." };
}
/**
* Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency.
* @returns The prepared headers.
*/
prepareHeaders(idempotencyKey) {
return {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
...(idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {}),
};
}
/**
* Sends a POST request to the specified URL.
* @param url - The URL to send the request to.
* @param data - The data to send in the request.
* @param headers - The headers for the request.
* @returns The response from the POST request.
*/
postRequest(url, data, headers) {
return axios.post(url, data, { headers });
}
/**
* Sends a GET request to the specified URL.
* @param url - The URL to send the request to.
* @param headers - The headers for the request.
* @returns The response from the GET request.
*/
getRequest(url, headers) {
return axios.get(url, { headers });
}
/**
* Monitors the status of a crawl job until completion or failure.
* @param id - The ID of the crawl operation.
* @param headers - The headers for the request.
* @param checkInterval - Interval in seconds for job status checks.
* @param checkUrl - Optional URL to check the status (used for v1 API)
* @returns The final job status or data.
*/
async monitorJobStatus(id, headers, checkInterval) {
while (true) {
const statusResponse = await this.getRequest(`${this.apiUrl}/v1/crawl/${id}`, headers);
if (statusResponse.status === 200) {
const statusData = statusResponse.data;
if (statusData.status === "completed") {
if ("data" in statusData) {
return statusData;
}
else {
throw new Error("Crawl job completed but no data was returned");
}
}
else if (["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)) {
checkInterval = Math.max(checkInterval, 2);
await new Promise((resolve) => setTimeout(resolve, checkInterval * 1000));
}
else {
throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`);
}
}
else {
this.handleError(statusResponse, "check crawl status");
}
}
}
/**
* Handles errors from API responses.
* @param {AxiosResponse} response - The response from the API.
* @param {string} action - The action being performed when the error occurred.
*/
handleError(response, action) {
if ([402, 408, 409, 500].includes(response.status)) {
const errorMessage = response.data.error || "Unknown error occurred";
throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`);
}
else {
throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`);
}
}
}
export class CrawlWatcher extends TypedEventTarget {
constructor(id, app) {
super();
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
this.status = "scraping";
this.data = [];
const messageHandler = (msg) => {
if (msg.type === "done") {
this.status = "completed";
this.dispatchTypedEvent("done", new CustomEvent("done", {
detail: {
status: this.status,
data: this.data,
},
}));
}
else if (msg.type === "error") {
this.status = "failed";
this.dispatchTypedEvent("error", new CustomEvent("error", {
detail: {
status: this.status,
data: this.data,
error: msg.error,
},
}));
}
else if (msg.type === "catchup") {
this.status = msg.data.status;
this.data.push(...(msg.data.data ?? []));
for (const doc of this.data) {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: doc,
}));
}
}
else if (msg.type === "document") {
this.dispatchTypedEvent("document", new CustomEvent("document", {
detail: msg.data,
}));
}
};
this.ws.onmessage = ((ev) => {
if (typeof ev.data !== "string") {
this.ws.close();
return;
}
const msg = JSON.parse(ev.data);
messageHandler(msg);
}).bind(this);
this.ws.onclose = ((ev) => {
const msg = JSON.parse(ev.reason);
messageHandler(msg);
}).bind(this);
this.ws.onerror = ((_) => {
this.status = "failed";
this.dispatchTypedEvent("error", new CustomEvent("error", {
detail: {
status: this.status,
data: this.data,
error: "WebSocket error",
},
}));
}).bind(this);
}
close() {
this.ws.close();
}
}

View File

@ -1 +0,0 @@
{"type": "module"}

File diff suppressed because it is too large Load Diff

View File

@ -1,22 +1,19 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.2.2",
"version": "1.4.2",
"description": "JavaScript SDK for Firecrawl API",
"main": "build/cjs/index.js",
"types": "types/index.d.ts",
"type": "module",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"exports": {
"require": {
"types": "./types/index.d.ts",
"default": "./build/cjs/index.js"
},
"import": {
"types": "./types/index.d.ts",
"default": "./build/esm/index.js"
"./package.json": "./package.json",
".": {
"import": "./dist/index.js",
"default": "./dist/index.cjs"
}
},
"type": "module",
"scripts": {
"build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json",
"build": "tsup",
"build-and-publish": "npm run build && npm publish --access public",
"publish-beta": "npm run build && npm publish --access public --tag beta",
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/v1/**/*.test.ts"
@ -29,10 +26,8 @@
"license": "MIT",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4",
"typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
},
@ -41,6 +36,8 @@
},
"homepage": "https://github.com/mendableai/firecrawl#readme",
"devDependencies": {
"uuid": "^9.0.1",
"dotenv": "^16.4.5",
"@jest/globals": "^29.7.0",
"@types/axios": "^0.14.0",
"@types/dotenv": "^8.2.0",
@ -50,6 +47,7 @@
"@types/uuid": "^9.0.8",
"jest": "^29.7.0",
"ts-jest": "^29.2.2",
"tsup": "^8.2.4",
"typescript": "^5.4.5"
},
"keywords": [

View File

@ -1,4 +1,4 @@
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
import FirecrawlApp, { type CrawlParams, type CrawlResponse, type CrawlStatusResponse, type MapResponse, type ScrapeResponse } from '../../../index';
import { v4 as uuidv4 } from 'uuid';
import dotenv from 'dotenv';
import { describe, test, expect } from '@jest/globals';
@ -6,7 +6,7 @@ import { describe, test, expect } from '@jest/globals';
dotenv.config();
const TEST_API_KEY = process.env.TEST_API_KEY;
const API_URL = "http://127.0.0.1:3002";
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
describe('FirecrawlApp E2E Tests', () => {
test.concurrent('should throw error for no API key', async () => {
@ -71,6 +71,7 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.links?.length).toBeGreaterThan(0);
expect(response.links?.[0]).toContain("https://");
expect(response.metadata).not.toBeNull();
expect(response.metadata).not.toBeUndefined();
expect(response.metadata).toHaveProperty("title");
expect(response.metadata).toHaveProperty("description");
expect(response.metadata).toHaveProperty("keywords");
@ -85,19 +86,21 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response.metadata).not.toHaveProperty("pageStatusCode");
expect(response.metadata).toHaveProperty("statusCode");
expect(response.metadata).not.toHaveProperty("pageError");
expect(response.metadata.error).toBeUndefined();
expect(response.metadata.title).toBe("Roast My Website");
expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
expect(response.metadata.robots).toBe("follow, index");
expect(response.metadata.ogTitle).toBe("Roast My Website");
expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.metadata.ogSiteName).toBe("Roast My Website");
expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
expect(response.metadata.statusCode).toBe(200);
if (response.metadata !== undefined) {
expect(response.metadata.error).toBeUndefined();
expect(response.metadata.title).toBe("Roast My Website");
expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
expect(response.metadata.robots).toBe("follow, index");
expect(response.metadata.ogTitle).toBe("Roast My Website");
expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
expect(response.metadata.ogSiteName).toBe("Roast My Website");
expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
expect(response.metadata.statusCode).toBe(200);
}
}, 30000); // 30 seconds timeout
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
@ -127,7 +130,7 @@ describe('FirecrawlApp E2E Tests', () => {
test.concurrent('should return successful response for crawl and wait for completion', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse;
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, 30) as CrawlStatusResponse;
expect(response).not.toBeNull();
expect(response).toHaveProperty("total");
expect(response.total).toBeGreaterThan(0);
@ -138,21 +141,25 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response).toHaveProperty("status");
expect(response.status).toBe("completed");
expect(response).not.toHaveProperty("next"); // wait until done
expect(response.data?.length).toBeGreaterThan(0);
expect(response.data?.[0]).toHaveProperty("markdown");
expect(response.data?.[0].markdown).toContain("_Roast_");
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
expect(response.data?.[0]).not.toHaveProperty("html");
expect(response.data?.[0]).not.toHaveProperty("rawHtml");
expect(response.data?.[0]).not.toHaveProperty("screenshot");
expect(response.data?.[0]).not.toHaveProperty("links");
expect(response.data?.[0]).toHaveProperty("metadata");
expect(response.data?.[0].metadata).toHaveProperty("title");
expect(response.data?.[0].metadata).toHaveProperty("description");
expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
expect(response.data?.[0].metadata).not.toHaveProperty("error");
expect(response.data.length).toBeGreaterThan(0);
expect(response.data[0]).not.toBeNull();
expect(response.data[0]).not.toBeUndefined();
if (response.data[0]) {
expect(response.data[0]).toHaveProperty("markdown");
expect(response.data[0].markdown).toContain("_Roast_");
expect(response.data[0]).not.toHaveProperty('content'); // v0
expect(response.data[0]).not.toHaveProperty("html");
expect(response.data[0]).not.toHaveProperty("rawHtml");
expect(response.data[0]).not.toHaveProperty("screenshot");
expect(response.data[0]).not.toHaveProperty("links");
expect(response.data[0]).toHaveProperty("metadata");
expect(response.data[0].metadata).toHaveProperty("title");
expect(response.data[0].metadata).toHaveProperty("description");
expect(response.data[0].metadata).toHaveProperty("language");
expect(response.data[0].metadata).toHaveProperty("sourceURL");
expect(response.data[0].metadata).toHaveProperty("statusCode");
expect(response.data[0].metadata).not.toHaveProperty("error");
}
}, 60000); // 60 seconds timeout
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
@ -173,7 +180,7 @@ describe('FirecrawlApp E2E Tests', () => {
onlyMainContent: true,
waitFor: 1000
}
} as CrawlParams, true, 30) as CrawlStatusResponse;
} as CrawlParams, 30) as CrawlStatusResponse;
expect(response).not.toBeNull();
expect(response).toHaveProperty("total");
expect(response.total).toBeGreaterThan(0);
@ -184,41 +191,45 @@ describe('FirecrawlApp E2E Tests', () => {
expect(response).toHaveProperty("status");
expect(response.status).toBe("completed");
expect(response).not.toHaveProperty("next");
expect(response.data?.length).toBeGreaterThan(0);
expect(response.data?.[0]).toHaveProperty("markdown");
expect(response.data?.[0].markdown).toContain("_Roast_");
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
expect(response.data?.[0]).toHaveProperty("html");
expect(response.data?.[0].html).toContain("<h1");
expect(response.data?.[0]).toHaveProperty("rawHtml");
expect(response.data?.[0].rawHtml).toContain("<h1");
expect(response.data?.[0]).toHaveProperty("screenshot");
expect(response.data?.[0].screenshot).toContain("https://");
expect(response.data?.[0]).toHaveProperty("links");
expect(response.data?.[0].links).not.toBeNull();
expect(response.data?.[0].links?.length).toBeGreaterThan(0);
expect(response.data?.[0]).toHaveProperty("metadata");
expect(response.data?.[0].metadata).toHaveProperty("title");
expect(response.data?.[0].metadata).toHaveProperty("description");
expect(response.data?.[0].metadata).toHaveProperty("language");
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
expect(response.data?.[0].metadata).not.toHaveProperty("error");
expect(response.data.length).toBeGreaterThan(0);
expect(response.data[0]).not.toBeNull();
expect(response.data[0]).not.toBeUndefined();
if (response.data[0]) {
expect(response.data[0]).toHaveProperty("markdown");
expect(response.data[0].markdown).toContain("_Roast_");
expect(response.data[0]).not.toHaveProperty('content'); // v0
expect(response.data[0]).toHaveProperty("html");
expect(response.data[0].html).toContain("<h1");
expect(response.data[0]).toHaveProperty("rawHtml");
expect(response.data[0].rawHtml).toContain("<h1");
expect(response.data[0]).toHaveProperty("screenshot");
expect(response.data[0].screenshot).toContain("https://");
expect(response.data[0]).toHaveProperty("links");
expect(response.data[0].links).not.toBeNull();
expect(response.data[0].links?.length).toBeGreaterThan(0);
expect(response.data[0]).toHaveProperty("metadata");
expect(response.data[0].metadata).toHaveProperty("title");
expect(response.data[0].metadata).toHaveProperty("description");
expect(response.data[0].metadata).toHaveProperty("language");
expect(response.data[0].metadata).toHaveProperty("sourceURL");
expect(response.data[0].metadata).toHaveProperty("statusCode");
expect(response.data[0].metadata).not.toHaveProperty("error");
}
}, 60000); // 60 seconds timeout
test.concurrent('should handle idempotency key for crawl', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const uniqueIdempotencyKey = uuidv4();
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
const response = await app.asyncCrawlUrl('https://roastmywebsite.ai', {}, uniqueIdempotencyKey) as CrawlResponse;
expect(response).not.toBeNull();
expect(response.id).toBeDefined();
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
});
test.concurrent('should check crawl status', async () => {
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
const response = await app.asyncCrawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams) as CrawlResponse;
expect(response).not.toBeNull();
expect(response.id).toBeDefined();
@ -226,7 +237,8 @@ describe('FirecrawlApp E2E Tests', () => {
const maxChecks = 15;
let checks = 0;
while (statusResponse.status === 'scraping' && checks < maxChecks) {
expect(statusResponse.success).toBe(true);
while ((statusResponse as any).status === 'scraping' && checks < maxChecks) {
await new Promise(resolve => setTimeout(resolve, 5000));
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
expect(statusResponse).not.toHaveProperty("current"); // v0
@ -236,44 +248,55 @@ describe('FirecrawlApp E2E Tests', () => {
expect(statusResponse).toHaveProperty("expiresAt");
expect(statusResponse).toHaveProperty("status");
expect(statusResponse).toHaveProperty("next");
expect(statusResponse.total).toBeGreaterThan(0);
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse.status).toBe("scraping");
expect(statusResponse.next).toContain("/v1/crawl/");
expect(statusResponse.success).toBe(true);
if (statusResponse.success === true) {
expect(statusResponse.total).toBeGreaterThan(0);
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse.status).toBe("scraping");
expect(statusResponse.next).toContain("/v1/crawl/");
}
statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
expect(statusResponse.success).toBe(true);
checks++;
}
expect(statusResponse).not.toBeNull();
expect(statusResponse).toHaveProperty("total");
expect(statusResponse.total).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("creditsUsed");
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("expiresAt");
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse).toHaveProperty("status");
expect(statusResponse.status).toBe("completed");
expect(statusResponse.data?.length).toBeGreaterThan(0);
expect(statusResponse.data?.[0]).toHaveProperty("markdown");
expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
expect(statusResponse.data?.[0]).toHaveProperty("html");
expect(statusResponse.data?.[0].html).toContain("<div");
expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
expect(statusResponse.data?.[0].rawHtml).toContain("<div");
expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
expect(statusResponse.data?.[0].screenshot).toContain("https://");
expect(statusResponse.data?.[0]).toHaveProperty("links");
expect(statusResponse.data?.[0].links).not.toBeNull();
expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
expect(statusResponse.data?.[0]).toHaveProperty("metadata");
expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
expect(statusResponse.success).toBe(true);
if (statusResponse.success === true) {
expect(statusResponse.total).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("creditsUsed");
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
expect(statusResponse).toHaveProperty("expiresAt");
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
expect(statusResponse).toHaveProperty("status");
expect(statusResponse.status).toBe("completed");
expect(statusResponse.data.length).toBeGreaterThan(0);
expect(statusResponse.data[0]).not.toBeNull();
expect(statusResponse.data[0]).not.toBeUndefined();
if (statusResponse.data[0]) {
expect(statusResponse.data[0]).toHaveProperty("markdown");
expect(statusResponse.data[0].markdown?.length).toBeGreaterThan(10);
expect(statusResponse.data[0]).not.toHaveProperty('content'); // v0
expect(statusResponse.data[0]).toHaveProperty("html");
expect(statusResponse.data[0].html).toContain("<div");
expect(statusResponse.data[0]).toHaveProperty("rawHtml");
expect(statusResponse.data[0].rawHtml).toContain("<div");
expect(statusResponse.data[0]).toHaveProperty("screenshot");
expect(statusResponse.data[0].screenshot).toContain("https://");
expect(statusResponse.data[0]).toHaveProperty("links");
expect(statusResponse.data[0].links).not.toBeNull();
expect(statusResponse.data[0].links?.length).toBeGreaterThan(0);
expect(statusResponse.data[0]).toHaveProperty("metadata");
expect(statusResponse.data[0].metadata).toHaveProperty("title");
expect(statusResponse.data[0].metadata).toHaveProperty("description");
expect(statusResponse.data[0].metadata).toHaveProperty("language");
expect(statusResponse.data[0].metadata).toHaveProperty("sourceURL");
expect(statusResponse.data[0].metadata).toHaveProperty("statusCode");
expect(statusResponse.data[0].metadata).not.toHaveProperty("error");
}
}
}, 60000); // 60 seconds timeout
test.concurrent('should throw error for invalid API key on map', async () => {

View File

@ -1,5 +1,5 @@
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios";
import type * as zt from "zod";
import { zodToJsonSchema } from "zod-to-json-schema";
import { WebSocket } from "isows";
import { TypedEventTarget } from "typescript-event-target";
@ -58,13 +58,13 @@ export interface FirecrawlDocumentMetadata {
* Document interface for Firecrawl.
* Represents a document retrieved or processed by Firecrawl.
*/
export interface FirecrawlDocument {
export interface FirecrawlDocument<T> {
url?: string;
markdown?: string;
html?: string;
rawHtml?: string;
links?: string[];
extract?: Record<any, any>;
extract?: T;
screenshot?: string;
metadata?: FirecrawlDocumentMetadata;
}
@ -73,26 +73,29 @@ export interface FirecrawlDocument {
* Parameters for scraping operations.
* Defines the options and configurations available for scraping web content.
*/
export interface ScrapeParams {
export interface CrawlScrapeOptions {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
headers?: Record<string, string>;
includeTags?: string[];
excludeTags?: string[];
onlyMainContent?: boolean;
extract?: {
prompt?: string;
schema?: z.ZodSchema | any;
systemPrompt?: string;
};
waitFor?: number;
timeout?: number;
}
export interface ScrapeParams<LLMSchema extends zt.ZodSchema> extends CrawlScrapeOptions {
extract?: {
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
};
}
/**
* Response interface for scraping operations.
* Defines the structure of the response received after a scraping operation.
*/
export interface ScrapeResponse extends FirecrawlDocument {
export interface ScrapeResponse<LLMResult> extends FirecrawlDocument<LLMResult> {
success: true;
warning?: string;
error?: string;
@ -110,7 +113,7 @@ export interface CrawlParams {
allowBackwardLinks?: boolean;
allowExternalLinks?: boolean;
ignoreSitemap?: boolean;
scrapeOptions?: ScrapeParams;
scrapeOptions?: CrawlScrapeOptions;
webhook?: string;
}
@ -131,15 +134,14 @@ export interface CrawlResponse {
*/
export interface CrawlStatusResponse {
success: true;
total: number;
status: "scraping" | "completed" | "failed" | "cancelled";
completed: number;
total: number;
creditsUsed: number;
expiresAt: Date;
status: "scraping" | "completed" | "failed";
next: string;
data?: FirecrawlDocument[];
error?: string;
}
next?: string;
data: FirecrawlDocument<undefined>[];
};
/**
* Parameters for mapping operations.
@ -184,7 +186,11 @@ export default class FirecrawlApp {
* @param config - Configuration options for the FirecrawlApp instance.
*/
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
this.apiKey = apiKey || "";
if (typeof apiKey !== "string") {
throw new Error("No API key provided");
}
this.apiKey = apiKey;
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
}
@ -194,10 +200,10 @@ export default class FirecrawlApp {
* @param params - Additional parameters for the scrape request.
* @returns The response from the scrape operation.
*/
async scrapeUrl(
async scrapeUrl<T extends zt.ZodSchema>(
url: string,
params?: ScrapeParams
): Promise<ScrapeResponse | ErrorResponse> {
params?: ScrapeParams<T>
): Promise<ScrapeResponse<zt.infer<T>> | ErrorResponse> {
const headers: AxiosRequestHeaders = {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
@ -329,9 +335,10 @@ export default class FirecrawlApp {
/**
* Checks the status of a crawl job using the Firecrawl API.
* @param id - The ID of the crawl operation.
* @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`)
* @returns The response containing the job status.
*/
async checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse> {
async checkCrawlStatus(id?: string, getAllData = false): Promise<CrawlStatusResponse | ErrorResponse> {
if (!id) {
throw new Error("No crawl ID provided");
}
@ -343,16 +350,28 @@ export default class FirecrawlApp {
headers
);
if (response.status === 200) {
let allData = response.data.data;
if (getAllData && response.data.status === "completed") {
let statusData = response.data
if ("data" in statusData) {
let data = statusData.data;
while ('next' in statusData) {
statusData = (await this.getRequest(statusData.next, headers)).data;
data = data.concat(statusData.data);
}
allData = data;
}
}
return ({
success: true,
success: response.data.success,
status: response.data.status,
total: response.data.total,
completed: response.data.completed,
creditsUsed: response.data.creditsUsed,
expiresAt: new Date(response.data.expiresAt),
next: response.data.next,
data: response.data.data,
error: response.data.error
data: allData,
error: response.data.error,
})
} else {
this.handleError(response, "check crawl status");
@ -433,11 +452,19 @@ export default class FirecrawlApp {
* @param headers - The headers for the request.
* @returns The response from the GET request.
*/
getRequest(
async getRequest(
url: string,
headers: AxiosRequestHeaders
): Promise<AxiosResponse> {
return axios.get(url, { headers });
try {
return await axios.get(url, { headers });
} catch (error) {
if (error instanceof AxiosError && error.response) {
return error.response as AxiosResponse;
} else {
throw error;
}
}
}
/**
@ -452,7 +479,7 @@ export default class FirecrawlApp {
id: string,
headers: AxiosRequestHeaders,
checkInterval: number
): Promise<CrawlStatusResponse> {
): Promise<CrawlStatusResponse | ErrorResponse> {
while (true) {
let statusResponse: AxiosResponse = await this.getRequest(
`${this.apiUrl}/v1/crawl/${id}`,
@ -460,20 +487,20 @@ export default class FirecrawlApp {
);
if (statusResponse.status === 200) {
let statusData = statusResponse.data;
if (statusData.status === "completed") {
if ("data" in statusData) {
let data = statusData.data;
while ('next' in statusData) {
statusResponse = await this.getRequest(statusData.next, headers);
statusData = statusResponse.data;
data = data.concat(statusData.data);
if (statusData.status === "completed") {
if ("data" in statusData) {
let data = statusData.data;
while ('next' in statusData) {
statusResponse = await this.getRequest(statusData.next, headers);
statusData = statusResponse.data;
data = data.concat(statusData.data);
}
statusData.data = data;
return statusData;
} else {
throw new Error("Crawl job completed but no data was returned");
}
statusData.data = data;
return statusData;
} else {
throw new Error("Crawl job completed but no data was returned");
}
} else if (
} else if (
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
) {
checkInterval = Math.max(checkInterval, 2);
@ -512,21 +539,21 @@ export default class FirecrawlApp {
}
interface CrawlWatcherEvents {
document: CustomEvent<FirecrawlDocument>,
document: CustomEvent<FirecrawlDocument<undefined>>,
done: CustomEvent<{
status: CrawlStatusResponse["status"];
data: FirecrawlDocument[];
data: FirecrawlDocument<undefined>[];
}>,
error: CustomEvent<{
status: CrawlStatusResponse["status"],
data: FirecrawlDocument[],
data: FirecrawlDocument<undefined>[],
error: string,
}>,
}
export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
private ws: WebSocket;
public data: FirecrawlDocument[];
public data: FirecrawlDocument<undefined>[];
public status: CrawlStatusResponse["status"];
constructor(id: string, app: FirecrawlApp) {
@ -547,7 +574,7 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
type DocumentMessage = {
type: "document",
data: FirecrawlDocument,
data: FirecrawlDocument<undefined>,
}
type DoneMessage = { type: "done" }

View File

@ -1,110 +1,24 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
// See https://www.totaltypescript.com/tsconfig-cheat-sheet
/* Base Options: */
"esModuleInterop": true,
"skipLibCheck": true,
"target": "es2022",
"allowJs": true,
"resolveJsonModule": true,
"moduleDetection": "force",
"isolatedModules": true,
"verbatimModuleSyntax": true,
/* Projects */
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Strictness */
"strict": true,
"noUncheckedIndexedAccess": true,
"noImplicitOverride": true,
/* Language and Environment */
"target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */
"module": "commonjs", /* Specify what module code is generated. */
"rootDir": "./src", /* Specify the root folder within your source files. */
"moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */
"declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
"outDir": "./build", /* Specify an output folder for all emitted files. */
// "removeComments": true, /* Disable emitting comments. */
// "noEmit": true, /* Disable emitting files from a compilation. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
// "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
// "newLine": "crlf", /* Set the newline character for emitting files. */
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
"declarationDir": "./types", /* Specify the output directory for generated declaration files. */
// "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
/* Type Checking */
"strict": true, /* Enable all strict type-checking options. */
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
/* If NOT transpiling with TypeScript: */
"module": "NodeNext",
"noEmit": true,
},
"include": ["src/**/*"],
"exclude": ["node_modules", "dist", "**/__tests__/*"]

View File

@ -0,0 +1,9 @@
import { defineConfig } from "tsup";
export default defineConfig({
entryPoints: ["src/index.ts"],
format: ["cjs", "esm"],
dts: true,
outDir: "dist",
clean: true,
});

View File

@ -1,260 +0,0 @@
import { AxiosResponse, AxiosRequestHeaders } from "axios";
import { z } from "zod";
import { TypedEventTarget } from "typescript-event-target";
/**
* Configuration interface for FirecrawlApp.
* @param apiKey - Optional API key for authentication.
* @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'.
*/
export interface FirecrawlAppConfig {
apiKey?: string | null;
apiUrl?: string | null;
}
/**
* Metadata for a Firecrawl document.
* Includes various optional properties for document metadata.
*/
export interface FirecrawlDocumentMetadata {
title?: string;
description?: string;
language?: string;
keywords?: string;
robots?: string;
ogTitle?: string;
ogDescription?: string;
ogUrl?: string;
ogImage?: string;
ogAudio?: string;
ogDeterminer?: string;
ogLocale?: string;
ogLocaleAlternate?: string[];
ogSiteName?: string;
ogVideo?: string;
dctermsCreated?: string;
dcDateCreated?: string;
dcDate?: string;
dctermsType?: string;
dcType?: string;
dctermsAudience?: string;
dctermsSubject?: string;
dcSubject?: string;
dcDescription?: string;
dctermsKeywords?: string;
modifiedTime?: string;
publishedTime?: string;
articleTag?: string;
articleSection?: string;
sourceURL?: string;
statusCode?: number;
error?: string;
[key: string]: any;
}
/**
* Document interface for Firecrawl.
* Represents a document retrieved or processed by Firecrawl.
*/
export interface FirecrawlDocument {
url?: string;
markdown?: string;
html?: string;
rawHtml?: string;
links?: string[];
extract?: Record<any, any>;
screenshot?: string;
metadata?: FirecrawlDocumentMetadata;
}
/**
* Parameters for scraping operations.
* Defines the options and configurations available for scraping web content.
*/
export interface ScrapeParams {
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract" | "full@scrennshot")[];
headers?: Record<string, string>;
includeTags?: string[];
excludeTags?: string[];
onlyMainContent?: boolean;
extract?: {
prompt?: string;
schema?: z.ZodSchema | any;
systemPrompt?: string;
};
waitFor?: number;
timeout?: number;
}
/**
* Response interface for scraping operations.
* Defines the structure of the response received after a scraping operation.
*/
export interface ScrapeResponse extends FirecrawlDocument {
success: true;
warning?: string;
error?: string;
}
/**
* Parameters for crawling operations.
* Includes options for both scraping and mapping during a crawl.
*/
export interface CrawlParams {
includePaths?: string[];
excludePaths?: string[];
maxDepth?: number;
limit?: number;
allowBackwardLinks?: boolean;
allowExternalLinks?: boolean;
ignoreSitemap?: boolean;
scrapeOptions?: ScrapeParams;
webhook?: string;
}
/**
* Response interface for crawling operations.
* Defines the structure of the response received after initiating a crawl.
*/
export interface CrawlResponse {
id?: string;
url?: string;
success: true;
error?: string;
}
/**
* Response interface for job status checks.
* Provides detailed status of a crawl job including progress and results.
*/
export interface CrawlStatusResponse {
success: true;
total: number;
completed: number;
creditsUsed: number;
expiresAt: Date;
status: "scraping" | "completed" | "failed";
next: string;
data?: FirecrawlDocument[];
error?: string;
}
/**
* Parameters for mapping operations.
* Defines options for mapping URLs during a crawl.
*/
export interface MapParams {
search?: string;
ignoreSitemap?: boolean;
includeSubdomains?: boolean;
limit?: number;
}
/**
* Response interface for mapping operations.
* Defines the structure of the response received after a mapping operation.
*/
export interface MapResponse {
success: true;
links?: string[];
error?: string;
}
/**
* Error response interface.
* Defines the structure of the response received when an error occurs.
*/
export interface ErrorResponse {
success: false;
error: string;
}
/**
* Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content.
*/
export default class FirecrawlApp {
apiKey: string;
apiUrl: string;
/**
* Initializes a new instance of the FirecrawlApp class.
* @param config - Configuration options for the FirecrawlApp instance.
*/
constructor({ apiKey, apiUrl }: FirecrawlAppConfig);
/**
* Scrapes a URL using the Firecrawl API.
* @param url - The URL to scrape.
* @param params - Additional parameters for the scrape request.
* @returns The response from the scrape operation.
*/
scrapeUrl(url: string, params?: ScrapeParams): Promise<ScrapeResponse | ErrorResponse>;
/**
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
* @param query - The search query string.
* @param params - Additional parameters for the search.
* @returns Throws an error advising to use version 0 of the API.
*/
search(query: string, params?: any): Promise<any>;
/**
* Initiates a crawl job for a URL using the Firecrawl API.
* @param url - The URL to crawl.
* @param params - Additional parameters for the crawl request.
* @param pollInterval - Time in seconds for job status checks.
* @param idempotencyKey - Optional idempotency key for the request.
* @returns The response from the crawl operation.
*/
crawlUrl(url: string, params?: CrawlParams, pollInterval?: number, idempotencyKey?: string): Promise<CrawlStatusResponse | ErrorResponse>;
asyncCrawlUrl(url: string, params?: CrawlParams, idempotencyKey?: string): Promise<CrawlResponse | ErrorResponse>;
/**
* Checks the status of a crawl job using the Firecrawl API.
* @param id - The ID of the crawl operation.
* @returns The response containing the job status.
*/
checkCrawlStatus(id?: string): Promise<CrawlStatusResponse | ErrorResponse>;
crawlUrlAndWatch(url: string, params?: CrawlParams, idempotencyKey?: string): Promise<CrawlWatcher>;
mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse>;
/**
* Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency.
* @returns The prepared headers.
*/
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders;
/**
* Sends a POST request to the specified URL.
* @param url - The URL to send the request to.
* @param data - The data to send in the request.
* @param headers - The headers for the request.
* @returns The response from the POST request.
*/
postRequest(url: string, data: any, headers: AxiosRequestHeaders): Promise<AxiosResponse>;
/**
* Sends a GET request to the specified URL.
* @param url - The URL to send the request to.
* @param headers - The headers for the request.
* @returns The response from the GET request.
*/
getRequest(url: string, headers: AxiosRequestHeaders): Promise<AxiosResponse>;
/**
* Monitors the status of a crawl job until completion or failure.
* @param id - The ID of the crawl operation.
* @param headers - The headers for the request.
* @param checkInterval - Interval in seconds for job status checks.
* @param checkUrl - Optional URL to check the status (used for v1 API)
* @returns The final job status or data.
*/
monitorJobStatus(id: string, headers: AxiosRequestHeaders, checkInterval: number): Promise<CrawlStatusResponse>;
/**
* Handles errors from API responses.
* @param {AxiosResponse} response - The response from the API.
* @param {string} action - The action being performed when the error occurred.
*/
handleError(response: AxiosResponse, action: string): void;
}
interface CrawlWatcherEvents {
document: CustomEvent<FirecrawlDocument>;
done: CustomEvent<{
status: CrawlStatusResponse["status"];
data: FirecrawlDocument[];
}>;
error: CustomEvent<{
status: CrawlStatusResponse["status"];
data: FirecrawlDocument[];
error: string;
}>;
}
export declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
private ws;
data: FirecrawlDocument[];
status: CrawlStatusResponse["status"];
constructor(id: string, app: FirecrawlApp);
close(): void;
}
export {};

View File

@ -9,8 +9,8 @@
"version": "1.0.0",
"license": "ISC",
"dependencies": {
"@mendable/firecrawl-js": "^0.0.36",
"axios": "^1.6.8",
"firecrawl": "^1.2.0",
"ts-node": "^10.9.2",
"typescript": "^5.4.5",
"uuid": "^10.0.0",
@ -422,12 +422,14 @@
}
},
"node_modules/@mendable/firecrawl-js": {
"version": "0.0.36",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.36.tgz",
"integrity": "sha512-5zQMWUD49r6Q7cxj+QBthQ964Bm9fMooW4E8E4nIca3BMXCeEuQFVf5C3OEWwZf0SjJvR+5Yx2wUbXJWd1wCOA==",
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-1.2.2.tgz",
"integrity": "sha512-2A1GzLD0bczlFIlcjxHcm/x8i76ndtV4EUzOfc81oOJ/HbycE2mbT6EUthoL+r4s5A8yO3bKr9o/GxmEn456VA==",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4",
"typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
@ -594,6 +596,32 @@
"@esbuild/win32-x64": "0.20.2"
}
},
"node_modules/firecrawl": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/firecrawl/-/firecrawl-1.2.0.tgz",
"integrity": "sha512-Sy1BCCvs5FhGc4yxPP7NG9iWnK8RXdvA1ZS/K1Gj+LrEN3iAT2WRzhYET7x8G2bif25F6rHJg57vdVb5sr6RyQ==",
"dependencies": {
"axios": "^1.6.8",
"dotenv": "^16.4.5",
"isows": "^1.0.4",
"typescript-event-target": "^1.1.1",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zod-to-json-schema": "^3.23.0"
}
},
"node_modules/firecrawl/node_modules/uuid": {
"version": "9.0.1",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
"funding": [
"https://github.com/sponsors/broofa",
"https://github.com/sponsors/ctavan"
],
"bin": {
"uuid": "dist/bin/uuid"
}
},
"node_modules/follow-redirects": {
"version": "1.15.6",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
@ -652,6 +680,20 @@
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
}
},
"node_modules/isows": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/isows/-/isows-1.0.4.tgz",
"integrity": "sha512-hEzjY+x9u9hPmBom9IIAqdJCwNLax+xrPb51vEPpERoFlIxgmZcHzsT5jKG06nvInKOBGvReAVz80Umed5CczQ==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/wagmi-dev"
}
],
"peerDependencies": {
"ws": "*"
}
},
"node_modules/make-error": {
"version": "1.3.6",
"resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz",
@ -763,6 +805,11 @@
"node": ">=14.17"
}
},
"node_modules/typescript-event-target": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/typescript-event-target/-/typescript-event-target-1.1.1.tgz",
"integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg=="
},
"node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
@ -786,6 +833,27 @@
"resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
"integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg=="
},
"node_modules/ws": {
"version": "8.18.0",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz",
"integrity": "sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==",
"peer": true,
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/yn": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz",

View File

@ -13,6 +13,7 @@
"dependencies": {
"@mendable/firecrawl-js": "^1.0.3",
"axios": "^1.6.8",
"firecrawl": "^1.2.0",
"ts-node": "^10.9.2",
"typescript": "^5.4.5",
"uuid": "^10.0.0",

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp
__version__ = "1.2.3"
__version__ = "1.2.4"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -13,7 +13,6 @@ import logging
import os
import time
from typing import Any, Dict, Optional, List
import asyncio
import json
import requests

View File

@ -12,8 +12,7 @@ dependencies = [
"requests",
"python-dotenv",
"websockets",
"asyncio",
"nest-asyncio"
"nest-asyncio"
]
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]

View File

@ -2,5 +2,4 @@ requests
pytest
python-dotenv
websockets
asyncio
nest-asyncio

View File

@ -0,0 +1,137 @@
# %%
import os
import datetime
import time
from firecrawl import FirecrawlApp
import json
import google.generativeai as genai
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
google_api_key = os.getenv("GOOGLE_API_KEY")
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
# Configure the Google Generative AI module with the API key
genai.configure(api_key=google_api_key)
model = genai.GenerativeModel("gemini-1.5-pro-001")
# Set the docs URL
docs_url = "https://docs.firecrawl.dev/api-reference"
# Initialize the FirecrawlApp with your API key
app = FirecrawlApp(api_key=firecrawl_api_key)
# %%
# Crawl all pages on docs
crawl_result = app.crawl_url(docs_url)
print(f"Total pages crawled: {len(crawl_result['data'])}")
# %%
# Define the prompt instructions for generating OpenAPI specs
prompt_instructions = """
Given the following API documentation content, generate an OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident and clear about all details. Focus on extracting the main endpoints, their HTTP methods, parameters, request bodies, and responses. The specification should follow OpenAPI 3.0 structure and conventions. Include only the 200 response for each endpoint. Limit all descriptions to 5 words or less.
If there is ANY uncertainty, lack of complete information, or if you are not 100% confident about ANY part of the specification, return an empty JSON object {{}}.
Do not make anything up. Only include information that is explicitly provided in the documentation. If any detail is unclear or missing, do not attempt to fill it in.
API Documentation Content:
{{content}}
Generate the OpenAPI 3.0 specification in JSON format ONLY if you are 100% confident about every single detail. Include only the JSON object, no additional text, and ensure it has no errors in the JSON format so it can be parsed. Remember to include only the 200 response for each endpoint and keep all descriptions to 5 words maximum.
Once again, if there is ANY doubt, uncertainty, or lack of complete information, return an empty JSON object {{}}.
To reiterate: accuracy is paramount. Do not make anything up. If you are not 100% clear or confident about the entire OpenAPI spec, return an empty JSON object {{}}.
"""
# %%
# Initialize a list to store all API specs
all_api_specs = []
# Process each page in crawl_result
for index, page in enumerate(crawl_result['data']):
if 'markdown' in page:
# Update prompt_instructions with the current page's content
current_prompt = prompt_instructions.replace("{content}", page['markdown'])
try:
# Query the model
response = model.generate_content([current_prompt])
response_dict = response.to_dict()
response_text = response_dict['candidates'][0]['content']['parts'][0]['text']
# Remove the ```json code wrap if present
response_text = response_text.strip().removeprefix('```json').removesuffix('```').strip()
# Parse JSON
json_data = json.loads(response_text)
# Add non-empty API specs to the list
if json_data != {}:
all_api_specs.append(json_data)
print(f"API specification generated for page {index}")
else:
print(f"No API specification found for page {index}")
except json.JSONDecodeError:
print(f"Error parsing JSON response for page {index}")
except Exception as e:
print(f"An error occurred for page {index}: {str(e)}")
# Print the total number of API specs collected
print(f"Total API specifications collected: {len(all_api_specs)}")
# %%
# Combine all API specs and keep the most filled out spec for each path and method
combined_spec = {
"openapi": "3.0.0",
"info": {
"title": f"{docs_url} API Specification",
"version": "1.0.0"
},
"paths": {},
"components": {
"schemas": {}
}
}
# Helper function to count properties in an object
def count_properties(obj):
if isinstance(obj, dict):
return sum(count_properties(v) for v in obj.values()) + len(obj)
elif isinstance(obj, list):
return sum(count_properties(item) for item in obj)
else:
return 1
# Combine specs, keeping the most detailed version of each path and schema
for spec in all_api_specs:
# Combine paths
if "paths" in spec:
for path, methods in spec["paths"].items():
if path not in combined_spec["paths"]:
combined_spec["paths"][path] = {}
for method, details in methods.items():
if method not in combined_spec["paths"][path] or count_properties(details) > count_properties(combined_spec["paths"][path][method]):
combined_spec["paths"][path][method] = details
# Combine schemas
if "components" in spec and "schemas" in spec["components"]:
for schema_name, schema in spec["components"]["schemas"].items():
if schema_name not in combined_spec["components"]["schemas"] or count_properties(schema) > count_properties(combined_spec["components"]["schemas"][schema_name]):
combined_spec["components"]["schemas"][schema_name] = schema
# Print summary of combined spec
print(f"Combined API specification generated")
print(f"Total paths in combined spec: {len(combined_spec['paths'])}")
print(f"Total schemas in combined spec: {len(combined_spec['components']['schemas'])}")
# Save the combined spec to a JSON file in the same directory as the Python file
output_file = os.path.join(os.path.dirname(__file__), "combined_api_spec.json")
with open(output_file, "w") as f:
json.dump(combined_spec, f, indent=2)
print(f"Combined API specification saved to {output_file}")