mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 17:19:00 +08:00
feat: rust sdk initial commit
This commit is contained in:
parent
e779dbbe26
commit
e24bbcc6eb
1
apps/rust-sdk/.gitignore
vendored
Normal file
1
apps/rust-sdk/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
target/
|
1223
apps/rust-sdk/Cargo.lock
generated
Normal file
1223
apps/rust-sdk/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
22
apps/rust-sdk/Cargo.toml
Normal file
22
apps/rust-sdk/Cargo.toml
Normal file
@ -0,0 +1,22 @@
|
||||
[package]
|
||||
name = "firecrawl-rs"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
[dependencies]
|
||||
reqwest = { version = "0.11", features = ["json", "blocking"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
log = "0.4"
|
||||
thiserror = "1.0"
|
||||
|
||||
[dependencies.tokio]
|
||||
version = "1"
|
||||
features = ["full"]
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
||||
[build-dependencies]
|
||||
tokio = { version = "1", features = ["full"] }
|
10
apps/rust-sdk/Makefile
Normal file
10
apps/rust-sdk/Makefile
Normal file
@ -0,0 +1,10 @@
|
||||
.PHONY: build run format
|
||||
|
||||
build:
|
||||
cargo build
|
||||
|
||||
run: build
|
||||
cargo run
|
||||
|
||||
format:
|
||||
cargo fmt
|
1
apps/rust-sdk/README.md
Normal file
1
apps/rust-sdk/README.md
Normal file
@ -0,0 +1 @@
|
||||
# Firecrawl Rust SDK
|
373
apps/rust-sdk/src/lib.rs
Normal file
373
apps/rust-sdk/src/lib.rs
Normal file
@ -0,0 +1,373 @@
|
||||
/*
|
||||
*
|
||||
* - Structs and Enums:
|
||||
* FirecrawlError: Custom error enum for handling various errors.
|
||||
* FirecrawlApp: Main struct for the application, holding API key, URL, and HTTP client.
|
||||
*
|
||||
* - Initialization:
|
||||
*
|
||||
* FirecrawlApp::new initializes the struct, fetching the API key and URL from environment variables if not provided.
|
||||
*
|
||||
* - API Methods:
|
||||
* scrape_url, search, crawl_url, check_crawl_status:
|
||||
* Methods for interacting with the Firecrawl API, similar to the Python methods.
|
||||
* monitor_job_status: Polls the API to monitor the status of a crawl job until completion.
|
||||
*/
|
||||
|
||||
use std::env;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use log::debug;
|
||||
use reqwest::{Client, Response};
|
||||
use serde_json::json;
|
||||
use serde_json::Value;
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FirecrawlError {
|
||||
#[error("HTTP request failed: {0}")]
|
||||
HttpRequestFailed(String),
|
||||
#[error("API key not provided")]
|
||||
ApiKeyNotProvided,
|
||||
#[error("Failed to parse response: {0}")]
|
||||
ResponseParseError(String),
|
||||
#[error("Crawl job failed or stopped: {0}")]
|
||||
CrawlJobFailed(String),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FirecrawlApp {
|
||||
api_key: String,
|
||||
api_url: String,
|
||||
client: Client,
|
||||
}
|
||||
// the api verstion of firecrawl
|
||||
const API_VERSION: &str = "/v0";
|
||||
|
||||
impl FirecrawlApp {
|
||||
/// Initialize the FirecrawlApp instance.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `api_key` (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
/// * `api_url` (Optional[str]): Base URL for the Firecrawl API.
|
||||
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
|
||||
let api_key = api_key
|
||||
.or_else(|| env::var("FIRECRAWL_API_KEY").ok())
|
||||
.ok_or(FirecrawlError::ApiKeyNotProvided)?;
|
||||
let api_url = api_url.unwrap_or_else(|| {
|
||||
env::var("FIRECRAWL_API_URL")
|
||||
.unwrap_or_else(|_| "https://api.firecrawl.dev".to_string())
|
||||
});
|
||||
|
||||
debug!("Initialized FirecrawlApp with API key: {}", api_key);
|
||||
debug!("Initialized FirecrawlApp with API URL: {}", api_url);
|
||||
|
||||
Ok(FirecrawlApp {
|
||||
api_key,
|
||||
api_url,
|
||||
client: Client::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Scrape the specified URL using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `url` (str): The URL to scrape.
|
||||
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The scraped data if the request is successful.
|
||||
///
|
||||
/// # Raises:
|
||||
/// * `Exception`: If the scrape request fails.
|
||||
pub async fn scrape_url(
|
||||
&self,
|
||||
url: &str,
|
||||
params: Option<Value>,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(None);
|
||||
let mut scrape_params = json!({"url": url});
|
||||
|
||||
if let Some(mut params) = params {
|
||||
if let Some(extractor_options) = params.get_mut("extractorOptions") {
|
||||
if let Some(extraction_schema) = extractor_options.get_mut("extractionSchema") {
|
||||
if extraction_schema.is_object() && extraction_schema.get("schema").is_some() {
|
||||
extractor_options["extractionSchema"] = extraction_schema["schema"].clone();
|
||||
}
|
||||
extractor_options["mode"] = extractor_options
|
||||
.get("mode")
|
||||
.cloned()
|
||||
.unwrap_or_else(|| json!("llm-extraction"));
|
||||
}
|
||||
scrape_params["extractorOptions"] = extractor_options.clone();
|
||||
}
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
if key != "extractorOptions" {
|
||||
scrape_params[key] = value.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/scrape", self.api_url, API_VERSION))
|
||||
.headers(headers)
|
||||
.json(&scrape_params)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response(response, "scrape URL").await
|
||||
}
|
||||
|
||||
/// Perform a search using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `query` (str): The search query.
|
||||
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The search results if the request is successful.
|
||||
///
|
||||
/// # Raises:
|
||||
/// * `Exception`: If the search request fails.
|
||||
pub async fn search(
|
||||
&self,
|
||||
query: &str,
|
||||
params: Option<Value>,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(None);
|
||||
let mut json_data = json!({"query": query});
|
||||
if let Some(params) = params {
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
json_data[key] = value.clone();
|
||||
}
|
||||
}
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/search", self.api_url, API_VERSION))
|
||||
.headers(headers)
|
||||
.json(&json_data)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response(response, "search").await
|
||||
}
|
||||
|
||||
/// Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `url` (str): The URL to crawl.
|
||||
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
/// * `wait_until_done` (bool): Whether to wait until the crawl job is completed.
|
||||
/// * `poll_interval` (int): Time in seconds between status checks when waiting for job completion.
|
||||
/// * `idempotency_key` (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The crawl job ID or the crawl results if waiting until completion.
|
||||
///
|
||||
/// # `Raises`:
|
||||
/// * `Exception`: If the crawl job initiation or monitoring fails.
|
||||
pub async fn crawl_url(
|
||||
&self,
|
||||
url: &str,
|
||||
params: Option<Value>,
|
||||
wait_until_done: bool,
|
||||
poll_interval: u64,
|
||||
idempotency_key: Option<String>,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(idempotency_key);
|
||||
let mut json_data = json!({"url": url});
|
||||
if let Some(params) = params {
|
||||
for (key, value) in params.as_object().unwrap() {
|
||||
json_data[key] = value.clone();
|
||||
}
|
||||
}
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
|
||||
.headers(headers.clone())
|
||||
.json(&json_data)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
let response_json = self.handle_response(response, "start crawl job").await?;
|
||||
let job_id = response_json["jobId"].as_str().unwrap().to_string();
|
||||
|
||||
if wait_until_done {
|
||||
self.monitor_job_status(&job_id, headers, poll_interval)
|
||||
.await
|
||||
} else {
|
||||
Ok(json!({"jobId": job_id}))
|
||||
}
|
||||
}
|
||||
|
||||
/// Check the status of a crawl job using the Firecrawl API.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `job_id` (str): The ID of the crawl job.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The status of the crawl job.
|
||||
///
|
||||
/// # Raises:
|
||||
/// * `Exception`: If the status check request fails.
|
||||
pub async fn check_crawl_status(&self, job_id: &str) -> Result<Value, FirecrawlError> {
|
||||
let headers = self.prepare_headers(None);
|
||||
let response = self
|
||||
.client
|
||||
.get(&format!(
|
||||
"{}{}/crawl/status/{}",
|
||||
self.api_url, API_VERSION, job_id
|
||||
))
|
||||
.headers(headers)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
self.handle_response(response, "check crawl status").await
|
||||
}
|
||||
|
||||
/// Monitor the status of a crawl job until completion.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `job_id` (str): The ID of the crawl job.
|
||||
/// * `headers` (Dict[str, str]): The headers to include in the status check requests.
|
||||
/// * `poll_interval` (int): Secounds between status checks.
|
||||
///
|
||||
/// # Returns:
|
||||
/// * `Any`: The crawl results if the job is completed successfully.
|
||||
///
|
||||
/// # Raises:
|
||||
/// Exception: If the job fails or an error occurs during status checks.
|
||||
async fn monitor_job_status(
|
||||
&self,
|
||||
job_id: &str,
|
||||
headers: reqwest::header::HeaderMap,
|
||||
poll_interval: u64,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
loop {
|
||||
let response = self
|
||||
.client
|
||||
.get(&format!(
|
||||
"{}{}/crawl/status/{}",
|
||||
self.api_url, API_VERSION, job_id
|
||||
))
|
||||
.headers(headers.clone())
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
|
||||
let status_data = self.handle_response(response, "check crawl status").await?;
|
||||
match status_data["status"].as_str() {
|
||||
Some("completed") => {
|
||||
if status_data["data"].is_object() {
|
||||
return Ok(status_data["data"].clone());
|
||||
} else {
|
||||
return Err(FirecrawlError::CrawlJobFailed(
|
||||
"Crawl job completed but no data was returned".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
Some("active") | Some("paused") | Some("pending") | Some("queued")
|
||||
| Some("waiting") => {
|
||||
thread::sleep(Duration::from_secs(poll_interval));
|
||||
}
|
||||
Some(status) => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job failed or was stopped. Status: {}",
|
||||
status
|
||||
)));
|
||||
}
|
||||
None => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(
|
||||
"Unexpected response: no status field".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepare the headers for API requests.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// `idempotency_key` (Optional[str]): A unique key to ensure idempotency of requests.
|
||||
///
|
||||
/// # Returns:
|
||||
/// Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
||||
fn prepare_headers(&self, idempotency_key: Option<String>) -> reqwest::header::HeaderMap {
|
||||
let mut headers = reqwest::header::HeaderMap::new();
|
||||
headers.insert("Content-Type", "application/json".parse().unwrap());
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", self.api_key).parse().unwrap(),
|
||||
);
|
||||
if let Some(key) = idempotency_key {
|
||||
headers.insert("x-idempotency-key", key.parse().unwrap());
|
||||
}
|
||||
headers
|
||||
}
|
||||
|
||||
/// Handle errors from API responses.
|
||||
///
|
||||
/// # Arguments:
|
||||
/// * `response` (requests.Response): The response object from the API request.
|
||||
/// * `action` (str): Description of the action that was being performed.
|
||||
///
|
||||
/// # Raises:
|
||||
/// Exception: An exception with a message containing the status code and error details from the response.
|
||||
async fn handle_response(
|
||||
&self,
|
||||
response: Response,
|
||||
action: &str,
|
||||
) -> Result<Value, FirecrawlError> {
|
||||
if response.status().is_success() {
|
||||
let response_json: Value = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
|
||||
if response_json["success"].as_bool().unwrap_or(false) {
|
||||
Ok(response_json["data"].clone())
|
||||
} else {
|
||||
Err(FirecrawlError::HttpRequestFailed(format!(
|
||||
"Failed to {}: {}",
|
||||
action, response_json["error"]
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
let status_code = response.status().as_u16();
|
||||
let error_message = response
|
||||
.json::<Value>()
|
||||
.await
|
||||
.unwrap_or_else(|_| json!({"error": "No additional error details provided."}));
|
||||
let message = match status_code {
|
||||
402 => format!(
|
||||
"Payment Required: Failed to {}. {}",
|
||||
action, error_message["error"]
|
||||
),
|
||||
408 => format!(
|
||||
"Request Timeout: Failed to {} as the request timed out. {}",
|
||||
action, error_message["error"]
|
||||
),
|
||||
409 => format!(
|
||||
"Conflict: Failed to {} due to a conflict. {}",
|
||||
action, error_message["error"]
|
||||
),
|
||||
500 => format!(
|
||||
"Internal Server Error: Failed to {}. {}",
|
||||
action, error_message["error"]
|
||||
),
|
||||
_ => format!(
|
||||
"Unexpected error during {}: Status code {}. {}",
|
||||
action, status_code, error_message["error"]
|
||||
),
|
||||
};
|
||||
Err(FirecrawlError::HttpRequestFailed(message))
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user