mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 00:58:59 +08:00
feat: rust sdk initial commit
This commit is contained in:
parent
e779dbbe26
commit
e24bbcc6eb
1
apps/rust-sdk/.gitignore
vendored
Normal file
1
apps/rust-sdk/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
target/
|
1223
apps/rust-sdk/Cargo.lock
generated
Normal file
1223
apps/rust-sdk/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
22
apps/rust-sdk/Cargo.toml
Normal file
22
apps/rust-sdk/Cargo.toml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
[package]
|
||||||
|
name = "firecrawl-rs"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
[dependencies]
|
||||||
|
reqwest = { version = "0.11", features = ["json", "blocking"] }
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
log = "0.4"
|
||||||
|
thiserror = "1.0"
|
||||||
|
|
||||||
|
[dependencies.tokio]
|
||||||
|
version = "1"
|
||||||
|
features = ["full"]
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
|
||||||
|
[build-dependencies]
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
10
apps/rust-sdk/Makefile
Normal file
10
apps/rust-sdk/Makefile
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
.PHONY: build run format
|
||||||
|
|
||||||
|
build:
|
||||||
|
cargo build
|
||||||
|
|
||||||
|
run: build
|
||||||
|
cargo run
|
||||||
|
|
||||||
|
format:
|
||||||
|
cargo fmt
|
1
apps/rust-sdk/README.md
Normal file
1
apps/rust-sdk/README.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
# Firecrawl Rust SDK
|
373
apps/rust-sdk/src/lib.rs
Normal file
373
apps/rust-sdk/src/lib.rs
Normal file
@ -0,0 +1,373 @@
|
|||||||
|
/*
|
||||||
|
*
|
||||||
|
* - Structs and Enums:
|
||||||
|
* FirecrawlError: Custom error enum for handling various errors.
|
||||||
|
* FirecrawlApp: Main struct for the application, holding API key, URL, and HTTP client.
|
||||||
|
*
|
||||||
|
* - Initialization:
|
||||||
|
*
|
||||||
|
* FirecrawlApp::new initializes the struct, fetching the API key and URL from environment variables if not provided.
|
||||||
|
*
|
||||||
|
* - API Methods:
|
||||||
|
* scrape_url, search, crawl_url, check_crawl_status:
|
||||||
|
* Methods for interacting with the Firecrawl API, similar to the Python methods.
|
||||||
|
* monitor_job_status: Polls the API to monitor the status of a crawl job until completion.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::env;
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use log::debug;
|
||||||
|
use reqwest::{Client, Response};
|
||||||
|
use serde_json::json;
|
||||||
|
use serde_json::Value;
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
#[derive(Error, Debug)]
|
||||||
|
pub enum FirecrawlError {
|
||||||
|
#[error("HTTP request failed: {0}")]
|
||||||
|
HttpRequestFailed(String),
|
||||||
|
#[error("API key not provided")]
|
||||||
|
ApiKeyNotProvided,
|
||||||
|
#[error("Failed to parse response: {0}")]
|
||||||
|
ResponseParseError(String),
|
||||||
|
#[error("Crawl job failed or stopped: {0}")]
|
||||||
|
CrawlJobFailed(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct FirecrawlApp {
|
||||||
|
api_key: String,
|
||||||
|
api_url: String,
|
||||||
|
client: Client,
|
||||||
|
}
|
||||||
|
// the api verstion of firecrawl
|
||||||
|
const API_VERSION: &str = "/v0";
|
||||||
|
|
||||||
|
impl FirecrawlApp {
|
||||||
|
/// Initialize the FirecrawlApp instance.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// * `api_key` (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||||
|
/// * `api_url` (Optional[str]): Base URL for the Firecrawl API.
|
||||||
|
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
|
||||||
|
let api_key = api_key
|
||||||
|
.or_else(|| env::var("FIRECRAWL_API_KEY").ok())
|
||||||
|
.ok_or(FirecrawlError::ApiKeyNotProvided)?;
|
||||||
|
let api_url = api_url.unwrap_or_else(|| {
|
||||||
|
env::var("FIRECRAWL_API_URL")
|
||||||
|
.unwrap_or_else(|_| "https://api.firecrawl.dev".to_string())
|
||||||
|
});
|
||||||
|
|
||||||
|
debug!("Initialized FirecrawlApp with API key: {}", api_key);
|
||||||
|
debug!("Initialized FirecrawlApp with API URL: {}", api_url);
|
||||||
|
|
||||||
|
Ok(FirecrawlApp {
|
||||||
|
api_key,
|
||||||
|
api_url,
|
||||||
|
client: Client::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scrape the specified URL using the Firecrawl API.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// * `url` (str): The URL to scrape.
|
||||||
|
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||||
|
///
|
||||||
|
/// # Returns:
|
||||||
|
/// * `Any`: The scraped data if the request is successful.
|
||||||
|
///
|
||||||
|
/// # Raises:
|
||||||
|
/// * `Exception`: If the scrape request fails.
|
||||||
|
pub async fn scrape_url(
|
||||||
|
&self,
|
||||||
|
url: &str,
|
||||||
|
params: Option<Value>,
|
||||||
|
) -> Result<Value, FirecrawlError> {
|
||||||
|
let headers = self.prepare_headers(None);
|
||||||
|
let mut scrape_params = json!({"url": url});
|
||||||
|
|
||||||
|
if let Some(mut params) = params {
|
||||||
|
if let Some(extractor_options) = params.get_mut("extractorOptions") {
|
||||||
|
if let Some(extraction_schema) = extractor_options.get_mut("extractionSchema") {
|
||||||
|
if extraction_schema.is_object() && extraction_schema.get("schema").is_some() {
|
||||||
|
extractor_options["extractionSchema"] = extraction_schema["schema"].clone();
|
||||||
|
}
|
||||||
|
extractor_options["mode"] = extractor_options
|
||||||
|
.get("mode")
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_else(|| json!("llm-extraction"));
|
||||||
|
}
|
||||||
|
scrape_params["extractorOptions"] = extractor_options.clone();
|
||||||
|
}
|
||||||
|
for (key, value) in params.as_object().unwrap() {
|
||||||
|
if key != "extractorOptions" {
|
||||||
|
scrape_params[key] = value.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(&format!("{}{}/scrape", self.api_url, API_VERSION))
|
||||||
|
.headers(headers)
|
||||||
|
.json(&scrape_params)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||||
|
|
||||||
|
self.handle_response(response, "scrape URL").await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perform a search using the Firecrawl API.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// * `query` (str): The search query.
|
||||||
|
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
||||||
|
///
|
||||||
|
/// # Returns:
|
||||||
|
/// * `Any`: The search results if the request is successful.
|
||||||
|
///
|
||||||
|
/// # Raises:
|
||||||
|
/// * `Exception`: If the search request fails.
|
||||||
|
pub async fn search(
|
||||||
|
&self,
|
||||||
|
query: &str,
|
||||||
|
params: Option<Value>,
|
||||||
|
) -> Result<Value, FirecrawlError> {
|
||||||
|
let headers = self.prepare_headers(None);
|
||||||
|
let mut json_data = json!({"query": query});
|
||||||
|
if let Some(params) = params {
|
||||||
|
for (key, value) in params.as_object().unwrap() {
|
||||||
|
json_data[key] = value.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(&format!("{}{}/search", self.api_url, API_VERSION))
|
||||||
|
.headers(headers)
|
||||||
|
.json(&json_data)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||||
|
|
||||||
|
self.handle_response(response, "search").await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// * `url` (str): The URL to crawl.
|
||||||
|
/// * `params` (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||||
|
/// * `wait_until_done` (bool): Whether to wait until the crawl job is completed.
|
||||||
|
/// * `poll_interval` (int): Time in seconds between status checks when waiting for job completion.
|
||||||
|
/// * `idempotency_key` (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||||
|
///
|
||||||
|
/// # Returns:
|
||||||
|
/// * `Any`: The crawl job ID or the crawl results if waiting until completion.
|
||||||
|
///
|
||||||
|
/// # `Raises`:
|
||||||
|
/// * `Exception`: If the crawl job initiation or monitoring fails.
|
||||||
|
pub async fn crawl_url(
|
||||||
|
&self,
|
||||||
|
url: &str,
|
||||||
|
params: Option<Value>,
|
||||||
|
wait_until_done: bool,
|
||||||
|
poll_interval: u64,
|
||||||
|
idempotency_key: Option<String>,
|
||||||
|
) -> Result<Value, FirecrawlError> {
|
||||||
|
let headers = self.prepare_headers(idempotency_key);
|
||||||
|
let mut json_data = json!({"url": url});
|
||||||
|
if let Some(params) = params {
|
||||||
|
for (key, value) in params.as_object().unwrap() {
|
||||||
|
json_data[key] = value.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
|
||||||
|
.headers(headers.clone())
|
||||||
|
.json(&json_data)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||||
|
|
||||||
|
let response_json = self.handle_response(response, "start crawl job").await?;
|
||||||
|
let job_id = response_json["jobId"].as_str().unwrap().to_string();
|
||||||
|
|
||||||
|
if wait_until_done {
|
||||||
|
self.monitor_job_status(&job_id, headers, poll_interval)
|
||||||
|
.await
|
||||||
|
} else {
|
||||||
|
Ok(json!({"jobId": job_id}))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check the status of a crawl job using the Firecrawl API.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// * `job_id` (str): The ID of the crawl job.
|
||||||
|
///
|
||||||
|
/// # Returns:
|
||||||
|
/// * `Any`: The status of the crawl job.
|
||||||
|
///
|
||||||
|
/// # Raises:
|
||||||
|
/// * `Exception`: If the status check request fails.
|
||||||
|
pub async fn check_crawl_status(&self, job_id: &str) -> Result<Value, FirecrawlError> {
|
||||||
|
let headers = self.prepare_headers(None);
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(&format!(
|
||||||
|
"{}{}/crawl/status/{}",
|
||||||
|
self.api_url, API_VERSION, job_id
|
||||||
|
))
|
||||||
|
.headers(headers)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||||
|
|
||||||
|
self.handle_response(response, "check crawl status").await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Monitor the status of a crawl job until completion.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// * `job_id` (str): The ID of the crawl job.
|
||||||
|
/// * `headers` (Dict[str, str]): The headers to include in the status check requests.
|
||||||
|
/// * `poll_interval` (int): Secounds between status checks.
|
||||||
|
///
|
||||||
|
/// # Returns:
|
||||||
|
/// * `Any`: The crawl results if the job is completed successfully.
|
||||||
|
///
|
||||||
|
/// # Raises:
|
||||||
|
/// Exception: If the job fails or an error occurs during status checks.
|
||||||
|
async fn monitor_job_status(
|
||||||
|
&self,
|
||||||
|
job_id: &str,
|
||||||
|
headers: reqwest::header::HeaderMap,
|
||||||
|
poll_interval: u64,
|
||||||
|
) -> Result<Value, FirecrawlError> {
|
||||||
|
loop {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(&format!(
|
||||||
|
"{}{}/crawl/status/{}",
|
||||||
|
self.api_url, API_VERSION, job_id
|
||||||
|
))
|
||||||
|
.headers(headers.clone())
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||||
|
|
||||||
|
let status_data = self.handle_response(response, "check crawl status").await?;
|
||||||
|
match status_data["status"].as_str() {
|
||||||
|
Some("completed") => {
|
||||||
|
if status_data["data"].is_object() {
|
||||||
|
return Ok(status_data["data"].clone());
|
||||||
|
} else {
|
||||||
|
return Err(FirecrawlError::CrawlJobFailed(
|
||||||
|
"Crawl job completed but no data was returned".to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some("active") | Some("paused") | Some("pending") | Some("queued")
|
||||||
|
| Some("waiting") => {
|
||||||
|
thread::sleep(Duration::from_secs(poll_interval));
|
||||||
|
}
|
||||||
|
Some(status) => {
|
||||||
|
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||||
|
"Crawl job failed or was stopped. Status: {}",
|
||||||
|
status
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
return Err(FirecrawlError::CrawlJobFailed(
|
||||||
|
"Unexpected response: no status field".to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Prepare the headers for API requests.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// `idempotency_key` (Optional[str]): A unique key to ensure idempotency of requests.
|
||||||
|
///
|
||||||
|
/// # Returns:
|
||||||
|
/// Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
||||||
|
fn prepare_headers(&self, idempotency_key: Option<String>) -> reqwest::header::HeaderMap {
|
||||||
|
let mut headers = reqwest::header::HeaderMap::new();
|
||||||
|
headers.insert("Content-Type", "application/json".parse().unwrap());
|
||||||
|
headers.insert(
|
||||||
|
"Authorization",
|
||||||
|
format!("Bearer {}", self.api_key).parse().unwrap(),
|
||||||
|
);
|
||||||
|
if let Some(key) = idempotency_key {
|
||||||
|
headers.insert("x-idempotency-key", key.parse().unwrap());
|
||||||
|
}
|
||||||
|
headers
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle errors from API responses.
|
||||||
|
///
|
||||||
|
/// # Arguments:
|
||||||
|
/// * `response` (requests.Response): The response object from the API request.
|
||||||
|
/// * `action` (str): Description of the action that was being performed.
|
||||||
|
///
|
||||||
|
/// # Raises:
|
||||||
|
/// Exception: An exception with a message containing the status code and error details from the response.
|
||||||
|
async fn handle_response(
|
||||||
|
&self,
|
||||||
|
response: Response,
|
||||||
|
action: &str,
|
||||||
|
) -> Result<Value, FirecrawlError> {
|
||||||
|
if response.status().is_success() {
|
||||||
|
let response_json: Value = response
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
|
||||||
|
if response_json["success"].as_bool().unwrap_or(false) {
|
||||||
|
Ok(response_json["data"].clone())
|
||||||
|
} else {
|
||||||
|
Err(FirecrawlError::HttpRequestFailed(format!(
|
||||||
|
"Failed to {}: {}",
|
||||||
|
action, response_json["error"]
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let status_code = response.status().as_u16();
|
||||||
|
let error_message = response
|
||||||
|
.json::<Value>()
|
||||||
|
.await
|
||||||
|
.unwrap_or_else(|_| json!({"error": "No additional error details provided."}));
|
||||||
|
let message = match status_code {
|
||||||
|
402 => format!(
|
||||||
|
"Payment Required: Failed to {}. {}",
|
||||||
|
action, error_message["error"]
|
||||||
|
),
|
||||||
|
408 => format!(
|
||||||
|
"Request Timeout: Failed to {} as the request timed out. {}",
|
||||||
|
action, error_message["error"]
|
||||||
|
),
|
||||||
|
409 => format!(
|
||||||
|
"Conflict: Failed to {} due to a conflict. {}",
|
||||||
|
action, error_message["error"]
|
||||||
|
),
|
||||||
|
500 => format!(
|
||||||
|
"Internal Server Error: Failed to {}. {}",
|
||||||
|
action, error_message["error"]
|
||||||
|
),
|
||||||
|
_ => format!(
|
||||||
|
"Unexpected error during {}: Status code {}. {}",
|
||||||
|
action, status_code, error_message["error"]
|
||||||
|
),
|
||||||
|
};
|
||||||
|
Err(FirecrawlError::HttpRequestFailed(message))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user