Rust SDK 1.0.0

This commit is contained in:
Gergő Móricz 2024-09-20 19:36:07 +02:00
parent 93a20442e3
commit a078cdbd9d
8 changed files with 242 additions and 195 deletions

View File

@ -1,40 +1,38 @@
use firecrawl::FirecrawlApp; use firecrawl::{crawl::CrawlOptions, scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}, FirecrawlApp};
use serde_json::json; use serde_json::json;
use uuid::Uuid; use uuid::Uuid;
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
// Initialize the FirecrawlApp with the API key // Initialize the FirecrawlApp with the API key
let api_key = Some("fc-YOUR_API_KEY".to_string()); let app = FirecrawlApp::new("fc-YOUR-API-KEY").expect("Failed to initialize FirecrawlApp");
let api_url = Some("http://0.0.0.0:3002".to_string());
let app = FirecrawlApp::new(api_key, api_url).expect("Failed to initialize FirecrawlApp"); // or, connect to a self-hosted instance:
// let app = FirecrawlApp::new_selfhosted("http://localhost:3002", None).expect("Failed to initialize FirecrawlApp");
// Scrape a website // Scrape a website
let scrape_result = app.scrape_url("https://firecrawl.dev", None).await; let scrape_result = app.scrape_url("https://firecrawl.dev", None).await;
match scrape_result { match scrape_result {
Ok(data) => println!("Scrape Result:\n{}", data["markdown"]), Ok(data) => println!("Scrape Result:\n{}", data.markdown.unwrap()),
Err(e) => eprintln!("Scrape failed: {}", e), Err(e) => eprintln!("Scrape failed: {:#?}", e),
} }
// Crawl a website // Crawl a website
let random_uuid = String::from(Uuid::new_v4()); let idempotency_key = String::from(Uuid::new_v4());
let idempotency_key = Some(random_uuid); // optional idempotency key let crawl_options = CrawlOptions {
let crawl_params = json!({ exclude_paths: Some(vec![ "blog/*".to_string() ]),
"crawlerOptions": { poll_interval: Some(2000),
"excludes": ["blog/*"] idempotency_key: Some(idempotency_key),
} ..Default::default()
}); };
let crawl_result = app let crawl_result = app
.crawl_url( .crawl_url(
"https://mendable.ai", "https://mendable.ai",
Some(crawl_params), crawl_options,
true,
2,
idempotency_key,
) )
.await; .await;
match crawl_result { match crawl_result {
Ok(data) => println!("Crawl Result:\n{}", data), Ok(data) => println!("Crawl Result (used {} credits):\n{:#?}", data.credits_used, data.data),
Err(e) => eprintln!("Crawl failed: {}", e), Err(e) => eprintln!("Crawl failed: {}", e),
} }
@ -62,21 +60,20 @@ async fn main() {
"required": ["top"] "required": ["top"]
}); });
let llm_extraction_params = json!({ let llm_extraction_options = ScrapeOptions {
"extractorOptions": { formats: Some(vec![ ScrapeFormats::Extract ]),
"extractionSchema": json_schema, extract: Some(ExtractOptions {
"mode": "llm-extraction" schema: Some(json_schema),
}, ..Default::default()
"pageOptions": { }),
"onlyMainContent": true ..Default::default()
} };
});
let llm_extraction_result = app let llm_extraction_result = app
.scrape_url("https://news.ycombinator.com", Some(llm_extraction_params)) .scrape_url("https://news.ycombinator.com", llm_extraction_options)
.await; .await;
match llm_extraction_result { match llm_extraction_result {
Ok(data) => println!("LLM Extraction Result:\n{}", data["llm_extraction"]), Ok(data) => println!("LLM Extraction Result:\n{:#?}", data.extract.unwrap()),
Err(e) => eprintln!("LLM Extraction failed: {}", e), Err(e) => eprintln!("LLM Extraction failed: {}", e),
} }
} }

View File

@ -48,8 +48,8 @@ impl From<CrawlScrapeFormats> for ScrapeFormats {
} }
} }
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct CrawlScrapeOptions { pub struct CrawlScrapeOptions {
/// Formats to extract from the page. (default: `[ Markdown ]`) /// Formats to extract from the page. (default: `[ Markdown ]`)
@ -93,8 +93,8 @@ impl From<CrawlScrapeOptions> for ScrapeOptions {
} }
} }
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct CrawlOptions { pub struct CrawlOptions {
/// Options to pass through to the scraper. /// Options to pass through to the scraper.
@ -103,12 +103,12 @@ pub struct CrawlOptions {
/// URL RegEx patterns to (exclusively) include. /// URL RegEx patterns to (exclusively) include.
/// ///
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled. /// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
pub include_paths: Option<String>, pub include_paths: Option<Vec<String>>,
/// URL RegEx patterns to exclude. /// URL RegEx patterns to exclude.
/// ///
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled. /// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
pub exclude_paths: Option<String>, pub exclude_paths: Option<Vec<String>>,
/// Maximum URL depth to crawl, relative to the base URL. (default: `2`) /// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
pub max_depth: Option<u32>, pub max_depth: Option<u32>,
@ -138,7 +138,6 @@ pub struct CrawlOptions {
} }
#[derive(Deserialize, Serialize, Debug, Default)] #[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct CrawlRequestBody { struct CrawlRequestBody {
url: String, url: String,
@ -148,7 +147,6 @@ struct CrawlRequestBody {
} }
#[derive(Deserialize, Serialize, Debug, Default)] #[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct CrawlResponse { struct CrawlResponse {
/// This will always be `true` due to `FirecrawlApp::handle_response`. /// This will always be `true` due to `FirecrawlApp::handle_response`.
@ -175,8 +173,8 @@ pub enum CrawlStatusTypes {
Cancelled, Cancelled,
} }
#[derive(Deserialize, Serialize, Debug, Clone)]
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct CrawlStatus { pub struct CrawlStatus {
/// The status of the crawl. /// The status of the crawl.
@ -203,7 +201,6 @@ pub struct CrawlStatus {
} }
#[derive(Deserialize, Serialize, Debug, Clone)] #[derive(Deserialize, Serialize, Debug, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct CrawlAsyncResponse { pub struct CrawlAsyncResponse {
success: bool, success: bool,
@ -216,6 +213,7 @@ pub struct CrawlAsyncResponse {
} }
impl FirecrawlApp { impl FirecrawlApp {
/// Initiates a crawl job for a URL using the Firecrawl API.
pub async fn crawl_url_async( pub async fn crawl_url_async(
&self, &self,
url: impl AsRef<str>, url: impl AsRef<str>,
@ -235,61 +233,63 @@ impl FirecrawlApp {
.json(&body) .json(&body)
.send() .send()
.await .await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; .map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
} }
/// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
pub async fn crawl_url( pub async fn crawl_url(
&self, &self,
url: impl AsRef<str>, url: impl AsRef<str>,
options: Option<CrawlOptions>, options: impl Into<Option<CrawlOptions>>,
) -> Result<Vec<Document>, FirecrawlError> { ) -> Result<CrawlStatus, FirecrawlError> {
let options = options.into();
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000); let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
let res = self.crawl_url_async(url, options).await?; let res = self.crawl_url_async(url, options).await?;
self.monitor_job_status(&res.id, poll_interval).await self.monitor_job_status(&res.id, poll_interval).await
} }
pub async fn check_crawl_status(&self, id: &str) -> Result<CrawlStatus, FirecrawlError> { /// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
let response = self let response = self
.client .client
.get(&format!( .get(&format!(
"{}{}/crawl/{}", "{}{}/crawl/{}",
self.api_url, API_VERSION, id self.api_url, API_VERSION, id.as_ref()
)) ))
.headers(self.prepare_headers(None)) .headers(self.prepare_headers(None))
.send() .send()
.await .await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; .map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?;
self.handle_response(response, "check crawl status").await self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await
} }
async fn monitor_job_status( async fn monitor_job_status(
&self, &self,
id: &str, id: &str,
poll_interval: u64, poll_interval: u64,
) -> Result<Vec<Document>, FirecrawlError> { ) -> Result<CrawlStatus, FirecrawlError> {
loop { loop {
let status_data = self.check_crawl_status(id).await?; let status_data = self.check_crawl_status(id).await?;
match status_data.status { match status_data.status {
CrawlStatusTypes::Completed => { CrawlStatusTypes::Completed => {
return Ok(status_data.data); return Ok(status_data);
} }
CrawlStatusTypes::Scraping => { CrawlStatusTypes::Scraping => {
tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await; tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
} }
CrawlStatusTypes::Failed => { CrawlStatusTypes::Failed => {
return Err(FirecrawlError::CrawlJobFailed(format!( return Err(FirecrawlError::CrawlJobFailed(format!(
"Crawl job failed." "Crawl job failed."
))); ), status_data));
} }
CrawlStatusTypes::Cancelled => { CrawlStatusTypes::Cancelled => {
return Err(FirecrawlError::CrawlJobFailed(format!( return Err(FirecrawlError::CrawlJobFailed(format!(
"Crawl job was cancelled." "Crawl job was cancelled."
))); ), status_data));
} }
} }
} }

View File

@ -1,8 +1,8 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct DocumentMetadata { pub struct DocumentMetadata {
// firecrawl specific // firecrawl specific
@ -12,8 +12,8 @@ pub struct DocumentMetadata {
pub error: Option<String>, pub error: Option<String>,
// basic meta tags // basic meta tags
pub title: String, pub title: Option<String>,
pub description: String, pub description: Option<String>,
pub language: Option<String>, pub language: Option<String>,
pub keywords: Option<String>, pub keywords: Option<String>,
pub robots: Option<String>, pub robots: Option<String>,
@ -26,7 +26,7 @@ pub struct DocumentMetadata {
pub og_audio: Option<String>, pub og_audio: Option<String>,
pub og_determiner: Option<String>, pub og_determiner: Option<String>,
pub og_locale: Option<String>, pub og_locale: Option<String>,
pub og_locale_alternate: Option<String>, pub og_locale_alternate: Option<Vec<String>>,
pub og_site_name: Option<String>, pub og_site_name: Option<String>,
pub og_video: Option<String>, pub og_video: Option<String>,
@ -49,8 +49,8 @@ pub struct DocumentMetadata {
pub dcterms_created: Option<String>, pub dcterms_created: Option<String>,
} }
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct Document { pub struct Document {
/// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default) /// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)

View File

@ -1,7 +1,11 @@
use std::fmt::Display;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
use thiserror::Error; use thiserror::Error;
use crate::crawl::CrawlStatus;
#[derive(Debug, Deserialize, Serialize, Clone)] #[derive(Debug, Deserialize, Serialize, Clone)]
pub struct FirecrawlAPIError { pub struct FirecrawlAPIError {
/// Always false. /// Always false.
@ -14,16 +18,28 @@ pub struct FirecrawlAPIError {
pub details: Option<Value>, pub details: Option<Value>,
} }
impl Display for FirecrawlAPIError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(details) = self.details.as_ref() {
write!(f, "{} ({})", self.error, details)
} else {
write!(f, "{}", self.error)
}
}
}
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum FirecrawlError { pub enum FirecrawlError {
#[error("HTTP request failed: {0}")] #[error("{0} failed: HTTP error {1}: {2}")]
HttpRequestFailed(String), HttpRequestFailed(String, u16, String),
#[error("API key not provided")] #[error("{0} failed: HTTP error: {1}")]
APIKeyNotProvided, HttpError(String, reqwest::Error),
#[error("Failed to parse response as text: {0}")]
ResponseParseErrorText(reqwest::Error),
#[error("Failed to parse response: {0}")] #[error("Failed to parse response: {0}")]
ResponseParseError(String), ResponseParseError(serde_json::Error),
#[error("API error")] #[error("{0} failed: {1}")]
APIError(FirecrawlAPIError), APIError(String, FirecrawlAPIError),
#[error("Crawl job failed or stopped: {0}")] #[error("Crawl job failed: {0}")]
CrawlJobFailed(String), CrawlJobFailed(String, CrawlStatus),
} }

View File

@ -1,18 +1,18 @@
use reqwest::{Client, Response}; use reqwest::{Client, Response};
use serde::de::DeserializeOwned; use serde::de::DeserializeOwned;
use serde_json::json;
use serde_json::Value; use serde_json::Value;
pub mod crawl; pub mod crawl;
pub mod document; pub mod document;
mod error; mod error;
pub mod map;
pub mod scrape; pub mod scrape;
pub use error::FirecrawlError; pub use error::FirecrawlError;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct FirecrawlApp { pub struct FirecrawlApp {
api_key: String, api_key: Option<String>,
api_url: String, api_url: String,
client: Client, client: Client,
} }
@ -20,15 +20,14 @@ pub struct FirecrawlApp {
pub(crate) const API_VERSION: &str = "/v1"; pub(crate) const API_VERSION: &str = "/v1";
impl FirecrawlApp { impl FirecrawlApp {
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> { pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
let api_key = api_key FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
.ok_or(FirecrawlError::APIKeyNotProvided)?; }
let api_url = api_url
.unwrap_or_else(|| "https://api.firecrawl.dev".to_string());
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
Ok(FirecrawlApp { Ok(FirecrawlApp {
api_key, api_key: api_key.map(|x| x.as_ref().to_string()),
api_url, api_url: api_url.as_ref().to_string(),
client: Client::new(), client: Client::new(),
}) })
} }
@ -36,10 +35,12 @@ impl FirecrawlApp {
fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap { fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
let mut headers = reqwest::header::HeaderMap::new(); let mut headers = reqwest::header::HeaderMap::new();
headers.insert("Content-Type", "application/json".parse().unwrap()); headers.insert("Content-Type", "application/json".parse().unwrap());
headers.insert( if let Some(api_key) = self.api_key.as_ref() {
"Authorization", headers.insert(
format!("Bearer {}", self.api_key).parse().unwrap(), "Authorization",
); format!("Bearer {}", api_key).parse().unwrap(),
);
}
if let Some(key) = idempotency_key { if let Some(key) = idempotency_key {
headers.insert("x-idempotency-key", key.parse().unwrap()); headers.insert("x-idempotency-key", key.parse().unwrap());
} }
@ -51,48 +52,34 @@ impl FirecrawlApp {
response: Response, response: Response,
action: impl AsRef<str>, action: impl AsRef<str>,
) -> Result<T, FirecrawlError> { ) -> Result<T, FirecrawlError> {
if response.status().is_success() { let (is_success, status) = (response.status().is_success(), response.status());
let response_json: Value = response
.json() let response = response
.await .text()
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?; .await
if response_json["success"].as_bool().unwrap_or(false) { .map_err(|e| FirecrawlError::ResponseParseErrorText(e))
Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?) .and_then(|response_json| serde_json::from_str::<Value>(&response_json).map_err(|e| FirecrawlError::ResponseParseError(e)))
} else { .and_then(|response_value| {
Err(FirecrawlError::HttpRequestFailed(format!( if response_value["success"].as_bool().unwrap_or(false) {
"Failed to {}: {}", Ok(serde_json::from_value::<T>(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?)
action.as_ref(), response_json["error"] } else {
))) Err(FirecrawlError::APIError(
} action.as_ref().to_string(),
} else { serde_json::from_value(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?
let status_code = response.status().as_u16(); ))
let error_message = response }
.json::<Value>() });
.await
.unwrap_or_else(|_| json!({"error": "No additional error details provided."})); match &response {
let message = match status_code { Ok(_) => response,
402 => format!( Err(FirecrawlError::ResponseParseError(_)) | Err(FirecrawlError::ResponseParseErrorText(_)) => {
"Payment Required: Failed to {}. {}", if is_success {
action.as_ref(), error_message["error"] response
), } else {
408 => format!( Err(FirecrawlError::HttpRequestFailed(action.as_ref().to_string(), status.as_u16(), status.as_str().to_string()))
"Request Timeout: Failed to {} as the request timed out. {}", }
action.as_ref(), error_message["error"] },
), Err(_) => response,
409 => format!(
"Conflict: Failed to {} due to a conflict. {}",
action.as_ref(), error_message["error"]
),
500 => format!(
"Internal Server Error: Failed to {}. {}",
action.as_ref(), error_message["error"]
),
_ => format!(
"Unexpected error during {}: Status code {}. {}",
action.as_ref(), status_code, error_message["error"]
),
};
Err(FirecrawlError::HttpRequestFailed(message))
} }
} }
} }

66
apps/rust-sdk/src/map.rs Normal file
View File

@ -0,0 +1,66 @@
use serde::{Deserialize, Serialize};
use crate::{FirecrawlApp, FirecrawlError, API_VERSION};
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
pub struct MapOptions {
/// Search query to use for mapping
pub search: Option<String>,
/// Ignore the website sitemap when crawling (default: `true`)
pub ignore_sitemap: Option<bool>,
/// Include subdomains of the website (default: `true`)
pub include_subdomains: Option<bool>,
/// Maximum number of links to return (default: `5000`)
pub exclude_tags: Option<u32>,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
struct MapRequestBody {
url: String,
#[serde(flatten)]
options: MapOptions,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
struct MapResponse {
success: bool,
links: Vec<String>,
}
impl FirecrawlApp {
/// Returns links from a URL using the Firecrawl API.
pub async fn map_url(
&self,
url: impl AsRef<str>,
options: impl Into<Option<MapOptions>>,
) -> Result<Vec<String>, FirecrawlError> {
let body = MapRequestBody {
url: url.as_ref().to_string(),
options: options.into().unwrap_or_default(),
};
let headers = self.prepare_headers(None);
let response = self
.client
.post(&format!("{}{}/map", self.api_url, API_VERSION))
.headers(headers)
.json(&body)
.send()
.await
.map_err(|e| FirecrawlError::HttpError(format!("Mapping {:?}", url.as_ref()), e))?;
let response = self.handle_response::<MapResponse>(response, "scrape URL").await?;
Ok(response.links)
}
}

View File

@ -42,21 +42,21 @@ pub enum ScrapeFormats {
Extract, Extract,
} }
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ExtractOptions { pub struct ExtractOptions {
/// Schema the output should adhere to, provided in JSON Schema format. /// Schema the output should adhere to, provided in JSON Schema format.
pub schema: Option<Value>, pub schema: Option<Value>,
pub system_prompt: Option<Value>, pub system_prompt: Option<String>,
/// Extraction prompt to send to the LLM agent along with the page content. /// Extraction prompt to send to the LLM agent along with the page content.
pub prompt: Option<Value>, pub prompt: Option<String>,
} }
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none] #[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ScrapeOptions { pub struct ScrapeOptions {
/// Formats to extract from the page. (default: `[ Markdown ]`) /// Formats to extract from the page. (default: `[ Markdown ]`)
@ -89,7 +89,6 @@ pub struct ScrapeOptions {
} }
#[derive(Deserialize, Serialize, Debug, Default)] #[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct ScrapeRequestBody { struct ScrapeRequestBody {
url: String, url: String,
@ -99,7 +98,6 @@ struct ScrapeRequestBody {
} }
#[derive(Deserialize, Serialize, Debug, Default)] #[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
struct ScrapeResponse { struct ScrapeResponse {
/// This will always be `true` due to `FirecrawlApp::handle_response`. /// This will always be `true` due to `FirecrawlApp::handle_response`.
@ -111,14 +109,15 @@ struct ScrapeResponse {
} }
impl FirecrawlApp { impl FirecrawlApp {
/// Scrapes a URL using the Firecrawl API.
pub async fn scrape_url( pub async fn scrape_url(
&self, &self,
url: impl AsRef<str>, url: impl AsRef<str>,
options: Option<ScrapeOptions>, options: impl Into<Option<ScrapeOptions>>,
) -> Result<Document, FirecrawlError> { ) -> Result<Document, FirecrawlError> {
let body = ScrapeRequestBody { let body = ScrapeRequestBody {
url: url.as_ref().to_string(), url: url.as_ref().to_string(),
options: options.unwrap_or_default(), options: options.into().unwrap_or_default(),
}; };
let headers = self.prepare_headers(None); let headers = self.prepare_headers(None);
@ -130,7 +129,7 @@ impl FirecrawlApp {
.json(&body) .json(&body)
.send() .send()
.await .await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?; .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?; let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;

View File

@ -1,24 +1,16 @@
use assert_matches::assert_matches; use assert_matches::assert_matches;
use dotenvy::dotenv; use dotenvy::dotenv;
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
use firecrawl::FirecrawlApp; use firecrawl::FirecrawlApp;
use serde_json::json; use serde_json::json;
use std::env; use std::env;
use std::time::Duration;
use tokio::time::sleep;
#[tokio::test]
async fn test_no_api_key() {
dotenv().ok();
let api_url = env::var("API_URL").expect("API_URL environment variable is not set");
assert_matches!(FirecrawlApp::new(None, Some(api_url)), Err(e) if e.to_string() == "API key not provided");
}
#[tokio::test] #[tokio::test]
async fn test_blocklisted_url() { async fn test_blocklisted_url() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap(); let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://facebook.com/fake-test"; let blocklisted_url = "https://facebook.com/fake-test";
let result = app.scrape_url(blocklisted_url, None).await; let result = app.scrape_url(blocklisted_url, None).await;
@ -32,74 +24,65 @@ async fn test_blocklisted_url() {
async fn test_successful_response_with_valid_preview_token() { async fn test_successful_response_with_valid_preview_token() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let app = FirecrawlApp::new( let app = FirecrawlApp::new_selfhosted(
Some("this_is_just_a_preview_token".to_string()), api_url,
Some(api_url), Some("this_is_just_a_preview_token"),
) )
.unwrap(); .unwrap();
let result = app let result = app
.scrape_url("https://roastmywebsite.ai", None) .scrape_url("https://roastmywebsite.ai", None)
.await .await
.unwrap(); .unwrap();
assert!(result.as_object().unwrap().contains_key("content")); assert!(result.markdown.is_some());
assert!(result["content"].as_str().unwrap().contains("_Roast_")); assert!(result.markdown.unwrap().contains("_Roast_"));
} }
#[tokio::test] #[tokio::test]
async fn test_scrape_url_e2e() { async fn test_scrape_url_e2e() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap(); let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let result = app let result = app
.scrape_url("https://roastmywebsite.ai", None) .scrape_url("https://roastmywebsite.ai", None)
.await .await
.unwrap(); .unwrap();
assert!(result.as_object().unwrap().contains_key("content")); assert!(result.markdown.is_some());
assert!(result.as_object().unwrap().contains_key("markdown")); assert!(result.markdown.unwrap().contains("_Roast_"));
assert!(result.as_object().unwrap().contains_key("metadata"));
assert!(!result.as_object().unwrap().contains_key("html"));
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
} }
#[tokio::test] #[tokio::test]
async fn test_successful_response_with_valid_api_key_and_include_html() { async fn test_successful_response_with_valid_api_key_and_include_html() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap(); let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let params = json!({ let params = ScrapeOptions {
"pageOptions": { formats: vec! [ ScrapeFormats::Markdown, ScrapeFormats::HTML ].into(),
"includeHtml": true ..Default::default()
} };
});
let result = app let result = app
.scrape_url("https://roastmywebsite.ai", Some(params)) .scrape_url("https://roastmywebsite.ai", params)
.await .await
.unwrap(); .unwrap();
assert!(result.as_object().unwrap().contains_key("content")); assert!(result.markdown.is_some());
assert!(result.as_object().unwrap().contains_key("markdown")); assert!(result.html.is_some());
assert!(result.as_object().unwrap().contains_key("html")); assert!(result.markdown.unwrap().contains("_Roast_"));
assert!(result.as_object().unwrap().contains_key("metadata")); assert!(result.html.unwrap().contains("<h1"));
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
assert!(result["markdown"].as_str().unwrap().contains("_Roast_"));
assert!(result["html"].as_str().unwrap().contains("<h1"));
} }
#[tokio::test] #[tokio::test]
async fn test_successful_response_for_valid_scrape_with_pdf_file() { async fn test_successful_response_for_valid_scrape_with_pdf_file() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap(); let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let result = app let result = app
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001.pdf", None) .scrape_url("https://arxiv.org/pdf/astro-ph/9301001.pdf", None)
.await .await
.unwrap(); .unwrap();
assert!(result.as_object().unwrap().contains_key("content")); assert!(result.markdown.is_some());
assert!(result.as_object().unwrap().contains_key("metadata")); assert!(result.markdown
assert!(result["content"]
.as_str()
.unwrap() .unwrap()
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy")); .contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
} }
@ -108,17 +91,14 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file() {
async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension() { async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap(); let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let result = app let result = app
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001", None) .scrape_url("https://arxiv.org/pdf/astro-ph/9301001", None)
.await .await
.unwrap(); .unwrap();
sleep(Duration::from_secs(6)).await; // wait for 6 seconds assert!(result.markdown.is_some());
assert!(result.as_object().unwrap().contains_key("content")); assert!(result.markdown
assert!(result.as_object().unwrap().contains_key("metadata"));
assert!(result["content"]
.as_str()
.unwrap() .unwrap()
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy")); .contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
} }
@ -127,10 +107,10 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
async fn test_should_return_error_for_blocklisted_url() { async fn test_should_return_error_for_blocklisted_url() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap(); let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://twitter.com/fake-test"; let blocklisted_url = "https://twitter.com/fake-test";
let result = app.crawl_url(blocklisted_url, None, true, 1, None).await; let result = app.crawl_url(blocklisted_url, None).await;
assert_matches!( assert_matches!(
result, result,
@ -142,13 +122,13 @@ async fn test_should_return_error_for_blocklisted_url() {
async fn test_llm_extraction() { async fn test_llm_extraction() {
dotenv().ok(); dotenv().ok();
let api_url = env::var("API_URL").unwrap(); let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap(); let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap(); let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let params = json!({ let options = ScrapeOptions {
"extractorOptions": { formats: vec! [ ScrapeFormats::Extract ].into(),
"mode": "llm-extraction", extract: ExtractOptions {
"extractionPrompt": "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source".to_string().into(),
"extractionSchema": { schema: json!({
"type": "object", "type": "object",
"properties": { "properties": {
"company_mission": {"type": "string"}, "company_mission": {"type": "string"},
@ -156,15 +136,17 @@ async fn test_llm_extraction() {
"is_open_source": {"type": "boolean"} "is_open_source": {"type": "boolean"}
}, },
"required": ["company_mission", "supports_sso", "is_open_source"] "required": ["company_mission", "supports_sso", "is_open_source"]
} }).into(),
} ..Default::default()
}); }.into(),
..Default::default()
};
let result = app let result = app
.scrape_url("https://mendable.ai", Some(params)) .scrape_url("https://mendable.ai", options)
.await .await
.unwrap(); .unwrap();
assert!(result.as_object().unwrap().contains_key("llm_extraction")); assert!(result.extract.is_some());
let llm_extraction = &result["llm_extraction"]; let llm_extraction = &result.extract.unwrap();
assert!(llm_extraction assert!(llm_extraction
.as_object() .as_object()
.unwrap() .unwrap()