mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 16:08:59 +08:00
Rust SDK 1.0.0
This commit is contained in:
parent
93a20442e3
commit
a078cdbd9d
@ -1,40 +1,38 @@
|
||||
use firecrawl::FirecrawlApp;
|
||||
use firecrawl::{crawl::CrawlOptions, scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}, FirecrawlApp};
|
||||
use serde_json::json;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
// Initialize the FirecrawlApp with the API key
|
||||
let api_key = Some("fc-YOUR_API_KEY".to_string());
|
||||
let api_url = Some("http://0.0.0.0:3002".to_string());
|
||||
let app = FirecrawlApp::new(api_key, api_url).expect("Failed to initialize FirecrawlApp");
|
||||
let app = FirecrawlApp::new("fc-YOUR-API-KEY").expect("Failed to initialize FirecrawlApp");
|
||||
|
||||
// or, connect to a self-hosted instance:
|
||||
// let app = FirecrawlApp::new_selfhosted("http://localhost:3002", None).expect("Failed to initialize FirecrawlApp");
|
||||
|
||||
// Scrape a website
|
||||
let scrape_result = app.scrape_url("https://firecrawl.dev", None).await;
|
||||
match scrape_result {
|
||||
Ok(data) => println!("Scrape Result:\n{}", data["markdown"]),
|
||||
Err(e) => eprintln!("Scrape failed: {}", e),
|
||||
Ok(data) => println!("Scrape Result:\n{}", data.markdown.unwrap()),
|
||||
Err(e) => eprintln!("Scrape failed: {:#?}", e),
|
||||
}
|
||||
|
||||
// Crawl a website
|
||||
let random_uuid = String::from(Uuid::new_v4());
|
||||
let idempotency_key = Some(random_uuid); // optional idempotency key
|
||||
let crawl_params = json!({
|
||||
"crawlerOptions": {
|
||||
"excludes": ["blog/*"]
|
||||
}
|
||||
});
|
||||
let idempotency_key = String::from(Uuid::new_v4());
|
||||
let crawl_options = CrawlOptions {
|
||||
exclude_paths: Some(vec![ "blog/*".to_string() ]),
|
||||
poll_interval: Some(2000),
|
||||
idempotency_key: Some(idempotency_key),
|
||||
..Default::default()
|
||||
};
|
||||
let crawl_result = app
|
||||
.crawl_url(
|
||||
"https://mendable.ai",
|
||||
Some(crawl_params),
|
||||
true,
|
||||
2,
|
||||
idempotency_key,
|
||||
crawl_options,
|
||||
)
|
||||
.await;
|
||||
match crawl_result {
|
||||
Ok(data) => println!("Crawl Result:\n{}", data),
|
||||
Ok(data) => println!("Crawl Result (used {} credits):\n{:#?}", data.credits_used, data.data),
|
||||
Err(e) => eprintln!("Crawl failed: {}", e),
|
||||
}
|
||||
|
||||
@ -62,21 +60,20 @@ async fn main() {
|
||||
"required": ["top"]
|
||||
});
|
||||
|
||||
let llm_extraction_params = json!({
|
||||
"extractorOptions": {
|
||||
"extractionSchema": json_schema,
|
||||
"mode": "llm-extraction"
|
||||
},
|
||||
"pageOptions": {
|
||||
"onlyMainContent": true
|
||||
}
|
||||
});
|
||||
let llm_extraction_options = ScrapeOptions {
|
||||
formats: Some(vec![ ScrapeFormats::Extract ]),
|
||||
extract: Some(ExtractOptions {
|
||||
schema: Some(json_schema),
|
||||
..Default::default()
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let llm_extraction_result = app
|
||||
.scrape_url("https://news.ycombinator.com", Some(llm_extraction_params))
|
||||
.scrape_url("https://news.ycombinator.com", llm_extraction_options)
|
||||
.await;
|
||||
match llm_extraction_result {
|
||||
Ok(data) => println!("LLM Extraction Result:\n{}", data["llm_extraction"]),
|
||||
Ok(data) => println!("LLM Extraction Result:\n{:#?}", data.extract.unwrap()),
|
||||
Err(e) => eprintln!("LLM Extraction failed: {}", e),
|
||||
}
|
||||
}
|
||||
|
@ -48,8 +48,8 @@ impl From<CrawlScrapeFormats> for ScrapeFormats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlScrapeOptions {
|
||||
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
||||
@ -93,8 +93,8 @@ impl From<CrawlScrapeOptions> for ScrapeOptions {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlOptions {
|
||||
/// Options to pass through to the scraper.
|
||||
@ -103,12 +103,12 @@ pub struct CrawlOptions {
|
||||
/// URL RegEx patterns to (exclusively) include.
|
||||
///
|
||||
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
|
||||
pub include_paths: Option<String>,
|
||||
pub include_paths: Option<Vec<String>>,
|
||||
|
||||
/// URL RegEx patterns to exclude.
|
||||
///
|
||||
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
|
||||
pub exclude_paths: Option<String>,
|
||||
pub exclude_paths: Option<Vec<String>>,
|
||||
|
||||
/// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
|
||||
pub max_depth: Option<u32>,
|
||||
@ -138,7 +138,6 @@ pub struct CrawlOptions {
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct CrawlRequestBody {
|
||||
url: String,
|
||||
@ -148,7 +147,6 @@ struct CrawlRequestBody {
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct CrawlResponse {
|
||||
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
||||
@ -175,8 +173,8 @@ pub enum CrawlStatusTypes {
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlStatus {
|
||||
/// The status of the crawl.
|
||||
@ -203,7 +201,6 @@ pub struct CrawlStatus {
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct CrawlAsyncResponse {
|
||||
success: bool,
|
||||
@ -216,6 +213,7 @@ pub struct CrawlAsyncResponse {
|
||||
}
|
||||
|
||||
impl FirecrawlApp {
|
||||
/// Initiates a crawl job for a URL using the Firecrawl API.
|
||||
pub async fn crawl_url_async(
|
||||
&self,
|
||||
url: impl AsRef<str>,
|
||||
@ -235,61 +233,63 @@ impl FirecrawlApp {
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
.map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
|
||||
|
||||
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
|
||||
}
|
||||
|
||||
/// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
|
||||
pub async fn crawl_url(
|
||||
&self,
|
||||
url: impl AsRef<str>,
|
||||
options: Option<CrawlOptions>,
|
||||
) -> Result<Vec<Document>, FirecrawlError> {
|
||||
options: impl Into<Option<CrawlOptions>>,
|
||||
) -> Result<CrawlStatus, FirecrawlError> {
|
||||
let options = options.into();
|
||||
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
|
||||
|
||||
let res = self.crawl_url_async(url, options).await?;
|
||||
|
||||
self.monitor_job_status(&res.id, poll_interval).await
|
||||
}
|
||||
|
||||
pub async fn check_crawl_status(&self, id: &str) -> Result<CrawlStatus, FirecrawlError> {
|
||||
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
|
||||
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
||||
let response = self
|
||||
.client
|
||||
.get(&format!(
|
||||
"{}{}/crawl/{}",
|
||||
self.api_url, API_VERSION, id
|
||||
self.api_url, API_VERSION, id.as_ref()
|
||||
))
|
||||
.headers(self.prepare_headers(None))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
.map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?;
|
||||
|
||||
self.handle_response(response, "check crawl status").await
|
||||
self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await
|
||||
}
|
||||
|
||||
async fn monitor_job_status(
|
||||
&self,
|
||||
id: &str,
|
||||
poll_interval: u64,
|
||||
) -> Result<Vec<Document>, FirecrawlError> {
|
||||
) -> Result<CrawlStatus, FirecrawlError> {
|
||||
loop {
|
||||
let status_data = self.check_crawl_status(id).await?;
|
||||
match status_data.status {
|
||||
CrawlStatusTypes::Completed => {
|
||||
return Ok(status_data.data);
|
||||
return Ok(status_data);
|
||||
}
|
||||
CrawlStatusTypes::Scraping => {
|
||||
tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await;
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||
}
|
||||
CrawlStatusTypes::Failed => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job failed."
|
||||
)));
|
||||
), status_data));
|
||||
}
|
||||
CrawlStatusTypes::Cancelled => {
|
||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||
"Crawl job was cancelled."
|
||||
)));
|
||||
), status_data));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DocumentMetadata {
|
||||
// firecrawl specific
|
||||
@ -12,8 +12,8 @@ pub struct DocumentMetadata {
|
||||
pub error: Option<String>,
|
||||
|
||||
// basic meta tags
|
||||
pub title: String,
|
||||
pub description: String,
|
||||
pub title: Option<String>,
|
||||
pub description: Option<String>,
|
||||
pub language: Option<String>,
|
||||
pub keywords: Option<String>,
|
||||
pub robots: Option<String>,
|
||||
@ -26,7 +26,7 @@ pub struct DocumentMetadata {
|
||||
pub og_audio: Option<String>,
|
||||
pub og_determiner: Option<String>,
|
||||
pub og_locale: Option<String>,
|
||||
pub og_locale_alternate: Option<String>,
|
||||
pub og_locale_alternate: Option<Vec<String>>,
|
||||
pub og_site_name: Option<String>,
|
||||
pub og_video: Option<String>,
|
||||
|
||||
@ -49,8 +49,8 @@ pub struct DocumentMetadata {
|
||||
pub dcterms_created: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct Document {
|
||||
/// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)
|
||||
|
@ -1,7 +1,11 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::crawl::CrawlStatus;
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||
pub struct FirecrawlAPIError {
|
||||
/// Always false.
|
||||
@ -14,16 +18,28 @@ pub struct FirecrawlAPIError {
|
||||
pub details: Option<Value>,
|
||||
}
|
||||
|
||||
impl Display for FirecrawlAPIError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if let Some(details) = self.details.as_ref() {
|
||||
write!(f, "{} ({})", self.error, details)
|
||||
} else {
|
||||
write!(f, "{}", self.error)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum FirecrawlError {
|
||||
#[error("HTTP request failed: {0}")]
|
||||
HttpRequestFailed(String),
|
||||
#[error("API key not provided")]
|
||||
APIKeyNotProvided,
|
||||
#[error("{0} failed: HTTP error {1}: {2}")]
|
||||
HttpRequestFailed(String, u16, String),
|
||||
#[error("{0} failed: HTTP error: {1}")]
|
||||
HttpError(String, reqwest::Error),
|
||||
#[error("Failed to parse response as text: {0}")]
|
||||
ResponseParseErrorText(reqwest::Error),
|
||||
#[error("Failed to parse response: {0}")]
|
||||
ResponseParseError(String),
|
||||
#[error("API error")]
|
||||
APIError(FirecrawlAPIError),
|
||||
#[error("Crawl job failed or stopped: {0}")]
|
||||
CrawlJobFailed(String),
|
||||
ResponseParseError(serde_json::Error),
|
||||
#[error("{0} failed: {1}")]
|
||||
APIError(String, FirecrawlAPIError),
|
||||
#[error("Crawl job failed: {0}")]
|
||||
CrawlJobFailed(String, CrawlStatus),
|
||||
}
|
||||
|
@ -1,18 +1,18 @@
|
||||
use reqwest::{Client, Response};
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::json;
|
||||
use serde_json::Value;
|
||||
|
||||
pub mod crawl;
|
||||
pub mod document;
|
||||
mod error;
|
||||
pub mod map;
|
||||
pub mod scrape;
|
||||
|
||||
pub use error::FirecrawlError;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FirecrawlApp {
|
||||
api_key: String,
|
||||
api_key: Option<String>,
|
||||
api_url: String,
|
||||
client: Client,
|
||||
}
|
||||
@ -20,15 +20,14 @@ pub struct FirecrawlApp {
|
||||
pub(crate) const API_VERSION: &str = "/v1";
|
||||
|
||||
impl FirecrawlApp {
|
||||
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
|
||||
let api_key = api_key
|
||||
.ok_or(FirecrawlError::APIKeyNotProvided)?;
|
||||
let api_url = api_url
|
||||
.unwrap_or_else(|| "https://api.firecrawl.dev".to_string());
|
||||
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
|
||||
FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
|
||||
}
|
||||
|
||||
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
|
||||
Ok(FirecrawlApp {
|
||||
api_key,
|
||||
api_url,
|
||||
api_key: api_key.map(|x| x.as_ref().to_string()),
|
||||
api_url: api_url.as_ref().to_string(),
|
||||
client: Client::new(),
|
||||
})
|
||||
}
|
||||
@ -36,10 +35,12 @@ impl FirecrawlApp {
|
||||
fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
|
||||
let mut headers = reqwest::header::HeaderMap::new();
|
||||
headers.insert("Content-Type", "application/json".parse().unwrap());
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", self.api_key).parse().unwrap(),
|
||||
);
|
||||
if let Some(api_key) = self.api_key.as_ref() {
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", api_key).parse().unwrap(),
|
||||
);
|
||||
}
|
||||
if let Some(key) = idempotency_key {
|
||||
headers.insert("x-idempotency-key", key.parse().unwrap());
|
||||
}
|
||||
@ -51,48 +52,34 @@ impl FirecrawlApp {
|
||||
response: Response,
|
||||
action: impl AsRef<str>,
|
||||
) -> Result<T, FirecrawlError> {
|
||||
if response.status().is_success() {
|
||||
let response_json: Value = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
|
||||
if response_json["success"].as_bool().unwrap_or(false) {
|
||||
Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?)
|
||||
} else {
|
||||
Err(FirecrawlError::HttpRequestFailed(format!(
|
||||
"Failed to {}: {}",
|
||||
action.as_ref(), response_json["error"]
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
let status_code = response.status().as_u16();
|
||||
let error_message = response
|
||||
.json::<Value>()
|
||||
.await
|
||||
.unwrap_or_else(|_| json!({"error": "No additional error details provided."}));
|
||||
let message = match status_code {
|
||||
402 => format!(
|
||||
"Payment Required: Failed to {}. {}",
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
408 => format!(
|
||||
"Request Timeout: Failed to {} as the request timed out. {}",
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
409 => format!(
|
||||
"Conflict: Failed to {} due to a conflict. {}",
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
500 => format!(
|
||||
"Internal Server Error: Failed to {}. {}",
|
||||
action.as_ref(), error_message["error"]
|
||||
),
|
||||
_ => format!(
|
||||
"Unexpected error during {}: Status code {}. {}",
|
||||
action.as_ref(), status_code, error_message["error"]
|
||||
),
|
||||
};
|
||||
Err(FirecrawlError::HttpRequestFailed(message))
|
||||
let (is_success, status) = (response.status().is_success(), response.status());
|
||||
|
||||
let response = response
|
||||
.text()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::ResponseParseErrorText(e))
|
||||
.and_then(|response_json| serde_json::from_str::<Value>(&response_json).map_err(|e| FirecrawlError::ResponseParseError(e)))
|
||||
.and_then(|response_value| {
|
||||
if response_value["success"].as_bool().unwrap_or(false) {
|
||||
Ok(serde_json::from_value::<T>(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?)
|
||||
} else {
|
||||
Err(FirecrawlError::APIError(
|
||||
action.as_ref().to_string(),
|
||||
serde_json::from_value(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?
|
||||
))
|
||||
}
|
||||
});
|
||||
|
||||
match &response {
|
||||
Ok(_) => response,
|
||||
Err(FirecrawlError::ResponseParseError(_)) | Err(FirecrawlError::ResponseParseErrorText(_)) => {
|
||||
if is_success {
|
||||
response
|
||||
} else {
|
||||
Err(FirecrawlError::HttpRequestFailed(action.as_ref().to_string(), status.as_u16(), status.as_str().to_string()))
|
||||
}
|
||||
},
|
||||
Err(_) => response,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
66
apps/rust-sdk/src/map.rs
Normal file
66
apps/rust-sdk/src/map.rs
Normal file
@ -0,0 +1,66 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{FirecrawlApp, FirecrawlError, API_VERSION};
|
||||
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct MapOptions {
|
||||
/// Search query to use for mapping
|
||||
pub search: Option<String>,
|
||||
|
||||
/// Ignore the website sitemap when crawling (default: `true`)
|
||||
pub ignore_sitemap: Option<bool>,
|
||||
|
||||
/// Include subdomains of the website (default: `true`)
|
||||
pub include_subdomains: Option<bool>,
|
||||
|
||||
/// Maximum number of links to return (default: `5000`)
|
||||
pub exclude_tags: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct MapRequestBody {
|
||||
url: String,
|
||||
|
||||
#[serde(flatten)]
|
||||
options: MapOptions,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct MapResponse {
|
||||
success: bool,
|
||||
|
||||
links: Vec<String>,
|
||||
}
|
||||
|
||||
impl FirecrawlApp {
|
||||
/// Returns links from a URL using the Firecrawl API.
|
||||
pub async fn map_url(
|
||||
&self,
|
||||
url: impl AsRef<str>,
|
||||
options: impl Into<Option<MapOptions>>,
|
||||
) -> Result<Vec<String>, FirecrawlError> {
|
||||
let body = MapRequestBody {
|
||||
url: url.as_ref().to_string(),
|
||||
options: options.into().unwrap_or_default(),
|
||||
};
|
||||
|
||||
let headers = self.prepare_headers(None);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&format!("{}{}/map", self.api_url, API_VERSION))
|
||||
.headers(headers)
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpError(format!("Mapping {:?}", url.as_ref()), e))?;
|
||||
|
||||
let response = self.handle_response::<MapResponse>(response, "scrape URL").await?;
|
||||
|
||||
Ok(response.links)
|
||||
}
|
||||
}
|
@ -42,21 +42,21 @@ pub enum ScrapeFormats {
|
||||
Extract,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ExtractOptions {
|
||||
/// Schema the output should adhere to, provided in JSON Schema format.
|
||||
pub schema: Option<Value>,
|
||||
|
||||
pub system_prompt: Option<Value>,
|
||||
pub system_prompt: Option<String>,
|
||||
|
||||
/// Extraction prompt to send to the LLM agent along with the page content.
|
||||
pub prompt: Option<Value>,
|
||||
pub prompt: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct ScrapeOptions {
|
||||
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
||||
@ -89,7 +89,6 @@ pub struct ScrapeOptions {
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ScrapeRequestBody {
|
||||
url: String,
|
||||
@ -99,7 +98,6 @@ struct ScrapeRequestBody {
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||
#[serde_with::skip_serializing_none]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ScrapeResponse {
|
||||
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
||||
@ -111,14 +109,15 @@ struct ScrapeResponse {
|
||||
}
|
||||
|
||||
impl FirecrawlApp {
|
||||
/// Scrapes a URL using the Firecrawl API.
|
||||
pub async fn scrape_url(
|
||||
&self,
|
||||
url: impl AsRef<str>,
|
||||
options: Option<ScrapeOptions>,
|
||||
options: impl Into<Option<ScrapeOptions>>,
|
||||
) -> Result<Document, FirecrawlError> {
|
||||
let body = ScrapeRequestBody {
|
||||
url: url.as_ref().to_string(),
|
||||
options: options.unwrap_or_default(),
|
||||
options: options.into().unwrap_or_default(),
|
||||
};
|
||||
|
||||
let headers = self.prepare_headers(None);
|
||||
@ -130,7 +129,7 @@ impl FirecrawlApp {
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
||||
.map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
|
||||
|
||||
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;
|
||||
|
||||
|
@ -1,24 +1,16 @@
|
||||
use assert_matches::assert_matches;
|
||||
use dotenvy::dotenv;
|
||||
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
|
||||
use firecrawl::FirecrawlApp;
|
||||
use serde_json::json;
|
||||
use std::env;
|
||||
use std::time::Duration;
|
||||
use tokio::time::sleep;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_no_api_key() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").expect("API_URL environment variable is not set");
|
||||
assert_matches!(FirecrawlApp::new(None, Some(api_url)), Err(e) if e.to_string() == "API key not provided");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_blocklisted_url() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let blocklisted_url = "https://facebook.com/fake-test";
|
||||
let result = app.scrape_url(blocklisted_url, None).await;
|
||||
|
||||
@ -32,74 +24,65 @@ async fn test_blocklisted_url() {
|
||||
async fn test_successful_response_with_valid_preview_token() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let app = FirecrawlApp::new(
|
||||
Some("this_is_just_a_preview_token".to_string()),
|
||||
Some(api_url),
|
||||
let app = FirecrawlApp::new_selfhosted(
|
||||
api_url,
|
||||
Some("this_is_just_a_preview_token"),
|
||||
)
|
||||
.unwrap();
|
||||
let result = app
|
||||
.scrape_url("https://roastmywebsite.ai", None)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(result.as_object().unwrap().contains_key("content"));
|
||||
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
|
||||
assert!(result.markdown.is_some());
|
||||
assert!(result.markdown.unwrap().contains("_Roast_"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_scrape_url_e2e() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let result = app
|
||||
.scrape_url("https://roastmywebsite.ai", None)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(result.as_object().unwrap().contains_key("content"));
|
||||
assert!(result.as_object().unwrap().contains_key("markdown"));
|
||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
||||
assert!(!result.as_object().unwrap().contains_key("html"));
|
||||
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
|
||||
assert!(result.markdown.is_some());
|
||||
assert!(result.markdown.unwrap().contains("_Roast_"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_successful_response_with_valid_api_key_and_include_html() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
||||
let params = json!({
|
||||
"pageOptions": {
|
||||
"includeHtml": true
|
||||
}
|
||||
});
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let params = ScrapeOptions {
|
||||
formats: vec! [ ScrapeFormats::Markdown, ScrapeFormats::HTML ].into(),
|
||||
..Default::default()
|
||||
};
|
||||
let result = app
|
||||
.scrape_url("https://roastmywebsite.ai", Some(params))
|
||||
.scrape_url("https://roastmywebsite.ai", params)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(result.as_object().unwrap().contains_key("content"));
|
||||
assert!(result.as_object().unwrap().contains_key("markdown"));
|
||||
assert!(result.as_object().unwrap().contains_key("html"));
|
||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
||||
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
|
||||
assert!(result["markdown"].as_str().unwrap().contains("_Roast_"));
|
||||
assert!(result["html"].as_str().unwrap().contains("<h1"));
|
||||
assert!(result.markdown.is_some());
|
||||
assert!(result.html.is_some());
|
||||
assert!(result.markdown.unwrap().contains("_Roast_"));
|
||||
assert!(result.html.unwrap().contains("<h1"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_successful_response_for_valid_scrape_with_pdf_file() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let result = app
|
||||
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001.pdf", None)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(result.as_object().unwrap().contains_key("content"));
|
||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
||||
assert!(result["content"]
|
||||
.as_str()
|
||||
assert!(result.markdown.is_some());
|
||||
assert!(result.markdown
|
||||
.unwrap()
|
||||
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
||||
}
|
||||
@ -108,17 +91,14 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file() {
|
||||
async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let result = app
|
||||
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001", None)
|
||||
.await
|
||||
.unwrap();
|
||||
sleep(Duration::from_secs(6)).await; // wait for 6 seconds
|
||||
assert!(result.as_object().unwrap().contains_key("content"));
|
||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
||||
assert!(result["content"]
|
||||
.as_str()
|
||||
assert!(result.markdown.is_some());
|
||||
assert!(result.markdown
|
||||
.unwrap()
|
||||
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
||||
}
|
||||
@ -127,10 +107,10 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
|
||||
async fn test_should_return_error_for_blocklisted_url() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let blocklisted_url = "https://twitter.com/fake-test";
|
||||
let result = app.crawl_url(blocklisted_url, None, true, 1, None).await;
|
||||
let result = app.crawl_url(blocklisted_url, None).await;
|
||||
|
||||
assert_matches!(
|
||||
result,
|
||||
@ -142,13 +122,13 @@ async fn test_should_return_error_for_blocklisted_url() {
|
||||
async fn test_llm_extraction() {
|
||||
dotenv().ok();
|
||||
let api_url = env::var("API_URL").unwrap();
|
||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
||||
let params = json!({
|
||||
"extractorOptions": {
|
||||
"mode": "llm-extraction",
|
||||
"extractionPrompt": "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
"extractionSchema": {
|
||||
let api_key = env::var("TEST_API_KEY").ok();
|
||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||
let options = ScrapeOptions {
|
||||
formats: vec! [ ScrapeFormats::Extract ].into(),
|
||||
extract: ExtractOptions {
|
||||
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source".to_string().into(),
|
||||
schema: json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"company_mission": {"type": "string"},
|
||||
@ -156,15 +136,17 @@ async fn test_llm_extraction() {
|
||||
"is_open_source": {"type": "boolean"}
|
||||
},
|
||||
"required": ["company_mission", "supports_sso", "is_open_source"]
|
||||
}
|
||||
}
|
||||
});
|
||||
}).into(),
|
||||
..Default::default()
|
||||
}.into(),
|
||||
..Default::default()
|
||||
};
|
||||
let result = app
|
||||
.scrape_url("https://mendable.ai", Some(params))
|
||||
.scrape_url("https://mendable.ai", options)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(result.as_object().unwrap().contains_key("llm_extraction"));
|
||||
let llm_extraction = &result["llm_extraction"];
|
||||
assert!(result.extract.is_some());
|
||||
let llm_extraction = &result.extract.unwrap();
|
||||
assert!(llm_extraction
|
||||
.as_object()
|
||||
.unwrap()
|
||||
|
Loading…
x
Reference in New Issue
Block a user