Rust SDK 1.0.0

This commit is contained in:
Gergő Móricz 2024-09-20 19:36:07 +02:00
parent 93a20442e3
commit a078cdbd9d
8 changed files with 242 additions and 195 deletions

View File

@ -1,40 +1,38 @@
use firecrawl::FirecrawlApp;
use firecrawl::{crawl::CrawlOptions, scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}, FirecrawlApp};
use serde_json::json;
use uuid::Uuid;
#[tokio::main]
async fn main() {
// Initialize the FirecrawlApp with the API key
let api_key = Some("fc-YOUR_API_KEY".to_string());
let api_url = Some("http://0.0.0.0:3002".to_string());
let app = FirecrawlApp::new(api_key, api_url).expect("Failed to initialize FirecrawlApp");
let app = FirecrawlApp::new("fc-YOUR-API-KEY").expect("Failed to initialize FirecrawlApp");
// or, connect to a self-hosted instance:
// let app = FirecrawlApp::new_selfhosted("http://localhost:3002", None).expect("Failed to initialize FirecrawlApp");
// Scrape a website
let scrape_result = app.scrape_url("https://firecrawl.dev", None).await;
match scrape_result {
Ok(data) => println!("Scrape Result:\n{}", data["markdown"]),
Err(e) => eprintln!("Scrape failed: {}", e),
Ok(data) => println!("Scrape Result:\n{}", data.markdown.unwrap()),
Err(e) => eprintln!("Scrape failed: {:#?}", e),
}
// Crawl a website
let random_uuid = String::from(Uuid::new_v4());
let idempotency_key = Some(random_uuid); // optional idempotency key
let crawl_params = json!({
"crawlerOptions": {
"excludes": ["blog/*"]
}
});
let idempotency_key = String::from(Uuid::new_v4());
let crawl_options = CrawlOptions {
exclude_paths: Some(vec![ "blog/*".to_string() ]),
poll_interval: Some(2000),
idempotency_key: Some(idempotency_key),
..Default::default()
};
let crawl_result = app
.crawl_url(
"https://mendable.ai",
Some(crawl_params),
true,
2,
idempotency_key,
crawl_options,
)
.await;
match crawl_result {
Ok(data) => println!("Crawl Result:\n{}", data),
Ok(data) => println!("Crawl Result (used {} credits):\n{:#?}", data.credits_used, data.data),
Err(e) => eprintln!("Crawl failed: {}", e),
}
@ -62,21 +60,20 @@ async fn main() {
"required": ["top"]
});
let llm_extraction_params = json!({
"extractorOptions": {
"extractionSchema": json_schema,
"mode": "llm-extraction"
},
"pageOptions": {
"onlyMainContent": true
}
});
let llm_extraction_options = ScrapeOptions {
formats: Some(vec![ ScrapeFormats::Extract ]),
extract: Some(ExtractOptions {
schema: Some(json_schema),
..Default::default()
}),
..Default::default()
};
let llm_extraction_result = app
.scrape_url("https://news.ycombinator.com", Some(llm_extraction_params))
.scrape_url("https://news.ycombinator.com", llm_extraction_options)
.await;
match llm_extraction_result {
Ok(data) => println!("LLM Extraction Result:\n{}", data["llm_extraction"]),
Ok(data) => println!("LLM Extraction Result:\n{:#?}", data.extract.unwrap()),
Err(e) => eprintln!("LLM Extraction failed: {}", e),
}
}

View File

@ -48,8 +48,8 @@ impl From<CrawlScrapeFormats> for ScrapeFormats {
}
}
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")]
pub struct CrawlScrapeOptions {
/// Formats to extract from the page. (default: `[ Markdown ]`)
@ -93,8 +93,8 @@ impl From<CrawlScrapeOptions> for ScrapeOptions {
}
}
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")]
pub struct CrawlOptions {
/// Options to pass through to the scraper.
@ -103,12 +103,12 @@ pub struct CrawlOptions {
/// URL RegEx patterns to (exclusively) include.
///
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
pub include_paths: Option<String>,
pub include_paths: Option<Vec<String>>,
/// URL RegEx patterns to exclude.
///
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
pub exclude_paths: Option<String>,
pub exclude_paths: Option<Vec<String>>,
/// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
pub max_depth: Option<u32>,
@ -138,7 +138,6 @@ pub struct CrawlOptions {
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct CrawlRequestBody {
url: String,
@ -148,7 +147,6 @@ struct CrawlRequestBody {
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct CrawlResponse {
/// This will always be `true` due to `FirecrawlApp::handle_response`.
@ -175,8 +173,8 @@ pub enum CrawlStatusTypes {
Cancelled,
}
#[derive(Deserialize, Serialize, Debug, Clone)]
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Clone)]
#[serde(rename_all = "camelCase")]
pub struct CrawlStatus {
/// The status of the crawl.
@ -203,7 +201,6 @@ pub struct CrawlStatus {
}
#[derive(Deserialize, Serialize, Debug, Clone)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
pub struct CrawlAsyncResponse {
success: bool,
@ -216,6 +213,7 @@ pub struct CrawlAsyncResponse {
}
impl FirecrawlApp {
/// Initiates a crawl job for a URL using the Firecrawl API.
pub async fn crawl_url_async(
&self,
url: impl AsRef<str>,
@ -235,61 +233,63 @@ impl FirecrawlApp {
.json(&body)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
.map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
}
/// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
pub async fn crawl_url(
&self,
url: impl AsRef<str>,
options: Option<CrawlOptions>,
) -> Result<Vec<Document>, FirecrawlError> {
options: impl Into<Option<CrawlOptions>>,
) -> Result<CrawlStatus, FirecrawlError> {
let options = options.into();
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
let res = self.crawl_url_async(url, options).await?;
self.monitor_job_status(&res.id, poll_interval).await
}
pub async fn check_crawl_status(&self, id: &str) -> Result<CrawlStatus, FirecrawlError> {
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
let response = self
.client
.get(&format!(
"{}{}/crawl/{}",
self.api_url, API_VERSION, id
self.api_url, API_VERSION, id.as_ref()
))
.headers(self.prepare_headers(None))
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
.map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?;
self.handle_response(response, "check crawl status").await
self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await
}
async fn monitor_job_status(
&self,
id: &str,
poll_interval: u64,
) -> Result<Vec<Document>, FirecrawlError> {
) -> Result<CrawlStatus, FirecrawlError> {
loop {
let status_data = self.check_crawl_status(id).await?;
match status_data.status {
CrawlStatusTypes::Completed => {
return Ok(status_data.data);
return Ok(status_data);
}
CrawlStatusTypes::Scraping => {
tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await;
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
}
CrawlStatusTypes::Failed => {
return Err(FirecrawlError::CrawlJobFailed(format!(
"Crawl job failed."
)));
), status_data));
}
CrawlStatusTypes::Cancelled => {
return Err(FirecrawlError::CrawlJobFailed(format!(
"Crawl job was cancelled."
)));
), status_data));
}
}
}

View File

@ -1,8 +1,8 @@
use serde::{Deserialize, Serialize};
use serde_json::Value;
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")]
pub struct DocumentMetadata {
// firecrawl specific
@ -12,8 +12,8 @@ pub struct DocumentMetadata {
pub error: Option<String>,
// basic meta tags
pub title: String,
pub description: String,
pub title: Option<String>,
pub description: Option<String>,
pub language: Option<String>,
pub keywords: Option<String>,
pub robots: Option<String>,
@ -26,7 +26,7 @@ pub struct DocumentMetadata {
pub og_audio: Option<String>,
pub og_determiner: Option<String>,
pub og_locale: Option<String>,
pub og_locale_alternate: Option<String>,
pub og_locale_alternate: Option<Vec<String>>,
pub og_site_name: Option<String>,
pub og_video: Option<String>,
@ -49,8 +49,8 @@ pub struct DocumentMetadata {
pub dcterms_created: Option<String>,
}
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Document {
/// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)

View File

@ -1,7 +1,11 @@
use std::fmt::Display;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use thiserror::Error;
use crate::crawl::CrawlStatus;
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct FirecrawlAPIError {
/// Always false.
@ -14,16 +18,28 @@ pub struct FirecrawlAPIError {
pub details: Option<Value>,
}
impl Display for FirecrawlAPIError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(details) = self.details.as_ref() {
write!(f, "{} ({})", self.error, details)
} else {
write!(f, "{}", self.error)
}
}
}
#[derive(Error, Debug)]
pub enum FirecrawlError {
#[error("HTTP request failed: {0}")]
HttpRequestFailed(String),
#[error("API key not provided")]
APIKeyNotProvided,
#[error("{0} failed: HTTP error {1}: {2}")]
HttpRequestFailed(String, u16, String),
#[error("{0} failed: HTTP error: {1}")]
HttpError(String, reqwest::Error),
#[error("Failed to parse response as text: {0}")]
ResponseParseErrorText(reqwest::Error),
#[error("Failed to parse response: {0}")]
ResponseParseError(String),
#[error("API error")]
APIError(FirecrawlAPIError),
#[error("Crawl job failed or stopped: {0}")]
CrawlJobFailed(String),
ResponseParseError(serde_json::Error),
#[error("{0} failed: {1}")]
APIError(String, FirecrawlAPIError),
#[error("Crawl job failed: {0}")]
CrawlJobFailed(String, CrawlStatus),
}

View File

@ -1,18 +1,18 @@
use reqwest::{Client, Response};
use serde::de::DeserializeOwned;
use serde_json::json;
use serde_json::Value;
pub mod crawl;
pub mod document;
mod error;
pub mod map;
pub mod scrape;
pub use error::FirecrawlError;
#[derive(Clone, Debug)]
pub struct FirecrawlApp {
api_key: String,
api_key: Option<String>,
api_url: String,
client: Client,
}
@ -20,15 +20,14 @@ pub struct FirecrawlApp {
pub(crate) const API_VERSION: &str = "/v1";
impl FirecrawlApp {
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
let api_key = api_key
.ok_or(FirecrawlError::APIKeyNotProvided)?;
let api_url = api_url
.unwrap_or_else(|| "https://api.firecrawl.dev".to_string());
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
}
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
Ok(FirecrawlApp {
api_key,
api_url,
api_key: api_key.map(|x| x.as_ref().to_string()),
api_url: api_url.as_ref().to_string(),
client: Client::new(),
})
}
@ -36,10 +35,12 @@ impl FirecrawlApp {
fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
let mut headers = reqwest::header::HeaderMap::new();
headers.insert("Content-Type", "application/json".parse().unwrap());
if let Some(api_key) = self.api_key.as_ref() {
headers.insert(
"Authorization",
format!("Bearer {}", self.api_key).parse().unwrap(),
format!("Bearer {}", api_key).parse().unwrap(),
);
}
if let Some(key) = idempotency_key {
headers.insert("x-idempotency-key", key.parse().unwrap());
}
@ -51,48 +52,34 @@ impl FirecrawlApp {
response: Response,
action: impl AsRef<str>,
) -> Result<T, FirecrawlError> {
if response.status().is_success() {
let response_json: Value = response
.json()
let (is_success, status) = (response.status().is_success(), response.status());
let response = response
.text()
.await
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
if response_json["success"].as_bool().unwrap_or(false) {
Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?)
.map_err(|e| FirecrawlError::ResponseParseErrorText(e))
.and_then(|response_json| serde_json::from_str::<Value>(&response_json).map_err(|e| FirecrawlError::ResponseParseError(e)))
.and_then(|response_value| {
if response_value["success"].as_bool().unwrap_or(false) {
Ok(serde_json::from_value::<T>(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?)
} else {
Err(FirecrawlError::HttpRequestFailed(format!(
"Failed to {}: {}",
action.as_ref(), response_json["error"]
)))
Err(FirecrawlError::APIError(
action.as_ref().to_string(),
serde_json::from_value(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?
))
}
});
match &response {
Ok(_) => response,
Err(FirecrawlError::ResponseParseError(_)) | Err(FirecrawlError::ResponseParseErrorText(_)) => {
if is_success {
response
} else {
let status_code = response.status().as_u16();
let error_message = response
.json::<Value>()
.await
.unwrap_or_else(|_| json!({"error": "No additional error details provided."}));
let message = match status_code {
402 => format!(
"Payment Required: Failed to {}. {}",
action.as_ref(), error_message["error"]
),
408 => format!(
"Request Timeout: Failed to {} as the request timed out. {}",
action.as_ref(), error_message["error"]
),
409 => format!(
"Conflict: Failed to {} due to a conflict. {}",
action.as_ref(), error_message["error"]
),
500 => format!(
"Internal Server Error: Failed to {}. {}",
action.as_ref(), error_message["error"]
),
_ => format!(
"Unexpected error during {}: Status code {}. {}",
action.as_ref(), status_code, error_message["error"]
),
};
Err(FirecrawlError::HttpRequestFailed(message))
Err(FirecrawlError::HttpRequestFailed(action.as_ref().to_string(), status.as_u16(), status.as_str().to_string()))
}
},
Err(_) => response,
}
}
}

66
apps/rust-sdk/src/map.rs Normal file
View File

@ -0,0 +1,66 @@
use serde::{Deserialize, Serialize};
use crate::{FirecrawlApp, FirecrawlError, API_VERSION};
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
pub struct MapOptions {
/// Search query to use for mapping
pub search: Option<String>,
/// Ignore the website sitemap when crawling (default: `true`)
pub ignore_sitemap: Option<bool>,
/// Include subdomains of the website (default: `true`)
pub include_subdomains: Option<bool>,
/// Maximum number of links to return (default: `5000`)
pub exclude_tags: Option<u32>,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
struct MapRequestBody {
url: String,
#[serde(flatten)]
options: MapOptions,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
struct MapResponse {
success: bool,
links: Vec<String>,
}
impl FirecrawlApp {
/// Returns links from a URL using the Firecrawl API.
pub async fn map_url(
&self,
url: impl AsRef<str>,
options: impl Into<Option<MapOptions>>,
) -> Result<Vec<String>, FirecrawlError> {
let body = MapRequestBody {
url: url.as_ref().to_string(),
options: options.into().unwrap_or_default(),
};
let headers = self.prepare_headers(None);
let response = self
.client
.post(&format!("{}{}/map", self.api_url, API_VERSION))
.headers(headers)
.json(&body)
.send()
.await
.map_err(|e| FirecrawlError::HttpError(format!("Mapping {:?}", url.as_ref()), e))?;
let response = self.handle_response::<MapResponse>(response, "scrape URL").await?;
Ok(response.links)
}
}

View File

@ -42,21 +42,21 @@ pub enum ScrapeFormats {
Extract,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
pub struct ExtractOptions {
/// Schema the output should adhere to, provided in JSON Schema format.
pub schema: Option<Value>,
pub system_prompt: Option<Value>,
pub system_prompt: Option<String>,
/// Extraction prompt to send to the LLM agent along with the page content.
pub prompt: Option<Value>,
pub prompt: Option<String>,
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde(rename_all = "camelCase")]
pub struct ScrapeOptions {
/// Formats to extract from the page. (default: `[ Markdown ]`)
@ -89,7 +89,6 @@ pub struct ScrapeOptions {
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct ScrapeRequestBody {
url: String,
@ -99,7 +98,6 @@ struct ScrapeRequestBody {
}
#[derive(Deserialize, Serialize, Debug, Default)]
#[serde_with::skip_serializing_none]
#[serde(rename_all = "camelCase")]
struct ScrapeResponse {
/// This will always be `true` due to `FirecrawlApp::handle_response`.
@ -111,14 +109,15 @@ struct ScrapeResponse {
}
impl FirecrawlApp {
/// Scrapes a URL using the Firecrawl API.
pub async fn scrape_url(
&self,
url: impl AsRef<str>,
options: Option<ScrapeOptions>,
options: impl Into<Option<ScrapeOptions>>,
) -> Result<Document, FirecrawlError> {
let body = ScrapeRequestBody {
url: url.as_ref().to_string(),
options: options.unwrap_or_default(),
options: options.into().unwrap_or_default(),
};
let headers = self.prepare_headers(None);
@ -130,7 +129,7 @@ impl FirecrawlApp {
.json(&body)
.send()
.await
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
.map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;

View File

@ -1,24 +1,16 @@
use assert_matches::assert_matches;
use dotenvy::dotenv;
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
use firecrawl::FirecrawlApp;
use serde_json::json;
use std::env;
use std::time::Duration;
use tokio::time::sleep;
#[tokio::test]
async fn test_no_api_key() {
dotenv().ok();
let api_url = env::var("API_URL").expect("API_URL environment variable is not set");
assert_matches!(FirecrawlApp::new(None, Some(api_url)), Err(e) if e.to_string() == "API key not provided");
}
#[tokio::test]
async fn test_blocklisted_url() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://facebook.com/fake-test";
let result = app.scrape_url(blocklisted_url, None).await;
@ -32,74 +24,65 @@ async fn test_blocklisted_url() {
async fn test_successful_response_with_valid_preview_token() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let app = FirecrawlApp::new(
Some("this_is_just_a_preview_token".to_string()),
Some(api_url),
let app = FirecrawlApp::new_selfhosted(
api_url,
Some("this_is_just_a_preview_token"),
)
.unwrap();
let result = app
.scrape_url("https://roastmywebsite.ai", None)
.await
.unwrap();
assert!(result.as_object().unwrap().contains_key("content"));
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
assert!(result.markdown.is_some());
assert!(result.markdown.unwrap().contains("_Roast_"));
}
#[tokio::test]
async fn test_scrape_url_e2e() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let result = app
.scrape_url("https://roastmywebsite.ai", None)
.await
.unwrap();
assert!(result.as_object().unwrap().contains_key("content"));
assert!(result.as_object().unwrap().contains_key("markdown"));
assert!(result.as_object().unwrap().contains_key("metadata"));
assert!(!result.as_object().unwrap().contains_key("html"));
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
assert!(result.markdown.is_some());
assert!(result.markdown.unwrap().contains("_Roast_"));
}
#[tokio::test]
async fn test_successful_response_with_valid_api_key_and_include_html() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
let params = json!({
"pageOptions": {
"includeHtml": true
}
});
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let params = ScrapeOptions {
formats: vec! [ ScrapeFormats::Markdown, ScrapeFormats::HTML ].into(),
..Default::default()
};
let result = app
.scrape_url("https://roastmywebsite.ai", Some(params))
.scrape_url("https://roastmywebsite.ai", params)
.await
.unwrap();
assert!(result.as_object().unwrap().contains_key("content"));
assert!(result.as_object().unwrap().contains_key("markdown"));
assert!(result.as_object().unwrap().contains_key("html"));
assert!(result.as_object().unwrap().contains_key("metadata"));
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
assert!(result["markdown"].as_str().unwrap().contains("_Roast_"));
assert!(result["html"].as_str().unwrap().contains("<h1"));
assert!(result.markdown.is_some());
assert!(result.html.is_some());
assert!(result.markdown.unwrap().contains("_Roast_"));
assert!(result.html.unwrap().contains("<h1"));
}
#[tokio::test]
async fn test_successful_response_for_valid_scrape_with_pdf_file() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let result = app
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001.pdf", None)
.await
.unwrap();
assert!(result.as_object().unwrap().contains_key("content"));
assert!(result.as_object().unwrap().contains_key("metadata"));
assert!(result["content"]
.as_str()
assert!(result.markdown.is_some());
assert!(result.markdown
.unwrap()
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
}
@ -108,17 +91,14 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file() {
async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let result = app
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001", None)
.await
.unwrap();
sleep(Duration::from_secs(6)).await; // wait for 6 seconds
assert!(result.as_object().unwrap().contains_key("content"));
assert!(result.as_object().unwrap().contains_key("metadata"));
assert!(result["content"]
.as_str()
assert!(result.markdown.is_some());
assert!(result.markdown
.unwrap()
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
}
@ -127,10 +107,10 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
async fn test_should_return_error_for_blocklisted_url() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let blocklisted_url = "https://twitter.com/fake-test";
let result = app.crawl_url(blocklisted_url, None, true, 1, None).await;
let result = app.crawl_url(blocklisted_url, None).await;
assert_matches!(
result,
@ -142,13 +122,13 @@ async fn test_should_return_error_for_blocklisted_url() {
async fn test_llm_extraction() {
dotenv().ok();
let api_url = env::var("API_URL").unwrap();
let api_key = env::var("TEST_API_KEY").unwrap();
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
let params = json!({
"extractorOptions": {
"mode": "llm-extraction",
"extractionPrompt": "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
"extractionSchema": {
let api_key = env::var("TEST_API_KEY").ok();
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
let options = ScrapeOptions {
formats: vec! [ ScrapeFormats::Extract ].into(),
extract: ExtractOptions {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source".to_string().into(),
schema: json!({
"type": "object",
"properties": {
"company_mission": {"type": "string"},
@ -156,15 +136,17 @@ async fn test_llm_extraction() {
"is_open_source": {"type": "boolean"}
},
"required": ["company_mission", "supports_sso", "is_open_source"]
}
}
});
}).into(),
..Default::default()
}.into(),
..Default::default()
};
let result = app
.scrape_url("https://mendable.ai", Some(params))
.scrape_url("https://mendable.ai", options)
.await
.unwrap();
assert!(result.as_object().unwrap().contains_key("llm_extraction"));
let llm_extraction = &result["llm_extraction"];
assert!(result.extract.is_some());
let llm_extraction = &result.extract.unwrap();
assert!(llm_extraction
.as_object()
.unwrap()