mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 11:29:02 +08:00
Rust SDK 1.0.0
This commit is contained in:
parent
93a20442e3
commit
a078cdbd9d
@ -1,40 +1,38 @@
|
|||||||
use firecrawl::FirecrawlApp;
|
use firecrawl::{crawl::CrawlOptions, scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}, FirecrawlApp};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
// Initialize the FirecrawlApp with the API key
|
// Initialize the FirecrawlApp with the API key
|
||||||
let api_key = Some("fc-YOUR_API_KEY".to_string());
|
let app = FirecrawlApp::new("fc-YOUR-API-KEY").expect("Failed to initialize FirecrawlApp");
|
||||||
let api_url = Some("http://0.0.0.0:3002".to_string());
|
|
||||||
let app = FirecrawlApp::new(api_key, api_url).expect("Failed to initialize FirecrawlApp");
|
// or, connect to a self-hosted instance:
|
||||||
|
// let app = FirecrawlApp::new_selfhosted("http://localhost:3002", None).expect("Failed to initialize FirecrawlApp");
|
||||||
|
|
||||||
// Scrape a website
|
// Scrape a website
|
||||||
let scrape_result = app.scrape_url("https://firecrawl.dev", None).await;
|
let scrape_result = app.scrape_url("https://firecrawl.dev", None).await;
|
||||||
match scrape_result {
|
match scrape_result {
|
||||||
Ok(data) => println!("Scrape Result:\n{}", data["markdown"]),
|
Ok(data) => println!("Scrape Result:\n{}", data.markdown.unwrap()),
|
||||||
Err(e) => eprintln!("Scrape failed: {}", e),
|
Err(e) => eprintln!("Scrape failed: {:#?}", e),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Crawl a website
|
// Crawl a website
|
||||||
let random_uuid = String::from(Uuid::new_v4());
|
let idempotency_key = String::from(Uuid::new_v4());
|
||||||
let idempotency_key = Some(random_uuid); // optional idempotency key
|
let crawl_options = CrawlOptions {
|
||||||
let crawl_params = json!({
|
exclude_paths: Some(vec![ "blog/*".to_string() ]),
|
||||||
"crawlerOptions": {
|
poll_interval: Some(2000),
|
||||||
"excludes": ["blog/*"]
|
idempotency_key: Some(idempotency_key),
|
||||||
}
|
..Default::default()
|
||||||
});
|
};
|
||||||
let crawl_result = app
|
let crawl_result = app
|
||||||
.crawl_url(
|
.crawl_url(
|
||||||
"https://mendable.ai",
|
"https://mendable.ai",
|
||||||
Some(crawl_params),
|
crawl_options,
|
||||||
true,
|
|
||||||
2,
|
|
||||||
idempotency_key,
|
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
match crawl_result {
|
match crawl_result {
|
||||||
Ok(data) => println!("Crawl Result:\n{}", data),
|
Ok(data) => println!("Crawl Result (used {} credits):\n{:#?}", data.credits_used, data.data),
|
||||||
Err(e) => eprintln!("Crawl failed: {}", e),
|
Err(e) => eprintln!("Crawl failed: {}", e),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -62,21 +60,20 @@ async fn main() {
|
|||||||
"required": ["top"]
|
"required": ["top"]
|
||||||
});
|
});
|
||||||
|
|
||||||
let llm_extraction_params = json!({
|
let llm_extraction_options = ScrapeOptions {
|
||||||
"extractorOptions": {
|
formats: Some(vec![ ScrapeFormats::Extract ]),
|
||||||
"extractionSchema": json_schema,
|
extract: Some(ExtractOptions {
|
||||||
"mode": "llm-extraction"
|
schema: Some(json_schema),
|
||||||
},
|
..Default::default()
|
||||||
"pageOptions": {
|
}),
|
||||||
"onlyMainContent": true
|
..Default::default()
|
||||||
}
|
};
|
||||||
});
|
|
||||||
|
|
||||||
let llm_extraction_result = app
|
let llm_extraction_result = app
|
||||||
.scrape_url("https://news.ycombinator.com", Some(llm_extraction_params))
|
.scrape_url("https://news.ycombinator.com", llm_extraction_options)
|
||||||
.await;
|
.await;
|
||||||
match llm_extraction_result {
|
match llm_extraction_result {
|
||||||
Ok(data) => println!("LLM Extraction Result:\n{}", data["llm_extraction"]),
|
Ok(data) => println!("LLM Extraction Result:\n{:#?}", data.extract.unwrap()),
|
||||||
Err(e) => eprintln!("LLM Extraction failed: {}", e),
|
Err(e) => eprintln!("LLM Extraction failed: {}", e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -48,8 +48,8 @@ impl From<CrawlScrapeFormats> for ScrapeFormats {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct CrawlScrapeOptions {
|
pub struct CrawlScrapeOptions {
|
||||||
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
||||||
@ -93,8 +93,8 @@ impl From<CrawlScrapeOptions> for ScrapeOptions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct CrawlOptions {
|
pub struct CrawlOptions {
|
||||||
/// Options to pass through to the scraper.
|
/// Options to pass through to the scraper.
|
||||||
@ -103,12 +103,12 @@ pub struct CrawlOptions {
|
|||||||
/// URL RegEx patterns to (exclusively) include.
|
/// URL RegEx patterns to (exclusively) include.
|
||||||
///
|
///
|
||||||
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
|
/// For example, if you specified `"blog"`, only pages that have `blog` somewhere in the URL would be crawled.
|
||||||
pub include_paths: Option<String>,
|
pub include_paths: Option<Vec<String>>,
|
||||||
|
|
||||||
/// URL RegEx patterns to exclude.
|
/// URL RegEx patterns to exclude.
|
||||||
///
|
///
|
||||||
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
|
/// For example, if you specified `"blog"`, pages that have `blog` somewhere in the URL would not be crawled.
|
||||||
pub exclude_paths: Option<String>,
|
pub exclude_paths: Option<Vec<String>>,
|
||||||
|
|
||||||
/// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
|
/// Maximum URL depth to crawl, relative to the base URL. (default: `2`)
|
||||||
pub max_depth: Option<u32>,
|
pub max_depth: Option<u32>,
|
||||||
@ -138,7 +138,6 @@ pub struct CrawlOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
#[serde_with::skip_serializing_none]
|
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
struct CrawlRequestBody {
|
struct CrawlRequestBody {
|
||||||
url: String,
|
url: String,
|
||||||
@ -148,7 +147,6 @@ struct CrawlRequestBody {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
#[serde_with::skip_serializing_none]
|
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
struct CrawlResponse {
|
struct CrawlResponse {
|
||||||
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
||||||
@ -175,8 +173,8 @@ pub enum CrawlStatusTypes {
|
|||||||
Cancelled,
|
Cancelled,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct CrawlStatus {
|
pub struct CrawlStatus {
|
||||||
/// The status of the crawl.
|
/// The status of the crawl.
|
||||||
@ -203,7 +201,6 @@ pub struct CrawlStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
#[serde_with::skip_serializing_none]
|
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct CrawlAsyncResponse {
|
pub struct CrawlAsyncResponse {
|
||||||
success: bool,
|
success: bool,
|
||||||
@ -216,6 +213,7 @@ pub struct CrawlAsyncResponse {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl FirecrawlApp {
|
impl FirecrawlApp {
|
||||||
|
/// Initiates a crawl job for a URL using the Firecrawl API.
|
||||||
pub async fn crawl_url_async(
|
pub async fn crawl_url_async(
|
||||||
&self,
|
&self,
|
||||||
url: impl AsRef<str>,
|
url: impl AsRef<str>,
|
||||||
@ -235,61 +233,63 @@ impl FirecrawlApp {
|
|||||||
.json(&body)
|
.json(&body)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
.map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
|
||||||
|
|
||||||
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
|
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
|
||||||
pub async fn crawl_url(
|
pub async fn crawl_url(
|
||||||
&self,
|
&self,
|
||||||
url: impl AsRef<str>,
|
url: impl AsRef<str>,
|
||||||
options: Option<CrawlOptions>,
|
options: impl Into<Option<CrawlOptions>>,
|
||||||
) -> Result<Vec<Document>, FirecrawlError> {
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
|
let options = options.into();
|
||||||
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
|
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
|
||||||
|
|
||||||
let res = self.crawl_url_async(url, options).await?;
|
let res = self.crawl_url_async(url, options).await?;
|
||||||
|
|
||||||
self.monitor_job_status(&res.id, poll_interval).await
|
self.monitor_job_status(&res.id, poll_interval).await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn check_crawl_status(&self, id: &str) -> Result<CrawlStatus, FirecrawlError> {
|
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
|
||||||
|
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
let response = self
|
let response = self
|
||||||
.client
|
.client
|
||||||
.get(&format!(
|
.get(&format!(
|
||||||
"{}{}/crawl/{}",
|
"{}{}/crawl/{}",
|
||||||
self.api_url, API_VERSION, id
|
self.api_url, API_VERSION, id.as_ref()
|
||||||
))
|
))
|
||||||
.headers(self.prepare_headers(None))
|
.headers(self.prepare_headers(None))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
.map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?;
|
||||||
|
|
||||||
self.handle_response(response, "check crawl status").await
|
self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn monitor_job_status(
|
async fn monitor_job_status(
|
||||||
&self,
|
&self,
|
||||||
id: &str,
|
id: &str,
|
||||||
poll_interval: u64,
|
poll_interval: u64,
|
||||||
) -> Result<Vec<Document>, FirecrawlError> {
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
loop {
|
loop {
|
||||||
let status_data = self.check_crawl_status(id).await?;
|
let status_data = self.check_crawl_status(id).await?;
|
||||||
match status_data.status {
|
match status_data.status {
|
||||||
CrawlStatusTypes::Completed => {
|
CrawlStatusTypes::Completed => {
|
||||||
return Ok(status_data.data);
|
return Ok(status_data);
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Scraping => {
|
CrawlStatusTypes::Scraping => {
|
||||||
tokio::time::sleep(tokio::time::Duration::from_secs(poll_interval)).await;
|
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Failed => {
|
CrawlStatusTypes::Failed => {
|
||||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||||
"Crawl job failed."
|
"Crawl job failed."
|
||||||
)));
|
), status_data));
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Cancelled => {
|
CrawlStatusTypes::Cancelled => {
|
||||||
return Err(FirecrawlError::CrawlJobFailed(format!(
|
return Err(FirecrawlError::CrawlJobFailed(format!(
|
||||||
"Crawl job was cancelled."
|
"Crawl job was cancelled."
|
||||||
)));
|
), status_data));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct DocumentMetadata {
|
pub struct DocumentMetadata {
|
||||||
// firecrawl specific
|
// firecrawl specific
|
||||||
@ -12,8 +12,8 @@ pub struct DocumentMetadata {
|
|||||||
pub error: Option<String>,
|
pub error: Option<String>,
|
||||||
|
|
||||||
// basic meta tags
|
// basic meta tags
|
||||||
pub title: String,
|
pub title: Option<String>,
|
||||||
pub description: String,
|
pub description: Option<String>,
|
||||||
pub language: Option<String>,
|
pub language: Option<String>,
|
||||||
pub keywords: Option<String>,
|
pub keywords: Option<String>,
|
||||||
pub robots: Option<String>,
|
pub robots: Option<String>,
|
||||||
@ -26,7 +26,7 @@ pub struct DocumentMetadata {
|
|||||||
pub og_audio: Option<String>,
|
pub og_audio: Option<String>,
|
||||||
pub og_determiner: Option<String>,
|
pub og_determiner: Option<String>,
|
||||||
pub og_locale: Option<String>,
|
pub og_locale: Option<String>,
|
||||||
pub og_locale_alternate: Option<String>,
|
pub og_locale_alternate: Option<Vec<String>>,
|
||||||
pub og_site_name: Option<String>,
|
pub og_site_name: Option<String>,
|
||||||
pub og_video: Option<String>,
|
pub og_video: Option<String>,
|
||||||
|
|
||||||
@ -49,8 +49,8 @@ pub struct DocumentMetadata {
|
|||||||
pub dcterms_created: Option<String>,
|
pub dcterms_created: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct Document {
|
pub struct Document {
|
||||||
/// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)
|
/// A list of the links on the page, present if `ScrapeFormats::Markdown` is present in `ScrapeOptions.formats`. (default)
|
||||||
|
@ -1,7 +1,11 @@
|
|||||||
|
use std::fmt::Display;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
|
use crate::crawl::CrawlStatus;
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||||
pub struct FirecrawlAPIError {
|
pub struct FirecrawlAPIError {
|
||||||
/// Always false.
|
/// Always false.
|
||||||
@ -14,16 +18,28 @@ pub struct FirecrawlAPIError {
|
|||||||
pub details: Option<Value>,
|
pub details: Option<Value>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Display for FirecrawlAPIError {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
if let Some(details) = self.details.as_ref() {
|
||||||
|
write!(f, "{} ({})", self.error, details)
|
||||||
|
} else {
|
||||||
|
write!(f, "{}", self.error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum FirecrawlError {
|
pub enum FirecrawlError {
|
||||||
#[error("HTTP request failed: {0}")]
|
#[error("{0} failed: HTTP error {1}: {2}")]
|
||||||
HttpRequestFailed(String),
|
HttpRequestFailed(String, u16, String),
|
||||||
#[error("API key not provided")]
|
#[error("{0} failed: HTTP error: {1}")]
|
||||||
APIKeyNotProvided,
|
HttpError(String, reqwest::Error),
|
||||||
|
#[error("Failed to parse response as text: {0}")]
|
||||||
|
ResponseParseErrorText(reqwest::Error),
|
||||||
#[error("Failed to parse response: {0}")]
|
#[error("Failed to parse response: {0}")]
|
||||||
ResponseParseError(String),
|
ResponseParseError(serde_json::Error),
|
||||||
#[error("API error")]
|
#[error("{0} failed: {1}")]
|
||||||
APIError(FirecrawlAPIError),
|
APIError(String, FirecrawlAPIError),
|
||||||
#[error("Crawl job failed or stopped: {0}")]
|
#[error("Crawl job failed: {0}")]
|
||||||
CrawlJobFailed(String),
|
CrawlJobFailed(String, CrawlStatus),
|
||||||
}
|
}
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
use reqwest::{Client, Response};
|
use reqwest::{Client, Response};
|
||||||
use serde::de::DeserializeOwned;
|
use serde::de::DeserializeOwned;
|
||||||
use serde_json::json;
|
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
pub mod crawl;
|
pub mod crawl;
|
||||||
pub mod document;
|
pub mod document;
|
||||||
mod error;
|
mod error;
|
||||||
|
pub mod map;
|
||||||
pub mod scrape;
|
pub mod scrape;
|
||||||
|
|
||||||
pub use error::FirecrawlError;
|
pub use error::FirecrawlError;
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct FirecrawlApp {
|
pub struct FirecrawlApp {
|
||||||
api_key: String,
|
api_key: Option<String>,
|
||||||
api_url: String,
|
api_url: String,
|
||||||
client: Client,
|
client: Client,
|
||||||
}
|
}
|
||||||
@ -20,15 +20,14 @@ pub struct FirecrawlApp {
|
|||||||
pub(crate) const API_VERSION: &str = "/v1";
|
pub(crate) const API_VERSION: &str = "/v1";
|
||||||
|
|
||||||
impl FirecrawlApp {
|
impl FirecrawlApp {
|
||||||
pub fn new(api_key: Option<String>, api_url: Option<String>) -> Result<Self, FirecrawlError> {
|
pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
|
||||||
let api_key = api_key
|
FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
|
||||||
.ok_or(FirecrawlError::APIKeyNotProvided)?;
|
}
|
||||||
let api_url = api_url
|
|
||||||
.unwrap_or_else(|| "https://api.firecrawl.dev".to_string());
|
|
||||||
|
|
||||||
|
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
|
||||||
Ok(FirecrawlApp {
|
Ok(FirecrawlApp {
|
||||||
api_key,
|
api_key: api_key.map(|x| x.as_ref().to_string()),
|
||||||
api_url,
|
api_url: api_url.as_ref().to_string(),
|
||||||
client: Client::new(),
|
client: Client::new(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -36,10 +35,12 @@ impl FirecrawlApp {
|
|||||||
fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
|
fn prepare_headers(&self, idempotency_key: Option<&String>) -> reqwest::header::HeaderMap {
|
||||||
let mut headers = reqwest::header::HeaderMap::new();
|
let mut headers = reqwest::header::HeaderMap::new();
|
||||||
headers.insert("Content-Type", "application/json".parse().unwrap());
|
headers.insert("Content-Type", "application/json".parse().unwrap());
|
||||||
headers.insert(
|
if let Some(api_key) = self.api_key.as_ref() {
|
||||||
"Authorization",
|
headers.insert(
|
||||||
format!("Bearer {}", self.api_key).parse().unwrap(),
|
"Authorization",
|
||||||
);
|
format!("Bearer {}", api_key).parse().unwrap(),
|
||||||
|
);
|
||||||
|
}
|
||||||
if let Some(key) = idempotency_key {
|
if let Some(key) = idempotency_key {
|
||||||
headers.insert("x-idempotency-key", key.parse().unwrap());
|
headers.insert("x-idempotency-key", key.parse().unwrap());
|
||||||
}
|
}
|
||||||
@ -51,48 +52,34 @@ impl FirecrawlApp {
|
|||||||
response: Response,
|
response: Response,
|
||||||
action: impl AsRef<str>,
|
action: impl AsRef<str>,
|
||||||
) -> Result<T, FirecrawlError> {
|
) -> Result<T, FirecrawlError> {
|
||||||
if response.status().is_success() {
|
let (is_success, status) = (response.status().is_success(), response.status());
|
||||||
let response_json: Value = response
|
|
||||||
.json()
|
let response = response
|
||||||
.await
|
.text()
|
||||||
.map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?;
|
.await
|
||||||
if response_json["success"].as_bool().unwrap_or(false) {
|
.map_err(|e| FirecrawlError::ResponseParseErrorText(e))
|
||||||
Ok(serde_json::from_value(response_json).map_err(|e| FirecrawlError::ResponseParseError(e.to_string()))?)
|
.and_then(|response_json| serde_json::from_str::<Value>(&response_json).map_err(|e| FirecrawlError::ResponseParseError(e)))
|
||||||
} else {
|
.and_then(|response_value| {
|
||||||
Err(FirecrawlError::HttpRequestFailed(format!(
|
if response_value["success"].as_bool().unwrap_or(false) {
|
||||||
"Failed to {}: {}",
|
Ok(serde_json::from_value::<T>(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?)
|
||||||
action.as_ref(), response_json["error"]
|
} else {
|
||||||
)))
|
Err(FirecrawlError::APIError(
|
||||||
}
|
action.as_ref().to_string(),
|
||||||
} else {
|
serde_json::from_value(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?
|
||||||
let status_code = response.status().as_u16();
|
))
|
||||||
let error_message = response
|
}
|
||||||
.json::<Value>()
|
});
|
||||||
.await
|
|
||||||
.unwrap_or_else(|_| json!({"error": "No additional error details provided."}));
|
match &response {
|
||||||
let message = match status_code {
|
Ok(_) => response,
|
||||||
402 => format!(
|
Err(FirecrawlError::ResponseParseError(_)) | Err(FirecrawlError::ResponseParseErrorText(_)) => {
|
||||||
"Payment Required: Failed to {}. {}",
|
if is_success {
|
||||||
action.as_ref(), error_message["error"]
|
response
|
||||||
),
|
} else {
|
||||||
408 => format!(
|
Err(FirecrawlError::HttpRequestFailed(action.as_ref().to_string(), status.as_u16(), status.as_str().to_string()))
|
||||||
"Request Timeout: Failed to {} as the request timed out. {}",
|
}
|
||||||
action.as_ref(), error_message["error"]
|
},
|
||||||
),
|
Err(_) => response,
|
||||||
409 => format!(
|
|
||||||
"Conflict: Failed to {} due to a conflict. {}",
|
|
||||||
action.as_ref(), error_message["error"]
|
|
||||||
),
|
|
||||||
500 => format!(
|
|
||||||
"Internal Server Error: Failed to {}. {}",
|
|
||||||
action.as_ref(), error_message["error"]
|
|
||||||
),
|
|
||||||
_ => format!(
|
|
||||||
"Unexpected error during {}: Status code {}. {}",
|
|
||||||
action.as_ref(), status_code, error_message["error"]
|
|
||||||
),
|
|
||||||
};
|
|
||||||
Err(FirecrawlError::HttpRequestFailed(message))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
66
apps/rust-sdk/src/map.rs
Normal file
66
apps/rust-sdk/src/map.rs
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::{FirecrawlApp, FirecrawlError, API_VERSION};
|
||||||
|
|
||||||
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct MapOptions {
|
||||||
|
/// Search query to use for mapping
|
||||||
|
pub search: Option<String>,
|
||||||
|
|
||||||
|
/// Ignore the website sitemap when crawling (default: `true`)
|
||||||
|
pub ignore_sitemap: Option<bool>,
|
||||||
|
|
||||||
|
/// Include subdomains of the website (default: `true`)
|
||||||
|
pub include_subdomains: Option<bool>,
|
||||||
|
|
||||||
|
/// Maximum number of links to return (default: `5000`)
|
||||||
|
pub exclude_tags: Option<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
struct MapRequestBody {
|
||||||
|
url: String,
|
||||||
|
|
||||||
|
#[serde(flatten)]
|
||||||
|
options: MapOptions,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
struct MapResponse {
|
||||||
|
success: bool,
|
||||||
|
|
||||||
|
links: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FirecrawlApp {
|
||||||
|
/// Returns links from a URL using the Firecrawl API.
|
||||||
|
pub async fn map_url(
|
||||||
|
&self,
|
||||||
|
url: impl AsRef<str>,
|
||||||
|
options: impl Into<Option<MapOptions>>,
|
||||||
|
) -> Result<Vec<String>, FirecrawlError> {
|
||||||
|
let body = MapRequestBody {
|
||||||
|
url: url.as_ref().to_string(),
|
||||||
|
options: options.into().unwrap_or_default(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let headers = self.prepare_headers(None);
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(&format!("{}{}/map", self.api_url, API_VERSION))
|
||||||
|
.headers(headers)
|
||||||
|
.json(&body)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpError(format!("Mapping {:?}", url.as_ref()), e))?;
|
||||||
|
|
||||||
|
let response = self.handle_response::<MapResponse>(response, "scrape URL").await?;
|
||||||
|
|
||||||
|
Ok(response.links)
|
||||||
|
}
|
||||||
|
}
|
@ -42,21 +42,21 @@ pub enum ScrapeFormats {
|
|||||||
Extract,
|
Extract,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ExtractOptions {
|
pub struct ExtractOptions {
|
||||||
/// Schema the output should adhere to, provided in JSON Schema format.
|
/// Schema the output should adhere to, provided in JSON Schema format.
|
||||||
pub schema: Option<Value>,
|
pub schema: Option<Value>,
|
||||||
|
|
||||||
pub system_prompt: Option<Value>,
|
pub system_prompt: Option<String>,
|
||||||
|
|
||||||
/// Extraction prompt to send to the LLM agent along with the page content.
|
/// Extraction prompt to send to the LLM agent along with the page content.
|
||||||
pub prompt: Option<Value>,
|
pub prompt: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ScrapeOptions {
|
pub struct ScrapeOptions {
|
||||||
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
||||||
@ -89,7 +89,6 @@ pub struct ScrapeOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
#[serde_with::skip_serializing_none]
|
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
struct ScrapeRequestBody {
|
struct ScrapeRequestBody {
|
||||||
url: String,
|
url: String,
|
||||||
@ -99,7 +98,6 @@ struct ScrapeRequestBody {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
#[serde_with::skip_serializing_none]
|
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
struct ScrapeResponse {
|
struct ScrapeResponse {
|
||||||
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
/// This will always be `true` due to `FirecrawlApp::handle_response`.
|
||||||
@ -111,14 +109,15 @@ struct ScrapeResponse {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl FirecrawlApp {
|
impl FirecrawlApp {
|
||||||
|
/// Scrapes a URL using the Firecrawl API.
|
||||||
pub async fn scrape_url(
|
pub async fn scrape_url(
|
||||||
&self,
|
&self,
|
||||||
url: impl AsRef<str>,
|
url: impl AsRef<str>,
|
||||||
options: Option<ScrapeOptions>,
|
options: impl Into<Option<ScrapeOptions>>,
|
||||||
) -> Result<Document, FirecrawlError> {
|
) -> Result<Document, FirecrawlError> {
|
||||||
let body = ScrapeRequestBody {
|
let body = ScrapeRequestBody {
|
||||||
url: url.as_ref().to_string(),
|
url: url.as_ref().to_string(),
|
||||||
options: options.unwrap_or_default(),
|
options: options.into().unwrap_or_default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let headers = self.prepare_headers(None);
|
let headers = self.prepare_headers(None);
|
||||||
@ -130,7 +129,7 @@ impl FirecrawlApp {
|
|||||||
.json(&body)
|
.json(&body)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpRequestFailed(e.to_string()))?;
|
.map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
|
||||||
|
|
||||||
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;
|
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;
|
||||||
|
|
||||||
|
@ -1,24 +1,16 @@
|
|||||||
use assert_matches::assert_matches;
|
use assert_matches::assert_matches;
|
||||||
use dotenvy::dotenv;
|
use dotenvy::dotenv;
|
||||||
|
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
|
||||||
use firecrawl::FirecrawlApp;
|
use firecrawl::FirecrawlApp;
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::time::Duration;
|
|
||||||
use tokio::time::sleep;
|
|
||||||
|
|
||||||
#[tokio::test]
|
|
||||||
async fn test_no_api_key() {
|
|
||||||
dotenv().ok();
|
|
||||||
let api_url = env::var("API_URL").expect("API_URL environment variable is not set");
|
|
||||||
assert_matches!(FirecrawlApp::new(None, Some(api_url)), Err(e) if e.to_string() == "API key not provided");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_blocklisted_url() {
|
async fn test_blocklisted_url() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let blocklisted_url = "https://facebook.com/fake-test";
|
let blocklisted_url = "https://facebook.com/fake-test";
|
||||||
let result = app.scrape_url(blocklisted_url, None).await;
|
let result = app.scrape_url(blocklisted_url, None).await;
|
||||||
|
|
||||||
@ -32,74 +24,65 @@ async fn test_blocklisted_url() {
|
|||||||
async fn test_successful_response_with_valid_preview_token() {
|
async fn test_successful_response_with_valid_preview_token() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let app = FirecrawlApp::new(
|
let app = FirecrawlApp::new_selfhosted(
|
||||||
Some("this_is_just_a_preview_token".to_string()),
|
api_url,
|
||||||
Some(api_url),
|
Some("this_is_just_a_preview_token"),
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let result = app
|
let result = app
|
||||||
.scrape_url("https://roastmywebsite.ai", None)
|
.scrape_url("https://roastmywebsite.ai", None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(result.as_object().unwrap().contains_key("content"));
|
assert!(result.markdown.is_some());
|
||||||
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
|
assert!(result.markdown.unwrap().contains("_Roast_"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_scrape_url_e2e() {
|
async fn test_scrape_url_e2e() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let result = app
|
let result = app
|
||||||
.scrape_url("https://roastmywebsite.ai", None)
|
.scrape_url("https://roastmywebsite.ai", None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(result.as_object().unwrap().contains_key("content"));
|
assert!(result.markdown.is_some());
|
||||||
assert!(result.as_object().unwrap().contains_key("markdown"));
|
assert!(result.markdown.unwrap().contains("_Roast_"));
|
||||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
|
||||||
assert!(!result.as_object().unwrap().contains_key("html"));
|
|
||||||
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_successful_response_with_valid_api_key_and_include_html() {
|
async fn test_successful_response_with_valid_api_key_and_include_html() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let params = json!({
|
let params = ScrapeOptions {
|
||||||
"pageOptions": {
|
formats: vec! [ ScrapeFormats::Markdown, ScrapeFormats::HTML ].into(),
|
||||||
"includeHtml": true
|
..Default::default()
|
||||||
}
|
};
|
||||||
});
|
|
||||||
let result = app
|
let result = app
|
||||||
.scrape_url("https://roastmywebsite.ai", Some(params))
|
.scrape_url("https://roastmywebsite.ai", params)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(result.as_object().unwrap().contains_key("content"));
|
assert!(result.markdown.is_some());
|
||||||
assert!(result.as_object().unwrap().contains_key("markdown"));
|
assert!(result.html.is_some());
|
||||||
assert!(result.as_object().unwrap().contains_key("html"));
|
assert!(result.markdown.unwrap().contains("_Roast_"));
|
||||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
assert!(result.html.unwrap().contains("<h1"));
|
||||||
assert!(result["content"].as_str().unwrap().contains("_Roast_"));
|
|
||||||
assert!(result["markdown"].as_str().unwrap().contains("_Roast_"));
|
|
||||||
assert!(result["html"].as_str().unwrap().contains("<h1"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_successful_response_for_valid_scrape_with_pdf_file() {
|
async fn test_successful_response_for_valid_scrape_with_pdf_file() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let result = app
|
let result = app
|
||||||
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001.pdf", None)
|
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001.pdf", None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(result.as_object().unwrap().contains_key("content"));
|
assert!(result.markdown.is_some());
|
||||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
assert!(result.markdown
|
||||||
assert!(result["content"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
||||||
}
|
}
|
||||||
@ -108,17 +91,14 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file() {
|
|||||||
async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension() {
|
async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let result = app
|
let result = app
|
||||||
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001", None)
|
.scrape_url("https://arxiv.org/pdf/astro-ph/9301001", None)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
sleep(Duration::from_secs(6)).await; // wait for 6 seconds
|
assert!(result.markdown.is_some());
|
||||||
assert!(result.as_object().unwrap().contains_key("content"));
|
assert!(result.markdown
|
||||||
assert!(result.as_object().unwrap().contains_key("metadata"));
|
|
||||||
assert!(result["content"]
|
|
||||||
.as_str()
|
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
||||||
}
|
}
|
||||||
@ -127,10 +107,10 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
|
|||||||
async fn test_should_return_error_for_blocklisted_url() {
|
async fn test_should_return_error_for_blocklisted_url() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let blocklisted_url = "https://twitter.com/fake-test";
|
let blocklisted_url = "https://twitter.com/fake-test";
|
||||||
let result = app.crawl_url(blocklisted_url, None, true, 1, None).await;
|
let result = app.crawl_url(blocklisted_url, None).await;
|
||||||
|
|
||||||
assert_matches!(
|
assert_matches!(
|
||||||
result,
|
result,
|
||||||
@ -142,13 +122,13 @@ async fn test_should_return_error_for_blocklisted_url() {
|
|||||||
async fn test_llm_extraction() {
|
async fn test_llm_extraction() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let api_key = env::var("TEST_API_KEY").unwrap();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new(Some(api_key), Some(api_url)).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let params = json!({
|
let options = ScrapeOptions {
|
||||||
"extractorOptions": {
|
formats: vec! [ ScrapeFormats::Extract ].into(),
|
||||||
"mode": "llm-extraction",
|
extract: ExtractOptions {
|
||||||
"extractionPrompt": "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source".to_string().into(),
|
||||||
"extractionSchema": {
|
schema: json!({
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"company_mission": {"type": "string"},
|
"company_mission": {"type": "string"},
|
||||||
@ -156,15 +136,17 @@ async fn test_llm_extraction() {
|
|||||||
"is_open_source": {"type": "boolean"}
|
"is_open_source": {"type": "boolean"}
|
||||||
},
|
},
|
||||||
"required": ["company_mission", "supports_sso", "is_open_source"]
|
"required": ["company_mission", "supports_sso", "is_open_source"]
|
||||||
}
|
}).into(),
|
||||||
}
|
..Default::default()
|
||||||
});
|
}.into(),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
let result = app
|
let result = app
|
||||||
.scrape_url("https://mendable.ai", Some(params))
|
.scrape_url("https://mendable.ai", options)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(result.as_object().unwrap().contains_key("llm_extraction"));
|
assert!(result.extract.is_some());
|
||||||
let llm_extraction = &result["llm_extraction"];
|
let llm_extraction = &result.extract.unwrap();
|
||||||
assert!(llm_extraction
|
assert!(llm_extraction
|
||||||
.as_object()
|
.as_object()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user