mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 02:09:03 +08:00
feat(rust): update rust sdk to support new features (#1446)
* chore(rust-sdk): cargo fmt * feat(rust-sdk): implement search api + example + test * feat(rust-sdk): implement crawl cancel api + example + test * feat(rust-sdk): implement crawl check errors api + example + test * feat(rust-sdk): implement batch crawl + test + example + Fix MapOptions * feat(rust-sdk): implement extract api + test + example * feat(rust-sdk): implement llmtxt api + test + example * chore(rust-sdk): correct mock tests * chore(rust-sdk): prep for cargo distribution
This commit is contained in:
parent
33aece8e96
commit
f2c01340d1
1377
apps/rust-sdk/Cargo.lock
generated
1377
apps/rust-sdk/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,13 +1,13 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "firecrawl"
|
name = "firecrawl"
|
||||||
author= "Mendable.ai"
|
author= "Mendable.ai"
|
||||||
version = "1.0.0"
|
version = "1.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
homepage = "https://www.firecrawl.dev/"
|
homepage = "https://www.firecrawl.dev/"
|
||||||
repository ="https://github.com/mendableai/firecrawl"
|
repository ="https://github.com/mendableai/firecrawl"
|
||||||
description = "Rust SDK for Firecrawl API."
|
description = "Rust SDK for Firecrawl API."
|
||||||
authors = ["Gergő Móricz <mogery@firecrawl.dev>", "sanix-darker <sanixdk@gmail.com>"]
|
authors = ["Gergő Móricz <mogery@firecrawl.dev>", "sanix-darker <sanixdk@gmail.com>", "kkharji <kkharji@protonmail.com>"]
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
path = "src/lib.rs"
|
path = "src/lib.rs"
|
||||||
@ -23,12 +23,18 @@ log = "^0.4"
|
|||||||
thiserror = "^1.0"
|
thiserror = "^1.0"
|
||||||
uuid = { version = "^1.10", features = ["v4"] }
|
uuid = { version = "^1.10", features = ["v4"] }
|
||||||
tokio = { version = "^1", features = ["full"] }
|
tokio = { version = "^1", features = ["full"] }
|
||||||
|
futures = "0.3.31"
|
||||||
|
schemars = "0.8.22"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
clippy = "^0.0.302"
|
clippy = "^0.0.302"
|
||||||
assert_matches = "^1.5"
|
assert_matches = "^1.5"
|
||||||
dotenvy = "^0.15"
|
dotenvy = "^0.15"
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
mockito = "1.7.0"
|
||||||
|
clap = { version ="4.5.35", features = ["derive"] }
|
||||||
|
axum = { version = "0.8.3", features = ["tokio", "macros"] }
|
||||||
|
bat = "0.25.0"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
175
apps/rust-sdk/examples/batch_scrape_example.rs
Normal file
175
apps/rust-sdk/examples/batch_scrape_example.rs
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
use clap::{Parser, Subcommand};
|
||||||
|
use firecrawl::{
|
||||||
|
batch_scrape::{BatchScrapeParams, WebhookOptions},
|
||||||
|
map::MapOptions,
|
||||||
|
scrape::{ScrapeFormats, ScrapeOptions},
|
||||||
|
FirecrawlApp,
|
||||||
|
};
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::net::SocketAddr;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
|
// Store webhook responses
|
||||||
|
struct WebhookState {
|
||||||
|
responses: Vec<Value>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(version, about, long_about = None)]
|
||||||
|
struct Cli {
|
||||||
|
#[command(subcommand)]
|
||||||
|
command: Commands,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Subcommand)]
|
||||||
|
enum Commands {
|
||||||
|
/// Multiple URL scraping with webhook monitoring
|
||||||
|
Basic,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn create_firecrawl_app() -> Result<FirecrawlApp, Box<dyn Error>> {
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
FirecrawlApp::new_selfhosted(api_url, None::<&str>).map_err(|e| e.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start webhook server and return its address
|
||||||
|
async fn start_webhook_server(
|
||||||
|
port: u16,
|
||||||
|
state: Arc<Mutex<WebhookState>>,
|
||||||
|
) -> Result<String, Box<dyn Error>> {
|
||||||
|
let state = state.clone();
|
||||||
|
use axum::routing::post;
|
||||||
|
use axum::Json;
|
||||||
|
|
||||||
|
let app = axum::Router::new().route(
|
||||||
|
"/",
|
||||||
|
post(move |body: Json<Value>| {
|
||||||
|
let state = state.clone();
|
||||||
|
async move {
|
||||||
|
state.lock().await.responses.push(body.0.clone());
|
||||||
|
match serde_json::to_string_pretty(&body.0) {
|
||||||
|
Ok(data) => println!(
|
||||||
|
"Received webhook: {}",
|
||||||
|
serde_json::to_string_pretty(&data).unwrap()
|
||||||
|
),
|
||||||
|
Err(_) => println!("Received webhook: {}", body.0),
|
||||||
|
}
|
||||||
|
"OK"
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
let addr = SocketAddr::from(([0, 0, 0, 0], port));
|
||||||
|
let webhook_url = format!("http://host.docker.internal:{}", port);
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
let listener = tokio::net::TcpListener::bind(addr)
|
||||||
|
.await
|
||||||
|
.inspect_err(|err| println!("{err:?}"))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
if let Err(e) = axum::serve(listener, app).await {
|
||||||
|
eprintln!("Webhook server error: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
println!("Webhook server running at {}", webhook_url);
|
||||||
|
|
||||||
|
Ok(webhook_url)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let cli = Cli::parse();
|
||||||
|
let firecrawl = create_firecrawl_app().await?;
|
||||||
|
|
||||||
|
let state = Arc::new(Mutex::new(WebhookState { responses: vec![] }));
|
||||||
|
let webhook_url = start_webhook_server(39120, state.clone()).await?;
|
||||||
|
|
||||||
|
match cli.command {
|
||||||
|
Commands::Basic => {
|
||||||
|
let mut urls = Vec::new();
|
||||||
|
|
||||||
|
let url_one = "https://invalid-url.url/";
|
||||||
|
println!("Mapping: {}", url_one);
|
||||||
|
match firecrawl.map_url(url_one, None).await {
|
||||||
|
Ok(mapped_urls) => urls.extend(mapped_urls),
|
||||||
|
Err(e) => println!("Error mapping {}: {}", url_one, e),
|
||||||
|
}
|
||||||
|
|
||||||
|
let url_two = "https://www.devjobsscanner.com";
|
||||||
|
println!("Mapping: {}", url_two);
|
||||||
|
match firecrawl
|
||||||
|
.map_url(
|
||||||
|
url_two,
|
||||||
|
Some(MapOptions {
|
||||||
|
search: Some("rust".into()),
|
||||||
|
limit: Some(20),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(mapped_urls) => urls.extend(mapped_urls),
|
||||||
|
Err(e) => println!("Error mapping {}: {}", url_two, e),
|
||||||
|
}
|
||||||
|
|
||||||
|
test_multiple_urls(&firecrawl, urls, &webhook_url).await?;
|
||||||
|
|
||||||
|
// Give time for webhooks to arrive
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
||||||
|
println!(
|
||||||
|
"Received {} webhook responses",
|
||||||
|
state.lock().await.responses.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn test_multiple_urls(
|
||||||
|
app: &FirecrawlApp,
|
||||||
|
urls: Vec<String>,
|
||||||
|
webhook_url: &str,
|
||||||
|
) -> Result<(), Box<dyn Error>> {
|
||||||
|
println!("Testing batch scraping of {} URLs", urls.len());
|
||||||
|
|
||||||
|
let webhook = WebhookOptions {
|
||||||
|
url: webhook_url.to_string(),
|
||||||
|
headers: None,
|
||||||
|
auth_token: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let params = BatchScrapeParams {
|
||||||
|
urls,
|
||||||
|
webhook: Some(webhook),
|
||||||
|
ignore_invalid_urls: true,
|
||||||
|
options: Some(ScrapeOptions {
|
||||||
|
formats: Some(vec![ScrapeFormats::Markdown, ScrapeFormats::Links]),
|
||||||
|
..Default::default()
|
||||||
|
}),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let batch = app.async_batch_scrape_urls(params).await?;
|
||||||
|
println!("Batch job started: {}", batch.id);
|
||||||
|
|
||||||
|
// Poll status periodically
|
||||||
|
loop {
|
||||||
|
let status = app.check_batch_scrape_status(&batch.id).await?;
|
||||||
|
println!("Progress: {}/{} pages", status.completed, status.total);
|
||||||
|
|
||||||
|
if status.completed >= status.total {
|
||||||
|
println!("Batch job completed!");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
33
apps/rust-sdk/examples/cancel_crawl_example.rs
Normal file
33
apps/rust-sdk/examples/cancel_crawl_example.rs
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
use firecrawl::FirecrawlApp;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
// Get API URL from environment
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
|
||||||
|
// Create the FirecrawlApp instance
|
||||||
|
let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?;
|
||||||
|
|
||||||
|
// Start a crawl job
|
||||||
|
println!("Starting a crawl job...");
|
||||||
|
let crawl_response = firecrawl
|
||||||
|
.crawl_url_async("https://example.com", None)
|
||||||
|
.await?;
|
||||||
|
println!("Crawl job started with ID: {}", crawl_response.id);
|
||||||
|
|
||||||
|
// Wait for a moment to let the crawl job start
|
||||||
|
println!("Waiting for a moment...");
|
||||||
|
tokio::time::sleep(Duration::from_secs(2)).await;
|
||||||
|
|
||||||
|
// Cancel the crawl job
|
||||||
|
println!("Cancelling the crawl job...");
|
||||||
|
let cancel_response = firecrawl.cancel_crawl(&crawl_response.id).await?;
|
||||||
|
|
||||||
|
println!("Cancellation result:");
|
||||||
|
println!(" Status: {:?}", cancel_response.status);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
59
apps/rust-sdk/examples/check_crawl_errors_example.rs
Normal file
59
apps/rust-sdk/examples/check_crawl_errors_example.rs
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
use firecrawl::FirecrawlApp;
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
// Get API URL from environment
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
|
||||||
|
// Create the FirecrawlApp instance
|
||||||
|
let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?;
|
||||||
|
|
||||||
|
// Start a crawl job that will likely have some errors (invalid URL format)
|
||||||
|
println!("Starting a crawl job...");
|
||||||
|
let crawl_response = firecrawl
|
||||||
|
.crawl_url_async("https://no-wer-agg.invalid", None)
|
||||||
|
.await?;
|
||||||
|
println!("Crawl job started with ID: {}", crawl_response.id);
|
||||||
|
|
||||||
|
println!("Let it do it's thing...");
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
||||||
|
|
||||||
|
// Check the crawl errors
|
||||||
|
println!("Checking for crawl errors...");
|
||||||
|
match firecrawl.check_crawl_errors(&crawl_response.id).await {
|
||||||
|
Ok(error_response) => {
|
||||||
|
println!("Crawl errors response:");
|
||||||
|
println!(" Number of errors: {}", error_response.errors.len());
|
||||||
|
|
||||||
|
if !error_response.errors.is_empty() {
|
||||||
|
println!("\nDetailed errors:");
|
||||||
|
for (i, error) in error_response.errors.iter().enumerate() {
|
||||||
|
println!("Error #{}", i + 1);
|
||||||
|
println!(" ID: {}", error.id);
|
||||||
|
if let Some(timestamp) = &error.timestamp {
|
||||||
|
println!(" Timestamp: {}", timestamp);
|
||||||
|
}
|
||||||
|
println!(" URL: {}", error.url);
|
||||||
|
println!(" Error: {}", error.error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"\nRobots.txt blocked URLs: {}",
|
||||||
|
error_response.robots_blocked.len()
|
||||||
|
);
|
||||||
|
for (i, url) in error_response.robots_blocked.iter().enumerate() {
|
||||||
|
println!(" {}. {}", i + 1, url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!("Failed to check crawl errors: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let cancel = firecrawl.cancel_crawl(&crawl_response.id).await?;
|
||||||
|
println!("Cancel: {}", cancel.status);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
@ -1,4 +1,8 @@
|
|||||||
use firecrawl::{crawl::CrawlOptions, scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions}, FirecrawlApp};
|
use firecrawl::{
|
||||||
|
crawl::CrawlOptions,
|
||||||
|
scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions},
|
||||||
|
FirecrawlApp,
|
||||||
|
};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
@ -19,16 +23,17 @@ async fn main() {
|
|||||||
|
|
||||||
// Crawl a website
|
// Crawl a website
|
||||||
let crawl_options = CrawlOptions {
|
let crawl_options = CrawlOptions {
|
||||||
exclude_paths: vec![ "blog/*".into() ].into(),
|
exclude_paths: vec!["blog/*".into()].into(),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
let crawl_result = app
|
let crawl_result = app.crawl_url("https://mendable.ai", crawl_options).await;
|
||||||
.crawl_url("https://mendable.ai", crawl_options)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
match crawl_result {
|
match crawl_result {
|
||||||
Ok(data) => println!("Crawl Result (used {} credits):\n{:#?}", data.credits_used, data.data),
|
Ok(data) => println!(
|
||||||
|
"Crawl Result (used {} credits):\n{:#?}",
|
||||||
|
data.credits_used, data.data
|
||||||
|
),
|
||||||
Err(e) => eprintln!("Crawl failed: {}", e),
|
Err(e) => eprintln!("Crawl failed: {}", e),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,11 +62,12 @@ async fn main() {
|
|||||||
});
|
});
|
||||||
|
|
||||||
let llm_extraction_options = ScrapeOptions {
|
let llm_extraction_options = ScrapeOptions {
|
||||||
formats: vec![ ScrapeFormats::Extract ].into(),
|
formats: vec![ScrapeFormats::Extract].into(),
|
||||||
extract: ExtractOptions {
|
extract: ExtractOptions {
|
||||||
schema: json_schema.into(),
|
schema: json_schema.into(),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
}.into(),
|
}
|
||||||
|
.into(),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -75,9 +81,7 @@ async fn main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Map a website (Alpha)
|
// Map a website (Alpha)
|
||||||
let map_result = app
|
let map_result = app.map_url("https://firecrawl.dev", None).await;
|
||||||
.map_url("https://firecrawl.dev", None)
|
|
||||||
.await;
|
|
||||||
|
|
||||||
match map_result {
|
match map_result {
|
||||||
Ok(data) => println!("Mapped URLs: {:#?}", data),
|
Ok(data) => println!("Mapped URLs: {:#?}", data),
|
||||||
|
237
apps/rust-sdk/examples/extract_example.rs
Normal file
237
apps/rust-sdk/examples/extract_example.rs
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
use firecrawl::{extract::ExtractParams, FirecrawlApp};
|
||||||
|
use serde_json::json;
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
use clap::{Parser, ValueEnum};
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(author, version, about, long_about = None)]
|
||||||
|
struct Args {
|
||||||
|
#[arg(value_enum)]
|
||||||
|
command: Examples,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, PartialEq, Eq, ValueEnum)]
|
||||||
|
enum Examples {
|
||||||
|
Basic,
|
||||||
|
Schema,
|
||||||
|
JsonSchema,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let args = Args::parse();
|
||||||
|
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?;
|
||||||
|
let urls = vec![
|
||||||
|
"https://www.firecrawl.dev/".to_string(),
|
||||||
|
"https://betteruptime.com".to_string(),
|
||||||
|
];
|
||||||
|
|
||||||
|
match args.command {
|
||||||
|
Examples::Basic => {
|
||||||
|
println!("Example 1: Extracting with URLs and prompt");
|
||||||
|
|
||||||
|
let extract_params = ExtractParams {
|
||||||
|
prompt: Some(
|
||||||
|
"Extract Product promise, consice descirption and category".to_string(),
|
||||||
|
),
|
||||||
|
url_trace: Some(true),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
println!("Starting asynchronous extraction job...");
|
||||||
|
let response = firecrawl
|
||||||
|
.async_extract(ExtractParams {
|
||||||
|
urls: Some(urls.iter().map(|u| u.to_string()).collect()),
|
||||||
|
prompt: extract_params.prompt.clone(),
|
||||||
|
url_trace: extract_params.url_trace,
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
println!("Extract job initiated:");
|
||||||
|
println!(" Job ID: {}", response.id);
|
||||||
|
|
||||||
|
println!("\nChecking extract status...");
|
||||||
|
for _ in 0..5 {
|
||||||
|
let response = firecrawl.get_extract_status(&response.id).await?;
|
||||||
|
|
||||||
|
println!("Extract status: {}", response.status);
|
||||||
|
if let Some(url_trace) = &response.url_trace {
|
||||||
|
println!("URL traces:");
|
||||||
|
for trace in url_trace {
|
||||||
|
println!(" URL: {}", trace.url);
|
||||||
|
println!(" Status: {}", trace.status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!("Extract data: {:#?}", response.data);
|
||||||
|
if response.status == "completed" {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Examples::Schema => {
|
||||||
|
println!("Example 2: Extracting with schema");
|
||||||
|
|
||||||
|
let schema = json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"category": { "type": "string" },
|
||||||
|
"promise": { "type": "string" },
|
||||||
|
"descirption": { "type": "string" }
|
||||||
|
},
|
||||||
|
"required": ["category", "promise", "description"]
|
||||||
|
});
|
||||||
|
|
||||||
|
println!("Starting synchronous extraction job...");
|
||||||
|
|
||||||
|
match firecrawl
|
||||||
|
.extract(ExtractParams {
|
||||||
|
urls: urls.into(),
|
||||||
|
schema: Some(schema),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(result) => {
|
||||||
|
println!("Extraction completed successfully!");
|
||||||
|
println!("Status: {}", result.status);
|
||||||
|
|
||||||
|
if let Some(data) = result.data {
|
||||||
|
println!("\nExtracted data:");
|
||||||
|
println!(" Title: {}", data["title"]);
|
||||||
|
if let Some(desc) = data.get("description") {
|
||||||
|
println!(" Description: {}", desc);
|
||||||
|
}
|
||||||
|
println!(
|
||||||
|
" Content (preview): {:.100}...",
|
||||||
|
data["content"].as_str().unwrap_or("N/A")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(sources) = result.sources {
|
||||||
|
println!("\nSources:");
|
||||||
|
for (field, urls) in sources {
|
||||||
|
println!(" {}: {}", field, urls.join(", "));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!("Extraction failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Examples::JsonSchema => {
|
||||||
|
println!("Example 3: Using JsonSchema derive");
|
||||||
|
|
||||||
|
/// A comprehensive analysis of given product
|
||||||
|
#[derive(serde::Serialize, serde::Deserialize, schemars::JsonSchema)]
|
||||||
|
struct ProductAnalysis {
|
||||||
|
/// The full name of the product
|
||||||
|
product_name: String,
|
||||||
|
/// The company/brand behind the product
|
||||||
|
brand: String,
|
||||||
|
/// The general price range (e.g. "Premium", "$10-50", "Enterprise")
|
||||||
|
price_range: String,
|
||||||
|
/// The main customer segments this product targets
|
||||||
|
target_audience: Vec<String>,
|
||||||
|
/// Primary benefits and value propositions of the product
|
||||||
|
key_benefits: Vec<String>,
|
||||||
|
/// Distinctive features that set this product apart from competitors
|
||||||
|
unique_selling_points: Vec<String>,
|
||||||
|
/// Direct comparisons with competing products/services
|
||||||
|
competitor_comparison: Vec<String>,
|
||||||
|
/// Technologies, frameworks, or platforms used (if applicable)
|
||||||
|
tech_stack: Option<Vec<String>>,
|
||||||
|
/// Aggregated review data and sentiment analysis
|
||||||
|
reviews_summary: ReviewsSummary,
|
||||||
|
// /// Score from 0-10 indicating product-market fit based on analysis
|
||||||
|
// market_fit_score: f32, // NOTE: Breaks
|
||||||
|
/// Assessment of future growth prospects (e.g. "High", "Moderate", "Limited")
|
||||||
|
growth_potential: String,
|
||||||
|
/// Relevant compliance standards and certifications
|
||||||
|
regulatory_compliance: Option<Vec<String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Aggregated analysis of product reviews from multiple sources
|
||||||
|
#[derive(serde::Serialize, serde::Deserialize, schemars::JsonSchema)]
|
||||||
|
struct ReviewsSummary {
|
||||||
|
/// Overall sentiment from review analysis (e.g. "Highly Positive", "Mixed", "Negative")
|
||||||
|
sentiment_analysis: String,
|
||||||
|
/// Most frequently mentioned positive aspects
|
||||||
|
common_praises: Vec<String>,
|
||||||
|
/// Most frequently mentioned criticisms or issues
|
||||||
|
common_complaints: Vec<String>,
|
||||||
|
/// Platforms or websites where reviews were sourced from
|
||||||
|
review_sources: Vec<String>,
|
||||||
|
}
|
||||||
|
println!("Starting extraction with derived schema...");
|
||||||
|
match firecrawl
|
||||||
|
.extract_with_schemars::<ProductAnalysis>(ExtractParams {
|
||||||
|
urls: urls.into(),
|
||||||
|
..Default::default()
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(result) => {
|
||||||
|
println!("Extraction completed!");
|
||||||
|
println!("Status: {}", result.status);
|
||||||
|
|
||||||
|
if let Some(data) = result.data {
|
||||||
|
if let Ok(analysis) = serde_json::from_value::<ProductAnalysis>(data) {
|
||||||
|
println!("\nExtracted Product Analysis:");
|
||||||
|
println!(" Product: {}", analysis.product_name);
|
||||||
|
println!(" Brand: {}", analysis.brand);
|
||||||
|
println!(" Price Range: {}", analysis.price_range);
|
||||||
|
println!(" Target Audience:");
|
||||||
|
for audience in analysis.target_audience {
|
||||||
|
println!(" - {}", audience);
|
||||||
|
}
|
||||||
|
println!(" Key Benefits:");
|
||||||
|
for benefit in analysis.key_benefits {
|
||||||
|
println!(" - {}", benefit);
|
||||||
|
}
|
||||||
|
println!(" USPs:");
|
||||||
|
for usp in analysis.unique_selling_points {
|
||||||
|
println!(" - {}", usp);
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("\n Reviews Summary:");
|
||||||
|
println!(
|
||||||
|
" Sentiment: {}",
|
||||||
|
analysis.reviews_summary.sentiment_analysis
|
||||||
|
);
|
||||||
|
println!(" Common Praises:");
|
||||||
|
for praise in analysis.reviews_summary.common_praises {
|
||||||
|
println!(" - {}", praise);
|
||||||
|
}
|
||||||
|
println!(" Common Complaints:");
|
||||||
|
for complaint in analysis.reviews_summary.common_complaints {
|
||||||
|
println!(" - {}", complaint);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
println!("Failed to parse extracted data");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(sources) = result.sources {
|
||||||
|
println!("\nSources:");
|
||||||
|
for (field, urls) in sources {
|
||||||
|
println!(" {}: {}", field, urls.join(", "));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!("Extraction failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
173
apps/rust-sdk/examples/llmstxt_example.rs
Normal file
173
apps/rust-sdk/examples/llmstxt_example.rs
Normal file
@ -0,0 +1,173 @@
|
|||||||
|
#![allow(clippy::option_map_unit_fn)]
|
||||||
|
use bat::{Input, PrettyPrinter};
|
||||||
|
use firecrawl::{llmstxt::GenerateLLMsTextParams, FirecrawlApp};
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
use clap::{Parser, ValueEnum};
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, PartialEq, Eq, ValueEnum)]
|
||||||
|
enum Mode {
|
||||||
|
Basic,
|
||||||
|
Pool,
|
||||||
|
Fulltext,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(author, version, about, long_about = None)]
|
||||||
|
struct Args {
|
||||||
|
/// URL for which to generate LLMs.txt
|
||||||
|
#[arg(default_value = "https://www.firecrawl.dev/")]
|
||||||
|
url: String,
|
||||||
|
|
||||||
|
#[arg(long, short = 'm', value_enum, default_value = "Mode::Basic")]
|
||||||
|
mode: Mode,
|
||||||
|
|
||||||
|
/// Maximum number of URLs to process
|
||||||
|
#[arg(long, short = 'd', default_value = "1")]
|
||||||
|
max_urls: u32,
|
||||||
|
|
||||||
|
/// Whether to show the full LLMs-full.txt in the response
|
||||||
|
#[arg(long, short = 'f', default_value = "false")]
|
||||||
|
full_text: bool,
|
||||||
|
|
||||||
|
/// Experimental streaming option
|
||||||
|
#[arg(long, short = 's', default_value = "false")]
|
||||||
|
stream: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let args = Args::parse();
|
||||||
|
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?;
|
||||||
|
|
||||||
|
let params = GenerateLLMsTextParams {
|
||||||
|
url: args.url.clone(),
|
||||||
|
max_urls: args.max_urls,
|
||||||
|
show_full_text: args.full_text,
|
||||||
|
experimental_stream: args.stream,
|
||||||
|
};
|
||||||
|
|
||||||
|
match args.mode {
|
||||||
|
Mode::Basic => {
|
||||||
|
println!("Example 1: Basic LLMs.txt generation (synchronous)");
|
||||||
|
println!("Generating LLMs.txt for {}...", args.url);
|
||||||
|
firecrawl
|
||||||
|
.generate_llms_text(params)
|
||||||
|
.await
|
||||||
|
.inspect(|result| {
|
||||||
|
println!("Expires at: {}", result.expires_at);
|
||||||
|
let text = (if args.full_text {
|
||||||
|
result.data.full.as_ref()
|
||||||
|
} else {
|
||||||
|
result.data.compact.as_ref()
|
||||||
|
})
|
||||||
|
.expect("LLM Text");
|
||||||
|
|
||||||
|
pretty_print_content("Firecrawl Result", text).expect("Print");
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
Mode::Pool => {
|
||||||
|
println!("Example 2: Asynchronous LLMs.txt generation with manual polling");
|
||||||
|
|
||||||
|
println!("Starting asynchronous LLMs.txt generation job...");
|
||||||
|
let response = firecrawl.async_generate_llms_text(params).await?;
|
||||||
|
|
||||||
|
println!("LLMs.txt generation job initiated:");
|
||||||
|
println!(" Job ID: {}", response.id);
|
||||||
|
println!("\nManually polling for status...");
|
||||||
|
for _ in 0..10 {
|
||||||
|
let status = firecrawl
|
||||||
|
.check_generate_llms_text_status(&response.id)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
match status.status.as_str() {
|
||||||
|
"completed" => {
|
||||||
|
println!("LLMs.txt generation completed!");
|
||||||
|
let text = (if args.full_text {
|
||||||
|
status.data.full.as_ref()
|
||||||
|
} else {
|
||||||
|
status.data.compact.as_ref()
|
||||||
|
})
|
||||||
|
.expect("LLM Text");
|
||||||
|
|
||||||
|
pretty_print_content("Pool Result", text).expect("Print");
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
"failed" => {
|
||||||
|
println!(
|
||||||
|
"LLMs.txt generation failed: {}",
|
||||||
|
status.error.unwrap_or_default()
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
status => println!("Generation status: {}", status),
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("Waiting 2 seconds before checking again...");
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Mode::Fulltext => {
|
||||||
|
println!("Example 3: LLMs.txt generation with full text");
|
||||||
|
|
||||||
|
println!("Generating LLMs.txt with full text...");
|
||||||
|
match firecrawl.generate_llms_text(params).await {
|
||||||
|
Ok(result) => {
|
||||||
|
println!("LLMs.txt generation completed successfully!");
|
||||||
|
let llmstxt = result.data.compact.expect("LLMs Text Expected");
|
||||||
|
let fulltxt = result.data.full.expect("Full LLMs Text Expected");
|
||||||
|
|
||||||
|
pretty_print_contents(&[
|
||||||
|
("LLMs.txt (compact)", llmstxt),
|
||||||
|
("LLMs.txt (full text)", fulltxt),
|
||||||
|
])
|
||||||
|
.expect("Print")
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!("LLMs.txt generation failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pretty prints the provided content with syntax highlighting
|
||||||
|
fn pretty_print_content(title: &str, content: &str) -> Result<(), Box<dyn Error>> {
|
||||||
|
PrettyPrinter::new()
|
||||||
|
.header(true)
|
||||||
|
.grid(true)
|
||||||
|
.input(
|
||||||
|
Input::from_bytes(content.as_bytes())
|
||||||
|
.title(title)
|
||||||
|
.name("file.md"),
|
||||||
|
)
|
||||||
|
.print()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pretty prints multiple contents with syntax highlighting
|
||||||
|
fn pretty_print_contents(title_contents: &[(&'static str, String)]) -> Result<(), Box<dyn Error>> {
|
||||||
|
let mut inputs = Vec::new();
|
||||||
|
for (title, content) in title_contents {
|
||||||
|
inputs.push(
|
||||||
|
Input::from_bytes(content.as_bytes())
|
||||||
|
.title(*title)
|
||||||
|
.name("file.md"),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
PrettyPrinter::new()
|
||||||
|
.header(true)
|
||||||
|
.grid(true)
|
||||||
|
.inputs(inputs)
|
||||||
|
.print()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
186
apps/rust-sdk/examples/search_example.rs
Normal file
186
apps/rust-sdk/examples/search_example.rs
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
use clap::{Parser, ValueEnum};
|
||||||
|
use firecrawl::{
|
||||||
|
search::{SearchParams, SearchResponse},
|
||||||
|
FirecrawlApp,
|
||||||
|
};
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
#[derive(Debug, Parser)]
|
||||||
|
#[command(author, version, about, long_about = None)]
|
||||||
|
struct Args {
|
||||||
|
/// Which example to run
|
||||||
|
#[arg(value_enum, default_value_t = Examples::All)]
|
||||||
|
example: Examples,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, ValueEnum)]
|
||||||
|
enum Examples {
|
||||||
|
All,
|
||||||
|
Basic,
|
||||||
|
Advanced,
|
||||||
|
Geo,
|
||||||
|
Temporal,
|
||||||
|
Social,
|
||||||
|
News,
|
||||||
|
Academic,
|
||||||
|
Commercial,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let args = Args::parse();
|
||||||
|
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let firecrawl = FirecrawlApp::new_selfhosted(api_url, None::<&str>)?;
|
||||||
|
|
||||||
|
match args.example {
|
||||||
|
Examples::All => {
|
||||||
|
run_basic_example(&firecrawl).await?;
|
||||||
|
run_advanced_example(&firecrawl).await?;
|
||||||
|
run_geographic_example(&firecrawl).await?;
|
||||||
|
run_temporal_example(&firecrawl).await?;
|
||||||
|
run_social_example(&firecrawl).await?;
|
||||||
|
run_news_example(&firecrawl).await?;
|
||||||
|
run_academic_example(&firecrawl).await?;
|
||||||
|
run_commercial_example(&firecrawl).await?;
|
||||||
|
}
|
||||||
|
Examples::Basic => run_basic_example(&firecrawl).await?,
|
||||||
|
Examples::Advanced => run_advanced_example(&firecrawl).await?,
|
||||||
|
Examples::Geo => run_geographic_example(&firecrawl).await?,
|
||||||
|
Examples::Temporal => run_temporal_example(&firecrawl).await?,
|
||||||
|
Examples::Social => run_social_example(&firecrawl).await?,
|
||||||
|
Examples::News => run_news_example(&firecrawl).await?,
|
||||||
|
Examples::Academic => run_academic_example(&firecrawl).await?,
|
||||||
|
Examples::Commercial => run_commercial_example(&firecrawl).await?,
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
async fn run_basic_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query = "rust programming language";
|
||||||
|
let results = firecrawl.search(query, None).await?;
|
||||||
|
print_results("Basic Search", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_advanced_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query = "rust web framework site:github.com OR site:gitlab.com";
|
||||||
|
let params = SearchParams {
|
||||||
|
query: query.to_string(),
|
||||||
|
limit: Some(5),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let results = firecrawl.search_with_params(params).await?;
|
||||||
|
print_results("Advanced Repository Search", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_geographic_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query = "coworking space startup hub";
|
||||||
|
let params = SearchParams {
|
||||||
|
query: query.to_string(),
|
||||||
|
// WARN: Doesn't work with searxng
|
||||||
|
location: Some("Silicon Valley, CA".to_string()),
|
||||||
|
// WARN: Doesn't work with searxng
|
||||||
|
country: Some("us".to_string()),
|
||||||
|
limit: Some(5),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let results = firecrawl.search_with_params(params).await?;
|
||||||
|
print_results("Geographic-Specific Search", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_temporal_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query = "artificial intelligence breakthroughs";
|
||||||
|
let params = SearchParams {
|
||||||
|
query: query.to_string(),
|
||||||
|
// WARN: Doesn't work with searxng
|
||||||
|
tbs: Some("qdr:m1".to_string()),
|
||||||
|
limit: Some(5),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let results = firecrawl.search_with_params(params).await?;
|
||||||
|
print_results("Recent AI News", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_social_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query = "viral tech trends site:twitter.com";
|
||||||
|
let params = SearchParams {
|
||||||
|
query: query.to_string(),
|
||||||
|
// WARN: Doesn't work. Maybe searxng related
|
||||||
|
filter: Some("site:twitter.com OR site:linkedin.com".to_string()),
|
||||||
|
// WARN: Doesn't work with searxng
|
||||||
|
tbs: Some("qdr:w".to_string()), // Last week
|
||||||
|
limit: Some(5),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let results = firecrawl.search_with_params(params).await?;
|
||||||
|
print_results("Social Media Tech Trends", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_news_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query =
|
||||||
|
"cryptocurrency market analysis site:reuters.com OR site:bloomberg.com OR site:ft.com";
|
||||||
|
let params = SearchParams {
|
||||||
|
query: query.to_string(),
|
||||||
|
// WARN: Doesn't work with searxng
|
||||||
|
tbs: Some("qdr:d".to_string()), // Last 24 hours
|
||||||
|
limit: Some(5),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let results = firecrawl.search_with_params(params).await?;
|
||||||
|
print_results("Financial News Search", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_academic_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query = "quantum computing research papers site:arxiv.org OR site:scholar.google.com";
|
||||||
|
let params = SearchParams {
|
||||||
|
query: query.to_string(),
|
||||||
|
// WARN: Doesn't work. Maybe searxng related
|
||||||
|
// filter: Some("site:arxiv.org OR site:scholar.google.com".to_string()),
|
||||||
|
// WARN: Doesn't work with searxng
|
||||||
|
tbs: Some("qdr:y".to_string()), // Last year
|
||||||
|
limit: Some(5),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let results = firecrawl.search_with_params(params).await?;
|
||||||
|
print_results("Academic Research Search", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn run_commercial_example(firecrawl: &FirecrawlApp) -> Result<(), Box<dyn Error>> {
|
||||||
|
let query = "enterprise cloud solutions reviews site:g2.com";
|
||||||
|
let params = SearchParams {
|
||||||
|
query: query.to_string(),
|
||||||
|
limit: Some(5),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let results = firecrawl.search_with_params(params).await?;
|
||||||
|
print_results("Commercial Product Search", query, &results);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn print_results(name: &str, query: &str, results: &SearchResponse) {
|
||||||
|
let sec = "=".repeat(70);
|
||||||
|
|
||||||
|
println!("\n{sec}");
|
||||||
|
println!("🔍 {name}");
|
||||||
|
println!("🔎 Query: \"{query}\"");
|
||||||
|
println!("{sec}");
|
||||||
|
|
||||||
|
for (i, doc) in results.data.iter().enumerate() {
|
||||||
|
println!("{}. 📌 Title: {}", i + 1, doc.title);
|
||||||
|
println!(" - 🔗 URL: {}", doc.url);
|
||||||
|
println!(" - 📝 Description: \"{:.40}\"...", doc.description);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(warning) = &results.warning {
|
||||||
|
println!("\n⚠️ Warning: {warning}");
|
||||||
|
}
|
||||||
|
println!("{sec}\n");
|
||||||
|
}
|
494
apps/rust-sdk/src/batch_scrape.rs
Normal file
494
apps/rust-sdk/src/batch_scrape.rs
Normal file
@ -0,0 +1,494 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
crawl::{CrawlErrorsResponse, CrawlStatus, CrawlStatusTypes},
|
||||||
|
scrape::ScrapeOptions,
|
||||||
|
FirecrawlApp, FirecrawlError, API_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct BatchScrapeParams {
|
||||||
|
/// List of URLs to scrape
|
||||||
|
pub urls: Vec<String>,
|
||||||
|
/// Scrape options to apply to all URLs
|
||||||
|
#[serde(flatten)]
|
||||||
|
pub options: Option<ScrapeOptions>,
|
||||||
|
/// Whether to ignore invalid URLs
|
||||||
|
#[serde(rename = "ignoreInvalidURLs")]
|
||||||
|
pub ignore_invalid_urls: bool,
|
||||||
|
/// ID of an existing job to append these URLs to
|
||||||
|
pub append_to_id: Option<String>,
|
||||||
|
/// Webhook configuration
|
||||||
|
pub webhook: Option<WebhookOptions>,
|
||||||
|
|
||||||
|
/// Idempotency key to send to the crawl endpoint.
|
||||||
|
#[serde(skip)]
|
||||||
|
pub idempotency_key: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Options for webhook notifications
|
||||||
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct WebhookOptions {
|
||||||
|
/// URL to send webhook notifications to
|
||||||
|
pub url: String,
|
||||||
|
/// Custom headers to include in webhook requests
|
||||||
|
pub headers: Option<HashMap<String, String>>,
|
||||||
|
/// Authentication token for the webhook
|
||||||
|
pub auth_token: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&str> for WebhookOptions {
|
||||||
|
fn from(url: &str) -> Self {
|
||||||
|
Self {
|
||||||
|
url: url.to_string(),
|
||||||
|
headers: None,
|
||||||
|
auth_token: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Response from initiating a batch scrape job
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct BatchScrapeResponse {
|
||||||
|
/// Whether the request was successful
|
||||||
|
pub success: bool,
|
||||||
|
/// The ID of the batch scrape job
|
||||||
|
pub id: String,
|
||||||
|
/// URL to get the status of the batch scrape job
|
||||||
|
pub url: String,
|
||||||
|
/// List of URLs that were invalid and could not be processed
|
||||||
|
pub invalid_urls: Option<Vec<String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<String> for WebhookOptions {
|
||||||
|
fn from(url: String) -> Self {
|
||||||
|
Self {
|
||||||
|
url,
|
||||||
|
headers: None,
|
||||||
|
auth_token: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FirecrawlApp {
|
||||||
|
/// Initiates an asynchronous batch scrape job
|
||||||
|
pub async fn async_batch_scrape_urls(
|
||||||
|
&self,
|
||||||
|
params: BatchScrapeParams,
|
||||||
|
) -> Result<BatchScrapeResponse, FirecrawlError> {
|
||||||
|
let headers = self.prepare_headers(params.idempotency_key.as_ref());
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(format!("{}{}/batch/scrape", self.api_url, API_VERSION))
|
||||||
|
.headers(headers)
|
||||||
|
.json(¶ms)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpError("Initiating batch scrape job".to_string(), e))?;
|
||||||
|
|
||||||
|
self.handle_response(response, "initiate batch scrape job")
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initiates a batch scrape job and waits for completion
|
||||||
|
pub async fn batch_scrape_urls(
|
||||||
|
&self,
|
||||||
|
params: BatchScrapeParams,
|
||||||
|
poll_interval: Option<u64>,
|
||||||
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
|
let poll_interval_ms = poll_interval.unwrap_or(2000);
|
||||||
|
|
||||||
|
let response = self.async_batch_scrape_urls(params).await?;
|
||||||
|
|
||||||
|
self.monitor_batch_job_status(&response.id, poll_interval_ms)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks the status of a batch scrape job
|
||||||
|
pub async fn check_batch_scrape_status(
|
||||||
|
&self,
|
||||||
|
id: impl AsRef<str>,
|
||||||
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(format!(
|
||||||
|
"{}{}/batch/scrape/{}",
|
||||||
|
self.api_url,
|
||||||
|
API_VERSION,
|
||||||
|
id.as_ref()
|
||||||
|
))
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(
|
||||||
|
format!("Checking status of batch scrape {}", id.as_ref()),
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let mut status: CrawlStatus = self
|
||||||
|
.handle_response(
|
||||||
|
response,
|
||||||
|
format!("Checking status of batch scrape {}", id.as_ref()),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if status.status == CrawlStatusTypes::Completed {
|
||||||
|
while let Some(next) = status.next.clone() {
|
||||||
|
let new_status = self.check_batch_scrape_status_next(next).await?;
|
||||||
|
status.data.extend_from_slice(&new_status.data);
|
||||||
|
status.next = new_status.next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(status)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to paginate through batch scrape status results
|
||||||
|
async fn check_batch_scrape_status_next(
|
||||||
|
&self,
|
||||||
|
next: impl AsRef<str>,
|
||||||
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(next.as_ref())
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(
|
||||||
|
format!("Paginating batch scrape using URL {:?}", next.as_ref()),
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response(
|
||||||
|
response,
|
||||||
|
format!("Paginating batch scrape using URL {:?}", next.as_ref()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check for errors in a batch scrape job
|
||||||
|
pub async fn check_batch_scrape_errors(
|
||||||
|
&self,
|
||||||
|
id: impl AsRef<str>,
|
||||||
|
) -> Result<CrawlErrorsResponse, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(format!(
|
||||||
|
"{}{}/batch/scrape/{}/errors",
|
||||||
|
self.api_url,
|
||||||
|
API_VERSION,
|
||||||
|
id.as_ref()
|
||||||
|
))
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(
|
||||||
|
format!("Checking errors for batch scrape {}", id.as_ref()),
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response(
|
||||||
|
response,
|
||||||
|
format!("Checking errors for batch scrape {}", id.as_ref()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to poll for batch job status until completion
|
||||||
|
async fn monitor_batch_job_status(
|
||||||
|
&self,
|
||||||
|
id: &str,
|
||||||
|
poll_interval: u64,
|
||||||
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
|
loop {
|
||||||
|
let status_data = self.check_batch_scrape_status(id).await?;
|
||||||
|
match status_data.status {
|
||||||
|
CrawlStatusTypes::Completed => {
|
||||||
|
break Ok(status_data);
|
||||||
|
}
|
||||||
|
CrawlStatusTypes::Scraping => {
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||||
|
}
|
||||||
|
CrawlStatusTypes::Failed => {
|
||||||
|
break Err(FirecrawlError::CrawlJobFailed(
|
||||||
|
"Batch scrape job failed".into(),
|
||||||
|
status_data,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
CrawlStatusTypes::Cancelled => {
|
||||||
|
break Err(FirecrawlError::CrawlJobFailed(
|
||||||
|
"Batch scrape job was cancelled".into(),
|
||||||
|
status_data,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "Makes real network request"]
|
||||||
|
async fn test_real_batch_scrape() {
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||||
|
|
||||||
|
// Start a batch scrape job
|
||||||
|
let params = BatchScrapeParams {
|
||||||
|
urls: vec![
|
||||||
|
"https://example.com".to_string(),
|
||||||
|
"https://example.org".to_string(),
|
||||||
|
],
|
||||||
|
ignore_invalid_urls: true,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.async_batch_scrape_urls(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert!(!response.id.is_empty());
|
||||||
|
assert!(!response.url.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_async_batch_scrape_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/batch/scrape")
|
||||||
|
// Remove the match_body expectation which might be causing issues
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"id": "batch-123",
|
||||||
|
"url": "https://api.example.com/v1/batch/batch-123",
|
||||||
|
"invalidUrls": []
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = BatchScrapeParams {
|
||||||
|
urls: vec![
|
||||||
|
"https://example.com".to_string(),
|
||||||
|
"https://example.org".to_string(),
|
||||||
|
],
|
||||||
|
ignore_invalid_urls: true,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.async_batch_scrape_urls(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.id, "batch-123");
|
||||||
|
assert_eq!(response.url, "https://api.example.com/v1/batch/batch-123");
|
||||||
|
assert!(response.invalid_urls.unwrap_or_default().is_empty());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_batch_scrape_with_webhook() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/batch/scrape")
|
||||||
|
// Remove the match_body expectation to simplify
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"id": "batch-123",
|
||||||
|
"url": "https://api.example.com/v1/batch/batch-123"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = BatchScrapeParams {
|
||||||
|
urls: vec!["https://example.com".to_string()],
|
||||||
|
webhook: Some("https://webhook.example.com/notify".into()),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.async_batch_scrape_urls(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.id, "batch-123");
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_check_batch_scrape_status_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("GET", "/v1/batch/scrape/batch-123")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"status": "completed",
|
||||||
|
"total": 2,
|
||||||
|
"completed": 2,
|
||||||
|
"creditsUsed": 2,
|
||||||
|
"expiresAt": "2023-12-31T23:59:59Z",
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"sourceURL": "https://example.com",
|
||||||
|
"statusCode": 200
|
||||||
|
},
|
||||||
|
"markdown": "Example Domain content"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"sourceURL": "https://example.org",
|
||||||
|
"statusCode": 200
|
||||||
|
},
|
||||||
|
"markdown": "Another example content"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let status = app.check_batch_scrape_status("batch-123").await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(status.total, 2);
|
||||||
|
assert_eq!(status.completed, 2);
|
||||||
|
assert_eq!(status.data.len(), 2);
|
||||||
|
assert_eq!(status.data[0].metadata.source_url, "https://example.com");
|
||||||
|
assert_eq!(status.data[1].metadata.source_url, "https://example.org");
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_check_batch_scrape_errors_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("GET", "/v1/batch/scrape/batch-123/errors")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"errors": [
|
||||||
|
{
|
||||||
|
"id": "error1",
|
||||||
|
"timestamp": "2023-01-01T00:00:00Z",
|
||||||
|
"url": "https://invalid.example.com",
|
||||||
|
"error": "Failed to load page"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"robotsBlocked": [
|
||||||
|
"https://example.com/admin"
|
||||||
|
]
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create_async()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let errors = app.check_batch_scrape_errors("batch-123").await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(errors.errors.len(), 1);
|
||||||
|
assert_eq!(errors.errors[0].url, "https://invalid.example.com");
|
||||||
|
assert_eq!(errors.robots_blocked.len(), 1);
|
||||||
|
assert_eq!(errors.robots_blocked[0], "https://example.com/admin");
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_batch_scrape_with_invalid_urls() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/batch/scrape")
|
||||||
|
// Remove the match_body expectation
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"id": "batch-123",
|
||||||
|
"url": "https://api.example.com/v1/batch/batch-123",
|
||||||
|
"invalidUrls": ["invalid-url"]
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = BatchScrapeParams {
|
||||||
|
urls: vec!["https://example.com".to_string(), "invalid-url".to_string()],
|
||||||
|
ignore_invalid_urls: true,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.async_batch_scrape_urls(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.id, "batch-123");
|
||||||
|
assert_eq!(response.invalid_urls, Some(vec!["invalid-url".to_string()]));
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_batch_scrape_error_response() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/batch/scrape")
|
||||||
|
.with_status(400)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": false,
|
||||||
|
"error": "No valid URLs provided"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = BatchScrapeParams::default();
|
||||||
|
let result = app.async_batch_scrape_urls(params).await;
|
||||||
|
|
||||||
|
assert!(result.is_err());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,11 @@ use std::collections::HashMap;
|
|||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::{document::Document, scrape::{ScrapeFormats, ScrapeOptions}, FirecrawlApp, FirecrawlError, API_VERSION};
|
use crate::{
|
||||||
|
document::Document,
|
||||||
|
scrape::{ScrapeFormats, ScrapeOptions},
|
||||||
|
FirecrawlApp, FirecrawlError, API_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
|
#[derive(Deserialize, Serialize, Clone, Copy, Debug)]
|
||||||
pub enum CrawlScrapeFormats {
|
pub enum CrawlScrapeFormats {
|
||||||
@ -81,7 +85,9 @@ pub struct CrawlScrapeOptions {
|
|||||||
impl From<CrawlScrapeOptions> for ScrapeOptions {
|
impl From<CrawlScrapeOptions> for ScrapeOptions {
|
||||||
fn from(value: CrawlScrapeOptions) -> Self {
|
fn from(value: CrawlScrapeOptions) -> Self {
|
||||||
ScrapeOptions {
|
ScrapeOptions {
|
||||||
formats: value.formats.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
|
formats: value
|
||||||
|
.formats
|
||||||
|
.map(|formats| formats.into_iter().map(|x| x.into()).collect()),
|
||||||
only_main_content: value.only_main_content,
|
only_main_content: value.only_main_content,
|
||||||
include_tags: value.include_tags,
|
include_tags: value.include_tags,
|
||||||
exclude_tags: value.exclude_tags,
|
exclude_tags: value.exclude_tags,
|
||||||
@ -200,6 +206,29 @@ pub struct CrawlStatus {
|
|||||||
pub data: Vec<Document>,
|
pub data: Vec<Document>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct CrawlError {
|
||||||
|
pub id: String,
|
||||||
|
pub timestamp: Option<String>,
|
||||||
|
pub url: String,
|
||||||
|
pub error: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct CrawlErrorsResponse {
|
||||||
|
pub errors: Vec<CrawlError>,
|
||||||
|
#[serde(rename = "robotsBlocked")]
|
||||||
|
pub robots_blocked: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct CancelCrawlResponse {
|
||||||
|
pub status: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Clone)]
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct CrawlAsyncResponse {
|
pub struct CrawlAsyncResponse {
|
||||||
@ -228,14 +257,15 @@ impl FirecrawlApp {
|
|||||||
|
|
||||||
let response = self
|
let response = self
|
||||||
.client
|
.client
|
||||||
.post(&format!("{}{}/crawl", self.api_url, API_VERSION))
|
.post(format!("{}{}/crawl", self.api_url, API_VERSION))
|
||||||
.headers(headers.clone())
|
.headers(headers.clone())
|
||||||
.json(&body)
|
.json(&body)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
|
.map_err(|e| FirecrawlError::HttpError(format!("Crawling {:?}", url.as_ref()), e))?;
|
||||||
|
|
||||||
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job").await
|
self.handle_response::<CrawlAsyncResponse>(response, "start crawl job")
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
|
/// Performs a crawl job for a URL using the Firecrawl API, waiting for the end result. This may take a long time depending on the size of the target page and your options (namely `CrawlOptions.limit`).
|
||||||
@ -245,38 +275,65 @@ impl FirecrawlApp {
|
|||||||
options: impl Into<Option<CrawlOptions>>,
|
options: impl Into<Option<CrawlOptions>>,
|
||||||
) -> Result<CrawlStatus, FirecrawlError> {
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
let options = options.into();
|
let options = options.into();
|
||||||
let poll_interval = options.as_ref().and_then(|x| x.poll_interval).unwrap_or(2000);
|
let poll_interval = options
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|x| x.poll_interval)
|
||||||
|
.unwrap_or(2000);
|
||||||
let res = self.crawl_url_async(url, options).await?;
|
let res = self.crawl_url_async(url, options).await?;
|
||||||
|
|
||||||
self.monitor_job_status(&res.id, poll_interval).await
|
self.monitor_job_status(&res.id, poll_interval).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn check_crawl_status_next(&self, next: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
async fn check_crawl_status_next(
|
||||||
|
&self,
|
||||||
|
next: impl AsRef<str>,
|
||||||
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
let response = self
|
let response = self
|
||||||
.client
|
.client
|
||||||
.get(next.as_ref())
|
.get(next.as_ref())
|
||||||
.headers(self.prepare_headers(None))
|
.headers(self.prepare_headers(None))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpError(format!("Paginating crawl using URL {:?}", next.as_ref()), e))?;
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(
|
||||||
|
format!("Paginating crawl using URL {:?}", next.as_ref()),
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
self.handle_response(response, format!("Paginating crawl using URL {:?}", next.as_ref())).await
|
self.handle_response(
|
||||||
|
response,
|
||||||
|
format!("Paginating crawl using URL {:?}", next.as_ref()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
|
/// Checks for the status of a crawl, based on the crawl's ID. To be used in conjunction with `FirecrawlApp::crawl_url_async`.
|
||||||
pub async fn check_crawl_status(&self, id: impl AsRef<str>) -> Result<CrawlStatus, FirecrawlError> {
|
pub async fn check_crawl_status(
|
||||||
|
&self,
|
||||||
|
id: impl AsRef<str>,
|
||||||
|
) -> Result<CrawlStatus, FirecrawlError> {
|
||||||
let response = self
|
let response = self
|
||||||
.client
|
.client
|
||||||
.get(&format!(
|
.get(format!(
|
||||||
"{}{}/crawl/{}",
|
"{}{}/crawl/{}",
|
||||||
self.api_url, API_VERSION, id.as_ref()
|
self.api_url,
|
||||||
|
API_VERSION,
|
||||||
|
id.as_ref()
|
||||||
))
|
))
|
||||||
.headers(self.prepare_headers(None))
|
.headers(self.prepare_headers(None))
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e))?;
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(format!("Checking status of crawl {}", id.as_ref()), e)
|
||||||
|
})?;
|
||||||
|
|
||||||
let mut status: CrawlStatus = self.handle_response(response, format!("Checking status of crawl {}", id.as_ref())).await?;
|
let mut status: CrawlStatus = self
|
||||||
|
.handle_response(
|
||||||
|
response,
|
||||||
|
format!("Checking status of crawl {}", id.as_ref()),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
if status.status == CrawlStatusTypes::Completed {
|
if status.status == CrawlStatusTypes::Completed {
|
||||||
while let Some(next) = status.next {
|
while let Some(next) = status.next {
|
||||||
@ -304,16 +361,240 @@ impl FirecrawlApp {
|
|||||||
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Failed => {
|
CrawlStatusTypes::Failed => {
|
||||||
break Err(FirecrawlError::CrawlJobFailed(format!(
|
break Err(FirecrawlError::CrawlJobFailed(
|
||||||
"Crawl job failed."
|
"Crawl job failed".into(),
|
||||||
), status_data));
|
status_data,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
CrawlStatusTypes::Cancelled => {
|
CrawlStatusTypes::Cancelled => {
|
||||||
break Err(FirecrawlError::CrawlJobFailed(format!(
|
break Err(FirecrawlError::CrawlJobFailed(
|
||||||
"Crawl job was cancelled."
|
"Crawl job was cancelled.".into(),
|
||||||
), status_data));
|
status_data,
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Cancel an asynchronous crawl job using the Firecrawl API.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A response indicating whether the cancellation was successful, or a FirecrawlError if the request fails.
|
||||||
|
pub async fn cancel_crawl(
|
||||||
|
&self,
|
||||||
|
id: impl AsRef<str>,
|
||||||
|
) -> Result<CancelCrawlResponse, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.delete(format!(
|
||||||
|
"{}{}/crawl/{}",
|
||||||
|
self.api_url,
|
||||||
|
API_VERSION,
|
||||||
|
id.as_ref()
|
||||||
|
))
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(format!("Cancelling crawl {}", id.as_ref()), e)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response(response, "crawl_cancel").await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns information about crawl errors.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A response containing information about crawl errors, or a FirecrawlError if the request fails.
|
||||||
|
pub async fn check_crawl_errors(
|
||||||
|
&self,
|
||||||
|
id: impl AsRef<str>,
|
||||||
|
) -> Result<CrawlErrorsResponse, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(format!(
|
||||||
|
"{}{}/crawl/{}/errors",
|
||||||
|
self.api_url,
|
||||||
|
API_VERSION,
|
||||||
|
id.as_ref()
|
||||||
|
))
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(format!("Checking errors for crawl {}", id.as_ref()), e)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response(response, "crawl_check").await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "Makes real network request"]
|
||||||
|
async fn test_real_cancel_crawl() {
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||||
|
|
||||||
|
// First start a crawl job
|
||||||
|
let crawl_response = app
|
||||||
|
.crawl_url_async("https://example.com", None)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Then cancel it
|
||||||
|
let cancel_response = app.cancel_crawl(crawl_response.id).await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(cancel_response.status, "cancelled");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_cancel_crawl_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for the cancel request
|
||||||
|
let mock = server
|
||||||
|
.mock("DELETE", "/v1/crawl/test-crawl-id")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": null,
|
||||||
|
"status": "cancelled"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let response = app.cancel_crawl("test-crawl-id").await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(response.status, "cancelled");
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_cancel_crawl_error_response() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for an error response
|
||||||
|
let mock = server
|
||||||
|
.mock("DELETE", "/v1/crawl/invalid-id")
|
||||||
|
.with_status(404)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": false,
|
||||||
|
"error": "Crawl job not found"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let result = app.cancel_crawl("invalid-id").await;
|
||||||
|
|
||||||
|
assert!(result.is_err());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "Makes real network request"]
|
||||||
|
async fn test_real_check_crawl_errors() {
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||||
|
|
||||||
|
// First start a crawl job
|
||||||
|
let crawl_response = app
|
||||||
|
.crawl_url_async("https://no-wer-agg.invalid", None)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Check for errors
|
||||||
|
let errors_response = app.check_crawl_errors(crawl_response.id).await.unwrap();
|
||||||
|
println!("{errors_response:?}");
|
||||||
|
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
|
||||||
|
|
||||||
|
assert!(
|
||||||
|
!errors_response.errors.is_empty(),
|
||||||
|
"WARN: Error returned related to Supabase not in my environment. It may fail"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_check_crawl_errors_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for the check errors request
|
||||||
|
let mock = server
|
||||||
|
.mock("GET", "/v1/crawl/test-crawl-id/errors")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"errors": [
|
||||||
|
{
|
||||||
|
"id": "error1",
|
||||||
|
"timestamp": "2023-01-01T00:00:00Z",
|
||||||
|
"url": "https://example.com/error-page",
|
||||||
|
"error": "Failed to load page"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"robotsBlocked": [
|
||||||
|
"https://example.com/blocked-by-robots"
|
||||||
|
]
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let response = app.check_crawl_errors("test-crawl-id").await.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(response.errors.len(), 1);
|
||||||
|
assert_eq!(response.errors[0].id, "error1");
|
||||||
|
assert_eq!(response.errors[0].url, "https://example.com/error-page");
|
||||||
|
assert_eq!(response.errors[0].error, "Failed to load page");
|
||||||
|
assert_eq!(response.robots_blocked.len(), 1);
|
||||||
|
assert_eq!(
|
||||||
|
response.robots_blocked[0],
|
||||||
|
"https://example.com/blocked-by-robots"
|
||||||
|
);
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_check_crawl_errors_error_response() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for an error response
|
||||||
|
let mock = server
|
||||||
|
.mock("GET", "/v1/crawl/invalid-id/errors")
|
||||||
|
.with_status(404)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": false,
|
||||||
|
"error": "Crawl job not found"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let result = app.check_crawl_errors("invalid-id").await;
|
||||||
|
|
||||||
|
assert!(result.is_err());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -83,4 +83,3 @@ pub struct Document {
|
|||||||
/// The warning message will contain any errors encountered during the extraction.
|
/// The warning message will contain any errors encountered during the extraction.
|
||||||
pub warning: Option<String>,
|
pub warning: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,4 +42,6 @@ pub enum FirecrawlError {
|
|||||||
APIError(String, FirecrawlAPIError),
|
APIError(String, FirecrawlAPIError),
|
||||||
#[error("Crawl job failed: {0}")]
|
#[error("Crawl job failed: {0}")]
|
||||||
CrawlJobFailed(String, CrawlStatus),
|
CrawlJobFailed(String, CrawlStatus),
|
||||||
|
#[error("Missuse: {0}")]
|
||||||
|
Missuse(String),
|
||||||
}
|
}
|
||||||
|
596
apps/rust-sdk/src/extract.rs
Normal file
596
apps/rust-sdk/src/extract.rs
Normal file
@ -0,0 +1,596 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use schemars::schema_for;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::{FirecrawlApp, FirecrawlError, API_VERSION};
|
||||||
|
|
||||||
|
/// Parameters for extract requests
|
||||||
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ExtractParams {
|
||||||
|
/// URLs to extract information from
|
||||||
|
pub urls: Option<Vec<String>>,
|
||||||
|
|
||||||
|
/// Extraction prompt
|
||||||
|
pub prompt: Option<String>,
|
||||||
|
|
||||||
|
/// Schema for structured output
|
||||||
|
pub schema: Option<Value>,
|
||||||
|
|
||||||
|
/// System prompt for the LLM
|
||||||
|
pub system_prompt: Option<String>,
|
||||||
|
|
||||||
|
/// Allow following external links
|
||||||
|
pub allow_external_links: Option<bool>,
|
||||||
|
|
||||||
|
/// Enable web search for additional information
|
||||||
|
pub enable_web_search: Option<bool>,
|
||||||
|
|
||||||
|
/// Show sources in the response
|
||||||
|
pub show_sources: Option<bool>,
|
||||||
|
|
||||||
|
/// Origin information, defaults to "api-sdk"
|
||||||
|
pub origin: Option<String>,
|
||||||
|
|
||||||
|
/// Timeout in milliseconds, defaults to 60000
|
||||||
|
pub timeout: Option<u32>,
|
||||||
|
|
||||||
|
/// Whether to include URL trace information, defaults to false
|
||||||
|
pub url_trace: Option<bool>,
|
||||||
|
|
||||||
|
/// Whether to ignore sitemap, defaults to false
|
||||||
|
pub ignore_sitemap: Option<bool>,
|
||||||
|
|
||||||
|
/// Whether to include subdomains, defaults to true
|
||||||
|
pub include_subdomains: Option<bool>,
|
||||||
|
|
||||||
|
/// Maximum number of URLs to process
|
||||||
|
pub limit: Option<u32>,
|
||||||
|
|
||||||
|
/// Experimental: Stream steps information
|
||||||
|
#[serde(rename = "__experimental_streamSteps")]
|
||||||
|
pub experimental_stream_steps: Option<bool>,
|
||||||
|
|
||||||
|
/// Experimental: Include LLM usage information
|
||||||
|
#[serde(rename = "__experimental_llmUsage")]
|
||||||
|
pub experimental_llm_usage: Option<bool>,
|
||||||
|
|
||||||
|
/// Experimental: Show sources information
|
||||||
|
#[serde(rename = "__experimental_showSources")]
|
||||||
|
pub experimental_show_sources: Option<bool>,
|
||||||
|
|
||||||
|
/// Experimental: Cache key
|
||||||
|
#[serde(rename = "__experimental_cacheKey")]
|
||||||
|
pub experimental_cache_key: Option<String>,
|
||||||
|
|
||||||
|
/// Experimental: Cache mode, defaults to "direct"
|
||||||
|
#[serde(rename = "__experimental_cacheMode")]
|
||||||
|
pub experimental_cache_mode: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Response from initiating an extract operation
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ExtractResponse {
|
||||||
|
/// Whether the request was successful
|
||||||
|
pub success: bool,
|
||||||
|
|
||||||
|
/// The ID of the extract job
|
||||||
|
pub id: String,
|
||||||
|
|
||||||
|
/// URL trace information if requested
|
||||||
|
pub url_trace: Option<Vec<URLTrace>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Information about URL processing during extraction
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct URLTrace {
|
||||||
|
/// The URL being processed
|
||||||
|
pub url: String,
|
||||||
|
|
||||||
|
/// Status of processing this URL
|
||||||
|
pub status: String,
|
||||||
|
|
||||||
|
/// Timing information for URL processing
|
||||||
|
pub timing: URLTraceTiming,
|
||||||
|
|
||||||
|
/// Error message if processing failed
|
||||||
|
pub error: Option<String>,
|
||||||
|
|
||||||
|
/// Warning message if there were issues
|
||||||
|
pub warning: Option<String>,
|
||||||
|
|
||||||
|
/// Content statistics
|
||||||
|
pub content_stats: Option<ContentStats>,
|
||||||
|
|
||||||
|
/// Relevance score for this URL (0-1)
|
||||||
|
pub relevance_score: Option<f64>,
|
||||||
|
|
||||||
|
/// Whether this URL was used in the final completion
|
||||||
|
pub used_in_completion: Option<bool>,
|
||||||
|
|
||||||
|
/// Fields extracted from this URL
|
||||||
|
pub extracted_fields: Option<Vec<String>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Timing information for URL processing
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct URLTraceTiming {
|
||||||
|
/// When the URL was discovered
|
||||||
|
pub discovered_at: String,
|
||||||
|
|
||||||
|
/// When scraping began for this URL
|
||||||
|
pub scraped_at: Option<String>,
|
||||||
|
|
||||||
|
/// When processing was completed for this URL
|
||||||
|
pub completed_at: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Statistics about processed content
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ContentStats {
|
||||||
|
/// Length of the raw content in characters
|
||||||
|
pub raw_content_length: u32,
|
||||||
|
|
||||||
|
/// Length of the processed content in characters
|
||||||
|
pub processed_content_length: u32,
|
||||||
|
|
||||||
|
/// Number of tokens used for this content
|
||||||
|
pub tokens_used: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Response for extract status check
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ExtractStatusResponse {
|
||||||
|
/// Whether the request was successful
|
||||||
|
pub success: bool,
|
||||||
|
|
||||||
|
/// Status of the extract job: "pending", "processing", "completed", "failed"
|
||||||
|
pub status: String,
|
||||||
|
|
||||||
|
/// Extracted data, present when status is "completed"
|
||||||
|
pub data: Option<Value>,
|
||||||
|
|
||||||
|
/// Error message if the job failed
|
||||||
|
pub error: Option<String>,
|
||||||
|
|
||||||
|
/// URL trace information if requested
|
||||||
|
pub url_trace: Option<Vec<URLTrace>>,
|
||||||
|
|
||||||
|
/// Sources information if requested
|
||||||
|
pub sources: Option<HashMap<String, Vec<String>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FirecrawlApp {
|
||||||
|
/// Extracts information from URLs using the Firecrawl API.
|
||||||
|
///
|
||||||
|
/// This is the synchronous version that polls until completion.
|
||||||
|
///
|
||||||
|
/// Either `params.prompt` or `params.schema` must be provided.
|
||||||
|
pub async fn extract(
|
||||||
|
&self,
|
||||||
|
params: impl Into<ExtractParams>,
|
||||||
|
) -> Result<ExtractStatusResponse, FirecrawlError> {
|
||||||
|
let mut params = params.into();
|
||||||
|
// Validation: Either prompt or schema must be provided
|
||||||
|
if params.prompt.is_none() && params.schema.is_none() {
|
||||||
|
return Err(FirecrawlError::APIError(
|
||||||
|
"Extract validation".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: "Either prompt or schema must be provided".to_string(),
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set default origin if not provided
|
||||||
|
if params.origin.is_none() {
|
||||||
|
params.origin = Some("api-sdk".to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initiate the extract job asynchronously
|
||||||
|
let response = self.async_extract(params).await?;
|
||||||
|
|
||||||
|
// Poll for the result
|
||||||
|
let poll_interval = 2000; // Default to 2 seconds
|
||||||
|
self.monitor_extract_job_status(&response.id, poll_interval)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn extract_with_schemars<T>(
|
||||||
|
&self,
|
||||||
|
params: impl Into<ExtractParams>,
|
||||||
|
) -> Result<ExtractStatusResponse, FirecrawlError>
|
||||||
|
where
|
||||||
|
T: schemars::JsonSchema,
|
||||||
|
{
|
||||||
|
let mut params = params.into();
|
||||||
|
let schema = schema_for!(T);
|
||||||
|
let schema_json = serde_json::to_value(schema).map_err(|e| {
|
||||||
|
FirecrawlError::APIError(
|
||||||
|
"Schema serialization".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: e.to_string(),
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
params.schema = Some(schema_json);
|
||||||
|
self.extract(params).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initiates an asynchronous extract operation.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `params` - Parameters for the extract request
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A response containing the extract job ID, or a FirecrawlError if the request fails.
|
||||||
|
///
|
||||||
|
/// # Notes
|
||||||
|
///
|
||||||
|
/// Either `params.urls` or `params.prompt` must be provided.
|
||||||
|
/// Either `params.prompt` or `params.schema` must be provided.
|
||||||
|
pub async fn async_extract(
|
||||||
|
&self,
|
||||||
|
params: impl Into<ExtractParams>,
|
||||||
|
) -> Result<ExtractResponse, FirecrawlError> {
|
||||||
|
let params = params.into();
|
||||||
|
// Validation: Either URLs or prompt must be provided
|
||||||
|
if params.urls.is_none() && params.prompt.is_none() {
|
||||||
|
return Err(FirecrawlError::APIError(
|
||||||
|
"Extract validation".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: "Either URLs or prompt must be provided".to_string(),
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validation: Either prompt or schema must be provided
|
||||||
|
if params.prompt.is_none() && params.schema.is_none() {
|
||||||
|
return Err(FirecrawlError::APIError(
|
||||||
|
"Extract validation".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: "Either prompt or schema must be provided".to_string(),
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let headers = self.prepare_headers(None);
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(format!("{}{}/extract", self.api_url, API_VERSION))
|
||||||
|
.headers(headers)
|
||||||
|
.json(¶ms)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| FirecrawlError::HttpError("Initiating extract job".to_string(), e))?;
|
||||||
|
|
||||||
|
self.handle_response(response, "initiate extract job").await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks the status of an extract job.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `id` - The ID of the extract job
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A response containing the status of the extract job, or a FirecrawlError if the request fails.
|
||||||
|
pub async fn get_extract_status(
|
||||||
|
&self,
|
||||||
|
id: impl AsRef<str>,
|
||||||
|
) -> Result<ExtractStatusResponse, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(format!(
|
||||||
|
"{}{}/extract/{}",
|
||||||
|
self.api_url,
|
||||||
|
API_VERSION,
|
||||||
|
id.as_ref()
|
||||||
|
))
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(format!("Checking status of extract {}", id.as_ref()), e)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response(
|
||||||
|
response,
|
||||||
|
format!("Checking status of extract {}", id.as_ref()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to poll for extract job status until completion
|
||||||
|
async fn monitor_extract_job_status(
|
||||||
|
&self,
|
||||||
|
id: &str,
|
||||||
|
poll_interval: u64,
|
||||||
|
) -> Result<ExtractStatusResponse, FirecrawlError> {
|
||||||
|
loop {
|
||||||
|
let status_data = self.get_extract_status(id).await?;
|
||||||
|
|
||||||
|
match status_data.status.as_str() {
|
||||||
|
"completed" => {
|
||||||
|
break Ok(status_data);
|
||||||
|
}
|
||||||
|
"pending" | "processing" => {
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||||
|
}
|
||||||
|
"failed" => {
|
||||||
|
let error_msg = status_data
|
||||||
|
.error
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_else(|| "Extract job failed".to_string());
|
||||||
|
break Err(FirecrawlError::APIError(
|
||||||
|
"Extract job failed".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: error_msg,
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
break Err(FirecrawlError::APIError(
|
||||||
|
"Extract job status".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: format!("Unexpected status: {}", status_data.status),
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "Makes real network request"]
|
||||||
|
async fn test_real_extract() {
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||||
|
|
||||||
|
// Create extract params
|
||||||
|
let params = ExtractParams {
|
||||||
|
urls: Some(vec!["https://example.com".to_string()]),
|
||||||
|
prompt: Some("Extract the title and main content from this page".to_string()),
|
||||||
|
schema: None,
|
||||||
|
origin: Some("test".to_string()),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Start an extract job
|
||||||
|
let response = app.async_extract(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert!(!response.id.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_async_extract_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for the extract request
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/extract")
|
||||||
|
.match_body(mockito::Matcher::PartialJson(json!({
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"prompt": "Extract the title and main content"
|
||||||
|
})))
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"id": "extract-123",
|
||||||
|
"urlTrace": []
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = ExtractParams {
|
||||||
|
urls: Some(vec!["https://example.com".to_string()]),
|
||||||
|
prompt: Some("Extract the title and main content".to_string()),
|
||||||
|
schema: None,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.async_extract(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.id, "extract-123");
|
||||||
|
assert!(response.url_trace.unwrap_or_default().is_empty());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_extract_with_schema() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for the extract request with schema
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/extract")
|
||||||
|
.match_body(mockito::Matcher::PartialJson(json!({
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": { "type": "string" },
|
||||||
|
"content": { "type": "string" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})))
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"id": "extract-123"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
// Set up the mock for the status request
|
||||||
|
let status_mock = server
|
||||||
|
.mock("GET", "/v1/extract/extract-123")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"status": "completed",
|
||||||
|
"data": {
|
||||||
|
"title": "Example Domain",
|
||||||
|
"content": "This domain is for use in illustrative examples in documents."
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let urls = Some(vec!["https://example.com".to_string()]);
|
||||||
|
let params = ExtractParams {
|
||||||
|
urls,
|
||||||
|
schema: Some(json!({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": { "type": "string" },
|
||||||
|
"content": { "type": "string" }
|
||||||
|
}
|
||||||
|
})),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.extract(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.status, "completed");
|
||||||
|
|
||||||
|
let data = response.data.unwrap();
|
||||||
|
assert_eq!(data["title"], "Example Domain");
|
||||||
|
assert_eq!(
|
||||||
|
data["content"],
|
||||||
|
"This domain is for use in illustrative examples in documents."
|
||||||
|
);
|
||||||
|
|
||||||
|
mock.assert();
|
||||||
|
status_mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_extract_status_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for the status check
|
||||||
|
let mock = server
|
||||||
|
.mock("GET", "/v1/extract/extract-123")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"status": "processing",
|
||||||
|
"urlTrace": [
|
||||||
|
{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"status": "scraping",
|
||||||
|
"timing": {
|
||||||
|
"discoveredAt": "2023-01-01T00:00:00Z"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let status = app.get_extract_status("extract-123").await.unwrap();
|
||||||
|
|
||||||
|
assert!(status.success);
|
||||||
|
assert_eq!(status.status, "processing");
|
||||||
|
assert_eq!(status.url_trace.unwrap()[0].url, "https://example.com");
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_extract_validation_errors() {
|
||||||
|
let app = FirecrawlApp::new_selfhosted("https://example.com", Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
// Test missing both URLs and prompt
|
||||||
|
let result = app.async_extract(ExtractParams::default()).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
|
||||||
|
// Test having URLs but missing both prompt and schema
|
||||||
|
let params = ExtractParams {
|
||||||
|
urls: Some(vec!["https://example.com".to_string()]),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let result = app.async_extract(params).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_extract_api_error() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for an error response
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/extract")
|
||||||
|
.with_status(400)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": false,
|
||||||
|
"error": "Invalid schema format"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = ExtractParams {
|
||||||
|
urls: Some(vec!["https://example.com".to_string()]),
|
||||||
|
schema: Some(json!("invalid")), // Invalid schema format
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = app.async_extract(params).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
}
|
@ -2,14 +2,18 @@ use reqwest::{Client, Response};
|
|||||||
use serde::de::DeserializeOwned;
|
use serde::de::DeserializeOwned;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
|
pub mod batch_scrape;
|
||||||
pub mod crawl;
|
pub mod crawl;
|
||||||
pub mod document;
|
pub mod document;
|
||||||
mod error;
|
mod error;
|
||||||
|
pub mod extract;
|
||||||
|
pub mod llmstxt;
|
||||||
pub mod map;
|
pub mod map;
|
||||||
pub mod scrape;
|
pub mod scrape;
|
||||||
|
pub mod search;
|
||||||
|
|
||||||
pub use error::FirecrawlError;
|
|
||||||
use error::FirecrawlAPIError;
|
use error::FirecrawlAPIError;
|
||||||
|
pub use error::FirecrawlError;
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct FirecrawlApp {
|
pub struct FirecrawlApp {
|
||||||
@ -26,7 +30,10 @@ impl FirecrawlApp {
|
|||||||
FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key))
|
FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
|
pub fn new_selfhosted(
|
||||||
|
api_url: impl AsRef<str>,
|
||||||
|
api_key: Option<impl AsRef<str>>,
|
||||||
|
) -> Result<Self, FirecrawlError> {
|
||||||
let url = api_url.as_ref().to_string();
|
let url = api_url.as_ref().to_string();
|
||||||
|
|
||||||
if url == CLOUD_API_URL && api_key.is_none() {
|
if url == CLOUD_API_URL && api_key.is_none() {
|
||||||
@ -36,7 +43,7 @@ impl FirecrawlApp {
|
|||||||
success: false,
|
success: false,
|
||||||
error: "API key is required for cloud service".to_string(),
|
error: "API key is required for cloud service".to_string(),
|
||||||
details: None,
|
details: None,
|
||||||
}
|
},
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,27 +80,43 @@ impl FirecrawlApp {
|
|||||||
.text()
|
.text()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::ResponseParseErrorText(e))
|
.map_err(|e| FirecrawlError::ResponseParseErrorText(e))
|
||||||
.and_then(|response_json| serde_json::from_str::<Value>(&response_json).map_err(|e| FirecrawlError::ResponseParseError(e)))
|
.and_then(|response_json| {
|
||||||
|
serde_json::from_str::<Value>(&response_json)
|
||||||
|
.map_err(|e| FirecrawlError::ResponseParseError(e))
|
||||||
|
.inspect(|data| {
|
||||||
|
#[cfg(debug_assertions)]
|
||||||
|
println!("Response JSON: {:#?}", data);
|
||||||
|
})
|
||||||
|
})
|
||||||
.and_then(|response_value| {
|
.and_then(|response_value| {
|
||||||
if response_value["success"].as_bool().unwrap_or(false) {
|
if action.as_ref().starts_with("crawl_") // no success in check/cancel crawl responses
|
||||||
Ok(serde_json::from_value::<T>(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?)
|
|| response_value["success"].as_bool().unwrap_or(false)
|
||||||
|
{
|
||||||
|
Ok(serde_json::from_value::<T>(response_value)
|
||||||
|
.map_err(|e| FirecrawlError::ResponseParseError(e))?)
|
||||||
} else {
|
} else {
|
||||||
Err(FirecrawlError::APIError(
|
Err(FirecrawlError::APIError(
|
||||||
action.as_ref().to_string(),
|
action.as_ref().to_string(),
|
||||||
serde_json::from_value(response_value).map_err(|e| FirecrawlError::ResponseParseError(e))?
|
serde_json::from_value(response_value)
|
||||||
|
.map_err(|e| FirecrawlError::ResponseParseError(e))?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
match &response {
|
match &response {
|
||||||
Ok(_) => response,
|
Ok(_) => response,
|
||||||
Err(FirecrawlError::ResponseParseError(_)) | Err(FirecrawlError::ResponseParseErrorText(_)) => {
|
Err(FirecrawlError::ResponseParseError(_))
|
||||||
|
| Err(FirecrawlError::ResponseParseErrorText(_)) => {
|
||||||
if is_success {
|
if is_success {
|
||||||
response
|
response
|
||||||
} else {
|
} else {
|
||||||
Err(FirecrawlError::HttpRequestFailed(action.as_ref().to_string(), status.as_u16(), status.as_str().to_string()))
|
Err(FirecrawlError::HttpRequestFailed(
|
||||||
|
action.as_ref().to_string(),
|
||||||
|
status.as_u16(),
|
||||||
|
status.as_str().to_string(),
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
|
||||||
Err(_) => response,
|
Err(_) => response,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
426
apps/rust-sdk/src/llmstxt.rs
Normal file
426
apps/rust-sdk/src/llmstxt.rs
Normal file
@ -0,0 +1,426 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::{FirecrawlApp, FirecrawlError, API_VERSION};
|
||||||
|
|
||||||
|
/// Parameters for generating LLMs.txt
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct GenerateLLMsTextParams {
|
||||||
|
/// URL for which to generate LLMs.txt
|
||||||
|
pub url: String,
|
||||||
|
|
||||||
|
/// Maximum number of URLs to process. Default: 10
|
||||||
|
pub max_urls: u32,
|
||||||
|
|
||||||
|
/// Whether to show the full LLMs-full.txt in the response. Default: false
|
||||||
|
pub show_full_text: bool,
|
||||||
|
|
||||||
|
/// Experimental streaming option
|
||||||
|
#[serde(rename = "__experimental_stream")]
|
||||||
|
pub experimental_stream: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for GenerateLLMsTextParams {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
url: String::new(),
|
||||||
|
max_urls: 1,
|
||||||
|
show_full_text: false,
|
||||||
|
experimental_stream: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Response from initiating a LLMs.txt generation job
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct GenerateLLMsTextResponse {
|
||||||
|
/// Whether the request was successful
|
||||||
|
pub success: bool,
|
||||||
|
|
||||||
|
/// Job ID for the LLMs.txt generation
|
||||||
|
pub id: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone, Default)]
|
||||||
|
pub struct LLMTextData {
|
||||||
|
#[serde(rename = "llmstxt")]
|
||||||
|
pub compact: Option<String>,
|
||||||
|
#[serde(rename = "llmsfulltxt")]
|
||||||
|
pub full: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Response from checking the status of a LLMs.txt generation job
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct GenerateLLMsTextStatusResponse {
|
||||||
|
/// Whether the request was successful
|
||||||
|
pub success: bool,
|
||||||
|
|
||||||
|
/// Status of the job: "pending", "processing", "completed", "failed"
|
||||||
|
pub status: String,
|
||||||
|
|
||||||
|
/// Generated LLMs.txt data, present when status is "completed"
|
||||||
|
#[serde(default)]
|
||||||
|
pub data: LLMTextData,
|
||||||
|
|
||||||
|
/// Error message if the job failed
|
||||||
|
pub error: Option<String>,
|
||||||
|
|
||||||
|
/// Expiration timestamp for the data
|
||||||
|
pub expires_at: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FirecrawlApp {
|
||||||
|
/// Generates LLMs.txt for a given URL and polls until completion.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `params` - Parameters for the LLMs.txt generation
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A response containing the generation results, or a FirecrawlError if the request fails.
|
||||||
|
pub async fn generate_llms_text(
|
||||||
|
&self,
|
||||||
|
params: impl Into<GenerateLLMsTextParams>,
|
||||||
|
) -> Result<GenerateLLMsTextStatusResponse, FirecrawlError> {
|
||||||
|
// Initiate the LLMs.txt generation job asynchronously
|
||||||
|
let response = self.async_generate_llms_text(params).await?;
|
||||||
|
|
||||||
|
// Poll for the result
|
||||||
|
let poll_interval = 2000; // Default to 2 seconds
|
||||||
|
self.monitor_llms_text_job_status(&response.id, poll_interval)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Initiates an asynchronous LLMs.txt generation operation.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `params` - Parameters for the LLMs.txt generation
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A response containing the generation job ID, or a FirecrawlError if the request fails.
|
||||||
|
pub async fn async_generate_llms_text(
|
||||||
|
&self,
|
||||||
|
params: impl Into<GenerateLLMsTextParams>,
|
||||||
|
) -> Result<GenerateLLMsTextResponse, FirecrawlError> {
|
||||||
|
let params = params.into();
|
||||||
|
|
||||||
|
// Validation: URL must be provided
|
||||||
|
if params.url.is_empty() {
|
||||||
|
return Err(FirecrawlError::APIError(
|
||||||
|
"Generate LLMs.txt validation".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: "URL must be provided".to_string(),
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let headers = self.prepare_headers(None);
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(format!("{}{}/llmstxt", self.api_url, API_VERSION))
|
||||||
|
.headers(headers)
|
||||||
|
.json(¶ms)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError("Initiating LLMs.txt generation".to_string(), e)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response(response, "initiate LLMs.txt generation")
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks the status of a LLMs.txt generation operation.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `id` - The ID of the LLMs.txt generation operation
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A response containing the current status and results of the generation operation,
|
||||||
|
/// or a FirecrawlError if the request fails.
|
||||||
|
pub async fn check_generate_llms_text_status(
|
||||||
|
&self,
|
||||||
|
id: impl AsRef<str>,
|
||||||
|
) -> Result<GenerateLLMsTextStatusResponse, FirecrawlError> {
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.get(format!(
|
||||||
|
"{}{}/llmstxt/{}",
|
||||||
|
self.api_url,
|
||||||
|
API_VERSION,
|
||||||
|
id.as_ref()
|
||||||
|
))
|
||||||
|
.headers(self.prepare_headers(None))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(
|
||||||
|
format!("Checking status of LLMs.txt generation {}", id.as_ref()),
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response(
|
||||||
|
response,
|
||||||
|
format!("Checking status of LLMs.txt generation {}", id.as_ref()),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function to poll for LLMs.txt generation job status until completion
|
||||||
|
async fn monitor_llms_text_job_status(
|
||||||
|
&self,
|
||||||
|
id: &str,
|
||||||
|
poll_interval: u64,
|
||||||
|
) -> Result<GenerateLLMsTextStatusResponse, FirecrawlError> {
|
||||||
|
loop {
|
||||||
|
let status_data = self.check_generate_llms_text_status(id).await?;
|
||||||
|
|
||||||
|
match status_data.status.as_str() {
|
||||||
|
"completed" => {
|
||||||
|
break Ok(status_data);
|
||||||
|
}
|
||||||
|
"pending" | "processing" => {
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(poll_interval)).await;
|
||||||
|
}
|
||||||
|
"failed" => {
|
||||||
|
let error_msg = status_data
|
||||||
|
.error
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_else(|| "LLMs.txt generation failed".to_string());
|
||||||
|
break Err(FirecrawlError::APIError(
|
||||||
|
"LLMs.txt generation failed".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: error_msg,
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
break Err(FirecrawlError::APIError(
|
||||||
|
"LLMs.txt generation status".to_string(),
|
||||||
|
crate::error::FirecrawlAPIError {
|
||||||
|
success: false,
|
||||||
|
error: format!("Unexpected status: {}", status_data.status),
|
||||||
|
details: None,
|
||||||
|
},
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "Makes real network request"]
|
||||||
|
async fn test_real_generate_llms_text() {
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||||
|
|
||||||
|
let params = GenerateLLMsTextParams {
|
||||||
|
url: "https://example.com".to_string(),
|
||||||
|
max_urls: 5,
|
||||||
|
show_full_text: true,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.async_generate_llms_text(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert!(!response.id.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_async_generate_llms_text_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/llmstxt")
|
||||||
|
.match_body(mockito::Matcher::PartialJson(json!({
|
||||||
|
"url": "https://example.com",
|
||||||
|
"maxUrls": 5
|
||||||
|
})))
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"id": "llmstxt-123"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = GenerateLLMsTextParams {
|
||||||
|
url: "https://example.com".to_string(),
|
||||||
|
max_urls: 5,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.async_generate_llms_text(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.id, "llmstxt-123");
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_check_generate_llms_text_status_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("GET", "/v1/llmstxt/llmstxt-123")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"status": "processing",
|
||||||
|
"expiresAt": "2023-01-01T00:00:00Z"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let status = app
|
||||||
|
.check_generate_llms_text_status("llmstxt-123")
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(status.success);
|
||||||
|
assert_eq!(status.status, "processing");
|
||||||
|
assert_eq!(status.expires_at, "2023-01-01T00:00:00Z");
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_generate_llms_text_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for the generate request
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/llmstxt")
|
||||||
|
.match_body(mockito::Matcher::PartialJson(json!({
|
||||||
|
"url": "https://example.com",
|
||||||
|
"showFullText": true
|
||||||
|
})))
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"id": "llmstxt-123"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
// Set up the mock for the status request
|
||||||
|
let status_mock = server
|
||||||
|
.mock("GET", "/v1/llmstxt/llmstxt-123")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"status": "completed",
|
||||||
|
"data": {
|
||||||
|
"llmstxt": "Allow: /about\nDisallow: /admin\n",
|
||||||
|
"llmsfulltxt": "# LLMs.txt\n\nAllow: /about\nDisallow: /admin\n"
|
||||||
|
},
|
||||||
|
"expiresAt": "2023-01-01T00:00:00Z"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = GenerateLLMsTextParams {
|
||||||
|
url: "https://example.com".to_string(),
|
||||||
|
show_full_text: true,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.generate_llms_text(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.status, "completed");
|
||||||
|
|
||||||
|
let data = response.data;
|
||||||
|
assert_eq!(
|
||||||
|
data.compact,
|
||||||
|
Some("Allow: /about\nDisallow: /admin\n".into())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
data.full,
|
||||||
|
Some("# LLMs.txt\n\nAllow: /about\nDisallow: /admin\n".into())
|
||||||
|
);
|
||||||
|
|
||||||
|
mock.assert();
|
||||||
|
status_mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_generate_llms_text_validation_errors() {
|
||||||
|
let app = FirecrawlApp::new_selfhosted("https://example.com", Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
// Test missing URL
|
||||||
|
let params = GenerateLLMsTextParams {
|
||||||
|
url: "".to_string(),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let result = app.async_generate_llms_text(params).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_generate_llms_text_api_error() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
// Set up the mock for an error response
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/llmstxt")
|
||||||
|
.with_status(400)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": false,
|
||||||
|
"error": "Invalid URL format"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
|
||||||
|
let params = GenerateLLMsTextParams {
|
||||||
|
url: "not-a-valid-url".to_string(),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = app.async_generate_llms_text(params).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
}
|
@ -16,7 +16,7 @@ pub struct MapOptions {
|
|||||||
pub include_subdomains: Option<bool>,
|
pub include_subdomains: Option<bool>,
|
||||||
|
|
||||||
/// Maximum number of links to return (default: `5000`)
|
/// Maximum number of links to return (default: `5000`)
|
||||||
pub exclude_tags: Option<u32>,
|
pub limit: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
#[derive(Deserialize, Serialize, Debug, Default)]
|
||||||
@ -59,7 +59,9 @@ impl FirecrawlApp {
|
|||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpError(format!("Mapping {:?}", url.as_ref()), e))?;
|
.map_err(|e| FirecrawlError::HttpError(format!("Mapping {:?}", url.as_ref()), e))?;
|
||||||
|
|
||||||
let response = self.handle_response::<MapResponse>(response, "scrape URL").await?;
|
let response = self
|
||||||
|
.handle_response::<MapResponse>(response, "scrape URL")
|
||||||
|
.await?;
|
||||||
|
|
||||||
Ok(response.links)
|
Ok(response.links)
|
||||||
}
|
}
|
||||||
|
@ -43,7 +43,7 @@ pub enum ScrapeFormats {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ExtractOptions {
|
pub struct ExtractOptions {
|
||||||
/// Schema the output should adhere to, provided in JSON Schema format.
|
/// Schema the output should adhere to, provided in JSON Schema format.
|
||||||
@ -56,7 +56,7 @@ pub struct ExtractOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[serde_with::skip_serializing_none]
|
#[serde_with::skip_serializing_none]
|
||||||
#[derive(Deserialize, Serialize, Debug, Default)]
|
#[derive(Deserialize, Serialize, Debug, Default, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ScrapeOptions {
|
pub struct ScrapeOptions {
|
||||||
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
/// Formats to extract from the page. (default: `[ Markdown ]`)
|
||||||
@ -131,7 +131,9 @@ impl FirecrawlApp {
|
|||||||
.await
|
.await
|
||||||
.map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
|
.map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?;
|
||||||
|
|
||||||
let response = self.handle_response::<ScrapeResponse>(response, "scrape URL").await?;
|
let response = self
|
||||||
|
.handle_response::<ScrapeResponse>(response, "scrape URL")
|
||||||
|
.await?;
|
||||||
|
|
||||||
Ok(response.data)
|
Ok(response.data)
|
||||||
}
|
}
|
||||||
|
245
apps/rust-sdk/src/search.rs
Normal file
245
apps/rust-sdk/src/search.rs
Normal file
@ -0,0 +1,245 @@
|
|||||||
|
use crate::{scrape::ScrapeOptions, FirecrawlApp, FirecrawlError, API_VERSION};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct SearchParams {
|
||||||
|
/// The search query string
|
||||||
|
pub query: String,
|
||||||
|
/// Maximum number of results to return. Default: 5, Max: 20
|
||||||
|
pub limit: Option<u32>,
|
||||||
|
/// Time-based search filter.
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub tbs: Option<String>,
|
||||||
|
/// Query string to filter search results. Example: "site:example.com"
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub filter: Option<String>,
|
||||||
|
/// Language code. Default: "en"
|
||||||
|
pub lang: Option<String>,
|
||||||
|
/// Country code. Default: "us"
|
||||||
|
pub country: Option<String>,
|
||||||
|
/// Geographic location string for local search results
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub location: Option<String>,
|
||||||
|
/// Origin identifier. Default: "api"
|
||||||
|
pub origin: Option<String>,
|
||||||
|
/// Timeout in milliseconds. Default: 60000
|
||||||
|
pub timeout: Option<u32>,
|
||||||
|
/// Additional options for webpage scraping behavior
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub scrape_options: Option<ScrapeOptions>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for SearchParams {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
query: String::new(),
|
||||||
|
limit: Some(5),
|
||||||
|
tbs: None,
|
||||||
|
filter: None,
|
||||||
|
lang: Some("en".to_string()),
|
||||||
|
country: Some("us".to_string()),
|
||||||
|
location: None,
|
||||||
|
origin: Some("api".to_string()),
|
||||||
|
timeout: Some(60000),
|
||||||
|
scrape_options: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct SearchResponse {
|
||||||
|
pub success: bool,
|
||||||
|
pub data: Vec<SearchDocument>,
|
||||||
|
pub warning: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Consider merging fields into document::Document (url, title, description) while preserving optionality
|
||||||
|
/// A document returned from a search or scrape request
|
||||||
|
#[serde_with::skip_serializing_none]
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct SearchDocument {
|
||||||
|
/// Document URL
|
||||||
|
pub url: String,
|
||||||
|
/// Document title
|
||||||
|
pub title: String,
|
||||||
|
/// Document description
|
||||||
|
pub description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FirecrawlApp {
|
||||||
|
/// Search for content using the Firecrawl API.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `query` - The search query string
|
||||||
|
/// * `params` - Optional parameters for the search request
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A SearchResponse containing the search results, or a FirecrawlError if the request fails.
|
||||||
|
pub async fn search(
|
||||||
|
&self,
|
||||||
|
query: impl AsRef<str>,
|
||||||
|
params: impl Into<Option<SearchParams>>,
|
||||||
|
) -> Result<SearchResponse, FirecrawlError> {
|
||||||
|
let mut search_params = params.into().unwrap_or_default();
|
||||||
|
search_params.query = query.as_ref().to_string();
|
||||||
|
|
||||||
|
self.search_with_params(search_params).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Alternative method that takes SearchParams directly
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `params` - Search parameters including the query
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
///
|
||||||
|
/// A SearchResponse containing the search results, or a FirecrawlError if the request fails.
|
||||||
|
pub async fn search_with_params(
|
||||||
|
&self,
|
||||||
|
params: SearchParams,
|
||||||
|
) -> Result<SearchResponse, FirecrawlError> {
|
||||||
|
let headers = self.prepare_headers(None);
|
||||||
|
|
||||||
|
let response = self
|
||||||
|
.client
|
||||||
|
.post(format!("{}{}/search", self.api_url, API_VERSION))
|
||||||
|
.headers(headers)
|
||||||
|
.json(¶ms)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
FirecrawlError::HttpError(format!("Searching with query: {:?}", params.query), e)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
self.handle_response::<SearchResponse>(response, "search")
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub mod tests {
|
||||||
|
use super::*;
|
||||||
|
use serde_json::json;
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
#[ignore = "Makes real network request"]
|
||||||
|
async fn test_real_search() {
|
||||||
|
let api_url = std::env::var("FIRECRAWL_API_URL")
|
||||||
|
.expect("Please set the FIRECRAWL_API_URL environment variable");
|
||||||
|
let app = FirecrawlApp::new_selfhosted(api_url, None::<&str>).unwrap();
|
||||||
|
let response = app.search("test query", None).await.unwrap();
|
||||||
|
assert!(response.success);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_search_with_mock() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/search")
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"data": [{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"title": "Example Domain",
|
||||||
|
"description": "...."
|
||||||
|
}],
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let response = app.search("test", None).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.data.len(), 1);
|
||||||
|
assert_eq!(response.data[0].url, "https://example.com");
|
||||||
|
assert_eq!(response.data[0].title, "Example Domain".to_string());
|
||||||
|
assert_eq!(response.data[0].description, "....".to_string());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_search_with_params() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/search")
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.match_body(mockito::Matcher::Json(json!({
|
||||||
|
"query": "test",
|
||||||
|
"limit": 10,
|
||||||
|
"lang": "fr",
|
||||||
|
"country": "fr",
|
||||||
|
"origin": "api",
|
||||||
|
"timeout": 30000
|
||||||
|
})))
|
||||||
|
.with_status(200)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": true,
|
||||||
|
"data": [],
|
||||||
|
"warning": "No results found"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let params = SearchParams {
|
||||||
|
query: "test".to_string(),
|
||||||
|
limit: Some(10),
|
||||||
|
lang: Some("fr".to_string()),
|
||||||
|
country: Some("fr".to_string()),
|
||||||
|
timeout: Some(30000),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
|
||||||
|
let response = app.search_with_params(params).await.unwrap();
|
||||||
|
|
||||||
|
assert!(response.success);
|
||||||
|
assert_eq!(response.data.len(), 0);
|
||||||
|
assert_eq!(response.warning, Some("No results found".to_string()));
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_search_error_response() {
|
||||||
|
let mut server = mockito::Server::new_async().await;
|
||||||
|
let mock = server
|
||||||
|
.mock("POST", "/v1/search")
|
||||||
|
.with_status(400)
|
||||||
|
.with_header("content-type", "application/json")
|
||||||
|
.with_body(
|
||||||
|
json!({
|
||||||
|
"success": false,
|
||||||
|
"error": "Invalid query"
|
||||||
|
})
|
||||||
|
.to_string(),
|
||||||
|
)
|
||||||
|
.create();
|
||||||
|
|
||||||
|
let app = FirecrawlApp::new_selfhosted(server.url(), Some("test_key")).unwrap();
|
||||||
|
let result = app.search("", None).await;
|
||||||
|
|
||||||
|
assert!(result.is_err());
|
||||||
|
mock.assert();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_search_network_error() {
|
||||||
|
let app = FirecrawlApp::new_selfhosted("http://invalid-url", Some("test_key")).unwrap();
|
||||||
|
let result = app.search("test", None).await;
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,3 @@
|
|||||||
use assert_matches::assert_matches;
|
|
||||||
use dotenvy::dotenv;
|
use dotenvy::dotenv;
|
||||||
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
|
use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
|
||||||
use firecrawl::{FirecrawlApp, FirecrawlError};
|
use firecrawl::{FirecrawlApp, FirecrawlError};
|
||||||
@ -24,11 +23,8 @@ use std::env;
|
|||||||
async fn test_successful_response_with_valid_preview_token() {
|
async fn test_successful_response_with_valid_preview_token() {
|
||||||
dotenv().ok();
|
dotenv().ok();
|
||||||
let api_url = env::var("API_URL").unwrap();
|
let api_url = env::var("API_URL").unwrap();
|
||||||
let app = FirecrawlApp::new_selfhosted(
|
let app =
|
||||||
api_url,
|
FirecrawlApp::new_selfhosted(api_url, Some(env::var("PREVIEW_TOKEN").unwrap())).unwrap();
|
||||||
Some(env::var("PREVIEW_TOKEN").unwrap()),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let result = app
|
let result = app
|
||||||
.scrape_url("https://roastmywebsite.ai", None)
|
.scrape_url("https://roastmywebsite.ai", None)
|
||||||
.await
|
.await
|
||||||
@ -58,7 +54,7 @@ async fn test_successful_response_with_valid_api_key_and_include_html() {
|
|||||||
let api_key = env::var("TEST_API_KEY").ok();
|
let api_key = env::var("TEST_API_KEY").ok();
|
||||||
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
let app = FirecrawlApp::new_selfhosted(api_url, api_key).unwrap();
|
||||||
let params = ScrapeOptions {
|
let params = ScrapeOptions {
|
||||||
formats: vec! [ ScrapeFormats::Markdown, ScrapeFormats::HTML ].into(),
|
formats: vec![ScrapeFormats::Markdown, ScrapeFormats::HTML].into(),
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
let result = app
|
let result = app
|
||||||
@ -82,7 +78,8 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file() {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(result.markdown.is_some());
|
assert!(result.markdown.is_some());
|
||||||
assert!(result.markdown
|
assert!(result
|
||||||
|
.markdown
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
||||||
}
|
}
|
||||||
@ -98,12 +95,12 @@ async fn test_successful_response_for_valid_scrape_with_pdf_file_without_explici
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(result.markdown.is_some());
|
assert!(result.markdown.is_some());
|
||||||
assert!(result.markdown
|
assert!(result
|
||||||
|
.markdown
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
.contains("We present spectrophotometric observations of the Broad Line Radio Galaxy"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// #[tokio::test]
|
// #[tokio::test]
|
||||||
// async fn test_should_return_error_for_blocklisted_url() {
|
// async fn test_should_return_error_for_blocklisted_url() {
|
||||||
// dotenv().ok();
|
// dotenv().ok();
|
||||||
@ -166,7 +163,11 @@ fn test_api_key_requirements() {
|
|||||||
match (api_url.contains("api.firecrawl.dev"), api_key) {
|
match (api_url.contains("api.firecrawl.dev"), api_key) {
|
||||||
(false, _) => {
|
(false, _) => {
|
||||||
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
||||||
assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap());
|
assert!(
|
||||||
|
result.is_ok(),
|
||||||
|
"Local setup failed: {:?}",
|
||||||
|
result.err().unwrap()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
(true, None) => {
|
(true, None) => {
|
||||||
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user