use std::collections::HashMap; use serde::{Deserialize, Serialize}; use serde_json::Value; use crate::{document::Document, FirecrawlApp, FirecrawlError, API_VERSION}; #[derive(Deserialize, Serialize, Clone, Copy, Debug)] pub enum ScrapeFormats { /// Will result in a copy of the Markdown content of the page. #[serde(rename = "markdown")] Markdown, /// Will result in a copy of the filtered, content-only HTML of the page. #[serde(rename = "html")] HTML, /// Will result in a copy of the raw HTML of the page. #[serde(rename = "rawHtml")] RawHTML, /// Will result in a Vec of URLs found on the page. #[serde(rename = "links")] Links, /// Will result in a URL to a screenshot of the page. /// /// Can not be used in conjunction with `ScrapeFormats::ScreenshotFullPage`. #[serde(rename = "screenshot")] Screenshot, /// Will result in a URL to a full-page screenshot of the page. /// /// Can not be used in conjunction with `ScrapeFormats::Screenshot`. #[serde(rename = "screenshot@fullPage")] ScreenshotFullPage, /// Will result in the results of an LLM extraction. /// /// See `ScrapeOptions.extract` for more options. #[serde(rename = "extract")] Extract, } #[serde_with::skip_serializing_none] #[derive(Deserialize, Serialize, Debug, Default)] #[serde(rename_all = "camelCase")] pub struct ExtractOptions { /// Schema the output should adhere to, provided in JSON Schema format. pub schema: Option, pub system_prompt: Option, /// Extraction prompt to send to the LLM agent along with the page content. pub prompt: Option, } #[serde_with::skip_serializing_none] #[derive(Deserialize, Serialize, Debug, Default)] #[serde(rename_all = "camelCase")] pub struct ScrapeOptions { /// Formats to extract from the page. (default: `[ Markdown ]`) pub formats: Option>, /// Only extract the main content of the page, excluding navigation and other miscellaneous content. (default: `true`) pub only_main_content: Option, /// HTML tags to exclusively include. /// /// For example, if you pass `div`, you will only get content from `
`s and their children. pub include_tags: Option>, /// HTML tags to exclude. /// /// For example, if you pass `img`, you will never get image URLs in your results. pub exclude_tags: Option>, /// Additional HTTP headers to use when loading the page. pub headers: Option>, // Amount of time to wait after loading the page, and before grabbing the content, in milliseconds. (default: `0`) pub wait_for: Option, // Timeout before returning an error, in milliseconds. (default: `60000`) pub timeout: Option, /// Extraction options, to be used in conjunction with `ScrapeFormats::Extract`. pub extract: Option, } #[derive(Deserialize, Serialize, Debug, Default)] #[serde(rename_all = "camelCase")] struct ScrapeRequestBody { url: String, #[serde(flatten)] options: ScrapeOptions, } #[derive(Deserialize, Serialize, Debug, Default)] #[serde(rename_all = "camelCase")] struct ScrapeResponse { /// This will always be `true` due to `FirecrawlApp::handle_response`. /// No need to expose. success: bool, /// The resulting document. data: Document, } impl FirecrawlApp { /// Scrapes a URL using the Firecrawl API. pub async fn scrape_url( &self, url: impl AsRef, options: impl Into>, ) -> Result { let body = ScrapeRequestBody { url: url.as_ref().to_string(), options: options.into().unwrap_or_default(), }; let headers = self.prepare_headers(None); let response = self .client .post(&format!("{}{}/scrape", self.api_url, API_VERSION)) .headers(headers) .json(&body) .send() .await .map_err(|e| FirecrawlError::HttpError(format!("Scraping {:?}", url.as_ref()), e))?; let response = self.handle_response::(response, "scrape URL").await?; Ok(response.data) } }