From 6a5a4e5b6f435e01d13a42a65bb7bacdaaec9ef0 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 13 Mar 2025 11:21:35 -0300 Subject: [PATCH 01/26] improv/types-and-comments-descs --- apps/python-sdk/firecrawl/firecrawl.py | 852 +++++++++++++++++++------ 1 file changed, 674 insertions(+), 178 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d79b174c..d212dea7 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -12,8 +12,9 @@ Classes: import logging import os import time -from typing import Any, Dict, Optional, List, Union, Callable +from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic import json +from datetime import datetime import requests import pydantic @@ -21,6 +22,212 @@ import websockets logger : logging.Logger = logging.getLogger("firecrawl") +T = TypeVar('T') + +class FirecrawlDocumentMetadata(pydantic.BaseModel): + """Metadata for a Firecrawl document.""" + title: Optional[str] = None + description: Optional[str] = None + language: Optional[str] = None + keywords: Optional[str] = None + robots: Optional[str] = None + ogTitle: Optional[str] = None + ogDescription: Optional[str] = None + ogUrl: Optional[str] = None + ogImage: Optional[str] = None + ogAudio: Optional[str] = None + ogDeterminer: Optional[str] = None + ogLocale: Optional[str] = None + ogLocaleAlternate: Optional[List[str]] = None + ogSiteName: Optional[str] = None + ogVideo: Optional[str] = None + dctermsCreated: Optional[str] = None + dcDateCreated: Optional[str] = None + dcDate: Optional[str] = None + dctermsType: Optional[str] = None + dcType: Optional[str] = None + dctermsAudience: Optional[str] = None + dctermsSubject: Optional[str] = None + dcSubject: Optional[str] = None + dcDescription: Optional[str] = None + dctermsKeywords: Optional[str] = None + modifiedTime: Optional[str] = None + publishedTime: Optional[str] = None + articleTag: Optional[str] = None + articleSection: Optional[str] = None + sourceURL: Optional[str] = None + statusCode: Optional[int] = None + error: Optional[str] = None + +class ActionsResult(pydantic.BaseModel): + """Result of actions performed during scraping.""" + screenshots: List[str] + +class FirecrawlDocument(pydantic.BaseModel, Generic[T]): + """Document retrieved or processed by Firecrawl.""" + url: Optional[str] = None + markdown: Optional[str] = None + html: Optional[str] = None + rawHtml: Optional[str] = None + links: Optional[List[str]] = None + extract: Optional[T] = None + json: Optional[T] = None + screenshot: Optional[str] = None + metadata: Optional[FirecrawlDocumentMetadata] = None + actions: Optional[ActionsResult] = None + title: Optional[str] = None # v1 search only + description: Optional[str] = None # v1 search only + +class LocationConfig(pydantic.BaseModel): + """Location configuration for scraping.""" + country: Optional[str] = None + languages: Optional[List[str]] = None + +class WebhookConfig(pydantic.BaseModel): + """Configuration for webhooks.""" + url: str + headers: Optional[Dict[str, str]] = None + metadata: Optional[Dict[str, str]] = None + events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None + +class CrawlScrapeOptions(pydantic.BaseModel): + """Parameters for scraping operations.""" + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None + headers: Optional[Dict[str, str]] = None + includeTags: Optional[List[str]] = None + excludeTags: Optional[List[str]] = None + onlyMainContent: Optional[bool] = None + waitFor: Optional[int] = None + timeout: Optional[int] = None + location: Optional[LocationConfig] = None + mobile: Optional[bool] = None + skipTlsVerification: Optional[bool] = None + removeBase64Images: Optional[bool] = None + blockAds: Optional[bool] = None + proxy: Optional[Literal["basic", "stealth"]] = None + +class Action(pydantic.BaseModel): + """Action to perform during scraping.""" + type: Literal["wait", "click", "screenshot", "write", "press", "scroll", "scrape", "executeJavascript"] + milliseconds: Optional[int] = None + selector: Optional[str] = None + fullPage: Optional[bool] = None + text: Optional[str] = None + key: Optional[str] = None + direction: Optional[Literal["up", "down"]] = None + script: Optional[str] = None + +class ExtractConfig(pydantic.BaseModel): + """Configuration for extraction.""" + prompt: Optional[str] = None + schema: Optional[Any] = None + systemPrompt: Optional[str] = None + +class ScrapeParams(CrawlScrapeOptions): + """Parameters for scraping operations.""" + extract: Optional[ExtractConfig] = None + jsonOptions: Optional[ExtractConfig] = None + actions: Optional[List[Action]] = None + +class ScrapeResponse(FirecrawlDocument[T], Generic[T]): + """Response from scraping operations.""" + success: bool = True + warning: Optional[str] = None + error: Optional[str] = None + +class BatchScrapeResponse(pydantic.BaseModel): + """Response from batch scrape operations.""" + id: Optional[str] = None + url: Optional[str] = None + success: bool = True + error: Optional[str] = None + invalidURLs: Optional[List[str]] = None + +class BatchScrapeStatusResponse(pydantic.BaseModel): + """Response from batch scrape status checks.""" + success: bool = True + status: Literal["scraping", "completed", "failed", "cancelled"] + completed: int + total: int + creditsUsed: int + expiresAt: datetime + next: Optional[str] = None + data: List[FirecrawlDocument] + +class CrawlParams(pydantic.BaseModel): + """Parameters for crawling operations.""" + includePaths: Optional[List[str]] = None + excludePaths: Optional[List[str]] = None + maxDepth: Optional[int] = None + maxDiscoveryDepth: Optional[int] = None + limit: Optional[int] = None + allowBackwardLinks: Optional[bool] = None + allowExternalLinks: Optional[bool] = None + ignoreSitemap: Optional[bool] = None + scrapeOptions: Optional[CrawlScrapeOptions] = None + webhook: Optional[Union[str, WebhookConfig]] = None + deduplicateSimilarURLs: Optional[bool] = None + ignoreQueryParameters: Optional[bool] = None + regexOnFullURL: Optional[bool] = None + +class CrawlResponse(pydantic.BaseModel): + """Response from crawling operations.""" + id: Optional[str] = None + url: Optional[str] = None + success: bool = True + error: Optional[str] = None + +class CrawlStatusResponse(pydantic.BaseModel): + """Response from crawl status checks.""" + success: bool = True + status: Literal["scraping", "completed", "failed", "cancelled"] + completed: int + total: int + creditsUsed: int + expiresAt: datetime + next: Optional[str] = None + data: List[FirecrawlDocument] + +class CrawlErrorsResponse(pydantic.BaseModel): + """Response from crawl/batch scrape error monitoring.""" + errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str} + robotsBlocked: List[str] + +class MapParams(pydantic.BaseModel): + """Parameters for mapping operations.""" + search: Optional[str] = None + ignoreSitemap: Optional[bool] = None + includeSubdomains: Optional[bool] = None + sitemapOnly: Optional[bool] = None + limit: Optional[int] = None + timeout: Optional[int] = None + +class MapResponse(pydantic.BaseModel): + """Response from mapping operations.""" + success: bool = True + links: Optional[List[str]] = None + error: Optional[str] = None + +class ExtractParams(pydantic.BaseModel): + """Parameters for extracting information from URLs.""" + prompt: Optional[str] = None + schema: Optional[Any] = None + systemPrompt: Optional[str] = None + allowExternalLinks: Optional[bool] = None + enableWebSearch: Optional[bool] = None + includeSubdomains: Optional[bool] = None + origin: Optional[str] = None + showSources: Optional[bool] = None + scrapeOptions: Optional[CrawlScrapeOptions] = None + +class ExtractResponse(pydantic.BaseModel, Generic[T]): + """Response from extract operations.""" + success: bool = True + data: Optional[T] = None + error: Optional[str] = None + warning: Optional[str] = None + sources: Optional[List[str]] = None + class SearchParams(pydantic.BaseModel): query: str limit: Optional[int] = 5 @@ -33,6 +240,13 @@ class SearchParams(pydantic.BaseModel): timeout: Optional[int] = 60000 scrapeOptions: Optional[Dict[str, Any]] = None +class SearchResponse(pydantic.BaseModel): + """Response from search operations.""" + success: bool = True + data: List[FirecrawlDocument] + warning: Optional[str] = None + error: Optional[str] = None + class GenerateLLMsTextParams(pydantic.BaseModel): """ Parameters for the LLMs.txt generation operation. @@ -73,40 +287,21 @@ class DeepResearchStatusResponse(pydantic.BaseModel): sources: List[Dict[str, Any]] summaries: List[str] +class GenerateLLMsTextResponse(pydantic.BaseModel): + """Response from LLMs.txt generation operations.""" + success: bool = True + id: str + error: Optional[str] = None + +class GenerateLLMsTextStatusResponse(pydantic.BaseModel): + """Status response from LLMs.txt generation operations.""" + success: bool = True + data: Optional[Dict[str, str]] = None # {llmstxt: str, llmsfulltxt?: str} + status: Literal["processing", "completed", "failed"] + error: Optional[str] = None + expiresAt: str + class FirecrawlApp: - class SearchResponse(pydantic.BaseModel): - """ - Response from the search operation. - """ - success: bool - data: List[Dict[str, Any]] - warning: Optional[str] = None - error: Optional[str] = None - - class ExtractParams(pydantic.BaseModel): - """ - Parameters for the extract operation. - """ - prompt: Optional[str] = None - schema_: Optional[Any] = pydantic.Field(None, alias='schema') - system_prompt: Optional[str] = None - allow_external_links: Optional[bool] = False - enable_web_search: Optional[bool] = False - # Just for backwards compatibility - enableWebSearch: Optional[bool] = False - show_sources: Optional[bool] = False - - - - - class ExtractResponse(pydantic.BaseModel): - """ - Response from the extract operation. - """ - success: bool - data: Optional[Any] = None - error: Optional[str] = None - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: """ Initialize the FirecrawlApp instance with API key, API URL. @@ -125,19 +320,42 @@ class FirecrawlApp: logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}") - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> ScrapeResponse[Any]: """ - Scrape the specified URL using the Firecrawl API. + Scrape and extract content from a URL. Args: - url (str): The URL to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + url (str): Target URL to scrape + + params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration: + + Content Options: + * formats - Content types to retrieve (markdown/html/etc) + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type (basic/stealth) + + Extraction Options: + * extract - Content extraction settings + * jsonOptions - JSON extraction settings + * actions - Actions to perform Returns: - Any: The scraped data if the request is successful. + ScrapeResponse with: + + * Requested content formats + * Page metadata + * Extraction results + * Success/error status Raises: - Exception: If the scrape request fails. + Exception: If scraping fails """ headers = self._prepare_headers() @@ -193,16 +411,35 @@ class FirecrawlApp: else: self._handle_error(response, 'scrape URL') - def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]: + def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: """ - Search for content using the Firecrawl API. + Search for content using Firecrawl. Args: - query (str): The search query string. - params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters. + query (str): Search query string + + params (Optional[Union[Dict[str, Any], SearchParams]]): See SearchParams model: + + Search Options: + * limit - Max results (default: 5) + * tbs - Time filter (e.g. "qdr:d") + * filter - Custom result filter + + Localization: + * lang - Language code (default: "en") + * country - Country code (default: "us") + * location - Geo-targeting + + Request Options: + * timeout - Request timeout (ms) + * scrapeOptions - Result scraping config, check ScrapeParams model for more details Returns: - Dict[str, Any]: The search response containing success status and search results. + SearchResponse + + + Raises: + Exception: If search fails """ if params is None: params = {} @@ -230,28 +467,46 @@ class FirecrawlApp: def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + idempotency_key: Optional[str] = None) -> CrawlStatusResponse: """ - Initiate a crawl job for the specified URL using the Firecrawl API. + Crawl a website starting from a URL. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + + params (Optional[Dict[str, Any]]): See CrawlParams model for configuration: + + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + + poll_interval: Seconds between status checks (default: 2) + + idempotency_key: Request deduplication key Returns: - Dict[str, Any]: A dictionary containing the crawl results. The structure includes: - - 'success' (bool): Indicates if the crawl was successful. - - 'status' (str): The final status of the crawl job (e.g., 'completed'). - - 'completed' (int): Number of scraped pages that completed. - - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this crawl. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires. - - 'data' (List[Dict]): List of all the scraped pages. + CrawlStatusResponse with: + * Crawling status and progress + * Crawled page contents + * Success/error information Raises: - Exception: If the crawl job initiation or monitoring fails. + Exception: If crawl fails """ endpoint = f'/v1/crawl' headers = self._prepare_headers(idempotency_key) @@ -270,20 +525,45 @@ class FirecrawlApp: self._handle_error(response, 'start crawl job') - def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> CrawlResponse: """ - Initiate a crawl job asynchronously. + Start an asynchronous crawl job. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + + params (Optional[Dict[str, Any]]): See CrawlParams model: + + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + + idempotency_key: Unique key to prevent duplicate requests Returns: - Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes: - - 'success' (bool): Indicates if the crawl initiation was successful. - - 'id' (str): The unique identifier for the crawl job. - - 'url' (str): The URL to check the status of the crawl job. + CrawlResponse with: + * success - Whether crawl started successfully + * id - Unique identifier for the crawl job + * url - Status check URL for the crawl + * error - Error message if start failed + + Raises: + Exception: If crawl initiation fails """ endpoint = f'/v1/crawl' headers = self._prepare_headers(idempotency_key) @@ -299,18 +579,31 @@ class FirecrawlApp: else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, id: str) -> Any: + def check_crawl_status(self, id: str) -> CrawlStatusResponse: """ - Check the status of a crawl job using the Firecrawl API. + Check the status and results of a crawl job. Args: - id (str): The ID of the crawl job. + id: Unique identifier for the crawl job Returns: - Any: The status of the crawl job. + CrawlStatusResponse containing: + + Status Information: + * status - Current state (scraping/completed/failed/cancelled) + * completed - Number of pages crawled + * total - Total pages to crawl + * creditsUsed - API credits consumed + * expiresAt - Data expiration timestamp + + Results: + * data - List of crawled documents + * next - URL for next page of results (if paginated) + * success - Whether status check succeeded + * error - Error message if failed Raises: - Exception: If the status check request fails. + Exception: If status check fails """ endpoint = f'/v1/crawl/{id}' @@ -369,7 +662,7 @@ class FirecrawlApp: else: self._handle_error(response, 'check crawl status') - def check_crawl_errors(self, id: str) -> Dict[str, Any]: + def check_crawl_errors(self, id: str) -> CrawlErrorsResponse: """ Returns information about crawl errors. @@ -427,16 +720,32 @@ class FirecrawlApp: else: raise Exception("Crawl job failed to start") - def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> MapResponse: """ - Perform a map search using the Firecrawl API. + Map and discover links from a URL. Args: - url (str): The URL to perform the map search on. - params (Optional[Dict[str, Any]]): Additional parameters for the map search. + url: Target URL to map + + params: See MapParams model: + + Discovery Options: + * search - Filter pattern for URLs + * ignoreSitemap - Skip sitemap.xml + * includeSubdomains - Include subdomain links + * sitemapOnly - Only use sitemap.xml + + Limits: + * limit - Max URLs to return + * timeout - Request timeout (ms) Returns: - List[str]: A list of URLs discovered during the map search. + MapResponse with: + * Discovered URLs + * Success/error status + + Raises: + Exception: If mapping fails """ endpoint = f'/v1/map' headers = self._prepare_headers() @@ -469,28 +778,44 @@ class FirecrawlApp: def batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + idempotency_key: Optional[str] = None) -> BatchScrapeStatusResponse: """ - Initiate a batch scrape job for the specified URLs using the Firecrawl API. + Batch scrape multiple URLs and monitor until completion. Args: - urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. - poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + urls: URLs to scrape + + params: See ScrapeParams model: + + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + + poll_interval: Seconds between status checks (default: 2) + + idempotency_key: Request deduplication key Returns: - Dict[str, Any]: A dictionary containing the scrape results. The structure includes: - - 'success' (bool): Indicates if the batch scrape was successful. - - 'status' (str): The final status of the batch scrape job (e.g., 'completed'). - - 'completed' (int): Number of scraped pages that completed. - - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this batch scrape. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires. - - 'data' (List[Dict]): List of all the scraped pages. + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information Raises: - Exception: If the batch scrape job initiation or monitoring fails. + Exception: If batch scrape fails """ endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) @@ -509,9 +834,13 @@ class FirecrawlApp: self._handle_error(response, 'start batch scrape job') - def async_batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_batch_scrape_urls( + self, + urls: List[str], + params: Optional[Dict[str, Any]] = None, + idempotency_key: Optional[str] = None) -> BatchScrapeResponse: """ - Initiate a crawl job asynchronously. + Initiate a batch scrape job asynchronously. Args: urls (List[str]): The URLs to scrape. @@ -519,7 +848,7 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes: + BatchScrapeResponse: A dictionary containing the batch scrape initiation response. The structure includes: - 'success' (bool): Indicates if the batch scrape initiation was successful. - 'id' (str): The unique identifier for the batch scrape job. - 'url' (str): The URL to check the status of the batch scrape job. @@ -538,13 +867,17 @@ class FirecrawlApp: else: self._handle_error(response, 'start batch scrape job') - def batch_scrape_urls_and_watch(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def batch_scrape_urls_and_watch( + self, + urls: List[str], + params: Optional[ScrapeParams] = None, + idempotency_key: Optional[str] = None) -> 'CrawlWatcher': """ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. Args: urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + params (Optional[ScrapeParams]): Additional parameters for the scraper. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: @@ -556,7 +889,7 @@ class FirecrawlApp: else: raise Exception("Batch scrape job failed to start") - def check_batch_scrape_status(self, id: str) -> Any: + def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse: """ Check the status of a batch scrape job using the Firecrawl API. @@ -564,7 +897,7 @@ class FirecrawlApp: id (str): The ID of the batch scrape job. Returns: - Any: The status of the batch scrape job. + BatchScrapeStatusResponse: The status of the batch scrape job. Raises: Exception: If the status check request fails. @@ -626,7 +959,7 @@ class FirecrawlApp: else: self._handle_error(response, 'check batch scrape status') - def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]: + def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse: """ Returns information about batch scrape errors. @@ -634,7 +967,13 @@ class FirecrawlApp: id (str): The ID of the crawl job. Returns: - Dict[str, Any]: Information about crawl errors. + CrawlErrorsResponse: A response containing: + - errors (List[Dict[str, str]]): List of errors with fields: + - id (str): Error ID + - timestamp (str): When the error occurred + - url (str): URL that caused the error + - error (str): Error message + - robotsBlocked (List[str]): List of URLs blocked by robots.txt """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers) @@ -646,16 +985,40 @@ class FirecrawlApp: else: self._handle_error(response, "check batch scrape errors") - def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any: + def extract( + self, + urls: List[str], + params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: """ - Extracts information from a URL using the Firecrawl API. + Extract structured information from URLs. Args: - urls (List[str]): The URLs to extract information from. - params (Optional[ExtractParams]): Additional parameters for the extract request. + urls: URLs to extract from + + params: See ExtractParams model: + + Extraction Config: + * prompt - Custom extraction prompt + * schema - JSON schema/Pydantic model + * systemPrompt - System context + + Behavior Options: + * allowExternalLinks - Follow external links + * enableWebSearch - Enable web search + * includeSubdomains - Include subdomains + * showSources - Include source URLs + + Scraping Options: + * scrapeOptions - Page scraping config Returns: - Union[ExtractResponse, ErrorResponse]: The response from the extract operation. + ExtractResponse with: + * Structured data matching schema + * Source information if requested + * Success/error status + + Raises: + ValueError: If prompt/schema missing or extraction fails """ headers = self._prepare_headers() @@ -715,10 +1078,7 @@ class FirecrawlApp: except: raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': - if status_data['success']: - return status_data - else: - raise Exception(f'Failed to extract. Error: {status_data["error"]}') + return status_data elif status_data['status'] in ['failed', 'cancelled']: raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') else: @@ -734,7 +1094,7 @@ class FirecrawlApp: return {'success': False, 'error': "Internal server error."} - def get_extract_status(self, job_id: str) -> Dict[str, Any]: + def get_extract_status(self, job_id: str) -> ExtractResponse[Any]: """ Retrieve the status of an extract job. @@ -742,7 +1102,7 @@ class FirecrawlApp: job_id (str): The ID of the extract job. Returns: - Dict[str, Any]: The status of the extract job. + ExtractResponse[Any]: The status of the extract job. Raises: ValueError: If there is an error retrieving the status. @@ -760,20 +1120,32 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extract job. Args: - urls (List[str]): The URLs to extract data from. - params (Optional[Dict[str, Any]]): Additional parameters for the extract request. - idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. + urls (List[str]): List of URLs to extract information from. Must be valid HTTP/HTTPS URLs. + params (Optional[Dict[str, Any]]): Extraction configuration parameters: + - prompt (str, optional): Custom prompt for extraction + - schema (Any, optional): JSON schema or Pydantic model for structured extraction + - systemPrompt (str, optional): System prompt for extraction + - allowExternalLinks (bool, optional): Allow following external links + - enableWebSearch (bool, optional): Enable web search during extraction + - includeSubdomains (bool, optional): Include content from subdomains + - origin (str, optional): Source of the extraction request + - showSources (bool, optional): Include source URLs in response + - scrapeOptions (CrawlScrapeOptions, optional): Configuration for scraping pages + idempotency_key (Optional[str]): Unique identifier to prevent duplicate requests. Returns: - Dict[str, Any]: The response from the extract operation. + ExtractResponse[Any]: A response containing: + - success (bool): Whether the extraction initiation was successful + - id (str): The unique identifier for the extract job + - error (str, optional): Error message if initiation failed Raises: - ValueError: If there is an error initiating the extract job. + ValueError: If neither prompt nor schema is provided, or if there is an error during initiation. """ headers = self._prepare_headers(idempotency_key) @@ -804,24 +1176,32 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + def generate_llms_text( + self, + url: str, + params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: """ Generate LLMs.txt for a given URL and poll until completion. Args: - url (str): The URL to generate LLMs.txt from. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + url: Target URL to generate LLMs.txt from + + params: See GenerateLLMsTextParams model: + + Generation Options: + * maxUrls - Maximum URLs to process (default: 10) + * showFullText - Include full text in output (default: False) + * __experimental_stream - Enable streaming of generation progress Returns: - Dict[str, Any]: A dictionary containing the generation results. The structure includes: - - 'success' (bool): Indicates if the generation was successful. - - 'status' (str): The final status of the generation job. - - 'data' (Dict): The generated LLMs.txt data. - - 'error' (Optional[str]): Error message if the generation failed. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires. + GenerateLLMsTextStatusResponse with: + * Generated LLMs.txt content + * Full version if requested + * Generation status + * Success/error information Raises: - Exception: If the generation job fails or an error occurs during status checks. + Exception: If generation fails """ if params is None: params = {} @@ -850,18 +1230,25 @@ class FirecrawlApp: return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} - def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + def async_generate_llms_text( + self, + url: str, + params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation operation. Args: - url (str): The URL to generate LLMs.txt from. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. + params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Generation configuration parameters: + - maxUrls (int, optional): Maximum number of URLs to process (default: 10) + - showFullText (bool, optional): Include full text in output (default: False) + - __experimental_stream (bool, optional): Enable streaming of generation progress Returns: - Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes: - - 'success' (bool): Indicates if the generation initiation was successful. - - 'id' (str): The unique identifier for the generation job. + GenerateLLMsTextResponse: A response containing: + - success (bool): Whether the generation initiation was successful + - id (str): The unique identifier for the generation job + - error (str, optional): Error message if initiation failed Raises: Exception: If the generation job initiation fails. @@ -891,15 +1278,22 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]: + def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse: """ Check the status of a LLMs.txt generation operation. Args: - id (str): The ID of the LLMs.txt generation operation. + id (str): The unique identifier of the LLMs.txt generation job to check status for. Returns: - Dict[str, Any]: The current status and results of the generation operation. + GenerateLLMsTextStatusResponse: A response containing: + - success (bool): Whether the generation was successful + - status (str): Status of generation ("processing", "completed", "failed") + - data (Dict[str, str], optional): Generated text with fields: + - llmstxt (str): Generated LLMs.txt content + - llmsfulltxt (str, optional): Full version if requested + - error (str, optional): Error message if generation failed + - expiresAt (str): When the generated data expires Raises: Exception: If the status check fails. @@ -921,7 +1315,9 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: + def _prepare_headers( + self, + idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -943,11 +1339,13 @@ class FirecrawlApp: 'Authorization': f'Bearer {self.api_key}', } - def _post_request(self, url: str, - data: Dict[str, Any], - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _post_request( + self, + url: str, + data: Dict[str, Any], + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a POST request with retries. @@ -972,10 +1370,12 @@ class FirecrawlApp: return response return response - def _get_request(self, url: str, - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _get_request( + self, + url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a GET request with retries. @@ -999,10 +1399,12 @@ class FirecrawlApp: return response return response - def _delete_request(self, url: str, - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _delete_request( + self, + url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a DELETE request with retries. @@ -1026,16 +1428,21 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status( + self, + id: str, + headers: Dict[str, str], + poll_interval: int) -> CrawlStatusResponse: """ Monitor the status of a crawl job until completion. Args: id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. - poll_interval (int): Secounds between status checks. + poll_interval (int): Seconds between status checks. + Returns: - Any: The crawl results if the job is completed successfully. + CrawlStatusResponse: The crawl results if the job is completed successfully. Raises: Exception: If the job fails or an error occurs during status checks. @@ -1073,7 +1480,10 @@ class FirecrawlApp: else: self._handle_error(status_response, 'check crawl status') - def _handle_error(self, response: requests.Response, action: str) -> None: + def _handle_error( + self, + response: requests.Response, + action: str) -> None: """ Handle errors from API responses. @@ -1105,22 +1515,47 @@ class FirecrawlApp: # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) - def deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, - on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, - on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> Dict[str, Any]: + def deep_research( + self, + query: str, + params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, + on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, + on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: """ Initiates a deep research operation on a given query and polls until completion. Args: - query (str): The query to research. - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation. - on_activity (Optional[Callable[[Dict[str, Any]], None]]): Optional callback to receive activity updates in real-time. + query: Research query or topic to investigate + + params: See DeepResearchParams model: + Research Settings: + * maxDepth - Maximum research depth (default: 7) + * timeLimit - Time limit in seconds (default: 270) + * maxUrls - Maximum URLs to process (default: 20) + + Callbacks: + * on_activity - Progress callback receiving: + {type, status, message, timestamp, depth} + * on_source - Source discovery callback receiving: + {url, title, description} Returns: - Dict[str, Any]: The final research results. + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries Raises: - Exception: If the research operation fails. + Exception: If research fails """ if params is None: params = {} @@ -1164,16 +1599,26 @@ class FirecrawlApp: return {'success': False, 'error': 'Deep research job terminated unexpectedly'} - def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: + def async_deep_research( + self, + query: str, + params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse: """ Initiates an asynchronous deep research operation. Args: - query (str): The query to research. - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation. + query (str): The research query to investigate. Should be a clear, specific question or topic. + params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Research configuration parameters: + - maxDepth (int, optional): Maximum depth of research exploration (default: 7) + - timeLimit (int, optional): Time limit in seconds for research (default: 270) + - maxUrls (int, optional): Maximum number of URLs to process (default: 20) + - __experimental_streamSteps (bool, optional): Enable streaming of research steps Returns: - Dict[str, Any]: The response from the deep research initiation. + DeepResearchResponse: A response containing: + - success (bool): Whether the research initiation was successful + - id (str): The unique identifier for the research job + - error (str, optional): Error message if initiation failed Raises: Exception: If the research initiation fails. @@ -1203,7 +1648,7 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def check_deep_research_status(self, id: str) -> Dict[str, Any]: + def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse: """ Check the status of a deep research operation. @@ -1211,7 +1656,19 @@ class FirecrawlApp: id (str): The ID of the deep research operation. Returns: - Dict[str, Any]: The current status and results of the research operation. + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries Raises: Exception: If the status check fails. @@ -1232,8 +1689,18 @@ class FirecrawlApp: raise ValueError(str(e)) return {'success': False, 'error': 'Internal server error'} - class CrawlWatcher: + """ + A class to watch and handle crawl job events via WebSocket connection. + + Attributes: + id (str): The ID of the crawl job to watch + app (FirecrawlApp): The FirecrawlApp instance + data (List[Dict[str, Any]]): List of crawled documents/data + status (str): Current status of the crawl job + ws_url (str): WebSocket URL for the crawl job + event_handlers (dict): Dictionary of event type to list of handler functions + """ def __init__(self, id: str, app: FirecrawlApp): self.id = id self.app = app @@ -1246,25 +1713,54 @@ class CrawlWatcher: 'document': [] } - async def connect(self): + async def connect(self) -> None: + """ + Establishes WebSocket connection and starts listening for messages. + """ async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket: await self._listen(websocket) - async def _listen(self, websocket): + async def _listen(self, websocket) -> None: + """ + Listens for incoming WebSocket messages and handles them. + + Args: + websocket: The WebSocket connection object + """ async for message in websocket: msg = json.loads(message) await self._handle_message(msg) - def add_event_listener(self, event_type: str, handler): + def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None: + """ + Adds an event handler function for a specific event type. + + Args: + event_type (str): Type of event to listen for ('done', 'error', or 'document') + handler (Callable): Function to handle the event + """ if event_type in self.event_handlers: self.event_handlers[event_type].append(handler) - def dispatch_event(self, event_type: str, detail: Dict[str, Any]): + def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None: + """ + Dispatches an event to all registered handlers for that event type. + + Args: + event_type (str): Type of event to dispatch + detail (Dict[str, Any]): Event details/data to pass to handlers + """ if event_type in self.event_handlers: for handler in self.event_handlers[event_type]: handler(detail) - async def _handle_message(self, msg: Dict[str, Any]): + async def _handle_message(self, msg: Dict[str, Any]) -> None: + """ + Handles incoming WebSocket messages based on their type. + + Args: + msg (Dict[str, Any]): The message to handle + """ if msg['type'] == 'done': self.status = 'completed' self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) From 3641070ece62d26a007f86385a4fe2aecfd96b16 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 13 Mar 2025 16:27:59 -0300 Subject: [PATCH 02/26] async --- apps/python-sdk/example.py | 4 +- apps/python-sdk/example_async.py | 168 +++ apps/python-sdk/firecrawl/firecrawl.py | 1770 +++++++++++++++++++++--- apps/python-sdk/requirements.txt | 3 +- 4 files changed, 1762 insertions(+), 183 deletions(-) create mode 100644 apps/python-sdk/example_async.py diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index fb960187..ae4258f7 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -47,7 +47,7 @@ while attempts > 0 and crawl_status['status'] != 'completed': attempts -= 1 time.sleep(1) -crawl_status = app.get_crawl_status(async_result['id']) +crawl_status = app.check_crawl_status(async_result['id']) print(crawl_status) # LLM Extraction: @@ -155,4 +155,4 @@ async def start_crawl_and_watch(): watcher.add_event_listener("done", on_done) # Start the watcher - await watcher.connect() + await watcher.connect() \ No newline at end of file diff --git a/apps/python-sdk/example_async.py b/apps/python-sdk/example_async.py new file mode 100644 index 00000000..7afe6a70 --- /dev/null +++ b/apps/python-sdk/example_async.py @@ -0,0 +1,168 @@ +import time +import nest_asyncio +import uuid +import asyncio +from firecrawl.firecrawl import AsyncFirecrawlApp +from pydantic import BaseModel, Field +from typing import List + +app = AsyncFirecrawlApp(api_key="fc-") + +async def example_scrape(): + # Scrape a website: + scrape_result = await app.scrape_url('firecrawl.dev') + print(scrape_result['markdown']) + +async def example_batch_scrape(): + # Batch scrape + urls = ['https://example.com', 'https://docs.firecrawl.dev'] + batch_scrape_params = { + 'formats': ['markdown', 'html'], + } + + # Synchronous batch scrape + batch_result = await app.batch_scrape_urls(urls, batch_scrape_params) + print("Synchronous Batch Scrape Result:") + print(batch_result['data'][0]['markdown']) + + # Asynchronous batch scrape + async_batch_result = await app.async_batch_scrape_urls(urls, batch_scrape_params) + print("\nAsynchronous Batch Scrape Result:") + print(async_batch_result) + +async def example_crawl(): + # Crawl a website: + idempotency_key = str(uuid.uuid4()) # optional idempotency key + crawl_result = await app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) + print(crawl_result) + + # Asynchronous Crawl a website: + async_result = await app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") + print(async_result) + + crawl_status = await app.check_crawl_status(async_result['id']) + print(crawl_status) + + attempts = 15 + while attempts > 0 and crawl_status['status'] != 'completed': + print(crawl_status) + crawl_status = await app.check_crawl_status(async_result['id']) + attempts -= 1 + await asyncio.sleep(1) # Use async sleep instead of time.sleep + + crawl_status = await app.check_crawl_status(async_result['id']) + print(crawl_status) + +async def example_llm_extraction(): + # Define schema to extract contents into using pydantic + class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + + class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., description="Top 5 stories") + + llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', { + 'formats': ['extract'], + 'extract': { + 'schema': TopArticlesSchema.model_json_schema() + } + }) + + print(llm_extraction_result['extract']) + + # Define schema to extract contents into using json schema + json_schema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] + } + + app2 = AsyncFirecrawlApp(api_key="fc-", version="v0") + + llm_extraction_result = await app2.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': json_schema, + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } + }) + +async def example_map_and_extract(): + # Map a website: + map_result = await app.map_url('https://firecrawl.dev', { 'search': 'blog' }) + print(map_result) + + # Extract URLs: + class ExtractSchema(BaseModel): + title: str + description: str + links: List[str] + + # Define the schema using Pydantic + extract_schema = ExtractSchema.schema() + + # Perform the extraction + extract_result = await app.extract(['https://firecrawl.dev'], { + 'prompt': "Extract the title, description, and links from the website", + 'schema': extract_schema + }) + print(extract_result) + +# Define event handlers for websocket +def on_document(detail): + print("DOC", detail) + +def on_error(detail): + print("ERR", detail['error']) + +def on_done(detail): + print("DONE", detail['status']) + +async def example_websocket_crawl(): + # Initiate the crawl job and get the watcher + watcher = await app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 }) + + # Add event listeners + watcher.add_event_listener("document", on_document) + watcher.add_event_listener("error", on_error) + watcher.add_event_listener("done", on_done) + + # Start the watcher + await watcher.connect() + +async def main(): + # Apply nest_asyncio to allow nested event loops + nest_asyncio.apply() + + # Run all the examples + await example_scrape() + await example_batch_scrape() + await example_crawl() + await example_llm_extraction() + await example_map_and_extract() + await example_websocket_crawl() + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d212dea7..e0f8c940 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -19,6 +19,8 @@ from datetime import datetime import requests import pydantic import websockets +import aiohttp +import asyncio logger : logging.Logger = logging.getLogger("firecrawl") @@ -326,21 +328,19 @@ class FirecrawlApp: Args: url (str): Target URL to scrape - params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration: - Content Options: * formats - Content types to retrieve (markdown/html/etc) * includeTags - HTML tags to include * excludeTags - HTML tags to exclude * onlyMainContent - Extract main content only - + Request Options: * headers - Custom HTTP headers * timeout - Request timeout (ms) * mobile - Use mobile user agent * proxy - Proxy type (basic/stealth) - + Extraction Options: * extract - Content extraction settings * jsonOptions - JSON extraction settings @@ -348,7 +348,6 @@ class FirecrawlApp: Returns: ScrapeResponse with: - * Requested content formats * Page metadata * Extraction results @@ -465,7 +464,7 @@ class FirecrawlApp: raise Exception(f'Failed to parse Firecrawl response as JSON.') def crawl_url(self, url: str, - params: Optional[Dict[str, Any]] = None, + params: Optional[CrawlParams] = None, poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None) -> CrawlStatusResponse: """ @@ -473,9 +472,7 @@ class FirecrawlApp: Args: url (str): Target URL to start crawling from - - params (Optional[Dict[str, Any]]): See CrawlParams model for configuration: - + params (Optional[CrawlParams]): See CrawlParams model: URL Discovery: * includePaths - Patterns of URLs to include * excludePaths - Patterns of URLs to exclude @@ -494,10 +491,8 @@ class FirecrawlApp: * deduplicateSimilarURLs - Remove similar URLs * ignoreQueryParameters - Ignore URL parameters * regexOnFullURL - Apply regex to full URLs - - poll_interval: Seconds between status checks (default: 2) - - idempotency_key: Request deduplication key + poll_interval (int): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: CrawlStatusResponse with: @@ -667,10 +662,19 @@ class FirecrawlApp: Returns information about crawl errors. Args: - id (str): The ID of the crawl job. + id (str): The ID of the crawl job Returns: - Dict[str, Any]: Information about crawl errors. + CrawlErrorsResponse containing: + * errors (List[Dict[str, str]]): List of errors with fields: + - id (str): Error ID + - timestamp (str): When the error occurred + - url (str): URL that caused the error + - error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If error check fails """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers) @@ -684,13 +688,18 @@ class FirecrawlApp: def cancel_crawl(self, id: str) -> Dict[str, Any]: """ - Cancel an asynchronous crawl job using the Firecrawl API. + Cancel an asynchronous crawl job. Args: - id (str): The ID of the crawl job to cancel. + id (str): The ID of the crawl job to cancel Returns: - Dict[str, Any]: The response from the cancel crawl request. + Dict[str, Any] containing: + * success (bool): Whether cancellation was successful + * error (str, optional): Error message if cancellation failed + + Raises: + Exception: If cancellation fails """ headers = self._prepare_headers() response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers) @@ -702,17 +711,42 @@ class FirecrawlApp: else: self._handle_error(response, "cancel crawl job") - def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def crawl_url_and_watch( + self, + url: str, + params: Optional[CrawlParams] = None, + idempotency_key: Optional[str] = None) -> 'CrawlWatcher': """ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + params (Optional[CrawlParams]): See CrawlParams model for configuration: + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: - CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job. + AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket + + Raises: + Exception: If crawl job fails to start """ crawl_response = self.async_crawl_url(url, params, idempotency_key) if crawl_response['success'] and 'id' in crawl_response: @@ -725,27 +759,27 @@ class FirecrawlApp: Map and discover links from a URL. Args: - url: Target URL to map + url: Target URL to map - params: See MapParams model: + params: See MapParams model: - Discovery Options: - * search - Filter pattern for URLs - * ignoreSitemap - Skip sitemap.xml - * includeSubdomains - Include subdomain links - * sitemapOnly - Only use sitemap.xml - - Limits: - * limit - Max URLs to return - * timeout - Request timeout (ms) + Discovery Options: + * search - Filter pattern for URLs + * ignoreSitemap - Skip sitemap.xml + * includeSubdomains - Include subdomain links + * sitemapOnly - Only use sitemap.xml + + Limits: + * limit - Max URLs to return + * timeout - Request timeout (ms) Returns: - MapResponse with: - * Discovered URLs - * Success/error status + MapResponse with: + * Discovered URLs + * Success/error status Raises: - Exception: If mapping fails + Exception: If mapping fails """ endpoint = f'/v1/map' headers = self._prepare_headers() @@ -776,46 +810,40 @@ class FirecrawlApp: self._handle_error(response, 'map') def batch_scrape_urls(self, urls: List[str], - params: Optional[Dict[str, Any]] = None, + params: Optional[ScrapeParams] = None, poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None) -> BatchScrapeStatusResponse: """ Batch scrape multiple URLs and monitor until completion. Args: - urls: URLs to scrape - - params: See ScrapeParams model: - - Content Options: - * formats - Content formats to retrieve - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only + urls (List[str]): URLs to scrape + params (Optional[ScrapeParams]): See ScrapeParams model: + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type - - Extraction Options: - * extract - Content extraction config - * jsonOptions - JSON extraction config - * actions - Actions to perform - - poll_interval: Seconds between status checks (default: 2) - - idempotency_key: Request deduplication key + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform Returns: - BatchScrapeStatusResponse with: - * Scraping status and progress - * Scraped content for each URL - * Success/error information + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information Raises: - Exception: If batch scrape fails + Exception: If batch scrape fails """ endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) @@ -837,21 +865,41 @@ class FirecrawlApp: def async_batch_scrape_urls( self, urls: List[str], - params: Optional[Dict[str, Any]] = None, + params: Optional[ScrapeParams] = None, idempotency_key: Optional[str] = None) -> BatchScrapeResponse: """ Initiate a batch scrape job asynchronously. Args: - urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + urls (List[str]): List of URLs to scrape + params (Optional[ScrapeParams]): See ScrapeParams model for configuration: + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: - BatchScrapeResponse: A dictionary containing the batch scrape initiation response. The structure includes: - - 'success' (bool): Indicates if the batch scrape initiation was successful. - - 'id' (str): The unique identifier for the batch scrape job. - - 'url' (str): The URL to check the status of the batch scrape job. + BatchScrapeResponse with: + * success - Whether job started successfully + * id - Unique identifier for the job + * url - Status check URL + * error - Error message if start failed + + Raises: + Exception: If job initiation fails """ endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) @@ -876,12 +924,32 @@ class FirecrawlApp: Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. Args: - urls (List[str]): The URLs to scrape. - params (Optional[ScrapeParams]): Additional parameters for the scraper. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + urls (List[str]): List of URLs to scrape + params (Optional[ScrapeParams]): See ScrapeParams model for configuration: + + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: - CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job. + AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket + + Raises: + Exception: If batch scrape job fails to start """ crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key) if crawl_response['success'] and 'id' in crawl_response: @@ -964,16 +1032,16 @@ class FirecrawlApp: Returns information about batch scrape errors. Args: - id (str): The ID of the crawl job. + id (str): The ID of the crawl job. Returns: CrawlErrorsResponse: A response containing: - - errors (List[Dict[str, str]]): List of errors with fields: - - id (str): Error ID - - timestamp (str): When the error occurred - - url (str): URL that caused the error - - error (str): Error message - - robotsBlocked (List[str]): List of URLs blocked by robots.txt + * errors (List[Dict[str, str]]): List of errors with fields: + * id (str): Error ID + * timestamp (str): When the error occurred + * url (str): URL that caused the error + * error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers) @@ -997,19 +1065,19 @@ class FirecrawlApp: params: See ExtractParams model: - Extraction Config: - * prompt - Custom extraction prompt - * schema - JSON schema/Pydantic model - * systemPrompt - System context - - Behavior Options: - * allowExternalLinks - Follow external links - * enableWebSearch - Enable web search - * includeSubdomains - Include subdomains - * showSources - Include source URLs - - Scraping Options: - * scrapeOptions - Page scraping config + Extraction Config: + * prompt - Custom extraction prompt + * schema - JSON schema/Pydantic model + * systemPrompt - System context + + Behavior Options: + * allowExternalLinks - Follow external links + * enableWebSearch - Enable web search + * includeSubdomains - Include subdomains + * showSources - Include source URLs + + Scraping Options: + * scrapeOptions - Page scraping config Returns: ExtractResponse with: @@ -1120,32 +1188,40 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: + def async_extract( + self, + urls: List[str], + params: Optional[ExtractParams] = None, + idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extract job. Args: - urls (List[str]): List of URLs to extract information from. Must be valid HTTP/HTTPS URLs. - params (Optional[Dict[str, Any]]): Extraction configuration parameters: - - prompt (str, optional): Custom prompt for extraction - - schema (Any, optional): JSON schema or Pydantic model for structured extraction - - systemPrompt (str, optional): System prompt for extraction - - allowExternalLinks (bool, optional): Allow following external links - - enableWebSearch (bool, optional): Enable web search during extraction - - includeSubdomains (bool, optional): Include content from subdomains - - origin (str, optional): Source of the extraction request - - showSources (bool, optional): Include source URLs in response - - scrapeOptions (CrawlScrapeOptions, optional): Configuration for scraping pages - idempotency_key (Optional[str]): Unique identifier to prevent duplicate requests. + urls (List[str]): URLs to extract information from + params (Optional[ExtractParams]): See ExtractParams model: + Extraction Config: + * prompt - Custom extraction prompt + * schema - JSON schema/Pydantic model + * systemPrompt - System context + + Behavior Options: + * allowExternalLinks - Follow external links + * enableWebSearch - Enable web search + * includeSubdomains - Include subdomains + * showSources - Include source URLs + + Scraping Options: + * scrapeOptions - Page scraping config + idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: - ExtractResponse[Any]: A response containing: - - success (bool): Whether the extraction initiation was successful - - id (str): The unique identifier for the extract job - - error (str, optional): Error message if initiation failed + ExtractResponse containing: + * success (bool): Whether job started successfully + * id (str): Unique identifier for the job + * error (str, optional): Error message if start failed Raises: - ValueError: If neither prompt nor schema is provided, or if there is an error during initiation. + ValueError: If job initiation fails """ headers = self._prepare_headers(idempotency_key) @@ -1184,24 +1260,26 @@ class FirecrawlApp: Generate LLMs.txt for a given URL and poll until completion. Args: - url: Target URL to generate LLMs.txt from + url: Target URL to generate LLMs.txt from params: See GenerateLLMsTextParams model: + params: See GenerateLLMsTextParams model: - Generation Options: - * maxUrls - Maximum URLs to process (default: 10) - * showFullText - Include full text in output (default: False) - * __experimental_stream - Enable streaming of generation progress + params: See GenerateLLMsTextParams model: + + Generation Options: + * maxUrls - Maximum URLs to process (default: 10) + * showFullText - Include full text in output (default: False) Returns: - GenerateLLMsTextStatusResponse with: - * Generated LLMs.txt content - * Full version if requested - * Generation status - * Success/error information + GenerateLLMsTextStatusResponse with: + * Generated LLMs.txt content + * Full version if requested + * Generation status + * Success/error information Raises: - Exception: If generation fails + Exception: If generation fails """ if params is None: params = {} @@ -1238,20 +1316,19 @@ class FirecrawlApp: Initiate an asynchronous LLMs.txt generation operation. Args: - url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Generation configuration parameters: - - maxUrls (int, optional): Maximum number of URLs to process (default: 10) - - showFullText (bool, optional): Include full text in output (default: False) - - __experimental_stream (bool, optional): Enable streaming of generation progress + url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. + params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Generation configuration parameters: + * maxUrls (int, optional): Maximum number of URLs to process (default: 10) + * showFullText (bool, optional): Include full text in output (default: False) Returns: - GenerateLLMsTextResponse: A response containing: - - success (bool): Whether the generation initiation was successful - - id (str): The unique identifier for the generation job - - error (str, optional): Error message if initiation failed + GenerateLLMsTextResponse: A response containing: + - success (bool): Whether the generation initiation was successful + - id (str): The unique identifier for the generation job + - error (str, optional): Error message if initiation failed Raises: - Exception: If the generation job initiation fails. + Exception: If the generation job initiation fails. """ if params is None: params = {} @@ -1283,20 +1360,20 @@ class FirecrawlApp: Check the status of a LLMs.txt generation operation. Args: - id (str): The unique identifier of the LLMs.txt generation job to check status for. + id (str): The unique identifier of the LLMs.txt generation job to check status for. Returns: - GenerateLLMsTextStatusResponse: A response containing: - - success (bool): Whether the generation was successful - - status (str): Status of generation ("processing", "completed", "failed") - - data (Dict[str, str], optional): Generated text with fields: - - llmstxt (str): Generated LLMs.txt content - - llmsfulltxt (str, optional): Full version if requested - - error (str, optional): Error message if generation failed - - expiresAt (str): When the generated data expires + GenerateLLMsTextStatusResponse: A response containing: + * success (bool): Whether the generation was successful + * status (str): Status of generation ("processing", "completed", "failed") + * data (Dict[str, str], optional): Generated text with fields: + * llmstxt (str): Generated LLMs.txt content + * llmsfulltxt (str, optional): Full version if requested + * error (str, optional): Error message if generation failed + * expiresAt (str): When the generated data expires Raises: - Exception: If the status check fails. + Exception: If the status check fails. """ headers = self._prepare_headers() try: @@ -1525,37 +1602,37 @@ class FirecrawlApp: Initiates a deep research operation on a given query and polls until completion. Args: - query: Research query or topic to investigate + query: Research query or topic to investigate - params: See DeepResearchParams model: - Research Settings: - * maxDepth - Maximum research depth (default: 7) - * timeLimit - Time limit in seconds (default: 270) - * maxUrls - Maximum URLs to process (default: 20) + params: See DeepResearchParams model: + Research Settings: + * maxDepth - Maximum research depth (default: 7) + * timeLimit - Time limit in seconds (default: 270) + * maxUrls - Maximum URLs to process (default: 20) - Callbacks: - * on_activity - Progress callback receiving: - {type, status, message, timestamp, depth} - * on_source - Source discovery callback receiving: - {url, title, description} + Callbacks: + * on_activity - Progress callback receiving: + {type, status, message, timestamp, depth} + * on_source - Source discovery callback receiving: + {url, title, description} Returns: - DeepResearchResponse containing: + DeepResearchResponse containing: - Status: - * success - Whether research completed successfully - * status - Current state (processing/completed/failed) - * error - Error message if failed - - Results: - * id - Unique identifier for the research job - * data - Research findings and analysis - * sources - List of discovered sources - * activities - Research progress log - * summaries - Generated research summaries + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries Raises: - Exception: If research fails + Exception: If research fails """ if params is None: params = {} @@ -1609,16 +1686,15 @@ class FirecrawlApp: Args: query (str): The research query to investigate. Should be a clear, specific question or topic. params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Research configuration parameters: - - maxDepth (int, optional): Maximum depth of research exploration (default: 7) - - timeLimit (int, optional): Time limit in seconds for research (default: 270) - - maxUrls (int, optional): Maximum number of URLs to process (default: 20) - - __experimental_streamSteps (bool, optional): Enable streaming of research steps + * maxDepth (int, optional): Maximum depth of research exploration (default: 7) + * timeLimit (int, optional): Time limit in seconds for research (default: 270) + * maxUrls (int, optional): Maximum number of URLs to process (default: 20) Returns: - DeepResearchResponse: A response containing: - - success (bool): Whether the research initiation was successful - - id (str): The unique identifier for the research job - - error (str, optional): Error message if initiation failed + DeepResearchResponse: A response containing: + * success (bool): Whether the research initiation was successful + * id (str): The unique identifier for the research job + * error (str, optional): Error message if initiation failed Raises: Exception: If the research initiation fails. @@ -1689,6 +1765,7 @@ class FirecrawlApp: raise ValueError(str(e)) return {'success': False, 'error': 'Internal server error'} + class CrawlWatcher: """ A class to watch and handle crawl job events via WebSocket connection. @@ -1775,3 +1852,1336 @@ class CrawlWatcher: elif msg['type'] == 'document': self.data.append(msg['data']) self.dispatch_event('document', {'data': msg['data'], 'id': self.id}) + +class AsyncFirecrawlApp(FirecrawlApp): + """ + Asynchronous version of FirecrawlApp that implements async methods using aiohttp. + Provides non-blocking alternatives to all FirecrawlApp operations. + """ + + async def _async_post_request( + self, + url: str, + data: Dict[str, Any], + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> Dict[str, Any]: + """ + Make an async POST request with exponential backoff retry logic. + + Args: + url (str): The URL to send the POST request to + data (Dict[str, Any]): The JSON data to include in the request body + headers (Dict[str, str]): Headers to include in the request + retries (int): Maximum number of retry attempts (default: 3) + backoff_factor (float): Factor to calculate delay between retries (default: 0.5) + Delay will be backoff_factor * (2 ** retry_count) + + Returns: + Dict[str, Any]: The parsed JSON response from the server + + Raises: + aiohttp.ClientError: If the request fails after all retries + Exception: If max retries are exceeded or other errors occur + """ + async with aiohttp.ClientSession() as session: + for attempt in range(retries): + try: + async with session.post(url, headers=headers, json=data) as response: + if response.status == 502: + await asyncio.sleep(backoff_factor * (2 ** attempt)) + continue + if response.status != 200: + await self._handle_error(response, "make POST request") + return await response.json() + except aiohttp.ClientError as e: + if attempt == retries - 1: + raise e + await asyncio.sleep(backoff_factor * (2 ** attempt)) + raise Exception("Max retries exceeded") + + async def _async_get_request( + self, + url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> Dict[str, Any]: + """ + Make an async GET request with exponential backoff retry logic. + + Args: + url (str): The URL to send the GET request to + headers (Dict[str, str]): Headers to include in the request + retries (int): Maximum number of retry attempts (default: 3) + backoff_factor (float): Factor to calculate delay between retries (default: 0.5) + Delay will be backoff_factor * (2 ** retry_count) + + Returns: + Dict[str, Any]: The parsed JSON response from the server + + Raises: + aiohttp.ClientError: If the request fails after all retries + Exception: If max retries are exceeded or other errors occur + """ + async with aiohttp.ClientSession() as session: + for attempt in range(retries): + try: + async with session.get(url, headers=headers) as response: + if response.status == 502: + await asyncio.sleep(backoff_factor * (2 ** attempt)) + continue + if response.status != 200: + await self._handle_error(response, "make GET request") + return await response.json() + except aiohttp.ClientError as e: + if attempt == retries - 1: + raise e + await asyncio.sleep(backoff_factor * (2 ** attempt)) + raise Exception("Max retries exceeded") + + async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None: + """ + Handle errors from async API responses with detailed error messages. + + Args: + response (aiohttp.ClientResponse): The response object from the failed request + action (str): Description of the action that was being attempted + + Raises: + aiohttp.ClientError: With a detailed error message based on the response status: + - 402: Payment Required + - 408: Request Timeout + - 409: Conflict + - 500: Internal Server Error + - Other: Unexpected error with status code + """ + try: + error_data = await response.json() + error_message = error_data.get('error', 'No error message provided.') + error_details = error_data.get('details', 'No additional error details provided.') + except: + raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}') + + if response.status == 402: + message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" + elif response.status == 408: + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" + elif response.status == 409: + message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" + elif response.status == 500: + message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" + else: + message = f"Unexpected error during {action}: Status code {response.status}. {error_message} - {error_details}" + + raise aiohttp.ClientError(message) + + async def crawl_url_and_watch( + self, + url: str, + params: Optional[CrawlParams] = None, + idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher': + """ + Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket. + + Args: + url (str): Target URL to start crawling from + params (Optional[CrawlParams]): See CrawlParams model for configuration: + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket + + Raises: + Exception: If crawl job fails to start + """ + crawl_response = await self.async_crawl_url(url, params, idempotency_key) + if crawl_response.get('success') and 'id' in crawl_response: + return AsyncCrawlWatcher(crawl_response['id'], self) + else: + raise Exception("Crawl job failed to start") + + async def batch_scrape_urls_and_watch( + self, + urls: List[str], + params: Optional[ScrapeParams] = None, + idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher': + """ + Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress. + + Args: + urls (List[str]): List of URLs to scrape + params (Optional[ScrapeParams]): See ScrapeParams model for configuration: + + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket + + Raises: + Exception: If batch scrape job fails to start + """ + batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key) + if batch_response.get('success') and 'id' in batch_response: + return AsyncCrawlWatcher(batch_response['id'], self) + else: + raise Exception("Batch scrape job failed to start") + + async def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> ScrapeResponse[Any]: + """ + Asynchronously scrape and extract content from a URL. + + Args: + url (str): Target URL to scrape + params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration: + Content Options: + * formats - Content types to retrieve (markdown/html/etc) + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type (basic/stealth) + + Extraction Options: + * extract - Content extraction settings + * jsonOptions - JSON extraction settings + * actions - Actions to perform + + Returns: + ScrapeResponse with: + * Requested content formats + * Page metadata + * Extraction results + * Success/error status + + Raises: + Exception: If scraping fails + """ + headers = self._prepare_headers() + scrape_params = {'url': url} + + if params: + extract = params.get('extract', {}) + if extract: + if 'schema' in extract and hasattr(extract['schema'], 'schema'): + extract['schema'] = extract['schema'].schema() + scrape_params['extract'] = extract + + for key, value in params.items(): + if key not in ['extract']: + scrape_params[key] = value + + endpoint = f'/v1/scrape' + response = await self._async_post_request( + f'{self.api_url}{endpoint}', + scrape_params, + headers + ) + + if response.get('success') and 'data' in response: + return response['data'] + elif "error" in response: + raise Exception(f'Failed to scrape URL. Error: {response["error"]}') + else: + raise Exception(f'Failed to scrape URL. Error: {response}') + + async def batch_scrape_urls(self, urls: List[str], params: Optional[ScrapeParams] = None) -> BatchScrapeStatusResponse: + """ + Asynchronously scrape multiple URLs and monitor until completion. + + Args: + urls (List[str]): URLs to scrape + params (Optional[ScrapeParams]): See ScrapeParams model: + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + + Returns: + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information + + Raises: + Exception: If batch scrape fails + """ + headers = self._prepare_headers() + json_data = {'urls': urls} + if params: + json_data.update(params) + + endpoint = f'/v1/batch/scrape' + response = await self._async_post_request( + f'{self.api_url}{endpoint}', + json_data, + headers + ) + + if response.get('success') and 'id' in response: + return await self._async_monitor_job_status(response['id'], headers) + else: + raise Exception(f'Failed to start batch scrape. Error: {response.get("error")}') + + async def async_batch_scrape_urls( + self, + urls: List[str], + params: Optional[ScrapeParams] = None, + idempotency_key: Optional[str] = None) -> BatchScrapeResponse: + """ + Initiate an asynchronous batch scrape job without waiting for completion. + + Args: + urls (List[str]): List of URLs to scrape + params (Optional[ScrapeParams]): See ScrapeParams model for configuration: + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + BatchScrapeResponse with: + * success - Whether job started successfully + * id - Unique identifier for the job + * url - Status check URL + * error - Error message if start failed + + Raises: + Exception: If job initiation fails + """ + headers = self._prepare_headers(idempotency_key) + json_data = {'urls': urls} + if params: + json_data.update(params) + + endpoint = f'/v1/batch/scrape' + return await self._async_post_request( + f'{self.api_url}{endpoint}', + json_data, + headers + ) + + async def crawl_url( + self, + url: str, + params: Optional[CrawlParams] = None, + poll_interval: int = 2, + idempotency_key: Optional[str] = None) -> CrawlStatusResponse: + """ + Asynchronously crawl a website starting from a URL and monitor until completion. + + Args: + url (str): Target URL to start crawling from + params (Optional[CrawlParams]): See CrawlParams model: + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + poll_interval (int): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + CrawlStatusResponse with: + * Crawling status and progress + * Crawled page contents + * Success/error information + + Raises: + Exception: If crawl fails + """ + headers = self._prepare_headers(idempotency_key) + json_data = {'url': url} + if params: + json_data.update(params) + + endpoint = f'/v1/crawl' + response = await self._async_post_request( + f'{self.api_url}{endpoint}', + json_data, + headers + ) + + if response.get('success') and 'id' in response: + return await self._async_monitor_job_status(response['id'], headers, poll_interval) + else: + raise Exception(f'Failed to start crawl. Error: {response.get("error")}') + + async def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> CrawlResponse: + """ + Initiate an asynchronous crawl job without waiting for completion. + + Args: + url (str): Target URL to start crawling from + params (Optional[Dict[str, Any]]): See CrawlParams model: + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + CrawlResponse with: + * success - Whether job started successfully + * id - Unique identifier for the job + * url - Status check URL + * error - Error message if start failed + + Raises: + Exception: If job initiation fails + """ + headers = self._prepare_headers(idempotency_key) + json_data = {'url': url} + if params: + json_data.update(params) + + endpoint = f'/v1/crawl' + return await self._async_post_request( + f'{self.api_url}{endpoint}', + json_data, + headers + ) + + async def check_crawl_status(self, id: str) -> CrawlStatusResponse: + """ + Check the status and results of an asynchronous crawl job. + + Args: + id (str): Unique identifier for the crawl job + + Returns: + CrawlStatusResponse containing: + Status Information: + * status - Current state (scraping/completed/failed/cancelled) + * completed - Number of pages crawled + * total - Total pages to crawl + * creditsUsed - API credits consumed + * expiresAt - Data expiration timestamp + + Results: + * data - List of crawled documents + * next - URL for next page of results (if paginated) + * success - Whether status check succeeded + * error - Error message if failed + + Raises: + Exception: If status check fails + """ + headers = self._prepare_headers() + endpoint = f'/v1/crawl/{id}' + + status_data = await self._async_get_request( + f'{self.api_url}{endpoint}', + headers + ) + + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + if len(status_data['data']) == 0: + break + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + next_data = await self._async_get_request(next_url, headers) + data.extend(next_data.get('data', [])) + status_data = next_data + status_data['data'] = data + + response = { + 'status': status_data.get('status'), + 'total': status_data.get('total'), + 'completed': status_data.get('completed'), + 'creditsUsed': status_data.get('creditsUsed'), + 'expiresAt': status_data.get('expiresAt'), + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response + } + + async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse: + """ + Monitor the status of an asynchronous job until completion. + + Args: + id (str): The ID of the job to monitor + headers (Dict[str, str]): Headers to include in status check requests + poll_interval (int): Seconds between status checks (default: 2) + + Returns: + CrawlStatusResponse: The job results if completed successfully + + Raises: + Exception: If the job fails or an error occurs during status checks + """ + while True: + status_data = await self._async_get_request( + f'{self.api_url}/v1/crawl/{id}', + headers + ) + + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + if len(status_data['data']) == 0: + break + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + next_data = await self._async_get_request(next_url, headers) + data.extend(next_data.get('data', [])) + status_data = next_data + status_data['data'] = data + return status_data + else: + raise Exception('Job completed but no data was returned') + elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: + await asyncio.sleep(max(poll_interval, 2)) + else: + raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}') + + async def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> MapResponse: + """ + Asynchronously map and discover links from a URL. + + Args: + url (str): Target URL to map + params (Optional[Dict[str, Any]]): See MapParams model: + Discovery Options: + * search - Filter pattern for URLs + * ignoreSitemap - Skip sitemap.xml + * includeSubdomains - Include subdomain links + * sitemapOnly - Only use sitemap.xml + + Limits: + * limit - Max URLs to return + * timeout - Request timeout (ms) + + Returns: + MapResponse with: + * Discovered URLs + * Success/error status + + Raises: + Exception: If mapping fails + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + + endpoint = f'/v1/map' + response = await self._async_post_request( + f'{self.api_url}{endpoint}', + json_data, + headers + ) + + if response.get('success') and 'links' in response: + return response + elif 'error' in response: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + raise Exception(f'Failed to map URL. Error: {response}') + + async def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: + """ + Asynchronously extract structured information from URLs. + + Args: + urls (List[str]): URLs to extract from + params (Optional[ExtractParams]): See ExtractParams model: + Extraction Config: + * prompt - Custom extraction prompt + * schema - JSON schema/Pydantic model + * systemPrompt - System context + + Behavior Options: + * allowExternalLinks - Follow external links + * enableWebSearch - Enable web search + * includeSubdomains - Include subdomains + * showSources - Include source URLs + + Scraping Options: + * scrapeOptions - Page scraping config + + Returns: + ExtractResponse with: + * Structured data matching schema + * Source information if requested + * Success/error status + + Raises: + ValueError: If prompt/schema missing or extraction fails + """ + headers = self._prepare_headers() + + if not params or (not params.get('prompt') and not params.get('schema')): + raise ValueError("Either prompt or schema is required") + + schema = params.get('schema') + if schema: + if hasattr(schema, 'model_json_schema'): + schema = schema.model_json_schema() + + request_data = { + 'urls': urls, + 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)), + 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), + 'showSources': params.get('show_sources', params.get('showSources', False)), + 'schema': schema, + 'origin': 'api-sdk' + } + + if params.get('prompt'): + request_data['prompt'] = params['prompt'] + if params.get('system_prompt'): + request_data['systemPrompt'] = params['system_prompt'] + elif params.get('systemPrompt'): + request_data['systemPrompt'] = params['systemPrompt'] + + response = await self._async_post_request( + f'{self.api_url}/v1/extract', + request_data, + headers + ) + + if response.get('success'): + job_id = response.get('id') + if not job_id: + raise Exception('Job ID not returned from extract request.') + + while True: + status_data = await self._async_get_request( + f'{self.api_url}/v1/extract/{job_id}', + headers + ) + + if status_data['status'] == 'completed': + return status_data + elif status_data['status'] in ['failed', 'cancelled']: + raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') + + await asyncio.sleep(2) + else: + raise Exception(f'Failed to extract. Error: {response.get("error")}') + + async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse: + """ + Check the status of an asynchronous batch scrape job. + + Args: + id (str): The ID of the batch scrape job + + Returns: + BatchScrapeStatusResponse containing: + Status Information: + * status - Current state (scraping/completed/failed/cancelled) + * completed - Number of URLs scraped + * total - Total URLs to scrape + * creditsUsed - API credits consumed + * expiresAt - Data expiration timestamp + + Results: + * data - List of scraped documents + * next - URL for next page of results (if paginated) + * success - Whether status check succeeded + * error - Error message if failed + + Raises: + Exception: If status check fails + """ + headers = self._prepare_headers() + endpoint = f'/v1/batch/scrape/{id}' + + status_data = await self._async_get_request( + f'{self.api_url}{endpoint}', + headers + ) + + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + if len(status_data['data']) == 0: + break + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + next_data = await self._async_get_request(next_url, headers) + data.extend(next_data.get('data', [])) + status_data = next_data + status_data['data'] = data + + response = { + 'status': status_data.get('status'), + 'total': status_data.get('total'), + 'completed': status_data.get('completed'), + 'creditsUsed': status_data.get('creditsUsed'), + 'expiresAt': status_data.get('expiresAt'), + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response + } + + async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse: + """ + Get information about errors from an asynchronous batch scrape job. + + Args: + id (str): The ID of the batch scrape job + + Returns: + CrawlErrorsResponse containing: + errors (List[Dict[str, str]]): List of errors with fields: + * id (str): Error ID + * timestamp (str): When the error occurred + * url (str): URL that caused the error + * error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If error check fails + """ + headers = self._prepare_headers() + return await self._async_get_request( + f'{self.api_url}/v1/batch/scrape/{id}/errors', + headers + ) + + async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse: + """ + Get information about errors from an asynchronous crawl job. + + Args: + id (str): The ID of the crawl job + + Returns: + CrawlErrorsResponse containing: + * errors (List[Dict[str, str]]): List of errors with fields: + - id (str): Error ID + - timestamp (str): When the error occurred + - url (str): URL that caused the error + - error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If error check fails + """ + headers = self._prepare_headers() + return await self._async_get_request( + f'{self.api_url}/v1/crawl/{id}/errors', + headers + ) + + async def cancel_crawl(self, id: str) -> Dict[str, Any]: + """ + Cancel an asynchronous crawl job. + + Args: + id (str): The ID of the crawl job to cancel + + Returns: + Dict[str, Any] containing: + * success (bool): Whether cancellation was successful + * error (str, optional): Error message if cancellation failed + + Raises: + Exception: If cancellation fails + """ + headers = self._prepare_headers() + async with aiohttp.ClientSession() as session: + async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response: + return await response.json() + + async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]: + """ + Check the status of an asynchronous extraction job. + + Args: + job_id (str): The ID of the extraction job + + Returns: + ExtractResponse containing: + * success (bool): Whether extraction completed successfully + * data (Any): Extracted structured data + * error (str, optional): Error message if extraction failed + * warning (str, optional): Warning message if any + * sources (List[str], optional): Source URLs if requested + + Raises: + ValueError: If status check fails + """ + headers = self._prepare_headers() + try: + return await self._async_get_request( + f'{self.api_url}/v1/extract/{job_id}', + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def async_extract( + self, + urls: List[str], + params: Optional[ExtractParams] = None, + idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: + """ + Initiate an asynchronous extraction job without waiting for completion. + + Args: + urls (List[str]): URLs to extract information from + params (Optional[ExtractParams]): See ExtractParams model: + Extraction Config: + * prompt - Custom extraction prompt + * schema - JSON schema/Pydantic model + * systemPrompt - System context + + Behavior Options: + * allowExternalLinks - Follow external links + * enableWebSearch - Enable web search + * includeSubdomains - Include subdomains + * showSources - Include source URLs + + Scraping Options: + * scrapeOptions - Page scraping config + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + ExtractResponse containing: + * success (bool): Whether job started successfully + * id (str): Unique identifier for the job + * error (str, optional): Error message if start failed + + Raises: + ValueError: If job initiation fails + """ + headers = self._prepare_headers(idempotency_key) + + schema = params.get('schema') if params else None + if schema: + if hasattr(schema, 'model_json_schema'): + schema = schema.model_json_schema() + + jsonData = {'urls': urls, **(params or {})} + request_data = { + **jsonData, + 'allowExternalLinks': params.get('allow_external_links', False) if params else False, + 'schema': schema, + 'origin': 'api-sdk' + } + + try: + return await self._async_post_request( + f'{self.api_url}/v1/extract', + request_data, + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: + """ + Generate LLMs.txt for a given URL and monitor until completion. + + Args: + url (str): Target URL to generate LLMs.txt from + params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): See GenerateLLMsTextParams model: + Generation Options: + * maxUrls - Maximum URLs to process (default: 10) + * showFullText - Include full text in output (default: False) + + Returns: + GenerateLLMsTextStatusResponse containing: + * success (bool): Whether generation completed successfully + * status (str): Status of generation (processing/completed/failed) + * data (Dict[str, str], optional): Generated text with fields: + - llmstxt (str): Generated LLMs.txt content + - llmsfulltxt (str, optional): Full version if requested + * error (str, optional): Error message if generation failed + * expiresAt (str): When the generated data expires + + Raises: + Exception: If generation fails + """ + if params is None: + params = {} + + if isinstance(params, dict): + generation_params = GenerateLLMsTextParams(**params) + else: + generation_params = params + + response = await self.async_generate_llms_text(url, generation_params) + if not response.get('success') or 'id' not in response: + return response + + job_id = response['id'] + while True: + status = await self.check_generate_llms_text_status(job_id) + + if status['status'] == 'completed': + return status + elif status['status'] == 'failed': + raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}') + elif status['status'] != 'processing': + break + + await asyncio.sleep(2) + + return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} + + async def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: + """ + Initiate an asynchronous LLMs.txt generation job without waiting for completion. + + Args: + url (str): Target URL to generate LLMs.txt from + params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): See GenerateLLMsTextParams model: + Generation Options: + * maxUrls - Maximum URLs to process (default: 10) + * showFullText - Include full text in output (default: False) + + Returns: + GenerateLLMsTextResponse containing: + * success (bool): Whether job started successfully + * id (str): Unique identifier for the job + * error (str, optional): Error message if start failed + + Raises: + ValueError: If job initiation fails + """ + if params is None: + params = {} + + if isinstance(params, dict): + generation_params = GenerateLLMsTextParams(**params) + else: + generation_params = params + + headers = self._prepare_headers() + json_data = {'url': url, **generation_params.dict(exclude_none=True)} + + try: + return await self._async_post_request( + f'{self.api_url}/v1/llmstxt', + json_data, + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse: + """ + Check the status of an asynchronous LLMs.txt generation job. + + Args: + id (str): The ID of the generation job + + Returns: + GenerateLLMsTextStatusResponse containing: + * success (bool): Whether generation completed successfully + * status (str): Status of generation (processing/completed/failed) + * data (Dict[str, str], optional): Generated text with fields: + - llmstxt (str): Generated LLMs.txt content + - llmsfulltxt (str, optional): Full version if requested + * error (str, optional): Error message if generation failed + * expiresAt (str): When the generated data expires + + Raises: + ValueError: If status check fails + """ + headers = self._prepare_headers() + try: + return await self._async_get_request( + f'{self.api_url}/v1/llmstxt/{id}', + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def deep_research( + self, + query: str, + params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, + on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, + on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: + """ + Initiates a deep research operation on a given query and polls until completion, providing real-time updates via callbacks. + + Args: + query: Research query or topic to investigate + + params: See DeepResearchParams model: + Research Settings: + * maxDepth - Maximum research depth (default: 7) + * timeLimit - Time limit in seconds (default: 270) + * maxUrls - Maximum URLs to process (default: 20) + + Callbacks: + * on_activity - Progress callback receiving: + {type, status, message, timestamp, depth} + * on_source - Source discovery callback receiving: + {url, title, description} + + Returns: + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries + + Raises: + Exception: If research fails + """ + if params is None: + params = {} + + if isinstance(params, dict): + research_params = DeepResearchParams(**params) + else: + research_params = params + + response = await self.async_deep_research(query, research_params) + if not response.get('success') or 'id' not in response: + return response + + job_id = response['id'] + last_activity_count = 0 + last_source_count = 0 + + while True: + status = await self.check_deep_research_status(job_id) + + if on_activity and 'activities' in status: + new_activities = status['activities'][last_activity_count:] + for activity in new_activities: + on_activity(activity) + last_activity_count = len(status['activities']) + + if on_source and 'sources' in status: + new_sources = status['sources'][last_source_count:] + for source in new_sources: + on_source(source) + last_source_count = len(status['sources']) + + if status['status'] == 'completed': + return status + elif status['status'] == 'failed': + raise Exception(f'Deep research failed. Error: {status.get("error")}') + elif status['status'] != 'processing': + break + + await asyncio.sleep(2) + + return {'success': False, 'error': 'Deep research job terminated unexpectedly'} + + async def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse: + """ + Initiate an asynchronous deep research job without waiting for completion. + + Args: + query (str): Research query or topic to investigate + params (Optional[Union[Dict[str, Any], DeepResearchParams]]): See DeepResearchParams model: + Research Settings: + * maxDepth - Maximum research depth (default: 7) + * timeLimit - Time limit in seconds (default: 270) + * maxUrls - Maximum URLs to process (default: 20) + + Returns: + DeepResearchResponse containing: + * success (bool): Whether job started successfully + * id (str): Unique identifier for the job + * error (str, optional): Error message if start failed + + Raises: + ValueError: If job initiation fails + """ + if params is None: + params = {} + + if isinstance(params, dict): + research_params = DeepResearchParams(**params) + else: + research_params = params + + headers = self._prepare_headers() + json_data = {'query': query, **research_params.dict(exclude_none=True)} + + try: + return await self._async_post_request( + f'{self.api_url}/v1/deep-research', + json_data, + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse: + """ + Check the status of an asynchronous deep research job. + + Args: + id (str): The ID of the research job + + Returns: + DeepResearchStatusResponse containing: + * success (bool): Whether research completed successfully + * status (str): Current state (processing/completed/failed) + * data (Dict[str, Any], optional): Research findings and analysis + * error (str, optional): Error message if failed + * expiresAt (str): When the research data expires + * currentDepth (int): Current research depth + * maxDepth (int): Maximum research depth + * activities (List[Dict[str, Any]]): Research progress log + * sources (List[Dict[str, Any]]): Discovered sources + * summaries (List[str]): Generated research summaries + + Raises: + ValueError: If status check fails + """ + headers = self._prepare_headers() + try: + return await self._async_get_request( + f'{self.api_url}/v1/deep-research/{id}', + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: + """ + Asynchronously search for content using Firecrawl. + + Args: + query (str): Search query string + params (Optional[Union[Dict[str, Any], SearchParams]]): See SearchParams model: + Search Options: + * limit - Max results (default: 5) + * tbs - Time filter (e.g. "qdr:d") + * filter - Custom result filter + + Localization: + * lang - Language code (default: "en") + * country - Country code (default: "us") + * location - Geo-targeting + + Request Options: + * timeout - Request timeout (ms) + * scrapeOptions - Result scraping config + + Returns: + SearchResponse containing: + * success (bool): Whether search completed successfully + * data (List[FirecrawlDocument]): Search results + * warning (str, optional): Warning message if any + * error (str, optional): Error message if search failed + + Raises: + Exception: If search fails + """ + if params is None: + params = {} + + if isinstance(params, dict): + search_params = SearchParams(query=query, **params) + else: + search_params = params + search_params.query = query + + return await self._async_post_request( + f"{self.api_url}/v1/search", + search_params.dict(exclude_none=True), + {"Authorization": f"Bearer {self.api_key}"} + ) + +class AsyncCrawlWatcher(CrawlWatcher): + """ + Async version of CrawlWatcher that properly handles async operations. + """ + def __init__(self, id: str, app: AsyncFirecrawlApp): + super().__init__(id, app) + + async def connect(self) -> None: + """ + Establishes async WebSocket connection and starts listening for messages. + """ + async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket: + await self._listen(websocket) + + async def _listen(self, websocket) -> None: + """ + Listens for incoming WebSocket messages and handles them asynchronously. + + Args: + websocket: The WebSocket connection object + """ + async for message in websocket: + msg = json.loads(message) + await self._handle_message(msg) + + async def _handle_message(self, msg: Dict[str, Any]) -> None: + """ + Handles incoming WebSocket messages based on their type asynchronously. + + Args: + msg (Dict[str, Any]): The message to handle + """ + if msg['type'] == 'done': + self.status = 'completed' + self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) + elif msg['type'] == 'error': + self.status = 'failed' + self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id}) + elif msg['type'] == 'catchup': + self.status = msg['data']['status'] + self.data.extend(msg['data'].get('data', [])) + for doc in self.data: + self.dispatch_event('document', {'data': doc, 'id': self.id}) + elif msg['type'] == 'document': + self.data.append(msg['data']) + self.dispatch_event('document', {'data': msg['data'], 'id': self.id}) + + async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None: + """ + Handle errors from async API responses. + """ + try: + error_data = await response.json() + error_message = error_data.get('error', 'No error message provided.') + error_details = error_data.get('details', 'No additional error details provided.') + except: + raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}') + + if response.status == 402: + message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" + elif response.status == 408: + message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" + elif response.status == 409: + message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" + elif response.status == 500: + message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" + else: + message = f"Unexpected error during {action}: Status code {response.status}. {error_message} - {error_details}" + + raise aiohttp.ClientError(message) diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index 5dcd8f6c..360d9e76 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -3,4 +3,5 @@ pytest python-dotenv websockets nest-asyncio -pydantic \ No newline at end of file +pydantic +aiohttp \ No newline at end of file From 86f41460e0bb50f009170a11757f27d037103c83 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 13 Mar 2025 17:00:46 -0300 Subject: [PATCH 03/26] removed v0 in example --- apps/python-sdk/example_async.py | 38 -------------------------------- 1 file changed, 38 deletions(-) diff --git a/apps/python-sdk/example_async.py b/apps/python-sdk/example_async.py index 7afe6a70..d5251515 100644 --- a/apps/python-sdk/example_async.py +++ b/apps/python-sdk/example_async.py @@ -73,42 +73,6 @@ async def example_llm_extraction(): print(llm_extraction_result['extract']) - # Define schema to extract contents into using json schema - json_schema = { - "type": "object", - "properties": { - "top": { - "type": "array", - "items": { - "type": "object", - "properties": { - "title": {"type": "string"}, - "points": {"type": "number"}, - "by": {"type": "string"}, - "commentsURL": {"type": "string"} - }, - "required": ["title", "points", "by", "commentsURL"] - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News" - } - }, - "required": ["top"] - } - - app2 = AsyncFirecrawlApp(api_key="fc-", version="v0") - - llm_extraction_result = await app2.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': json_schema, - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } - }) - async def example_map_and_extract(): # Map a website: map_result = await app.map_url('https://firecrawl.dev', { 'search': 'blog' }) @@ -153,10 +117,8 @@ async def example_websocket_crawl(): await watcher.connect() async def main(): - # Apply nest_asyncio to allow nested event loops nest_asyncio.apply() - # Run all the examples await example_scrape() await example_batch_scrape() await example_crawl() From e7db5a2d5b19188e05e840e1e2c7a88098f9bde0 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 14 Mar 2025 07:49:30 -0300 Subject: [PATCH 04/26] tomkosms review --- apps/python-sdk/firecrawl/firecrawl.py | 137 +++++++++++++++++++------ 1 file changed, 106 insertions(+), 31 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index e0f8c940..3bc1aa9b 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -108,16 +108,46 @@ class CrawlScrapeOptions(pydantic.BaseModel): blockAds: Optional[bool] = None proxy: Optional[Literal["basic", "stealth"]] = None -class Action(pydantic.BaseModel): - """Action to perform during scraping.""" - type: Literal["wait", "click", "screenshot", "write", "press", "scroll", "scrape", "executeJavascript"] - milliseconds: Optional[int] = None +class WaitAction(pydantic.BaseModel): + """Wait action to perform during scraping.""" + type: Literal["wait"] + milliseconds: int selector: Optional[str] = None + +class ScreenshotAction(pydantic.BaseModel): + """Screenshot action to perform during scraping.""" + type: Literal["screenshot"] fullPage: Optional[bool] = None - text: Optional[str] = None - key: Optional[str] = None - direction: Optional[Literal["up", "down"]] = None - script: Optional[str] = None + +class ClickAction(pydantic.BaseModel): + """Click action to perform during scraping.""" + type: Literal["click"] + selector: str + +class WriteAction(pydantic.BaseModel): + """Write action to perform during scraping.""" + type: Literal["write"] + text: str + +class PressAction(pydantic.BaseModel): + """Press action to perform during scraping.""" + type: Literal["press"] + key: str + +class ScrollAction(pydantic.BaseModel): + """Scroll action to perform during scraping.""" + type: Literal["scroll"] + direction: Literal["up", "down"] + selector: Optional[str] = None + +class ScrapeAction(pydantic.BaseModel): + """Scrape action to perform during scraping.""" + type: Literal["scrape"] + +class ExecuteJavascriptAction(pydantic.BaseModel): + """Execute javascript action to perform during scraping.""" + type: Literal["executeJavascript"] + script: str class ExtractConfig(pydantic.BaseModel): """Configuration for extraction.""" @@ -129,7 +159,7 @@ class ScrapeParams(CrawlScrapeOptions): """Parameters for scraping operations.""" extract: Optional[ExtractConfig] = None jsonOptions: Optional[ExtractConfig] = None - actions: Optional[List[Action]] = None + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None class ScrapeResponse(FirecrawlDocument[T], Generic[T]): """Response from scraping operations.""" @@ -240,7 +270,7 @@ class SearchParams(pydantic.BaseModel): location: Optional[str] = None origin: Optional[str] = "api" timeout: Optional[int] = 60000 - scrapeOptions: Optional[Dict[str, Any]] = None + scrapeOptions: Optional[CrawlScrapeOptions] = None class SearchResponse(pydantic.BaseModel): """Response from search operations.""" @@ -295,10 +325,14 @@ class GenerateLLMsTextResponse(pydantic.BaseModel): id: str error: Optional[str] = None +class GenerateLLMsTextStatusResponseData(pydantic.BaseModel): + llmstxt: str + llmsfulltxt: Optional[str] = None + class GenerateLLMsTextStatusResponse(pydantic.BaseModel): """Status response from LLMs.txt generation operations.""" success: bool = True - data: Optional[Dict[str, str]] = None # {llmstxt: str, llmsfulltxt?: str} + data: Optional[GenerateLLMsTextStatusResponseData] = None status: Literal["processing", "completed", "failed"] error: Optional[str] = None expiresAt: str @@ -322,13 +356,16 @@ class FirecrawlApp: logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}") - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> ScrapeResponse[Any]: + def scrape_url( + self, + url: str, + params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]: """ Scrape and extract content from a URL. Args: url (str): Target URL to scrape - params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration: + params (Optional[ScrapeParams]): See ScrapeParams model for configuration: Content Options: * formats - Content types to retrieve (markdown/html/etc) * includeTags - HTML tags to include @@ -410,7 +447,10 @@ class FirecrawlApp: else: self._handle_error(response, 'scrape URL') - def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: + def search( + self, + query: str, + params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: """ Search for content using Firecrawl. @@ -520,14 +560,18 @@ class FirecrawlApp: self._handle_error(response, 'start crawl job') - def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> CrawlResponse: + def async_crawl_url( + self, + url: str, + params: Optional[CrawlParams] = None, + idempotency_key: Optional[str] = None) -> CrawlResponse: """ Start an asynchronous crawl job. Args: url (str): Target URL to start crawling from - params (Optional[Dict[str, Any]]): See CrawlParams model: + params (Optional[CrawlParams]): See CrawlParams model: URL Discovery: * includePaths - Patterns of URLs to include @@ -754,7 +798,10 @@ class FirecrawlApp: else: raise Exception("Crawl job failed to start") - def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> MapResponse: + def map_url( + self, + url: str, + params: Optional[MapParams] = None) -> MapResponse: """ Map and discover links from a URL. @@ -1891,7 +1938,7 @@ class AsyncFirecrawlApp(FirecrawlApp): if response.status == 502: await asyncio.sleep(backoff_factor * (2 ** attempt)) continue - if response.status != 200: + if response.status >= 300: await self._handle_error(response, "make POST request") return await response.json() except aiohttp.ClientError as e: @@ -1930,7 +1977,7 @@ class AsyncFirecrawlApp(FirecrawlApp): if response.status == 502: await asyncio.sleep(backoff_factor * (2 ** attempt)) continue - if response.status != 200: + if response.status >= 300: # Accept any 2xx status code as success await self._handle_error(response, "make GET request") return await response.json() except aiohttp.ClientError as e: @@ -2060,13 +2107,16 @@ class AsyncFirecrawlApp(FirecrawlApp): else: raise Exception("Batch scrape job failed to start") - async def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> ScrapeResponse[Any]: + async def scrape_url( + self, + url: str, + params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]: """ Asynchronously scrape and extract content from a URL. Args: url (str): Target URL to scrape - params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration: + params (Optional[ScrapeParams]): See ScrapeParams model for configuration: Content Options: * formats - Content types to retrieve (markdown/html/etc) * includeTags - HTML tags to include @@ -2122,7 +2172,10 @@ class AsyncFirecrawlApp(FirecrawlApp): else: raise Exception(f'Failed to scrape URL. Error: {response}') - async def batch_scrape_urls(self, urls: List[str], params: Optional[ScrapeParams] = None) -> BatchScrapeStatusResponse: + async def batch_scrape_urls( + self, + urls: List[str], + params: Optional[ScrapeParams] = None) -> BatchScrapeStatusResponse: """ Asynchronously scrape multiple URLs and monitor until completion. @@ -2282,13 +2335,17 @@ class AsyncFirecrawlApp(FirecrawlApp): else: raise Exception(f'Failed to start crawl. Error: {response.get("error")}') - async def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> CrawlResponse: + async def async_crawl_url( + self, + url: str, + params: Optional[CrawlParams] = None, + idempotency_key: Optional[str] = None) -> CrawlResponse: """ Initiate an asynchronous crawl job without waiting for completion. Args: url (str): Target URL to start crawling from - params (Optional[Dict[str, Any]]): See CrawlParams model: + params (Optional[CrawlParams]): See CrawlParams model: URL Discovery: * includePaths - Patterns of URLs to include * excludePaths - Patterns of URLs to exclude @@ -2442,13 +2499,16 @@ class AsyncFirecrawlApp(FirecrawlApp): else: raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}') - async def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> MapResponse: + async def map_url( + self, + url: str, + params: Optional[MapParams] = None) -> MapResponse: """ Asynchronously map and discover links from a URL. Args: url (str): Target URL to map - params (Optional[Dict[str, Any]]): See MapParams model: + params (Optional[MapParams]): See MapParams model: Discovery Options: * search - Filter pattern for URLs * ignoreSitemap - Skip sitemap.xml @@ -2486,7 +2546,10 @@ class AsyncFirecrawlApp(FirecrawlApp): else: raise Exception(f'Failed to map URL. Error: {response}') - async def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: + async def extract( + self, + urls: List[str], + params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: """ Asynchronously extract structured information from URLs. @@ -2792,7 +2855,10 @@ class AsyncFirecrawlApp(FirecrawlApp): except Exception as e: raise ValueError(str(e)) - async def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: + async def generate_llms_text( + self, + url: str, + params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: """ Generate LLMs.txt for a given URL and monitor until completion. @@ -2843,7 +2909,10 @@ class AsyncFirecrawlApp(FirecrawlApp): return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} - async def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: + async def async_generate_llms_text( + self, + url: str, + params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation job without waiting for completion. @@ -2996,7 +3065,10 @@ class AsyncFirecrawlApp(FirecrawlApp): return {'success': False, 'error': 'Deep research job terminated unexpectedly'} - async def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse: + async def async_deep_research( + self, + query: str, + params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse: """ Initiate an asynchronous deep research job without waiting for completion. @@ -3069,7 +3141,10 @@ class AsyncFirecrawlApp(FirecrawlApp): except Exception as e: raise ValueError(str(e)) - async def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: + async def search( + self, + query: str, + params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: """ Asynchronously search for content using Firecrawl. From 97695dd55b987b12641739da20872e7e92e15eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Fri, 14 Mar 2025 19:53:57 +0000 Subject: [PATCH 05/26] refator: dry request and error handling --- apps/python-sdk/firecrawl/firecrawl.py | 209 +++++++++++++++---------- 1 file changed, 123 insertions(+), 86 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 3bc1aa9b..d62312c6 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1624,21 +1624,35 @@ class FirecrawlApp: except: raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response) - - if response.status_code == 402: - message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" - elif response.status_code == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" - elif response.status_code == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" - elif response.status_code == 500: - message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" - else: - message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}" + message = self._get_error_message(response.status_code, action, error_message, error_details) # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) + def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str: + """ + Generate a standardized error message based on HTTP status code. + + Args: + status_code (int): The HTTP status code from the response + action (str): Description of the action that was being performed + error_message (str): The error message from the API response + error_details (str): Additional error details from the API response + + Returns: + str: A formatted error message + """ + if status_code == 402: + return f"Payment Required: Failed to {action}. {error_message} - {error_details}" + elif status_code == 408: + return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" + elif status_code == 409: + return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" + elif status_code == 500: + return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" + else: + return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}" + def deep_research( self, query: str, @@ -1905,86 +1919,96 @@ class AsyncFirecrawlApp(FirecrawlApp): Asynchronous version of FirecrawlApp that implements async methods using aiohttp. Provides non-blocking alternatives to all FirecrawlApp operations. """ - - async def _async_post_request( + + async def _async_request( self, + method: str, url: str, - data: Dict[str, Any], headers: Dict[str, str], + data: Optional[Dict[str, Any]] = None, retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]: """ + Generic async request method with exponential backoff retry logic. + + Args: + method (str): The HTTP method to use (e.g., "GET" or "POST"). + url (str): The URL to send the request to. + headers (Dict[str, str]): Headers to include in the request. + data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests). + retries (int): Maximum number of retry attempts (default: 3). + backoff_factor (float): Factor to calculate delay between retries (default: 0.5). + Delay will be backoff_factor * (2 ** retry_count). + + Returns: + Dict[str, Any]: The parsed JSON response from the server. + + Raises: + aiohttp.ClientError: If the request fails after all retries. + Exception: If max retries are exceeded or other errors occur. + """ + async with aiohttp.ClientSession() as session: + for attempt in range(retries): + try: + async with session.request( + method=method, url=url, headers=headers, json=data + ) as response: + if response.status == 502: + await asyncio.sleep(backoff_factor * (2 ** attempt)) + continue + if response.status >= 300: + await self._handle_error(response, f"make {method} request") + return await response.json() + except aiohttp.ClientError as e: + if attempt == retries - 1: + raise e + await asyncio.sleep(backoff_factor * (2 ** attempt)) + raise Exception("Max retries exceeded") + + async def _async_post_request( + self, url: str, data: Dict[str, Any], headers: Dict[str, str], + retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]: + """ Make an async POST request with exponential backoff retry logic. Args: - url (str): The URL to send the POST request to - data (Dict[str, Any]): The JSON data to include in the request body - headers (Dict[str, str]): Headers to include in the request - retries (int): Maximum number of retry attempts (default: 3) - backoff_factor (float): Factor to calculate delay between retries (default: 0.5) - Delay will be backoff_factor * (2 ** retry_count) + url (str): The URL to send the POST request to. + data (Dict[str, Any]): The JSON data to include in the request body. + headers (Dict[str, str]): Headers to include in the request. + retries (int): Maximum number of retry attempts (default: 3). + backoff_factor (float): Factor to calculate delay between retries (default: 0.5). + Delay will be backoff_factor * (2 ** retry_count). Returns: - Dict[str, Any]: The parsed JSON response from the server + Dict[str, Any]: The parsed JSON response from the server. Raises: - aiohttp.ClientError: If the request fails after all retries - Exception: If max retries are exceeded or other errors occur + aiohttp.ClientError: If the request fails after all retries. + Exception: If max retries are exceeded or other errors occur. """ - async with aiohttp.ClientSession() as session: - for attempt in range(retries): - try: - async with session.post(url, headers=headers, json=data) as response: - if response.status == 502: - await asyncio.sleep(backoff_factor * (2 ** attempt)) - continue - if response.status >= 300: - await self._handle_error(response, "make POST request") - return await response.json() - except aiohttp.ClientError as e: - if attempt == retries - 1: - raise e - await asyncio.sleep(backoff_factor * (2 ** attempt)) - raise Exception("Max retries exceeded") + return await self._async_request("POST", url, headers, data, retries, backoff_factor) async def _async_get_request( - self, - url: str, - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> Dict[str, Any]: + self, url: str, headers: Dict[str, str], + retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]: """ Make an async GET request with exponential backoff retry logic. Args: - url (str): The URL to send the GET request to - headers (Dict[str, str]): Headers to include in the request - retries (int): Maximum number of retry attempts (default: 3) - backoff_factor (float): Factor to calculate delay between retries (default: 0.5) - Delay will be backoff_factor * (2 ** retry_count) + url (str): The URL to send the GET request to. + headers (Dict[str, str]): Headers to include in the request. + retries (int): Maximum number of retry attempts (default: 3). + backoff_factor (float): Factor to calculate delay between retries (default: 0.5). + Delay will be backoff_factor * (2 ** retry_count). Returns: - Dict[str, Any]: The parsed JSON response from the server + Dict[str, Any]: The parsed JSON response from the server. Raises: - aiohttp.ClientError: If the request fails after all retries - Exception: If max retries are exceeded or other errors occur + aiohttp.ClientError: If the request fails after all retries. + Exception: If max retries are exceeded or other errors occur. """ - async with aiohttp.ClientSession() as session: - for attempt in range(retries): - try: - async with session.get(url, headers=headers) as response: - if response.status == 502: - await asyncio.sleep(backoff_factor * (2 ** attempt)) - continue - if response.status >= 300: # Accept any 2xx status code as success - await self._handle_error(response, "make GET request") - return await response.json() - except aiohttp.ClientError as e: - if attempt == retries - 1: - raise e - await asyncio.sleep(backoff_factor * (2 ** attempt)) - raise Exception("Max retries exceeded") + return await self._async_request("GET", url, headers, None, retries, backoff_factor) async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None: """ @@ -2009,19 +2033,25 @@ class AsyncFirecrawlApp(FirecrawlApp): except: raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}') - if response.status == 402: - message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" - elif response.status == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" - elif response.status == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" - elif response.status == 500: - message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" - else: - message = f"Unexpected error during {action}: Status code {response.status}. {error_message} - {error_details}" + message = await self._get_async_error_message(response.status, action, error_message, error_details) raise aiohttp.ClientError(message) + async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str: + """ + Generate a standardized error message based on HTTP status code for async operations. + + Args: + status_code (int): The HTTP status code from the response + action (str): Description of the action that was being performed + error_message (str): The error message from the API response + error_details (str): Additional error details from the API response + + Returns: + str: A formatted error message + """ + return self._get_error_message(status_code, action, error_message, error_details) + async def crawl_url_and_watch( self, url: str, @@ -3248,15 +3278,22 @@ class AsyncCrawlWatcher(CrawlWatcher): except: raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}') - if response.status == 402: - message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" - elif response.status == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" - elif response.status == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" - elif response.status == 500: - message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" - else: - message = f"Unexpected error during {action}: Status code {response.status}. {error_message} - {error_details}" + # Use the app's method to get the error message + message = await self.app._get_async_error_message(response.status, action, error_message, error_details) raise aiohttp.ClientError(message) + + async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str: + """ + Generate a standardized error message based on HTTP status code for async operations. + + Args: + status_code (int): The HTTP status code from the response + action (str): Description of the action that was being performed + error_message (str): The error message from the API response + error_details (str): Additional error details from the API response + + Returns: + str: A formatted error message + """ + return self._get_error_message(status_code, action, error_message, error_details) From cc255d488eefd953d14420cb7e7ef3b4d1cf4911 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 14 Mar 2025 18:27:42 -0300 Subject: [PATCH 06/26] fixed websocket params --- apps/python-sdk/firecrawl/firecrawl.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d62312c6..990599cc 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1855,7 +1855,10 @@ class CrawlWatcher: """ Establishes WebSocket connection and starts listening for messages. """ - async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket: + async with websockets.connect( + self.ws_url, + additional_headers=[("Authorization", f"Bearer {self.app.api_key}")] + ) as websocket: await self._listen(websocket) async def _listen(self, websocket) -> None: @@ -3231,7 +3234,10 @@ class AsyncCrawlWatcher(CrawlWatcher): """ Establishes async WebSocket connection and starts listening for messages. """ - async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket: + async with websockets.connect( + self.ws_url, + additional_headers=[("Authorization", f"Bearer {self.app.api_key}")] + ) as websocket: await self._listen(websocket) async def _listen(self, websocket) -> None: From 4f984d3fded33517ec52694ab2b22fcb6cc42e52 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 19 Mar 2025 09:45:51 -0300 Subject: [PATCH 07/26] added origin to requests --- apps/api/src/controllers/v1/extract.ts | 4 +- apps/js-sdk/firecrawl/src/index.ts | 47 +++++++++++++++++------- apps/python-sdk/firecrawl/firecrawl.py | 51 ++++++++++++++++++++++---- 3 files changed, 79 insertions(+), 23 deletions(-) diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index b18117f5..31c848b7 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -60,7 +60,9 @@ export async function extractController( if ( (await getTeamIdSyncB(req.auth.team_id)) && req.body.origin !== "api-sdk" && - req.body.origin !== "website" + req.body.origin !== "website" && + !req.body.origin.startsWith("python-sdk@") && + !req.body.origin.startsWith("js-sdk@") ) { return await oldExtract(req, res, extractId); } diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index ab09432e..8e4eca61 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -474,11 +474,26 @@ export interface GenerateLLMsTextStatusResponse { export default class FirecrawlApp { public apiKey: string; public apiUrl: string; - + public version: string = "1.19.1"; + private isCloudService(url: string): boolean { return url.includes('api.firecrawl.dev'); } + private async getVersion(): Promise { + try { + const packageJson = await import('../package.json', { assert: { type: 'json' } }); + return packageJson.default.version; + } catch (error) { + console.error("Error getting version:", error); + return "1.19.1"; + } + } + + private async init() { + this.version = await this.getVersion(); + } + /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. @@ -492,6 +507,7 @@ export default class FirecrawlApp { this.apiKey = apiKey || ''; this.apiUrl = baseUrl; + this.init(); } /** @@ -508,7 +524,7 @@ export default class FirecrawlApp { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; - let jsonData: any = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; @@ -590,7 +606,7 @@ export default class FirecrawlApp { lang: params?.lang ?? "en", country: params?.country ?? "us", location: params?.location, - origin: params?.origin ?? "api", + origin: `js-sdk@${this.version}`, timeout: params?.timeout ?? 60000, scrapeOptions: params?.scrapeOptions ?? { formats: [] }, }; @@ -662,7 +678,7 @@ export default class FirecrawlApp { idempotencyKey?: string ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/crawl`, @@ -691,7 +707,7 @@ export default class FirecrawlApp { idempotencyKey?: string ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/crawl`, @@ -867,7 +883,7 @@ export default class FirecrawlApp { */ async mapUrl(url: string, params?: MapParams): Promise { const headers = this.prepareHeaders(); - let jsonData: { url: string } & MapParams = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( @@ -904,7 +920,7 @@ export default class FirecrawlApp { ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; @@ -969,7 +985,7 @@ export default class FirecrawlApp { ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, @@ -1143,7 +1159,7 @@ export default class FirecrawlApp { try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/extract`, - { ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" }, + { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` }, headers ); @@ -1211,7 +1227,7 @@ export default class FirecrawlApp { try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/extract`, - { ...jsonData, schema: jsonSchema }, + { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` }, headers ); @@ -1497,10 +1513,11 @@ export default class FirecrawlApp { */ async asyncDeepResearch(query: string, params: DeepResearchParams): Promise { const headers = this.prepareHeaders(); + let jsonData: any = { query, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( - `${this.apiUrl}/v1/deep-research`, - { query, ...params }, + this.apiUrl + `/v1/deep-research`, + jsonData, headers ); @@ -1632,9 +1649,10 @@ export default class FirecrawlApp { async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise { const headers = this.prepareHeaders(); try { + let jsonData: any = { topic, ...params, origin: `js-sdk@${this.version}` }; const response: AxiosResponse = await this.postRequest( `${this.apiUrl}/v1/deep-research`, - { topic, ...params }, + jsonData, headers ); @@ -1744,10 +1762,11 @@ export default class FirecrawlApp { */ async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise { const headers = this.prepareHeaders(); + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( `${this.apiUrl}/v1/llmstxt`, - { url, ...params }, + jsonData, headers ); diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 990599cc..adedce9c 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -15,6 +15,7 @@ import time from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic import json from datetime import datetime +import re import requests import pydantic @@ -22,6 +23,20 @@ import websockets import aiohttp import asyncio +def get_version(): + try: + from pathlib import Path + package_path = os.path.dirname(__file__) + version_file = Path(os.path.join(package_path, '__init__.py')).read_text() + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) + if version_match: + return version_match.group(1).strip() + except Exception: + print("Failed to get version from __init__.py") + return None + +version = get_version() + logger : logging.Logger = logging.getLogger("firecrawl") T = TypeVar('T') @@ -424,6 +439,7 @@ class FirecrawlApp: if key not in ['jsonOptions']: scrape_params[key] = value + scrape_params['origin'] = f"python-sdk@{version}" endpoint = f'/v1/scrape' # Make the POST request with the prepared headers and JSON data @@ -489,10 +505,13 @@ class FirecrawlApp: search_params = params search_params.query = query + params_dict = search_params.dict(exclude_none=True) + params_dict['origin'] = f"python-sdk@{version}" + response = requests.post( f"{self.api_url}/v1/search", headers={"Authorization": f"Bearer {self.api_key}"}, - json=search_params.dict(exclude_none=True) + json=params_dict ) if response.status_code != 200: @@ -548,6 +567,7 @@ class FirecrawlApp: json_data = {'url': url} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: try: @@ -609,6 +629,7 @@ class FirecrawlApp: json_data = {'url': url} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: try: @@ -835,6 +856,7 @@ class FirecrawlApp: json_data = {'url': url} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" # Make the POST request with the prepared headers and JSON data response = requests.post( @@ -897,6 +919,7 @@ class FirecrawlApp: json_data = {'urls': urls} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: try: @@ -953,6 +976,7 @@ class FirecrawlApp: json_data = {'urls': urls} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: try: @@ -1153,7 +1177,7 @@ class FirecrawlApp: 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), 'showSources': params.get('show_sources', params.get('showSources', False)), 'schema': schema, - 'origin': 'api-sdk' + 'origin': f'python-sdk@{get_version()}' } # Only add prompt and systemPrompt if they exist @@ -1284,7 +1308,7 @@ class FirecrawlApp: **jsonData, 'allowExternalLinks': params.get('allow_external_links', False) if params else False, 'schema': schema, - 'origin': 'api-sdk' + 'origin': f'python-sdk@{version}' } try: @@ -1387,6 +1411,7 @@ class FirecrawlApp: headers = self._prepare_headers() json_data = {'url': url, **generation_params.dict(exclude_none=True)} + json_data['origin'] = f"python-sdk@{version}" try: response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers) @@ -1770,6 +1795,7 @@ class FirecrawlApp: headers = self._prepare_headers() json_data = {'query': query, **research_params.dict(exclude_none=True)} + json_data['origin'] = f"python-sdk@{version}" try: response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers) @@ -2178,7 +2204,7 @@ class AsyncFirecrawlApp(FirecrawlApp): Exception: If scraping fails """ headers = self._prepare_headers() - scrape_params = {'url': url} + scrape_params = {'url': url, 'origin': f'python-sdk@{version}'} if params: extract = params.get('extract', {}) @@ -2245,6 +2271,7 @@ class AsyncFirecrawlApp(FirecrawlApp): json_data = {'urls': urls} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" endpoint = f'/v1/batch/scrape' response = await self._async_post_request( @@ -2301,6 +2328,7 @@ class AsyncFirecrawlApp(FirecrawlApp): json_data = {'urls': urls} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" endpoint = f'/v1/batch/scrape' return await self._async_post_request( @@ -2355,6 +2383,7 @@ class AsyncFirecrawlApp(FirecrawlApp): json_data = {'url': url} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" endpoint = f'/v1/crawl' response = await self._async_post_request( @@ -2413,6 +2442,7 @@ class AsyncFirecrawlApp(FirecrawlApp): json_data = {'url': url} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" endpoint = f'/v1/crawl' return await self._async_post_request( @@ -2564,6 +2594,7 @@ class AsyncFirecrawlApp(FirecrawlApp): json_data = {'url': url} if params: json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" endpoint = f'/v1/map' response = await self._async_post_request( @@ -2628,7 +2659,7 @@ class AsyncFirecrawlApp(FirecrawlApp): 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), 'showSources': params.get('show_sources', params.get('showSources', False)), 'schema': schema, - 'origin': 'api-sdk' + 'origin': f'python-sdk@{version}' } if params.get('prompt'): @@ -2876,7 +2907,7 @@ class AsyncFirecrawlApp(FirecrawlApp): **jsonData, 'allowExternalLinks': params.get('allow_external_links', False) if params else False, 'schema': schema, - 'origin': 'api-sdk' + 'origin': f'python-sdk@{version}' } try: @@ -2975,6 +3006,7 @@ class AsyncFirecrawlApp(FirecrawlApp): headers = self._prepare_headers() json_data = {'url': url, **generation_params.dict(exclude_none=True)} + json_data['origin'] = f"python-sdk@{version}" try: return await self._async_post_request( @@ -3132,7 +3164,7 @@ class AsyncFirecrawlApp(FirecrawlApp): headers = self._prepare_headers() json_data = {'query': query, **research_params.dict(exclude_none=True)} - + json_data['origin'] = f"python-sdk@{version}" try: return await self._async_post_request( f'{self.api_url}/v1/deep-research', @@ -3217,9 +3249,12 @@ class AsyncFirecrawlApp(FirecrawlApp): search_params = params search_params.query = query + search_params_dict = search_params.dict(exclude_none=True) + search_params_dict['origin'] = f"python-sdk@{version}" + return await self._async_post_request( f"{self.api_url}/v1/search", - search_params.dict(exclude_none=True), + search_params_dict, {"Authorization": f"Bearer {self.api_key}"} ) From d9780412f539579227b042b1826fca38a7efaf06 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 17 Apr 2025 23:08:33 -0700 Subject: [PATCH 08/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 8ca94fa6..d168a6b5 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -364,7 +364,6 @@ class ChangeTrackingData(pydantic.BaseModel): diff: Optional[Dict[str, Any]] = None json: Optional[Any] = None -class FirecrawlApp: class SearchResponse(pydantic.BaseModel): """ Response from the search operation. @@ -399,6 +398,9 @@ class FirecrawlApp: data: Optional[Any] = None error: Optional[str] = None +class FirecrawlApp: + + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: """ Initialize the FirecrawlApp instance with API key, API URL. From f48937a55dc733d74e3326d4082c8d57e1e6a770 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 17 Apr 2025 23:17:00 -0700 Subject: [PATCH 09/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d168a6b5..8fead0e6 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -16,12 +16,20 @@ from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, import json from datetime import datetime import re - +import warnings import requests import pydantic import websockets import aiohttp import asyncio +from pydantic import Field + +# Suppress Pydantic warnings about attribute shadowing +warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"") +warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"") +warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractConfig\" shadows an attribute in parent \"BaseModel\"") +warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"") + def get_version(): try: From 22cfdd6ae3bb396469d48dcc93564efae0254e71 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 17 Apr 2025 23:31:28 -0700 Subject: [PATCH 10/26] added agent options types --- apps/python-sdk/firecrawl/firecrawl.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 8ca94fa6..d7abf336 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -76,6 +76,15 @@ class FirecrawlDocumentMetadata(pydantic.BaseModel): statusCode: Optional[int] = None error: Optional[str] = None +class AgentOptions(pydantic.BaseModel): + """Configuration for the agent.""" + model: Literal["FIRE-1"] = "FIRE-1" + prompt: Optional[str] = None + +class AgentOptionsExtract(pydantic.BaseModel): + """Configuration for the agent in extract operations.""" + model: Literal["FIRE-1"] = "FIRE-1" + class ActionsResult(pydantic.BaseModel): """Result of actions performed during scraping.""" screenshots: List[str] @@ -164,17 +173,24 @@ class ExecuteJavascriptAction(pydantic.BaseModel): type: Literal["executeJavascript"] script: str + +class ExtractAgent(pydantic.BaseModel): + """Configuration for the agent in extract operations.""" + model: Literal["FIRE-1"] = "FIRE-1" + class ExtractConfig(pydantic.BaseModel): """Configuration for extraction.""" prompt: Optional[str] = None schema: Optional[Any] = None systemPrompt: Optional[str] = None + agent: Optional[ExtractAgent] = None class ScrapeParams(CrawlScrapeOptions): """Parameters for scraping operations.""" extract: Optional[ExtractConfig] = None jsonOptions: Optional[ExtractConfig] = None actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None + agent: Optional[AgentOptions] = None class ScrapeResponse(FirecrawlDocument[T], Generic[T]): """Response from scraping operations.""" @@ -363,7 +379,7 @@ class ChangeTrackingData(pydantic.BaseModel): visibility: str # "visible" | "hidden" diff: Optional[Dict[str, Any]] = None json: Optional[Any] = None - + class FirecrawlApp: class SearchResponse(pydantic.BaseModel): """ From 8eb4e1a96a4ad213ac1d270ee776e58ef42d9ea1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 17 Apr 2025 23:50:56 -0700 Subject: [PATCH 11/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 131 ++++++++++++++----------- 1 file changed, 74 insertions(+), 57 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 8fead0e6..b23b60c9 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -115,7 +115,7 @@ class WebhookConfig(pydantic.BaseModel): metadata: Optional[Dict[str, str]] = None events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None -class CrawlScrapeOptions(pydantic.BaseModel): +class CommonOptions(pydantic.BaseModel): """Parameters for scraping operations.""" formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None headers: Optional[Dict[str, str]] = None @@ -178,7 +178,7 @@ class ExtractConfig(pydantic.BaseModel): schema: Optional[Any] = None systemPrompt: Optional[str] = None -class ScrapeParams(CrawlScrapeOptions): +class ScrapeParams(CommonOptions): """Parameters for scraping operations.""" extract: Optional[ExtractConfig] = None jsonOptions: Optional[ExtractConfig] = None @@ -219,7 +219,7 @@ class CrawlParams(pydantic.BaseModel): allowBackwardLinks: Optional[bool] = None allowExternalLinks: Optional[bool] = None ignoreSitemap: Optional[bool] = None - scrapeOptions: Optional[CrawlScrapeOptions] = None + scrapeOptions: Optional[CommonOptions] = None webhook: Optional[Union[str, WebhookConfig]] = None deduplicateSimilarURLs: Optional[bool] = None ignoreQueryParameters: Optional[bool] = None @@ -273,7 +273,7 @@ class ExtractParams(pydantic.BaseModel): includeSubdomains: Optional[bool] = None origin: Optional[str] = None showSources: Optional[bool] = None - scrapeOptions: Optional[CrawlScrapeOptions] = None + scrapeOptions: Optional[CommonOptions] = None class ExtractResponse(pydantic.BaseModel, Generic[T]): """Response from extract operations.""" @@ -293,7 +293,7 @@ class SearchParams(pydantic.BaseModel): location: Optional[str] = None origin: Optional[str] = "api" timeout: Optional[int] = 60000 - scrapeOptions: Optional[CrawlScrapeOptions] = None + scrapeOptions: Optional[CommonOptions] = None class SearchResponse(pydantic.BaseModel): """Response from search operations.""" @@ -430,7 +430,21 @@ class FirecrawlApp: def scrape_url( self, url: str, - params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]: + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]: """ Scrape and extract content from a URL. @@ -464,66 +478,69 @@ class FirecrawlApp: Raises: Exception: If scraping fails """ - headers = self._prepare_headers() - # Prepare the base scrape parameters with the URL - scrape_params = {'url': url} + # Build scrape parameters + scrape_params = { + 'url': url, + 'origin': f"python-sdk@{version}" + } - # If there are additional params, process them - if params: - # Handle extract (for v1) - extract = params.get('extract', {}) - if extract: - if 'schema' in extract and hasattr(extract['schema'], 'schema'): - extract['schema'] = extract['schema'].schema() - scrape_params['extract'] = extract + # Add optional parameters if provided + if formats: + scrape_params['formats'] = formats + if include_tags: + scrape_params['includeTags'] = include_tags + if exclude_tags: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for: + scrape_params['waitFor'] = wait_for + if timeout: + scrape_params['timeout'] = timeout + if location: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy: + scrape_params['proxy'] = proxy + if extract: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key not in ['extract']: - scrape_params[key] = value - - json = params.get("jsonOptions", {}) - if json: - if 'schema' in json and hasattr(json['schema'], 'schema'): - json['schema'] = json['schema'].schema() - scrape_params['jsonOptions'] = json - - change_tracking = params.get("changeTrackingOptions", {}) - if change_tracking: - scrape_params['changeTrackingOptions'] = change_tracking - - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']: - scrape_params[key] = value - - agent = params.get('agent') - if agent: - scrape_params['agent'] = agent - - scrape_params['origin'] = f"python-sdk@{version}" - - endpoint = f'/v1/scrape' - # Make the POST request with the prepared headers and JSON data + # Make request response = requests.post( - f'{self.api_url}{endpoint}', + f'{self.api_url}/v1/scrape', headers=headers, json=scrape_params, - timeout=(scrape_params["timeout"] + 5000 if "timeout" in scrape_params else None), + timeout=(timeout + 5000 if timeout else None) ) + if response.status_code == 200: try: - response = response.json() - except: - raise Exception(f'Failed to parse Firecrawl response as JSON.') - if response['success'] and 'data' in response: - return response['data'] - elif "error" in response: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - else: - raise Exception(f'Failed to scrape URL. Error: {response}') + response_json = response.json() + if response_json.get('success') and 'data' in response_json: + return ScrapeResponse(**response_json['data']) + elif "error" in response_json: + raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}') + else: + raise Exception(f'Failed to scrape URL. Error: {response_json}') + except ValueError: + raise Exception('Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'scrape URL') @@ -1690,7 +1707,7 @@ class FirecrawlApp: raise Exception(f'Failed to parse Firecrawl response as JSON.') data.extend(status_data.get('data', [])) status_data['data'] = data - return status_data + return CrawlStatusResponse(**status_data) else: raise Exception('Crawl job completed but no data was returned') elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: From 85247991bcc1703ac8690951710c9a0470b90a50 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Apr 2025 00:00:18 -0700 Subject: [PATCH 12/26] generic --- apps/python-sdk/firecrawl/firecrawl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index e7cd45e2..015629c6 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -411,12 +411,12 @@ class ExtractParams(pydantic.BaseModel): show_sources: Optional[bool] = False agent: Optional[Dict[str, Any]] = None -class ExtractResponse(pydantic.BaseModel): +class ExtractResponse(pydantic.BaseModel, Generic[T]): """ Response from the extract operation. """ success: bool - data: Optional[Any] = None + data: Optional[T] = None error: Optional[str] = None class FirecrawlApp: From 8c5509cbb4c2e2be6538868c1377451fc631da00 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 00:26:00 -0700 Subject: [PATCH 13/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 107 ++++++++++++++++--------- 1 file changed, 71 insertions(+), 36 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b6f77661..d56f951c 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -558,61 +558,96 @@ class FirecrawlApp: def search( self, query: str, - params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: + limit: Optional[int] = None, + tbs: Optional[str] = None, + filter: Optional[str] = None, + lang: Optional[str] = None, + country: Optional[str] = None, + location: Optional[str] = None, + timeout: Optional[int] = None, + scrape_options: Optional[CommonOptions] = None, + params: Optional[Union[Dict[str, Any], SearchParams]] = None, + **kwargs) -> SearchResponse: """ Search for content using Firecrawl. Args: - query (str): Search query string - - params (Optional[Union[Dict[str, Any], SearchParams]]): See SearchParams model: - - Search Options: - * limit - Max results (default: 5) - * tbs - Time filter (e.g. "qdr:d") - * filter - Custom result filter - - Localization: - * lang - Language code (default: "en") - * country - Country code (default: "us") - * location - Geo-targeting - - Request Options: - * timeout - Request timeout (ms) - * scrapeOptions - Result scraping config, check ScrapeParams model for more details + query (str): Search query string + limit (Optional[int]): Max results (default: 5) + tbs (Optional[str]): Time filter (e.g. "qdr:d") + filter (Optional[str]): Custom result filter + lang (Optional[str]): Language code (default: "en") + country (Optional[str]): Country code (default: "us") + location (Optional[str]): Geo-targeting + timeout (Optional[int]): Request timeout in milliseconds + scrape_options (Optional[CommonOptions]): Result scraping configuration + params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters + **kwargs: Additional keyword arguments for future compatibility Returns: - SearchResponse - + SearchResponse: Response containing: + * success (bool): Whether request succeeded + * data (List[FirecrawlDocument]): Search results + * warning (Optional[str]): Warning message if any + * error (Optional[str]): Error message if any Raises: - Exception: If search fails + Exception: If search fails or response cannot be parsed """ - if params is None: - params = {} + # Build search parameters + search_params = {} + if params: + if isinstance(params, dict): + search_params.update(params) + else: + search_params.update(params.dict(exclude_none=True)) - if isinstance(params, dict): - search_params = SearchParams(query=query, **params) - else: - search_params = params - search_params.query = query + # Add individual parameters + if limit is not None: + search_params['limit'] = limit + if tbs is not None: + search_params['tbs'] = tbs + if filter is not None: + search_params['filter'] = filter + if lang is not None: + search_params['lang'] = lang + if country is not None: + search_params['country'] = country + if location is not None: + search_params['location'] = location + if timeout is not None: + search_params['timeout'] = timeout + if scrape_options is not None: + search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + + # Add any additional kwargs + search_params.update(kwargs) - params_dict = search_params.dict(exclude_none=True) + # Create final params object + final_params = SearchParams(query=query, **search_params) + params_dict = final_params.dict(exclude_none=True) params_dict['origin'] = f"python-sdk@{version}" + # Make request response = requests.post( f"{self.api_url}/v1/search", headers={"Authorization": f"Bearer {self.api_key}"}, json=params_dict ) - if response.status_code != 200: - raise Exception(f"Request failed with status code {response.status_code}") - - try: - return response.json() - except: - raise Exception(f'Failed to parse Firecrawl response as JSON.') + if response.status_code == 200: + try: + response_json = response.json() + if response_json.get('success') and 'data' in response_json: + return SearchResponse(**response_json) + elif "error" in response_json: + raise Exception(f'Search failed. Error: {response_json["error"]}') + else: + raise Exception(f'Search failed. Error: {response_json}') + except ValueError: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'search') def crawl_url(self, url: str, params: Optional[CrawlParams] = None, From a655d24e7cdcc64e11578f3afa78c4732331b5fe Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Apr 2025 00:29:20 -0700 Subject: [PATCH 14/26] scrape params commentary --- apps/python-sdk/firecrawl/firecrawl.py | 32 ++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d56f951c..ea99aaa7 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -461,23 +461,21 @@ class FirecrawlApp: Args: url (str): Target URL to scrape - params (Optional[ScrapeParams]): See ScrapeParams model for configuration: - Content Options: - * formats - Content types to retrieve (markdown/html/etc) - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only - - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type (basic/stealth) - - Extraction Options: - * extract - Content extraction settings - * jsonOptions - JSON extraction settings - * actions - Actions to perform + formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait for a specific element to appear + timeout (Optional[int]): Request timeout (ms) + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 images + block_ads (Optional[bool]): Block ads + proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth) + extract (Optional[ExtractConfig]): Content extraction settings + json_options (Optional[ExtractConfig]): JSON extraction settings + actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform Returns: ScrapeResponse with: From 5e6e41ab175207fd2fd0a24b6f8c5406b4826fb5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 00:37:34 -0700 Subject: [PATCH 15/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 868 +++++++++++++++++-------- 1 file changed, 611 insertions(+), 257 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d56f951c..1eb5f8e7 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -441,6 +441,7 @@ class FirecrawlApp: def scrape_url( self, url: str, + *, formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -455,7 +456,8 @@ class FirecrawlApp: proxy: Optional[Literal["basic", "stealth"]] = None, extract: Optional[ExtractConfig] = None, json_options: Optional[ExtractConfig] = None, - actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]: + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + **kwargs) -> ScrapeResponse[Any]: """ Scrape and extract content from a URL. @@ -479,6 +481,7 @@ class FirecrawlApp: * jsonOptions - JSON extraction settings * actions - Actions to perform + Returns: ScrapeResponse with: * Requested content formats @@ -532,6 +535,7 @@ class FirecrawlApp: scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) if actions: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + scrape_params.update(kwargs) # Make request response = requests.post( @@ -558,6 +562,7 @@ class FirecrawlApp: def search( self, query: str, + *, limit: Optional[int] = None, tbs: Optional[str] = None, filter: Optional[str] = None, @@ -649,97 +654,150 @@ class FirecrawlApp: else: self._handle_error(response, 'search') - def crawl_url(self, url: str, - params: Optional[CrawlParams] = None, - poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> CrawlStatusResponse: + def crawl_url( + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlStatusResponse: """ Crawl a website starting from a URL. Args: - url (str): Target URL to start crawling from - params (Optional[CrawlParams]): See CrawlParams model: - URL Discovery: - * includePaths - Patterns of URLs to include - * excludePaths - Patterns of URLs to exclude - * maxDepth - Maximum crawl depth - * maxDiscoveryDepth - Maximum depth for finding new URLs - * limit - Maximum pages to crawl - - Link Following: - * allowBackwardLinks - Follow parent directory links - * allowExternalLinks - Follow external domain links - * ignoreSitemap - Skip sitemap.xml processing - - Advanced: - * scrapeOptions - Page scraping configuration - * webhook - Notification webhook settings - * deduplicateSimilarURLs - Remove similar URLs - * ignoreQueryParameters - Ignore URL parameters - * regexOnFullURL - Apply regex to full URLs - poll_interval (int): Seconds between status checks (default: 2) - idempotency_key (Optional[str]): Unique key to prevent duplicate requests + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + poll_interval (Optional[int]): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - CrawlStatusResponse with: - * Crawling status and progress - * Crawled page contents - * Success/error information + CrawlStatusResponse with: + * Crawling status and progress + * Crawled page contents + * Success/error information Raises: - Exception: If crawl fails + Exception: If crawl fails """ - endpoint = f'/v1/crawl' + crawl_params = {} + + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers) + if response.status_code == 200: try: id = response.json().get('id') except: raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) - else: self._handle_error(response, 'start crawl job') - def async_crawl_url( - self, - url: str, - params: Optional[CrawlParams] = None, - idempotency_key: Optional[str] = None) -> CrawlResponse: + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlResponse: """ Start an asynchronous crawl job. Args: url (str): Target URL to start crawling from - - params (Optional[CrawlParams]): See CrawlParams model: - - URL Discovery: - * includePaths - Patterns of URLs to include - * excludePaths - Patterns of URLs to exclude - * maxDepth - Maximum crawl depth - * maxDiscoveryDepth - Maximum depth for finding new URLs - * limit - Maximum pages to crawl - - Link Following: - * allowBackwardLinks - Follow parent directory links - * allowExternalLinks - Follow external domain links - * ignoreSitemap - Skip sitemap.xml processing - - Advanced: - * scrapeOptions - Page scraping configuration - * webhook - Notification webhook settings - * deduplicateSimilarURLs - Remove similar URLs - * ignoreQueryParameters - Ignore URL parameters - * regexOnFullURL - Apply regex to full URLs - - idempotency_key: Unique key to prevent duplicate requests + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: CrawlResponse with: @@ -751,16 +809,52 @@ class FirecrawlApp: Raises: Exception: If crawl initiation fails """ - endpoint = f'/v1/crawl' + crawl_params = {} + + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers) + if response.status_code == 200: try: - return response.json() + return CrawlResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -842,10 +936,10 @@ class FirecrawlApp: if 'next' in status_data: response['next'] = status_data['next'] - return { - 'success': False if 'error' in status_data else True, + return CrawlStatusResponse( + success=False if 'error' in status_data else True, **response - } + ) else: self._handle_error(response, 'check crawl status') @@ -872,7 +966,7 @@ class FirecrawlApp: response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers) if response.status_code == 200: try: - return response.json() + return CrawlErrorsResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -906,254 +1000,519 @@ class FirecrawlApp: def crawl_url_and_watch( self, url: str, - params: Optional[CrawlParams] = None, - idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> 'CrawlWatcher': """ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket. Args: - url (str): Target URL to start crawling from - params (Optional[CrawlParams]): See CrawlParams model for configuration: - URL Discovery: - * includePaths - Patterns of URLs to include - * excludePaths - Patterns of URLs to exclude - * maxDepth - Maximum crawl depth - * maxDiscoveryDepth - Maximum depth for finding new URLs - * limit - Maximum pages to crawl - - Link Following: - * allowBackwardLinks - Follow parent directory links - * allowExternalLinks - Follow external domain links - * ignoreSitemap - Skip sitemap.xml processing - - Advanced: - * scrapeOptions - Page scraping configuration - * webhook - Notification webhook settings - * deduplicateSimilarURLs - Remove similar URLs - * ignoreQueryParameters - Ignore URL parameters - * regexOnFullURL - Apply regex to full URLs - idempotency_key (Optional[str]): Unique key to prevent duplicate requests + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket + CrawlWatcher: An instance to monitor the crawl job via WebSocket Raises: - Exception: If crawl job fails to start + Exception: If crawl job fails to start """ - crawl_response = self.async_crawl_url(url, params, idempotency_key) - if crawl_response['success'] and 'id' in crawl_response: - return CrawlWatcher(crawl_response['id'], self) + crawl_response = self.async_crawl_url( + url, + include_paths=include_paths, + exclude_paths=exclude_paths, + max_depth=max_depth, + max_discovery_depth=max_discovery_depth, + limit=limit, + allow_backward_links=allow_backward_links, + allow_external_links=allow_external_links, + ignore_sitemap=ignore_sitemap, + scrape_options=scrape_options, + webhook=webhook, + deduplicate_similar_urls=deduplicate_similar_urls, + ignore_query_parameters=ignore_query_parameters, + regex_on_full_url=regex_on_full_url, + idempotency_key=idempotency_key, + **kwargs + ) + if crawl_response.success and crawl_response.id: + return CrawlWatcher(crawl_response.id, self) else: raise Exception("Crawl job failed to start") def map_url( self, url: str, + *, + search: Optional[str] = None, + ignore_sitemap: Optional[bool] = None, + include_subdomains: Optional[bool] = None, + sitemap_only: Optional[bool] = None, + limit: Optional[int] = None, + timeout: Optional[int] = None, params: Optional[MapParams] = None) -> MapResponse: """ Map and discover links from a URL. Args: - url: Target URL to map - - params: See MapParams model: - - Discovery Options: - * search - Filter pattern for URLs - * ignoreSitemap - Skip sitemap.xml - * includeSubdomains - Include subdomain links - * sitemapOnly - Only use sitemap.xml - - Limits: - * limit - Max URLs to return - * timeout - Request timeout (ms) + url (str): Target URL to map + search (Optional[str]): Filter pattern for URLs + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + include_subdomains (Optional[bool]): Include subdomain links + sitemap_only (Optional[bool]): Only use sitemap.xml + limit (Optional[int]): Maximum URLs to return + timeout (Optional[int]): Request timeout in milliseconds + params (Optional[MapParams]): Additional mapping parameters Returns: - MapResponse with: - * Discovered URLs - * Success/error status + MapResponse: Response containing: + * success (bool): Whether request succeeded + * links (List[str]): Discovered URLs + * error (Optional[str]): Error message if any Raises: - Exception: If mapping fails + Exception: If mapping fails or response cannot be parsed """ - endpoint = f'/v1/map' - headers = self._prepare_headers() - - # Prepare the base scrape parameters with the URL - json_data = {'url': url} + # Build map parameters + map_params = {} if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" + map_params.update(params.dict(exclude_none=True)) - # Make the POST request with the prepared headers and JSON data + # Add individual parameters + if search is not None: + map_params['search'] = search + if ignore_sitemap is not None: + map_params['ignoreSitemap'] = ignore_sitemap + if include_subdomains is not None: + map_params['includeSubdomains'] = include_subdomains + if sitemap_only is not None: + map_params['sitemapOnly'] = sitemap_only + if limit is not None: + map_params['limit'] = limit + if timeout is not None: + map_params['timeout'] = timeout + + # Create final params object + final_params = MapParams(**map_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request response = requests.post( - f'{self.api_url}{endpoint}', - headers=headers, - json=json_data, + f"{self.api_url}/v1/map", + headers={"Authorization": f"Bearer {self.api_key}"}, + json=params_dict ) + if response.status_code == 200: try: - response = response.json() - except: - raise Exception(f'Failed to parse Firecrawl response as JSON.') - if response['success'] and 'links' in response: - return response - elif 'error' in response: - raise Exception(f'Failed to map URL. Error: {response["error"]}') - else: - raise Exception(f'Failed to map URL. Error: {response}') + response_json = response.json() + if response_json.get('success') and 'links' in response_json: + return MapResponse(**response_json) + elif "error" in response_json: + raise Exception(f'Map failed. Error: {response_json["error"]}') + else: + raise Exception(f'Map failed. Error: {response_json}') + except ValueError: + raise Exception('Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'map') - def batch_scrape_urls(self, urls: List[str], - params: Optional[ScrapeParams] = None, - poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> BatchScrapeStatusResponse: + def batch_scrape_urls( + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeStatusResponse: """ Batch scrape multiple URLs and monitor until completion. Args: urls (List[str]): URLs to scrape - params (Optional[ScrapeParams]): See ScrapeParams model: - Content Options: - * formats - Content formats to retrieve - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only - - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type - - Extraction Options: - * extract - Content extraction config - * jsonOptions - JSON extraction config - * actions - Actions to perform + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + poll_interval (Optional[int]): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - BatchScrapeStatusResponse with: - * Scraping status and progress - * Scraped content for each URL - * Success/error information + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information Raises: - Exception: If batch scrape fails + Exception: If batch scrape fails """ - endpoint = f'/v1/batch/scrape' + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'urls': urls} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) + if response.status_code == 200: try: id = response.json().get('id') except: raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) - else: self._handle_error(response, 'start batch scrape job') - def async_batch_scrape_urls( - self, - urls: List[str], - params: Optional[ScrapeParams] = None, - idempotency_key: Optional[str] = None) -> BatchScrapeResponse: + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeResponse: """ Initiate a batch scrape job asynchronously. Args: - urls (List[str]): List of URLs to scrape - params (Optional[ScrapeParams]): See ScrapeParams model for configuration: - Content Options: - * formats - Content formats to retrieve - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only - - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type - - Extraction Options: - * extract - Content extraction config - * jsonOptions - JSON extraction config - * actions - Actions to perform - idempotency_key (Optional[str]): Unique key to prevent duplicate requests + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - BatchScrapeResponse with: - * success - Whether job started successfully - * id - Unique identifier for the job - * url - Status check URL - * error - Error message if start failed + BatchScrapeResponse with: + * success - Whether job started successfully + * id - Unique identifier for the job + * url - Status check URL + * error - Error message if start failed Raises: - Exception: If job initiation fails + Exception: If job initiation fails """ - endpoint = f'/v1/batch/scrape' + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'urls': urls} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) + if response.status_code == 200: try: - return response.json() + return BatchScrapeResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'start batch scrape job') def batch_scrape_urls_and_watch( - self, - urls: List[str], - params: Optional[ScrapeParams] = None, - idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> 'CrawlWatcher': """ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. Args: - urls (List[str]): List of URLs to scrape - params (Optional[ScrapeParams]): See ScrapeParams model for configuration: - - Content Options: - * formats - Content formats to retrieve - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only - - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type - - Extraction Options: - * extract - Content extraction config - * jsonOptions - JSON extraction config - * actions - Actions to perform + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket + CrawlWatcher: An instance to monitor the batch scrape job via WebSocket Raises: Exception: If batch scrape job fails to start """ - crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key) - if crawl_response['success'] and 'id' in crawl_response: - return CrawlWatcher(crawl_response['id'], self) + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) + + if response.status_code == 200: + try: + crawl_response = BatchScrapeResponse(**response.json()) + if crawl_response.success and crawl_response.id: + return CrawlWatcher(crawl_response.id, self) + else: + raise Exception("Batch scrape job failed to start") + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') else: - raise Exception("Batch scrape job failed to start") + self._handle_error(response, 'start batch scrape job') def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse: """ @@ -1203,25 +1562,17 @@ class FirecrawlApp: break status_data['data'] = data - response = { + return BatchScrapeStatusResponse(**{ + 'success': False if 'error' in status_data else True, 'status': status_data.get('status'), 'total': status_data.get('total'), 'completed': status_data.get('completed'), 'creditsUsed': status_data.get('creditsUsed'), 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data') - } - - if 'error' in status_data: - response['error'] = status_data['error'] - - if 'next' in status_data: - response['next'] = status_data['next'] - - return { - 'success': False if 'error' in status_data else True, - **response - } + 'data': status_data.get('data'), + 'next': status_data.get('next'), + 'error': status_data.get('error') + }) else: self._handle_error(response, 'check batch scrape status') @@ -1230,7 +1581,7 @@ class FirecrawlApp: Returns information about batch scrape errors. Args: - id (str): The ID of the crawl job. + id (str): The ID of the crawl job. Returns: CrawlErrorsResponse: A response containing: @@ -1240,12 +1591,15 @@ class FirecrawlApp: * url (str): URL that caused the error * error (str): Error message * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If the error check request fails """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers) if response.status_code == 200: try: - return response.json() + return CrawlErrorsResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: From d8792d2301cb8d5fff2228a01294d69b2b32035c Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 00:48:07 -0700 Subject: [PATCH 16/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 359 +++++++++++++------------ 1 file changed, 190 insertions(+), 169 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 1eb5f8e7..82ff9606 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1608,47 +1608,45 @@ class FirecrawlApp: def extract( self, urls: Optional[List[str]] = None, - params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: + *, + prompt: Optional[str] = None, + schema_: Optional[Any] = None, + system_prompt: Optional[str] = None, + allow_external_links: Optional[bool] = False, + enable_web_search: Optional[bool] = False, + show_sources: Optional[bool] = False, + agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]: """ Extract structured information from URLs. Args: - urls: URLs to extract from - - params: See ExtractParams model: - - Extraction Config: - * prompt - Custom extraction prompt - * schema - JSON schema/Pydantic model - * systemPrompt - System context - - Behavior Options: - * allowExternalLinks - Follow external links - * enableWebSearch - Enable web search - * includeSubdomains - Include subdomains - * showSources - Include source URLs - - Scraping Options: - * scrapeOptions - Page scraping config + urls (Optional[List[str]]): URLs to extract from + prompt (Optional[str]): Custom extraction prompt + schema_ (Optional[Any]): JSON schema/Pydantic model + system_prompt (Optional[str]): System context + allow_external_links (Optional[bool]): Follow external links + enable_web_search (Optional[bool]): Enable web search + show_sources (Optional[bool]): Include source URLs + agent (Optional[Dict[str, Any]]): Agent configuration Returns: - ExtractResponse with: - * Structured data matching schema - * Source information if requested - * Success/error status + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any Raises: ValueError: If prompt/schema missing or extraction fails """ headers = self._prepare_headers() - if not params or (not params.get('prompt') and not params.get('schema')): + if not prompt and not schema_: raise ValueError("Either prompt or schema is required") - if not urls and not params.get('prompt'): + if not urls and not prompt: raise ValueError("Either urls or prompt is required") - schema = params.get('schema') + schema = schema_ if schema: if hasattr(schema, 'model_json_schema'): # Convert Pydantic model to JSON schema @@ -1656,26 +1654,22 @@ class FirecrawlApp: # Otherwise assume it's already a JSON schema dict request_data = { - 'urls': urls, - 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)), - 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), - 'showSources': params.get('show_sources', params.get('showSources', False)), + 'urls': urls or [], + 'allowExternalLinks': allow_external_links, + 'enableWebSearch': enable_web_search, + 'showSources': show_sources, 'schema': schema, 'origin': f'python-sdk@{get_version()}' } - if not request_data['urls']: - request_data['urls'] = [] # Only add prompt and systemPrompt if they exist - if params.get('prompt'): - request_data['prompt'] = params['prompt'] - if params.get('system_prompt'): - request_data['systemPrompt'] = params['system_prompt'] - elif params.get('systemPrompt'): # Check legacy field name - request_data['systemPrompt'] = params['systemPrompt'] + if prompt: + request_data['prompt'] = prompt + if system_prompt: + request_data['systemPrompt'] = system_prompt - if params.get('agent'): - request_data['agent'] = params['agent'] + if agent: + request_data['agent'] = agent try: # Send the initial extract request @@ -1706,7 +1700,7 @@ class FirecrawlApp: except: raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': - return status_data + return ExtractResponse(**status_data) elif status_data['status'] in ['failed', 'cancelled']: raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') else: @@ -1720,7 +1714,7 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - return {'success': False, 'error': "Internal server error."} + return ExtractResponse(success=False, error="Internal server error.") def get_extract_status(self, job_id: str) -> ExtractResponse[Any]: """ @@ -1740,7 +1734,7 @@ class FirecrawlApp: response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers) if response.status_code == 200: try: - return response.json() + return ExtractResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -1751,60 +1745,68 @@ class FirecrawlApp: def async_extract( self, urls: List[str], - params: Optional[ExtractParams] = None, + *, + prompt: Optional[str] = None, + schema_: Optional[Any] = None, + system_prompt: Optional[str] = None, + allow_external_links: Optional[bool] = False, + enable_web_search: Optional[bool] = False, + show_sources: Optional[bool] = False, + agent: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extract job. Args: urls (List[str]): URLs to extract information from - params (Optional[ExtractParams]): See ExtractParams model: - Extraction Config: - * prompt - Custom extraction prompt - * schema - JSON schema/Pydantic model - * systemPrompt - System context - - Behavior Options: - * allowExternalLinks - Follow external links - * enableWebSearch - Enable web search - * includeSubdomains - Include subdomains - * showSources - Include source URLs - - Scraping Options: - * scrapeOptions - Page scraping config + prompt (Optional[str]): Custom extraction prompt + schema_ (Optional[Any]): JSON schema/Pydantic model + system_prompt (Optional[str]): System context + allow_external_links (Optional[bool]): Follow external links + enable_web_search (Optional[bool]): Enable web search + show_sources (Optional[bool]): Include source URLs + agent (Optional[Dict[str, Any]]): Agent configuration idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: - ExtractResponse containing: - * success (bool): Whether job started successfully - * id (str): Unique identifier for the job - * error (str, optional): Error message if start failed + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any Raises: - ValueError: If job initiation fails + ValueError: If job initiation fails """ headers = self._prepare_headers(idempotency_key) - schema = params.get('schema') if params else None + schema = schema_ if schema: if hasattr(schema, 'model_json_schema'): # Convert Pydantic model to JSON schema schema = schema.model_json_schema() # Otherwise assume it's already a JSON schema dict - jsonData = {'urls': urls, **(params or {})} request_data = { - **jsonData, - 'allowExternalLinks': params.get('allow_external_links', False) if params else False, + 'urls': urls, + 'allowExternalLinks': allow_external_links, + 'enableWebSearch': enable_web_search, + 'showSources': show_sources, 'schema': schema, 'origin': f'python-sdk@{version}' } + if prompt: + request_data['prompt'] = prompt + if system_prompt: + request_data['systemPrompt'] = system_prompt + if agent: + request_data['agent'] = agent + try: response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers) if response.status_code == 200: try: - return response.json() + return ExtractResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -1815,41 +1817,36 @@ class FirecrawlApp: def generate_llms_text( self, url: str, - params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse: """ Generate LLMs.txt for a given URL and poll until completion. Args: - url: Target URL to generate LLMs.txt from - - params: See GenerateLLMsTextParams model: - params: See GenerateLLMsTextParams model: - - params: See GenerateLLMsTextParams model: - - Generation Options: - * maxUrls - Maximum URLs to process (default: 10) - * showFullText - Include full text in output (default: False) + url (str): Target URL to generate LLMs.txt from + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming Returns: - GenerateLLMsTextStatusResponse with: - * Generated LLMs.txt content - * Full version if requested - * Generation status - * Success/error information + GenerateLLMsTextStatusResponse with: + * Generated LLMs.txt content + * Full version if requested + * Generation status + * Success/error information Raises: - Exception: If generation fails + Exception: If generation fails """ - if params is None: - params = {} + params = GenerateLLMsTextParams( + maxUrls=max_urls, + showFullText=show_full_text, + __experimental_stream=experimental_stream + ) - if isinstance(params, dict): - generation_params = GenerateLLMsTextParams(**params) - else: - generation_params = params - - response = self.async_generate_llms_text(url, generation_params) + response = self.async_generate_llms_text(url, params) if not response.get('success') or 'id' not in response: return response @@ -1871,35 +1868,36 @@ class FirecrawlApp: def async_generate_llms_text( self, url: str, - params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation operation. Args: - url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Generation configuration parameters: - * maxUrls (int, optional): Maximum number of URLs to process (default: 10) - * showFullText (bool, optional): Include full text in output (default: False) + url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming Returns: - GenerateLLMsTextResponse: A response containing: - - success (bool): Whether the generation initiation was successful - - id (str): The unique identifier for the generation job - - error (str, optional): Error message if initiation failed + GenerateLLMsTextResponse: A response containing: + * success (bool): Whether the generation initiation was successful + * id (str): The unique identifier for the generation job + * error (str, optional): Error message if initiation failed Raises: - Exception: If the generation job initiation fails. + Exception: If the generation job initiation fails. """ - if params is None: - params = {} - - if isinstance(params, dict): - generation_params = GenerateLLMsTextParams(**params) - else: - generation_params = params + params = GenerateLLMsTextParams( + maxUrls=max_urls, + showFullText=show_full_text, + __experimental_stream=experimental_stream + ) headers = self._prepare_headers() - json_data = {'url': url, **generation_params.dict(exclude_none=True)} + json_data = {'url': url, **params.dict(exclude_none=True)} json_data['origin'] = f"python-sdk@{version}" try: @@ -1921,20 +1919,20 @@ class FirecrawlApp: Check the status of a LLMs.txt generation operation. Args: - id (str): The unique identifier of the LLMs.txt generation job to check status for. + id (str): The unique identifier of the LLMs.txt generation job to check status for. Returns: - GenerateLLMsTextStatusResponse: A response containing: - * success (bool): Whether the generation was successful - * status (str): Status of generation ("processing", "completed", "failed") - * data (Dict[str, str], optional): Generated text with fields: - * llmstxt (str): Generated LLMs.txt content - * llmsfulltxt (str, optional): Full version if requested - * error (str, optional): Error message if generation failed - * expiresAt (str): When the generated data expires + GenerateLLMsTextStatusResponse: A response containing: + * success (bool): Whether the generation was successful + * status (str): Status of generation ("processing", "completed", "failed") + * data (Dict[str, str], optional): Generated text with fields: + * llmstxt (str): Generated LLMs.txt content + * llmsfulltxt (str, optional): Full version if requested + * error (str, optional): Error message if generation failed + * expiresAt (str): When the generated data expires Raises: - Exception: If the status check fails. + Exception: If the status check fails. """ headers = self._prepare_headers() try: @@ -2172,52 +2170,57 @@ class FirecrawlApp: def deep_research( self, query: str, - params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None, on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: """ Initiates a deep research operation on a given query and polls until completion. Args: - query: Research query or topic to investigate - - params: See DeepResearchParams model: - Research Settings: - * maxDepth - Maximum research depth (default: 7) - * timeLimit - Time limit in seconds (default: 270) - * maxUrls - Maximum URLs to process (default: 20) - - Callbacks: - * on_activity - Progress callback receiving: - {type, status, message, timestamp, depth} - * on_source - Source discovery callback receiving: - {url, title, description} + query (str): Research query or topic to investigate + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming + on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth} + on_source (Optional[Callable]): Source discovery callback receiving {url, title, description} Returns: - DeepResearchResponse containing: - - Status: - * success - Whether research completed successfully - * status - Current state (processing/completed/failed) - * error - Error message if failed - - Results: - * id - Unique identifier for the research job - * data - Research findings and analysis - * sources - List of discovered sources - * activities - Research progress log - * summaries - Generated research summaries + DeepResearchStatusResponse containing: + * success (bool): Whether research completed successfully + * status (str): Current state (processing/completed/failed) + * error (Optional[str]): Error message if failed + * id (str): Unique identifier for the research job + * data (Any): Research findings and analysis + * sources (List[Dict]): List of discovered sources + * activities (List[Dict]): Research progress log + * summaries (List[str]): Generated research summaries Raises: - Exception: If research fails + Exception: If research fails """ - if params is None: - params = {} - - if isinstance(params, dict): - research_params = DeepResearchParams(**params) - else: - research_params = params + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) response = self.async_deep_research(query, research_params) if not response.get('success') or 'id' not in response: @@ -2253,19 +2256,30 @@ class FirecrawlApp: return {'success': False, 'error': 'Deep research job terminated unexpectedly'} - def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: + def async_deep_research( + self, + query: str, + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]: """ Initiates an asynchronous deep research operation. Args: - query (str): The research query to investigate. Should be a clear, specific question or topic. - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Research configuration parameters: - * maxDepth (int, optional): Maximum depth of research exploration (default: 7) - * timeLimit (int, optional): Time limit in seconds for research (default: 270) - * maxUrls (int, optional): Maximum number of URLs to process (default: 20) + query (str): Research query or topic to investigate + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming Returns: - DeepResearchResponse: A response containing: + Dict[str, Any]: A response containing: * success (bool): Whether the research initiation was successful * id (str): The unique identifier for the research job * error (str, optional): Error message if initiation failed @@ -2273,13 +2287,20 @@ class FirecrawlApp: Raises: Exception: If the research initiation fails. """ - if params is None: - params = {} - - if isinstance(params, dict): - research_params = DeepResearchParams(**params) - else: - research_params = params + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) headers = self._prepare_headers() From 390f3d44a344b822b8fc31bc8a01f1cbead0c1dd Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 00:51:06 -0700 Subject: [PATCH 17/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 82ff9606..f0cc707c 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -402,7 +402,7 @@ class ExtractParams(pydantic.BaseModel): Parameters for the extract operation. """ prompt: Optional[str] = None - schema_: Optional[Any] = pydantic.Field(None, alias='schema') + schema: Optional[Any] = pydantic.Field(None, alias='schema') system_prompt: Optional[str] = None allow_external_links: Optional[bool] = False enable_web_search: Optional[bool] = False @@ -1610,7 +1610,7 @@ class FirecrawlApp: urls: Optional[List[str]] = None, *, prompt: Optional[str] = None, - schema_: Optional[Any] = None, + schema: Optional[Any] = None, system_prompt: Optional[str] = None, allow_external_links: Optional[bool] = False, enable_web_search: Optional[bool] = False, @@ -1622,7 +1622,7 @@ class FirecrawlApp: Args: urls (Optional[List[str]]): URLs to extract from prompt (Optional[str]): Custom extraction prompt - schema_ (Optional[Any]): JSON schema/Pydantic model + schema (Optional[Any]): JSON schema/Pydantic model system_prompt (Optional[str]): System context allow_external_links (Optional[bool]): Follow external links enable_web_search (Optional[bool]): Enable web search @@ -1640,13 +1640,12 @@ class FirecrawlApp: """ headers = self._prepare_headers() - if not prompt and not schema_: + if not prompt and not schema: raise ValueError("Either prompt or schema is required") if not urls and not prompt: raise ValueError("Either urls or prompt is required") - schema = schema_ if schema: if hasattr(schema, 'model_json_schema'): # Convert Pydantic model to JSON schema @@ -1747,7 +1746,7 @@ class FirecrawlApp: urls: List[str], *, prompt: Optional[str] = None, - schema_: Optional[Any] = None, + schema: Optional[Any] = None, system_prompt: Optional[str] = None, allow_external_links: Optional[bool] = False, enable_web_search: Optional[bool] = False, @@ -1760,7 +1759,7 @@ class FirecrawlApp: Args: urls (List[str]): URLs to extract information from prompt (Optional[str]): Custom extraction prompt - schema_ (Optional[Any]): JSON schema/Pydantic model + schema (Optional[Any]): JSON schema/Pydantic model system_prompt (Optional[str]): System context allow_external_links (Optional[bool]): Follow external links enable_web_search (Optional[bool]): Enable web search @@ -1779,7 +1778,7 @@ class FirecrawlApp: """ headers = self._prepare_headers(idempotency_key) - schema = schema_ + schema = schema if schema: if hasattr(schema, 'model_json_schema'): # Convert Pydantic model to JSON schema From 1aa0c092e05bde9d867a6a5fe54175797fa37c2f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 01:01:01 -0700 Subject: [PATCH 18/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 70 +++++++++++++------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f0cc707c..117ca093 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -49,40 +49,40 @@ logger : logging.Logger = logging.getLogger("firecrawl") T = TypeVar('T') -class FirecrawlDocumentMetadata(pydantic.BaseModel): - """Metadata for a Firecrawl document.""" - title: Optional[str] = None - description: Optional[str] = None - language: Optional[str] = None - keywords: Optional[str] = None - robots: Optional[str] = None - ogTitle: Optional[str] = None - ogDescription: Optional[str] = None - ogUrl: Optional[str] = None - ogImage: Optional[str] = None - ogAudio: Optional[str] = None - ogDeterminer: Optional[str] = None - ogLocale: Optional[str] = None - ogLocaleAlternate: Optional[List[str]] = None - ogSiteName: Optional[str] = None - ogVideo: Optional[str] = None - dctermsCreated: Optional[str] = None - dcDateCreated: Optional[str] = None - dcDate: Optional[str] = None - dctermsType: Optional[str] = None - dcType: Optional[str] = None - dctermsAudience: Optional[str] = None - dctermsSubject: Optional[str] = None - dcSubject: Optional[str] = None - dcDescription: Optional[str] = None - dctermsKeywords: Optional[str] = None - modifiedTime: Optional[str] = None - publishedTime: Optional[str] = None - articleTag: Optional[str] = None - articleSection: Optional[str] = None - sourceURL: Optional[str] = None - statusCode: Optional[int] = None - error: Optional[str] = None +# class FirecrawlDocumentMetadata(pydantic.BaseModel): +# """Metadata for a Firecrawl document.""" +# title: Optional[str] = None +# description: Optional[str] = None +# language: Optional[str] = None +# keywords: Optional[str] = None +# robots: Optional[str] = None +# ogTitle: Optional[str] = None +# ogDescription: Optional[str] = None +# ogUrl: Optional[str] = None +# ogImage: Optional[str] = None +# ogAudio: Optional[str] = None +# ogDeterminer: Optional[str] = None +# ogLocale: Optional[str] = None +# ogLocaleAlternate: Optional[List[str]] = None +# ogSiteName: Optional[str] = None +# ogVideo: Optional[str] = None +# dctermsCreated: Optional[str] = None +# dcDateCreated: Optional[str] = None +# dcDate: Optional[str] = None +# dctermsType: Optional[str] = None +# dcType: Optional[str] = None +# dctermsAudience: Optional[str] = None +# dctermsSubject: Optional[str] = None +# dcSubject: Optional[str] = None +# dcDescription: Optional[str] = None +# dctermsKeywords: Optional[str] = None +# modifiedTime: Optional[str] = None +# publishedTime: Optional[str] = None +# articleTag: Optional[str] = None +# articleSection: Optional[str] = None +# sourceURL: Optional[str] = None +# statusCode: Optional[int] = None +# error: Optional[str] = None class AgentOptions(pydantic.BaseModel): """Configuration for the agent.""" @@ -107,7 +107,7 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]): extract: Optional[T] = None json: Optional[T] = None screenshot: Optional[str] = None - metadata: Optional[FirecrawlDocumentMetadata] = None + metadata: Optional[Any] = None actions: Optional[ActionsResult] = None title: Optional[str] = None # v1 search only description: Optional[str] = None # v1 search only From 8cd82b5600620a13dba590722271c2bd1a2d1475 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Apr 2025 01:06:58 -0700 Subject: [PATCH 19/26] async scrape --- apps/python-sdk/firecrawl/firecrawl.py | 115 +++++++++++++++++-------- 1 file changed, 81 insertions(+), 34 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index ea99aaa7..f66b25d5 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -2308,29 +2308,41 @@ class AsyncFirecrawlApp(FirecrawlApp): async def scrape_url( self, url: str, - params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]: + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]: """ - Asynchronously scrape and extract content from a URL. + Scrape and extract content from a URL asynchronously. Args: - url (str): Target URL to scrape - params (Optional[ScrapeParams]): See ScrapeParams model for configuration: - Content Options: - * formats - Content types to retrieve (markdown/html/etc) - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only - - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type (basic/stealth) - - Extraction Options: - * extract - Content extraction settings - * jsonOptions - JSON extraction settings - * actions - Actions to perform + url (str): Target URL to scrape + formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait for a specific element to appear + timeout (Optional[int]): Request timeout (ms) + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 images + block_ads (Optional[bool]): Block ads + proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth) + extract (Optional[ExtractConfig]): Content extraction settings + json_options (Optional[ExtractConfig]): JSON extraction settings + actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform Returns: ScrapeResponse with: @@ -2340,35 +2352,70 @@ class AsyncFirecrawlApp(FirecrawlApp): * Success/error status Raises: - Exception: If scraping fails + Exception: If scraping fails """ headers = self._prepare_headers() - scrape_params = {'url': url, 'origin': f'python-sdk@{version}'} - if params: - extract = params.get('extract', {}) - if extract: - if 'schema' in extract and hasattr(extract['schema'], 'schema'): - extract['schema'] = extract['schema'].schema() - scrape_params['extract'] = extract + # Build scrape parameters + scrape_params = { + 'url': url, + 'origin': f"python-sdk@{version}" + } - for key, value in params.items(): - if key not in ['extract']: - scrape_params[key] = value + # Add optional parameters if provided and not None + if formats: + scrape_params['formats'] = formats + if include_tags: + scrape_params['includeTags'] = include_tags + if exclude_tags: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for: + scrape_params['waitFor'] = wait_for + if timeout: + scrape_params['timeout'] = timeout + if location: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy: + scrape_params['proxy'] = proxy + if extract: + extract_dict = extract.dict(exclude_none=True) + if 'schema' in extract_dict and hasattr(extract.schema, 'schema'): + extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted + scrape_params['extract'] = extract_dict + if json_options: + json_options_dict = json_options.dict(exclude_none=True) + if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'): + json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted + scrape_params['jsonOptions'] = json_options_dict + if actions: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + # Make async request endpoint = f'/v1/scrape' response = await self._async_post_request( f'{self.api_url}{endpoint}', scrape_params, headers ) - + if response.get('success') and 'data' in response: - return response['data'] + return ScrapeResponse(**response['data']) elif "error" in response: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') else: - raise Exception(f'Failed to scrape URL. Error: {response}') + # Use the response content directly if possible, otherwise a generic message + error_content = response.get('error', str(response)) + raise Exception(f'Failed to scrape URL. Error: {error_content}') async def batch_scrape_urls( self, From 0b62be58745d8775daf2c2ffa1806f356315db54 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 01:12:24 -0700 Subject: [PATCH 20/26] Update firecrawl.py --- apps/python-sdk/firecrawl/firecrawl.py | 400 +++++++++++++++---------- 1 file changed, 234 insertions(+), 166 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 117ca093..5a3a80d0 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -3356,12 +3356,12 @@ class AsyncFirecrawlApp(FirecrawlApp): job_id (str): The ID of the extraction job Returns: - ExtractResponse containing: - * success (bool): Whether extraction completed successfully - * data (Any): Extracted structured data - * error (str, optional): Error message if extraction failed - * warning (str, optional): Warning message if any - * sources (List[str], optional): Source URLs if requested + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any + * warning (Optional[str]): Warning message if any + * sources (Optional[List[str]]): Source URLs if requested Raises: ValueError: If status check fails @@ -3377,54 +3377,67 @@ class AsyncFirecrawlApp(FirecrawlApp): async def async_extract( self, - urls: List[str], - params: Optional[ExtractParams] = None, + urls: Optional[List[str]] = None, + *, + prompt: Optional[str] = None, + schema: Optional[Any] = None, + system_prompt: Optional[str] = None, + allow_external_links: Optional[bool] = False, + enable_web_search: Optional[bool] = False, + show_sources: Optional[bool] = False, + agent: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extraction job without waiting for completion. Args: - urls (List[str]): URLs to extract information from - params (Optional[ExtractParams]): See ExtractParams model: - Extraction Config: - * prompt - Custom extraction prompt - * schema - JSON schema/Pydantic model - * systemPrompt - System context - - Behavior Options: - * allowExternalLinks - Follow external links - * enableWebSearch - Enable web search - * includeSubdomains - Include subdomains - * showSources - Include source URLs - - Scraping Options: - * scrapeOptions - Page scraping config + urls (Optional[List[str]]): URLs to extract from + prompt (Optional[str]): Custom extraction prompt + schema (Optional[Any]): JSON schema/Pydantic model + system_prompt (Optional[str]): System context + allow_external_links (Optional[bool]): Follow external links + enable_web_search (Optional[bool]): Enable web search + show_sources (Optional[bool]): Include source URLs + agent (Optional[Dict[str, Any]]): Agent configuration idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: - ExtractResponse containing: - * success (bool): Whether job started successfully - * id (str): Unique identifier for the job - * error (str, optional): Error message if start failed + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any Raises: - ValueError: If job initiation fails + ValueError: If job initiation fails """ headers = self._prepare_headers(idempotency_key) - - schema = params.get('schema') if params else None + + if not prompt and not schema: + raise ValueError("Either prompt or schema is required") + + if not urls and not prompt: + raise ValueError("Either urls or prompt is required") + if schema: if hasattr(schema, 'model_json_schema'): schema = schema.model_json_schema() - jsonData = {'urls': urls, **(params or {})} request_data = { - **jsonData, - 'allowExternalLinks': params.get('allow_external_links', False) if params else False, + 'urls': urls or [], + 'allowExternalLinks': allow_external_links, + 'enableWebSearch': enable_web_search, + 'showSources': show_sources, 'schema': schema, 'origin': f'python-sdk@{version}' } + if prompt: + request_data['prompt'] = prompt + if system_prompt: + request_data['systemPrompt'] = system_prompt + if agent: + request_data['agent'] = agent + try: return await self._async_post_request( f'{self.api_url}/v1/extract', @@ -3437,16 +3450,18 @@ class AsyncFirecrawlApp(FirecrawlApp): async def generate_llms_text( self, url: str, - params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse: """ Generate LLMs.txt for a given URL and monitor until completion. Args: url (str): Target URL to generate LLMs.txt from - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): See GenerateLLMsTextParams model: - Generation Options: - * maxUrls - Maximum URLs to process (default: 10) - * showFullText - Include full text in output (default: False) + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming Returns: GenerateLLMsTextStatusResponse containing: @@ -3461,15 +3476,15 @@ class AsyncFirecrawlApp(FirecrawlApp): Raises: Exception: If generation fails """ - if params is None: - params = {} + params = {} + if max_urls is not None: + params['maxUrls'] = max_urls + if show_full_text is not None: + params['showFullText'] = show_full_text + if experimental_stream is not None: + params['__experimental_stream'] = experimental_stream - if isinstance(params, dict): - generation_params = GenerateLLMsTextParams(**params) - else: - generation_params = params - - response = await self.async_generate_llms_text(url, generation_params) + response = await self.async_generate_llms_text(url, params) if not response.get('success') or 'id' not in response: return response @@ -3491,36 +3506,38 @@ class AsyncFirecrawlApp(FirecrawlApp): async def async_generate_llms_text( self, url: str, - params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation job without waiting for completion. Args: - url (str): Target URL to generate LLMs.txt from - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): See GenerateLLMsTextParams model: - Generation Options: - * maxUrls - Maximum URLs to process (default: 10) - * showFullText - Include full text in output (default: False) + url (str): Target URL to generate LLMs.txt from + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming Returns: - GenerateLLMsTextResponse containing: - * success (bool): Whether job started successfully - * id (str): Unique identifier for the job - * error (str, optional): Error message if start failed + GenerateLLMsTextResponse containing: + * success (bool): Whether job started successfully + * id (str): Unique identifier for the job + * error (str, optional): Error message if start failed Raises: - ValueError: If job initiation fails + ValueError: If job initiation fails """ - if params is None: - params = {} - - if isinstance(params, dict): - generation_params = GenerateLLMsTextParams(**params) - else: - generation_params = params + params = {} + if max_urls is not None: + params['maxUrls'] = max_urls + if show_full_text is not None: + params['showFullText'] = show_full_text + if experimental_stream is not None: + params['__experimental_stream'] = experimental_stream headers = self._prepare_headers() - json_data = {'url': url, **generation_params.dict(exclude_none=True)} + json_data = {'url': url, **params.dict(exclude_none=True)} json_data['origin'] = f"python-sdk@{version}" try: @@ -3564,52 +3581,57 @@ class AsyncFirecrawlApp(FirecrawlApp): async def deep_research( self, query: str, - params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None, on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: """ - Initiates a deep research operation on a given query and polls until completion, providing real-time updates via callbacks. + Initiates a deep research operation on a given query and polls until completion. Args: - query: Research query or topic to investigate - - params: See DeepResearchParams model: - Research Settings: - * maxDepth - Maximum research depth (default: 7) - * timeLimit - Time limit in seconds (default: 270) - * maxUrls - Maximum URLs to process (default: 20) - - Callbacks: - * on_activity - Progress callback receiving: - {type, status, message, timestamp, depth} - * on_source - Source discovery callback receiving: - {url, title, description} + query (str): Research query or topic to investigate + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming + on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth} + on_source (Optional[Callable]): Source discovery callback receiving {url, title, description} Returns: - DeepResearchResponse containing: - - Status: - * success - Whether research completed successfully - * status - Current state (processing/completed/failed) - * error - Error message if failed - - Results: - * id - Unique identifier for the research job - * data - Research findings and analysis - * sources - List of discovered sources - * activities - Research progress log - * summaries - Generated research summaries + DeepResearchStatusResponse containing: + * success (bool): Whether research completed successfully + * status (str): Current state (processing/completed/failed) + * error (Optional[str]): Error message if failed + * id (str): Unique identifier for the research job + * data (Any): Research findings and analysis + * sources (List[Dict]): List of discovered sources + * activities (List[Dict]): Research progress log + * summaries (List[str]): Generated research summaries Raises: - Exception: If research fails + Exception: If research fails """ - if params is None: - params = {} - - if isinstance(params, dict): - research_params = DeepResearchParams(**params) - else: - research_params = params + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) response = await self.async_deep_research(query, research_params) if not response.get('success') or 'id' not in response: @@ -3648,38 +3670,54 @@ class AsyncFirecrawlApp(FirecrawlApp): async def async_deep_research( self, query: str, - params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse: + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]: """ - Initiate an asynchronous deep research job without waiting for completion. + Initiates an asynchronous deep research operation. Args: query (str): Research query or topic to investigate - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): See DeepResearchParams model: - Research Settings: - * maxDepth - Maximum research depth (default: 7) - * timeLimit - Time limit in seconds (default: 270) - * maxUrls - Maximum URLs to process (default: 20) + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming Returns: - DeepResearchResponse containing: - * success (bool): Whether job started successfully - * id (str): Unique identifier for the job - * error (str, optional): Error message if start failed + Dict[str, Any]: A response containing: + * success (bool): Whether the research initiation was successful + * id (str): The unique identifier for the research job + * error (str, optional): Error message if initiation failed Raises: - ValueError: If job initiation fails + Exception: If the research initiation fails. """ - if params is None: - params = {} - - if isinstance(params, dict): - research_params = DeepResearchParams(**params) - else: - research_params = params + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) headers = self._prepare_headers() + json_data = {'query': query, **research_params.dict(exclude_none=True)} json_data['origin'] = f"python-sdk@{version}" + try: return await self._async_post_request( f'{self.api_url}/v1/deep-research', @@ -3691,26 +3729,28 @@ class AsyncFirecrawlApp(FirecrawlApp): async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse: """ - Check the status of an asynchronous deep research job. + Check the status of a deep research operation. Args: - id (str): The ID of the research job + id (str): The ID of the deep research operation. Returns: - DeepResearchStatusResponse containing: - * success (bool): Whether research completed successfully - * status (str): Current state (processing/completed/failed) - * data (Dict[str, Any], optional): Research findings and analysis - * error (str, optional): Error message if failed - * expiresAt (str): When the research data expires - * currentDepth (int): Current research depth - * maxDepth (int): Maximum research depth - * activities (List[Dict[str, Any]]): Research progress log - * sources (List[Dict[str, Any]]): Discovered sources - * summaries (List[str]): Generated research summaries + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries Raises: - ValueError: If status check fails + Exception: If the status check fails. """ headers = self._prepare_headers() try: @@ -3724,52 +3764,80 @@ class AsyncFirecrawlApp(FirecrawlApp): async def search( self, query: str, - params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: + *, + limit: Optional[int] = None, + tbs: Optional[str] = None, + filter: Optional[str] = None, + lang: Optional[str] = None, + country: Optional[str] = None, + location: Optional[str] = None, + timeout: Optional[int] = None, + scrape_options: Optional[CommonOptions] = None, + params: Optional[Union[Dict[str, Any], SearchParams]] = None, + **kwargs) -> SearchResponse: """ Asynchronously search for content using Firecrawl. Args: - query (str): Search query string - params (Optional[Union[Dict[str, Any], SearchParams]]): See SearchParams model: - Search Options: - * limit - Max results (default: 5) - * tbs - Time filter (e.g. "qdr:d") - * filter - Custom result filter - - Localization: - * lang - Language code (default: "en") - * country - Country code (default: "us") - * location - Geo-targeting - - Request Options: - * timeout - Request timeout (ms) - * scrapeOptions - Result scraping config + query (str): Search query string + limit (Optional[int]): Max results (default: 5) + tbs (Optional[str]): Time filter (e.g. "qdr:d") + filter (Optional[str]): Custom result filter + lang (Optional[str]): Language code (default: "en") + country (Optional[str]): Country code (default: "us") + location (Optional[str]): Geo-targeting + timeout (Optional[int]): Request timeout in milliseconds + scrape_options (Optional[CommonOptions]): Result scraping configuration + params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters + **kwargs: Additional keyword arguments for future compatibility Returns: - SearchResponse containing: - * success (bool): Whether search completed successfully - * data (List[FirecrawlDocument]): Search results - * warning (str, optional): Warning message if any - * error (str, optional): Error message if search failed + SearchResponse: Response containing: + * success (bool): Whether request succeeded + * data (List[FirecrawlDocument]): Search results + * warning (Optional[str]): Warning message if any + * error (Optional[str]): Error message if any Raises: - Exception: If search fails + Exception: If search fails or response cannot be parsed """ - if params is None: - params = {} + # Build search parameters + search_params = {} + if params: + if isinstance(params, dict): + search_params.update(params) + else: + search_params.update(params.dict(exclude_none=True)) - if isinstance(params, dict): - search_params = SearchParams(query=query, **params) - else: - search_params = params - search_params.query = query + # Add individual parameters + if limit is not None: + search_params['limit'] = limit + if tbs is not None: + search_params['tbs'] = tbs + if filter is not None: + search_params['filter'] = filter + if lang is not None: + search_params['lang'] = lang + if country is not None: + search_params['country'] = country + if location is not None: + search_params['location'] = location + if timeout is not None: + search_params['timeout'] = timeout + if scrape_options is not None: + search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + + # Add any additional kwargs + search_params.update(kwargs) - search_params_dict = search_params.dict(exclude_none=True) - search_params_dict['origin'] = f"python-sdk@{version}" + # Create final params object + final_params = SearchParams(query=query, **search_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['origin'] = f"python-sdk@{version}" return await self._async_post_request( f"{self.api_url}/v1/search", - search_params_dict, + params_dict, {"Authorization": f"Bearer {self.api_key}"} ) From f3522666db5d2339b5567c7a537dd064704c76c2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 01:13:53 -0700 Subject: [PATCH 21/26] Nick: new examples --- apps/python-sdk/example.py | 79 ++++++++++++++------------------------ 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index ae4258f7..705d2e0c 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -1,53 +1,45 @@ -import time -import nest_asyncio -import uuid -from firecrawl.firecrawl import FirecrawlApp +from firecrawl.firecrawl import ExtractConfig, FirecrawlApp from pydantic import BaseModel, Field from typing import List +import time +app = FirecrawlApp(api_url="https://api.firecrawl.dev") -app = FirecrawlApp(api_key="fc-") - -# Scrape a website: -scrape_result = app.scrape_url('firecrawl.dev') -print(scrape_result['markdown']) +# # Scrape a website: +scrape_result = app.scrape_url('example.com', formats=["markdown", "html"]) +print(scrape_result.markdown) -# Test batch scrape +# # Test batch scrapeq urls = ['https://example.com', 'https://docs.firecrawl.dev'] -batch_scrape_params = { - 'formats': ['markdown', 'html'], -} - # Synchronous batch scrape -batch_result = app.batch_scrape_urls(urls, batch_scrape_params) +batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"]) print("Synchronous Batch Scrape Result:") -print(batch_result['data'][0]['markdown']) +print(batch_result.data[0].markdown) -# Asynchronous batch scrape -async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params) +# # Asynchronous batch scrape +async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"]) print("\nAsynchronous Batch Scrape Result:") print(async_batch_result) # Crawl a website: -idempotency_key = str(uuid.uuid4()) # optional idempotency key -crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) -print(crawl_result) +crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*']) +print(crawl_result.data[0].markdown) -# Asynchronous Crawl a website: -async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") +# # Asynchronous Crawl a website: +async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*']) print(async_result) -crawl_status = app.check_crawl_status(async_result['id']) +crawl_status = app.check_crawl_status(async_result.id) print(crawl_status) attempts = 15 -while attempts > 0 and crawl_status['status'] != 'completed': +while attempts > 0 and crawl_status.status != 'completed': print(crawl_status) - crawl_status = app.check_crawl_status(async_result['id']) + crawl_status = app.check_crawl_status(async_result.id) attempts -= 1 time.sleep(1) -crawl_status = app.check_crawl_status(async_result['id']) +crawl_status = app.check_crawl_status(async_result.id) print(crawl_status) # LLM Extraction: @@ -61,14 +53,11 @@ class ArticleSchema(BaseModel): class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., description="Top 5 stories") -llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { - 'formats': ['extract'], - 'extract': { - 'schema': TopArticlesSchema.model_json_schema() - } -}) +extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema()) -print(llm_extraction_result['extract']) +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) + +print(llm_extraction_result.extract) # # Define schema to extract contents into using json schema json_schema = { @@ -94,24 +83,16 @@ json_schema = { "required": ["top"] } -app2 = FirecrawlApp(api_key="fc-", version="v0") +extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True}) +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) - -llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': json_schema, - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } -}) +print(llm_extraction_result.extract) # print(llm_extraction_result['llm_extraction']) # Map a website: -map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) +map_result = app.map_url('https://firecrawl.dev', search="blog") print(map_result) # Extract URLs: @@ -124,14 +105,12 @@ class ExtractSchema(BaseModel): extract_schema = ExtractSchema.schema() # Perform the extraction -extract_result = app.extract(['https://firecrawl.dev'], { - 'prompt': "Extract the title, description, and links from the website", - 'schema': extract_schema -}) +extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) print(extract_result) # Crawl a website with WebSockets: # inside an async function... +import nest_asyncio nest_asyncio.apply() # Define event handlers From a3f31682127d89a7c6260d75b1522407065227ab Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 01:15:14 -0700 Subject: [PATCH 22/26] Nick: python sdk 2.0 --- apps/python-sdk/example_async.py | 48 +++++++++++---------------- apps/python-sdk/firecrawl/__init__.py | 2 +- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/apps/python-sdk/example_async.py b/apps/python-sdk/example_async.py index d5251515..c554d695 100644 --- a/apps/python-sdk/example_async.py +++ b/apps/python-sdk/example_async.py @@ -6,51 +6,47 @@ from firecrawl.firecrawl import AsyncFirecrawlApp from pydantic import BaseModel, Field from typing import List -app = AsyncFirecrawlApp(api_key="fc-") +app = AsyncFirecrawlApp(api_url="https://api.firecrawl.dev") async def example_scrape(): # Scrape a website: - scrape_result = await app.scrape_url('firecrawl.dev') - print(scrape_result['markdown']) + scrape_result = await app.scrape_url('example.com', formats=["markdown", "html"]) + print(scrape_result.markdown) async def example_batch_scrape(): # Batch scrape urls = ['https://example.com', 'https://docs.firecrawl.dev'] - batch_scrape_params = { - 'formats': ['markdown', 'html'], - } # Synchronous batch scrape - batch_result = await app.batch_scrape_urls(urls, batch_scrape_params) + batch_result = await app.batch_scrape_urls(urls, formats=["markdown", "html"]) print("Synchronous Batch Scrape Result:") - print(batch_result['data'][0]['markdown']) + print(batch_result.data[0].markdown) # Asynchronous batch scrape - async_batch_result = await app.async_batch_scrape_urls(urls, batch_scrape_params) + async_batch_result = await app.async_batch_scrape_urls(urls, formats=["markdown", "html"]) print("\nAsynchronous Batch Scrape Result:") print(async_batch_result) async def example_crawl(): # Crawl a website: - idempotency_key = str(uuid.uuid4()) # optional idempotency key - crawl_result = await app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) - print(crawl_result) + crawl_result = await app.crawl_url('firecrawl.dev', exclude_paths=['blog/*']) + print(crawl_result.data[0].markdown) # Asynchronous Crawl a website: - async_result = await app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") + async_result = await app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*']) print(async_result) - crawl_status = await app.check_crawl_status(async_result['id']) + crawl_status = await app.check_crawl_status(async_result.id) print(crawl_status) attempts = 15 - while attempts > 0 and crawl_status['status'] != 'completed': + while attempts > 0 and crawl_status.status != 'completed': print(crawl_status) - crawl_status = await app.check_crawl_status(async_result['id']) + crawl_status = await app.check_crawl_status(async_result.id) attempts -= 1 await asyncio.sleep(1) # Use async sleep instead of time.sleep - crawl_status = await app.check_crawl_status(async_result['id']) + crawl_status = await app.check_crawl_status(async_result.id) print(crawl_status) async def example_llm_extraction(): @@ -64,18 +60,15 @@ async def example_llm_extraction(): class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., description="Top 5 stories") - llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', { - 'formats': ['extract'], - 'extract': { - 'schema': TopArticlesSchema.model_json_schema() - } - }) + extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema()) - print(llm_extraction_result['extract']) + llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) + + print(llm_extraction_result.extract) async def example_map_and_extract(): # Map a website: - map_result = await app.map_url('https://firecrawl.dev', { 'search': 'blog' }) + map_result = await app.map_url('https://firecrawl.dev', search="blog") print(map_result) # Extract URLs: @@ -88,10 +81,7 @@ async def example_map_and_extract(): extract_schema = ExtractSchema.schema() # Perform the extraction - extract_result = await app.extract(['https://firecrawl.dev'], { - 'prompt': "Extract the title, description, and links from the website", - 'schema': extract_schema - }) + extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) print(extract_result) # Define event handlers for websocket diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index c30ba0fb..10431768 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.17.0" +__version__ = "2.0.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 0915db515c70535d8956ddef421fd7433e85223e Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Apr 2025 01:20:16 -0700 Subject: [PATCH 23/26] async functions --- apps/python-sdk/firecrawl/firecrawl.py | 597 +++++++++++++++++-------- 1 file changed, 422 insertions(+), 175 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 0f7964e1..d622a7ce 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -2792,225 +2792,472 @@ class AsyncFirecrawlApp(FirecrawlApp): raise Exception(f'Failed to scrape URL. Error: {error_content}') async def batch_scrape_urls( - self, - urls: List[str], - params: Optional[ScrapeParams] = None) -> BatchScrapeStatusResponse: + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeStatusResponse: """ Asynchronously scrape multiple URLs and monitor until completion. Args: urls (List[str]): URLs to scrape - params (Optional[ScrapeParams]): See ScrapeParams model: - Content Options: - * formats - Content formats to retrieve - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only - - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type - - Extraction Options: - * extract - Content extraction config - * jsonOptions - JSON extraction config - * actions - Actions to perform + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + poll_interval (Optional[int]): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - BatchScrapeStatusResponse with: - * Scraping status and progress - * Scraped content for each URL - * Success/error information + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information Raises: - Exception: If batch scrape fails + Exception: If batch scrape fails """ - headers = self._prepare_headers() - json_data = {'urls': urls} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" + scrape_params = {} - endpoint = f'/v1/batch/scrape' + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) response = await self._async_post_request( - f'{self.api_url}{endpoint}', - json_data, + f'{self.api_url}/v1/batch/scrape', + params_dict, headers ) - if response.get('success') and 'id' in response: - return await self._async_monitor_job_status(response['id'], headers) + if response.status_code == 200: + try: + id = response.json().get('id') + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + return self._monitor_job_status(id, headers, poll_interval) else: - raise Exception(f'Failed to start batch scrape. Error: {response.get("error")}') + self._handle_error(response, 'start batch scrape job') + async def async_batch_scrape_urls( - self, - urls: List[str], - params: Optional[ScrapeParams] = None, - idempotency_key: Optional[str] = None) -> BatchScrapeResponse: + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeResponse: """ - Initiate an asynchronous batch scrape job without waiting for completion. + Initiate a batch scrape job asynchronously. Args: - urls (List[str]): List of URLs to scrape - params (Optional[ScrapeParams]): See ScrapeParams model for configuration: - Content Options: - * formats - Content formats to retrieve - * includeTags - HTML tags to include - * excludeTags - HTML tags to exclude - * onlyMainContent - Extract main content only - - Request Options: - * headers - Custom HTTP headers - * timeout - Request timeout (ms) - * mobile - Use mobile user agent - * proxy - Proxy type - - Extraction Options: - * extract - Content extraction config - * jsonOptions - JSON extraction config - * actions - Actions to perform - idempotency_key (Optional[str]): Unique key to prevent duplicate requests + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - BatchScrapeResponse with: - * success - Whether job started successfully - * id - Unique identifier for the job - * url - Status check URL - * error - Error message if start failed + BatchScrapeResponse with: + * success - Whether job started successfully + * id - Unique identifier for the job + * url - Status check URL + * error - Error message if start failed Raises: - Exception: If job initiation fails + Exception: If job initiation fails """ - headers = self._prepare_headers(idempotency_key) - json_data = {'urls': urls} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" + scrape_params = {} - endpoint = f'/v1/batch/scrape' - return await self._async_post_request( - f'{self.api_url}{endpoint}', - json_data, + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = await self._async_post_request( + f'{self.api_url}/v1/batch/scrape', + params_dict, headers ) + if response.status_code == 200: + try: + return BatchScrapeResponse(**response.json()) + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'start batch scrape job') + async def crawl_url( - self, - url: str, - params: Optional[CrawlParams] = None, - poll_interval: int = 2, - idempotency_key: Optional[str] = None) -> CrawlStatusResponse: + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlStatusResponse: """ - Asynchronously crawl a website starting from a URL and monitor until completion. - - Args: - url (str): Target URL to start crawling from - params (Optional[CrawlParams]): See CrawlParams model: - URL Discovery: - * includePaths - Patterns of URLs to include - * excludePaths - Patterns of URLs to exclude - * maxDepth - Maximum crawl depth - * maxDiscoveryDepth - Maximum depth for finding new URLs - * limit - Maximum pages to crawl - - Link Following: - * allowBackwardLinks - Follow parent directory links - * allowExternalLinks - Follow external domain links - * ignoreSitemap - Skip sitemap.xml processing - - Advanced: - * scrapeOptions - Page scraping configuration - * webhook - Notification webhook settings - * deduplicateSimilarURLs - Remove similar URLs - * ignoreQueryParameters - Ignore URL parameters - * regexOnFullURL - Apply regex to full URLs - poll_interval (int): Seconds between status checks (default: 2) - idempotency_key (Optional[str]): Unique key to prevent duplicate requests - - Returns: - CrawlStatusResponse with: - * Crawling status and progress - * Crawled page contents - * Success/error information - - Raises: - Exception: If crawl fails - """ - headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" - - endpoint = f'/v1/crawl' - response = await self._async_post_request( - f'{self.api_url}{endpoint}', - json_data, - headers - ) - - if response.get('success') and 'id' in response: - return await self._async_monitor_job_status(response['id'], headers, poll_interval) - else: - raise Exception(f'Failed to start crawl. Error: {response.get("error")}') - - async def async_crawl_url( - self, - url: str, - params: Optional[CrawlParams] = None, - idempotency_key: Optional[str] = None) -> CrawlResponse: - """ - Initiate an asynchronous crawl job without waiting for completion. + Crawl a website starting from a URL. Args: url (str): Target URL to start crawling from - params (Optional[CrawlParams]): See CrawlParams model: - URL Discovery: - * includePaths - Patterns of URLs to include - * excludePaths - Patterns of URLs to exclude - * maxDepth - Maximum crawl depth - * maxDiscoveryDepth - Maximum depth for finding new URLs - * limit - Maximum pages to crawl - - Link Following: - * allowBackwardLinks - Follow parent directory links - * allowExternalLinks - Follow external domain links - * ignoreSitemap - Skip sitemap.xml processing - - Advanced: - * scrapeOptions - Page scraping configuration - * webhook - Notification webhook settings - * deduplicateSimilarURLs - Remove similar URLs - * ignoreQueryParameters - Ignore URL parameters - * regexOnFullURL - Apply regex to full URLs + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + poll_interval (Optional[int]): Seconds between status checks (default: 2) idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - CrawlResponse with: - * success - Whether job started successfully - * id - Unique identifier for the job - * url - Status check URL - * error - Error message if start failed + CrawlStatusResponse with: + * Crawling status and progress + * Crawled page contents + * Success/error information Raises: - Exception: If job initiation fails + Exception: If crawl fails """ - headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" + crawl_params = {} - endpoint = f'/v1/crawl' - return await self._async_post_request( - f'{self.api_url}{endpoint}', - json_data, - headers + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = await self._async_post_request( + f'{self.api_url}/v1/crawl', params_dict, headers) + + if response.status_code == 200: + try: + id = response.json().get('id') + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + return self._monitor_job_status(id, headers, poll_interval) + else: + self._handle_error(response, 'start crawl job') + + + async def async_crawl_url( + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlResponse: + """ + Start an asynchronous crawl job. + + Args: + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API + + Returns: + CrawlResponse with: + * success - Whether crawl started successfully + * id - Unique identifier for the crawl job + * url - Status check URL for the crawl + * error - Error message if start failed + + Raises: + Exception: If crawl initiation fails + """ + crawl_params = {} + + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = await self._async_post_request( + f'{self.api_url}/v1/crawl', + params_dict, + headers ) + if response.status_code == 200: + try: + return CrawlResponse(**response.json()) + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'start crawl job') + async def check_crawl_status(self, id: str) -> CrawlStatusResponse: """ Check the status and results of an asynchronous crawl job. From 9ba1ae9ae13c7b4892712504f9f3bb7b274fa173 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 01:28:31 -0700 Subject: [PATCH 24/26] Nick: --- apps/api/src/routes/v1.ts | 4 ++-- apps/python-sdk/firecrawl/firecrawl.py | 32 ++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 185a70de..271f2b17 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -278,14 +278,14 @@ v1Router.get( v1Router.post( "/deep-research", - authMiddleware(RateLimiterMode.Extract), + authMiddleware(RateLimiterMode.Crawl), checkCreditsMiddleware(1), wrap(deepResearchController), ); v1Router.get( "/deep-research/:jobId", - authMiddleware(RateLimiterMode.ExtractStatus), + authMiddleware(RateLimiterMode.CrawlStatus), wrap(deepResearchStatusController), ); diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 1442894e..76816a1d 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1843,7 +1843,12 @@ class FirecrawlApp: __experimental_stream=experimental_stream ) - response = self.async_generate_llms_text(url, params) + response = self.async_generate_llms_text( + url, + max_urls=max_urls, + show_full_text=show_full_text, + experimental_stream=experimental_stream + ) if not response.get('success') or 'id' not in response: return response @@ -2219,7 +2224,14 @@ class FirecrawlApp: research_params['__experimental_streamSteps'] = __experimental_stream_steps research_params = DeepResearchParams(**research_params) - response = self.async_deep_research(query, research_params) + response = self.async_deep_research( + query, + max_depth=max_depth, + time_limit=time_limit, + max_urls=max_urls, + analysis_prompt=analysis_prompt, + system_prompt=system_prompt + ) if not response.get('success') or 'id' not in response: return response @@ -3529,7 +3541,12 @@ class AsyncFirecrawlApp(FirecrawlApp): if experimental_stream is not None: params['__experimental_stream'] = experimental_stream - response = await self.async_generate_llms_text(url, params) + response = await self.async_generate_llms_text( + url, + max_urls=max_urls, + show_full_text=show_full_text, + experimental_stream=experimental_stream + ) if not response.get('success') or 'id' not in response: return response @@ -3678,7 +3695,14 @@ class AsyncFirecrawlApp(FirecrawlApp): research_params['__experimental_streamSteps'] = __experimental_stream_steps research_params = DeepResearchParams(**research_params) - response = await self.async_deep_research(query, research_params) + response = await self.async_deep_research( + query, + max_depth=max_depth, + time_limit=time_limit, + max_urls=max_urls, + analysis_prompt=analysis_prompt, + system_prompt=system_prompt + ) if not response.get('success') or 'id' not in response: return response From 9e67d7ba22ba9956d3e02d629d203d58296abbe4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 01:30:40 -0700 Subject: [PATCH 25/26] Nick: --- apps/python-sdk/pyproject.toml | 3 ++- apps/python-sdk/setup.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 5a87d8c5..0483c31c 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -13,7 +13,8 @@ dependencies = [ "python-dotenv", "websockets", "nest-asyncio", - "pydantic>=2.10.3", + "pydantic", + "aiohttp" ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index 8a67d1fd..1fb31664 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -32,7 +32,9 @@ setup( 'python-dotenv', 'websockets', 'asyncio', - 'nest-asyncio' + 'nest-asyncio', + 'pydantic', + 'aiohttp' ], python_requires=">=3.8", classifiers=[ From 06c54bc41cb97590be44221ec82c74fe8d3c2bda Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 01:43:18 -0700 Subject: [PATCH 26/26] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 10431768..eea9ba54 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -11,9 +11,9 @@ For more information visit https://github.com/firecrawl/ import logging import os -from .firecrawl import FirecrawlApp # noqa +from .firecrawl import FirecrawlApp, ExtractConfig # noqa -__version__ = "2.0.0" +__version__ = "2.0.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl")