From 6a5a4e5b6f435e01d13a42a65bb7bacdaaec9ef0 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 13 Mar 2025 11:21:35 -0300 Subject: [PATCH] improv/types-and-comments-descs --- apps/python-sdk/firecrawl/firecrawl.py | 852 +++++++++++++++++++------ 1 file changed, 674 insertions(+), 178 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d79b174c..d212dea7 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -12,8 +12,9 @@ Classes: import logging import os import time -from typing import Any, Dict, Optional, List, Union, Callable +from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic import json +from datetime import datetime import requests import pydantic @@ -21,6 +22,212 @@ import websockets logger : logging.Logger = logging.getLogger("firecrawl") +T = TypeVar('T') + +class FirecrawlDocumentMetadata(pydantic.BaseModel): + """Metadata for a Firecrawl document.""" + title: Optional[str] = None + description: Optional[str] = None + language: Optional[str] = None + keywords: Optional[str] = None + robots: Optional[str] = None + ogTitle: Optional[str] = None + ogDescription: Optional[str] = None + ogUrl: Optional[str] = None + ogImage: Optional[str] = None + ogAudio: Optional[str] = None + ogDeterminer: Optional[str] = None + ogLocale: Optional[str] = None + ogLocaleAlternate: Optional[List[str]] = None + ogSiteName: Optional[str] = None + ogVideo: Optional[str] = None + dctermsCreated: Optional[str] = None + dcDateCreated: Optional[str] = None + dcDate: Optional[str] = None + dctermsType: Optional[str] = None + dcType: Optional[str] = None + dctermsAudience: Optional[str] = None + dctermsSubject: Optional[str] = None + dcSubject: Optional[str] = None + dcDescription: Optional[str] = None + dctermsKeywords: Optional[str] = None + modifiedTime: Optional[str] = None + publishedTime: Optional[str] = None + articleTag: Optional[str] = None + articleSection: Optional[str] = None + sourceURL: Optional[str] = None + statusCode: Optional[int] = None + error: Optional[str] = None + +class ActionsResult(pydantic.BaseModel): + """Result of actions performed during scraping.""" + screenshots: List[str] + +class FirecrawlDocument(pydantic.BaseModel, Generic[T]): + """Document retrieved or processed by Firecrawl.""" + url: Optional[str] = None + markdown: Optional[str] = None + html: Optional[str] = None + rawHtml: Optional[str] = None + links: Optional[List[str]] = None + extract: Optional[T] = None + json: Optional[T] = None + screenshot: Optional[str] = None + metadata: Optional[FirecrawlDocumentMetadata] = None + actions: Optional[ActionsResult] = None + title: Optional[str] = None # v1 search only + description: Optional[str] = None # v1 search only + +class LocationConfig(pydantic.BaseModel): + """Location configuration for scraping.""" + country: Optional[str] = None + languages: Optional[List[str]] = None + +class WebhookConfig(pydantic.BaseModel): + """Configuration for webhooks.""" + url: str + headers: Optional[Dict[str, str]] = None + metadata: Optional[Dict[str, str]] = None + events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None + +class CrawlScrapeOptions(pydantic.BaseModel): + """Parameters for scraping operations.""" + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None + headers: Optional[Dict[str, str]] = None + includeTags: Optional[List[str]] = None + excludeTags: Optional[List[str]] = None + onlyMainContent: Optional[bool] = None + waitFor: Optional[int] = None + timeout: Optional[int] = None + location: Optional[LocationConfig] = None + mobile: Optional[bool] = None + skipTlsVerification: Optional[bool] = None + removeBase64Images: Optional[bool] = None + blockAds: Optional[bool] = None + proxy: Optional[Literal["basic", "stealth"]] = None + +class Action(pydantic.BaseModel): + """Action to perform during scraping.""" + type: Literal["wait", "click", "screenshot", "write", "press", "scroll", "scrape", "executeJavascript"] + milliseconds: Optional[int] = None + selector: Optional[str] = None + fullPage: Optional[bool] = None + text: Optional[str] = None + key: Optional[str] = None + direction: Optional[Literal["up", "down"]] = None + script: Optional[str] = None + +class ExtractConfig(pydantic.BaseModel): + """Configuration for extraction.""" + prompt: Optional[str] = None + schema: Optional[Any] = None + systemPrompt: Optional[str] = None + +class ScrapeParams(CrawlScrapeOptions): + """Parameters for scraping operations.""" + extract: Optional[ExtractConfig] = None + jsonOptions: Optional[ExtractConfig] = None + actions: Optional[List[Action]] = None + +class ScrapeResponse(FirecrawlDocument[T], Generic[T]): + """Response from scraping operations.""" + success: bool = True + warning: Optional[str] = None + error: Optional[str] = None + +class BatchScrapeResponse(pydantic.BaseModel): + """Response from batch scrape operations.""" + id: Optional[str] = None + url: Optional[str] = None + success: bool = True + error: Optional[str] = None + invalidURLs: Optional[List[str]] = None + +class BatchScrapeStatusResponse(pydantic.BaseModel): + """Response from batch scrape status checks.""" + success: bool = True + status: Literal["scraping", "completed", "failed", "cancelled"] + completed: int + total: int + creditsUsed: int + expiresAt: datetime + next: Optional[str] = None + data: List[FirecrawlDocument] + +class CrawlParams(pydantic.BaseModel): + """Parameters for crawling operations.""" + includePaths: Optional[List[str]] = None + excludePaths: Optional[List[str]] = None + maxDepth: Optional[int] = None + maxDiscoveryDepth: Optional[int] = None + limit: Optional[int] = None + allowBackwardLinks: Optional[bool] = None + allowExternalLinks: Optional[bool] = None + ignoreSitemap: Optional[bool] = None + scrapeOptions: Optional[CrawlScrapeOptions] = None + webhook: Optional[Union[str, WebhookConfig]] = None + deduplicateSimilarURLs: Optional[bool] = None + ignoreQueryParameters: Optional[bool] = None + regexOnFullURL: Optional[bool] = None + +class CrawlResponse(pydantic.BaseModel): + """Response from crawling operations.""" + id: Optional[str] = None + url: Optional[str] = None + success: bool = True + error: Optional[str] = None + +class CrawlStatusResponse(pydantic.BaseModel): + """Response from crawl status checks.""" + success: bool = True + status: Literal["scraping", "completed", "failed", "cancelled"] + completed: int + total: int + creditsUsed: int + expiresAt: datetime + next: Optional[str] = None + data: List[FirecrawlDocument] + +class CrawlErrorsResponse(pydantic.BaseModel): + """Response from crawl/batch scrape error monitoring.""" + errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str} + robotsBlocked: List[str] + +class MapParams(pydantic.BaseModel): + """Parameters for mapping operations.""" + search: Optional[str] = None + ignoreSitemap: Optional[bool] = None + includeSubdomains: Optional[bool] = None + sitemapOnly: Optional[bool] = None + limit: Optional[int] = None + timeout: Optional[int] = None + +class MapResponse(pydantic.BaseModel): + """Response from mapping operations.""" + success: bool = True + links: Optional[List[str]] = None + error: Optional[str] = None + +class ExtractParams(pydantic.BaseModel): + """Parameters for extracting information from URLs.""" + prompt: Optional[str] = None + schema: Optional[Any] = None + systemPrompt: Optional[str] = None + allowExternalLinks: Optional[bool] = None + enableWebSearch: Optional[bool] = None + includeSubdomains: Optional[bool] = None + origin: Optional[str] = None + showSources: Optional[bool] = None + scrapeOptions: Optional[CrawlScrapeOptions] = None + +class ExtractResponse(pydantic.BaseModel, Generic[T]): + """Response from extract operations.""" + success: bool = True + data: Optional[T] = None + error: Optional[str] = None + warning: Optional[str] = None + sources: Optional[List[str]] = None + class SearchParams(pydantic.BaseModel): query: str limit: Optional[int] = 5 @@ -33,6 +240,13 @@ class SearchParams(pydantic.BaseModel): timeout: Optional[int] = 60000 scrapeOptions: Optional[Dict[str, Any]] = None +class SearchResponse(pydantic.BaseModel): + """Response from search operations.""" + success: bool = True + data: List[FirecrawlDocument] + warning: Optional[str] = None + error: Optional[str] = None + class GenerateLLMsTextParams(pydantic.BaseModel): """ Parameters for the LLMs.txt generation operation. @@ -73,40 +287,21 @@ class DeepResearchStatusResponse(pydantic.BaseModel): sources: List[Dict[str, Any]] summaries: List[str] +class GenerateLLMsTextResponse(pydantic.BaseModel): + """Response from LLMs.txt generation operations.""" + success: bool = True + id: str + error: Optional[str] = None + +class GenerateLLMsTextStatusResponse(pydantic.BaseModel): + """Status response from LLMs.txt generation operations.""" + success: bool = True + data: Optional[Dict[str, str]] = None # {llmstxt: str, llmsfulltxt?: str} + status: Literal["processing", "completed", "failed"] + error: Optional[str] = None + expiresAt: str + class FirecrawlApp: - class SearchResponse(pydantic.BaseModel): - """ - Response from the search operation. - """ - success: bool - data: List[Dict[str, Any]] - warning: Optional[str] = None - error: Optional[str] = None - - class ExtractParams(pydantic.BaseModel): - """ - Parameters for the extract operation. - """ - prompt: Optional[str] = None - schema_: Optional[Any] = pydantic.Field(None, alias='schema') - system_prompt: Optional[str] = None - allow_external_links: Optional[bool] = False - enable_web_search: Optional[bool] = False - # Just for backwards compatibility - enableWebSearch: Optional[bool] = False - show_sources: Optional[bool] = False - - - - - class ExtractResponse(pydantic.BaseModel): - """ - Response from the extract operation. - """ - success: bool - data: Optional[Any] = None - error: Optional[str] = None - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: """ Initialize the FirecrawlApp instance with API key, API URL. @@ -125,19 +320,42 @@ class FirecrawlApp: logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}") - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> ScrapeResponse[Any]: """ - Scrape the specified URL using the Firecrawl API. + Scrape and extract content from a URL. Args: - url (str): The URL to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + url (str): Target URL to scrape + + params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration: + + Content Options: + * formats - Content types to retrieve (markdown/html/etc) + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type (basic/stealth) + + Extraction Options: + * extract - Content extraction settings + * jsonOptions - JSON extraction settings + * actions - Actions to perform Returns: - Any: The scraped data if the request is successful. + ScrapeResponse with: + + * Requested content formats + * Page metadata + * Extraction results + * Success/error status Raises: - Exception: If the scrape request fails. + Exception: If scraping fails """ headers = self._prepare_headers() @@ -193,16 +411,35 @@ class FirecrawlApp: else: self._handle_error(response, 'scrape URL') - def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]: + def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse: """ - Search for content using the Firecrawl API. + Search for content using Firecrawl. Args: - query (str): The search query string. - params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters. + query (str): Search query string + + params (Optional[Union[Dict[str, Any], SearchParams]]): See SearchParams model: + + Search Options: + * limit - Max results (default: 5) + * tbs - Time filter (e.g. "qdr:d") + * filter - Custom result filter + + Localization: + * lang - Language code (default: "en") + * country - Country code (default: "us") + * location - Geo-targeting + + Request Options: + * timeout - Request timeout (ms) + * scrapeOptions - Result scraping config, check ScrapeParams model for more details Returns: - Dict[str, Any]: The search response containing success status and search results. + SearchResponse + + + Raises: + Exception: If search fails """ if params is None: params = {} @@ -230,28 +467,46 @@ class FirecrawlApp: def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + idempotency_key: Optional[str] = None) -> CrawlStatusResponse: """ - Initiate a crawl job for the specified URL using the Firecrawl API. + Crawl a website starting from a URL. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + + params (Optional[Dict[str, Any]]): See CrawlParams model for configuration: + + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + + poll_interval: Seconds between status checks (default: 2) + + idempotency_key: Request deduplication key Returns: - Dict[str, Any]: A dictionary containing the crawl results. The structure includes: - - 'success' (bool): Indicates if the crawl was successful. - - 'status' (str): The final status of the crawl job (e.g., 'completed'). - - 'completed' (int): Number of scraped pages that completed. - - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this crawl. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires. - - 'data' (List[Dict]): List of all the scraped pages. + CrawlStatusResponse with: + * Crawling status and progress + * Crawled page contents + * Success/error information Raises: - Exception: If the crawl job initiation or monitoring fails. + Exception: If crawl fails """ endpoint = f'/v1/crawl' headers = self._prepare_headers(idempotency_key) @@ -270,20 +525,45 @@ class FirecrawlApp: self._handle_error(response, 'start crawl job') - def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> CrawlResponse: """ - Initiate a crawl job asynchronously. + Start an asynchronous crawl job. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + + params (Optional[Dict[str, Any]]): See CrawlParams model: + + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + + idempotency_key: Unique key to prevent duplicate requests Returns: - Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes: - - 'success' (bool): Indicates if the crawl initiation was successful. - - 'id' (str): The unique identifier for the crawl job. - - 'url' (str): The URL to check the status of the crawl job. + CrawlResponse with: + * success - Whether crawl started successfully + * id - Unique identifier for the crawl job + * url - Status check URL for the crawl + * error - Error message if start failed + + Raises: + Exception: If crawl initiation fails """ endpoint = f'/v1/crawl' headers = self._prepare_headers(idempotency_key) @@ -299,18 +579,31 @@ class FirecrawlApp: else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, id: str) -> Any: + def check_crawl_status(self, id: str) -> CrawlStatusResponse: """ - Check the status of a crawl job using the Firecrawl API. + Check the status and results of a crawl job. Args: - id (str): The ID of the crawl job. + id: Unique identifier for the crawl job Returns: - Any: The status of the crawl job. + CrawlStatusResponse containing: + + Status Information: + * status - Current state (scraping/completed/failed/cancelled) + * completed - Number of pages crawled + * total - Total pages to crawl + * creditsUsed - API credits consumed + * expiresAt - Data expiration timestamp + + Results: + * data - List of crawled documents + * next - URL for next page of results (if paginated) + * success - Whether status check succeeded + * error - Error message if failed Raises: - Exception: If the status check request fails. + Exception: If status check fails """ endpoint = f'/v1/crawl/{id}' @@ -369,7 +662,7 @@ class FirecrawlApp: else: self._handle_error(response, 'check crawl status') - def check_crawl_errors(self, id: str) -> Dict[str, Any]: + def check_crawl_errors(self, id: str) -> CrawlErrorsResponse: """ Returns information about crawl errors. @@ -427,16 +720,32 @@ class FirecrawlApp: else: raise Exception("Crawl job failed to start") - def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> MapResponse: """ - Perform a map search using the Firecrawl API. + Map and discover links from a URL. Args: - url (str): The URL to perform the map search on. - params (Optional[Dict[str, Any]]): Additional parameters for the map search. + url: Target URL to map + + params: See MapParams model: + + Discovery Options: + * search - Filter pattern for URLs + * ignoreSitemap - Skip sitemap.xml + * includeSubdomains - Include subdomain links + * sitemapOnly - Only use sitemap.xml + + Limits: + * limit - Max URLs to return + * timeout - Request timeout (ms) Returns: - List[str]: A list of URLs discovered during the map search. + MapResponse with: + * Discovered URLs + * Success/error status + + Raises: + Exception: If mapping fails """ endpoint = f'/v1/map' headers = self._prepare_headers() @@ -469,28 +778,44 @@ class FirecrawlApp: def batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + idempotency_key: Optional[str] = None) -> BatchScrapeStatusResponse: """ - Initiate a batch scrape job for the specified URLs using the Firecrawl API. + Batch scrape multiple URLs and monitor until completion. Args: - urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. - poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + urls: URLs to scrape + + params: See ScrapeParams model: + + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + + poll_interval: Seconds between status checks (default: 2) + + idempotency_key: Request deduplication key Returns: - Dict[str, Any]: A dictionary containing the scrape results. The structure includes: - - 'success' (bool): Indicates if the batch scrape was successful. - - 'status' (str): The final status of the batch scrape job (e.g., 'completed'). - - 'completed' (int): Number of scraped pages that completed. - - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this batch scrape. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires. - - 'data' (List[Dict]): List of all the scraped pages. + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information Raises: - Exception: If the batch scrape job initiation or monitoring fails. + Exception: If batch scrape fails """ endpoint = f'/v1/batch/scrape' headers = self._prepare_headers(idempotency_key) @@ -509,9 +834,13 @@ class FirecrawlApp: self._handle_error(response, 'start batch scrape job') - def async_batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_batch_scrape_urls( + self, + urls: List[str], + params: Optional[Dict[str, Any]] = None, + idempotency_key: Optional[str] = None) -> BatchScrapeResponse: """ - Initiate a crawl job asynchronously. + Initiate a batch scrape job asynchronously. Args: urls (List[str]): The URLs to scrape. @@ -519,7 +848,7 @@ class FirecrawlApp: idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: - Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes: + BatchScrapeResponse: A dictionary containing the batch scrape initiation response. The structure includes: - 'success' (bool): Indicates if the batch scrape initiation was successful. - 'id' (str): The unique identifier for the batch scrape job. - 'url' (str): The URL to check the status of the batch scrape job. @@ -538,13 +867,17 @@ class FirecrawlApp: else: self._handle_error(response, 'start batch scrape job') - def batch_scrape_urls_and_watch(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def batch_scrape_urls_and_watch( + self, + urls: List[str], + params: Optional[ScrapeParams] = None, + idempotency_key: Optional[str] = None) -> 'CrawlWatcher': """ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. Args: urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. + params (Optional[ScrapeParams]): Additional parameters for the scraper. idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. Returns: @@ -556,7 +889,7 @@ class FirecrawlApp: else: raise Exception("Batch scrape job failed to start") - def check_batch_scrape_status(self, id: str) -> Any: + def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse: """ Check the status of a batch scrape job using the Firecrawl API. @@ -564,7 +897,7 @@ class FirecrawlApp: id (str): The ID of the batch scrape job. Returns: - Any: The status of the batch scrape job. + BatchScrapeStatusResponse: The status of the batch scrape job. Raises: Exception: If the status check request fails. @@ -626,7 +959,7 @@ class FirecrawlApp: else: self._handle_error(response, 'check batch scrape status') - def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]: + def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse: """ Returns information about batch scrape errors. @@ -634,7 +967,13 @@ class FirecrawlApp: id (str): The ID of the crawl job. Returns: - Dict[str, Any]: Information about crawl errors. + CrawlErrorsResponse: A response containing: + - errors (List[Dict[str, str]]): List of errors with fields: + - id (str): Error ID + - timestamp (str): When the error occurred + - url (str): URL that caused the error + - error (str): Error message + - robotsBlocked (List[str]): List of URLs blocked by robots.txt """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers) @@ -646,16 +985,40 @@ class FirecrawlApp: else: self._handle_error(response, "check batch scrape errors") - def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any: + def extract( + self, + urls: List[str], + params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: """ - Extracts information from a URL using the Firecrawl API. + Extract structured information from URLs. Args: - urls (List[str]): The URLs to extract information from. - params (Optional[ExtractParams]): Additional parameters for the extract request. + urls: URLs to extract from + + params: See ExtractParams model: + + Extraction Config: + * prompt - Custom extraction prompt + * schema - JSON schema/Pydantic model + * systemPrompt - System context + + Behavior Options: + * allowExternalLinks - Follow external links + * enableWebSearch - Enable web search + * includeSubdomains - Include subdomains + * showSources - Include source URLs + + Scraping Options: + * scrapeOptions - Page scraping config Returns: - Union[ExtractResponse, ErrorResponse]: The response from the extract operation. + ExtractResponse with: + * Structured data matching schema + * Source information if requested + * Success/error status + + Raises: + ValueError: If prompt/schema missing or extraction fails """ headers = self._prepare_headers() @@ -715,10 +1078,7 @@ class FirecrawlApp: except: raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': - if status_data['success']: - return status_data - else: - raise Exception(f'Failed to extract. Error: {status_data["error"]}') + return status_data elif status_data['status'] in ['failed', 'cancelled']: raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') else: @@ -734,7 +1094,7 @@ class FirecrawlApp: return {'success': False, 'error': "Internal server error."} - def get_extract_status(self, job_id: str) -> Dict[str, Any]: + def get_extract_status(self, job_id: str) -> ExtractResponse[Any]: """ Retrieve the status of an extract job. @@ -742,7 +1102,7 @@ class FirecrawlApp: job_id (str): The ID of the extract job. Returns: - Dict[str, Any]: The status of the extract job. + ExtractResponse[Any]: The status of the extract job. Raises: ValueError: If there is an error retrieving the status. @@ -760,20 +1120,32 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extract job. Args: - urls (List[str]): The URLs to extract data from. - params (Optional[Dict[str, Any]]): Additional parameters for the extract request. - idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. + urls (List[str]): List of URLs to extract information from. Must be valid HTTP/HTTPS URLs. + params (Optional[Dict[str, Any]]): Extraction configuration parameters: + - prompt (str, optional): Custom prompt for extraction + - schema (Any, optional): JSON schema or Pydantic model for structured extraction + - systemPrompt (str, optional): System prompt for extraction + - allowExternalLinks (bool, optional): Allow following external links + - enableWebSearch (bool, optional): Enable web search during extraction + - includeSubdomains (bool, optional): Include content from subdomains + - origin (str, optional): Source of the extraction request + - showSources (bool, optional): Include source URLs in response + - scrapeOptions (CrawlScrapeOptions, optional): Configuration for scraping pages + idempotency_key (Optional[str]): Unique identifier to prevent duplicate requests. Returns: - Dict[str, Any]: The response from the extract operation. + ExtractResponse[Any]: A response containing: + - success (bool): Whether the extraction initiation was successful + - id (str): The unique identifier for the extract job + - error (str, optional): Error message if initiation failed Raises: - ValueError: If there is an error initiating the extract job. + ValueError: If neither prompt nor schema is provided, or if there is an error during initiation. """ headers = self._prepare_headers(idempotency_key) @@ -804,24 +1176,32 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + def generate_llms_text( + self, + url: str, + params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: """ Generate LLMs.txt for a given URL and poll until completion. Args: - url (str): The URL to generate LLMs.txt from. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + url: Target URL to generate LLMs.txt from + + params: See GenerateLLMsTextParams model: + + Generation Options: + * maxUrls - Maximum URLs to process (default: 10) + * showFullText - Include full text in output (default: False) + * __experimental_stream - Enable streaming of generation progress Returns: - Dict[str, Any]: A dictionary containing the generation results. The structure includes: - - 'success' (bool): Indicates if the generation was successful. - - 'status' (str): The final status of the generation job. - - 'data' (Dict): The generated LLMs.txt data. - - 'error' (Optional[str]): Error message if the generation failed. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires. + GenerateLLMsTextStatusResponse with: + * Generated LLMs.txt content + * Full version if requested + * Generation status + * Success/error information Raises: - Exception: If the generation job fails or an error occurs during status checks. + Exception: If generation fails """ if params is None: params = {} @@ -850,18 +1230,25 @@ class FirecrawlApp: return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} - def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + def async_generate_llms_text( + self, + url: str, + params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation operation. Args: - url (str): The URL to generate LLMs.txt from. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. + params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Generation configuration parameters: + - maxUrls (int, optional): Maximum number of URLs to process (default: 10) + - showFullText (bool, optional): Include full text in output (default: False) + - __experimental_stream (bool, optional): Enable streaming of generation progress Returns: - Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes: - - 'success' (bool): Indicates if the generation initiation was successful. - - 'id' (str): The unique identifier for the generation job. + GenerateLLMsTextResponse: A response containing: + - success (bool): Whether the generation initiation was successful + - id (str): The unique identifier for the generation job + - error (str, optional): Error message if initiation failed Raises: Exception: If the generation job initiation fails. @@ -891,15 +1278,22 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]: + def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse: """ Check the status of a LLMs.txt generation operation. Args: - id (str): The ID of the LLMs.txt generation operation. + id (str): The unique identifier of the LLMs.txt generation job to check status for. Returns: - Dict[str, Any]: The current status and results of the generation operation. + GenerateLLMsTextStatusResponse: A response containing: + - success (bool): Whether the generation was successful + - status (str): Status of generation ("processing", "completed", "failed") + - data (Dict[str, str], optional): Generated text with fields: + - llmstxt (str): Generated LLMs.txt content + - llmsfulltxt (str, optional): Full version if requested + - error (str, optional): Error message if generation failed + - expiresAt (str): When the generated data expires Raises: Exception: If the status check fails. @@ -921,7 +1315,9 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: + def _prepare_headers( + self, + idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -943,11 +1339,13 @@ class FirecrawlApp: 'Authorization': f'Bearer {self.api_key}', } - def _post_request(self, url: str, - data: Dict[str, Any], - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _post_request( + self, + url: str, + data: Dict[str, Any], + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a POST request with retries. @@ -972,10 +1370,12 @@ class FirecrawlApp: return response return response - def _get_request(self, url: str, - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _get_request( + self, + url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a GET request with retries. @@ -999,10 +1399,12 @@ class FirecrawlApp: return response return response - def _delete_request(self, url: str, - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _delete_request( + self, + url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a DELETE request with retries. @@ -1026,16 +1428,21 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status( + self, + id: str, + headers: Dict[str, str], + poll_interval: int) -> CrawlStatusResponse: """ Monitor the status of a crawl job until completion. Args: id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. - poll_interval (int): Secounds between status checks. + poll_interval (int): Seconds between status checks. + Returns: - Any: The crawl results if the job is completed successfully. + CrawlStatusResponse: The crawl results if the job is completed successfully. Raises: Exception: If the job fails or an error occurs during status checks. @@ -1073,7 +1480,10 @@ class FirecrawlApp: else: self._handle_error(status_response, 'check crawl status') - def _handle_error(self, response: requests.Response, action: str) -> None: + def _handle_error( + self, + response: requests.Response, + action: str) -> None: """ Handle errors from API responses. @@ -1105,22 +1515,47 @@ class FirecrawlApp: # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) - def deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, - on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, - on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> Dict[str, Any]: + def deep_research( + self, + query: str, + params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, + on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, + on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: """ Initiates a deep research operation on a given query and polls until completion. Args: - query (str): The query to research. - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation. - on_activity (Optional[Callable[[Dict[str, Any]], None]]): Optional callback to receive activity updates in real-time. + query: Research query or topic to investigate + + params: See DeepResearchParams model: + Research Settings: + * maxDepth - Maximum research depth (default: 7) + * timeLimit - Time limit in seconds (default: 270) + * maxUrls - Maximum URLs to process (default: 20) + + Callbacks: + * on_activity - Progress callback receiving: + {type, status, message, timestamp, depth} + * on_source - Source discovery callback receiving: + {url, title, description} Returns: - Dict[str, Any]: The final research results. + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries Raises: - Exception: If the research operation fails. + Exception: If research fails """ if params is None: params = {} @@ -1164,16 +1599,26 @@ class FirecrawlApp: return {'success': False, 'error': 'Deep research job terminated unexpectedly'} - def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: + def async_deep_research( + self, + query: str, + params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse: """ Initiates an asynchronous deep research operation. Args: - query (str): The query to research. - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation. + query (str): The research query to investigate. Should be a clear, specific question or topic. + params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Research configuration parameters: + - maxDepth (int, optional): Maximum depth of research exploration (default: 7) + - timeLimit (int, optional): Time limit in seconds for research (default: 270) + - maxUrls (int, optional): Maximum number of URLs to process (default: 20) + - __experimental_streamSteps (bool, optional): Enable streaming of research steps Returns: - Dict[str, Any]: The response from the deep research initiation. + DeepResearchResponse: A response containing: + - success (bool): Whether the research initiation was successful + - id (str): The unique identifier for the research job + - error (str, optional): Error message if initiation failed Raises: Exception: If the research initiation fails. @@ -1203,7 +1648,7 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def check_deep_research_status(self, id: str) -> Dict[str, Any]: + def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse: """ Check the status of a deep research operation. @@ -1211,7 +1656,19 @@ class FirecrawlApp: id (str): The ID of the deep research operation. Returns: - Dict[str, Any]: The current status and results of the research operation. + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries Raises: Exception: If the status check fails. @@ -1232,8 +1689,18 @@ class FirecrawlApp: raise ValueError(str(e)) return {'success': False, 'error': 'Internal server error'} - class CrawlWatcher: + """ + A class to watch and handle crawl job events via WebSocket connection. + + Attributes: + id (str): The ID of the crawl job to watch + app (FirecrawlApp): The FirecrawlApp instance + data (List[Dict[str, Any]]): List of crawled documents/data + status (str): Current status of the crawl job + ws_url (str): WebSocket URL for the crawl job + event_handlers (dict): Dictionary of event type to list of handler functions + """ def __init__(self, id: str, app: FirecrawlApp): self.id = id self.app = app @@ -1246,25 +1713,54 @@ class CrawlWatcher: 'document': [] } - async def connect(self): + async def connect(self) -> None: + """ + Establishes WebSocket connection and starts listening for messages. + """ async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket: await self._listen(websocket) - async def _listen(self, websocket): + async def _listen(self, websocket) -> None: + """ + Listens for incoming WebSocket messages and handles them. + + Args: + websocket: The WebSocket connection object + """ async for message in websocket: msg = json.loads(message) await self._handle_message(msg) - def add_event_listener(self, event_type: str, handler): + def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None: + """ + Adds an event handler function for a specific event type. + + Args: + event_type (str): Type of event to listen for ('done', 'error', or 'document') + handler (Callable): Function to handle the event + """ if event_type in self.event_handlers: self.event_handlers[event_type].append(handler) - def dispatch_event(self, event_type: str, detail: Dict[str, Any]): + def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None: + """ + Dispatches an event to all registered handlers for that event type. + + Args: + event_type (str): Type of event to dispatch + detail (Dict[str, Any]): Event details/data to pass to handlers + """ if event_type in self.event_handlers: for handler in self.event_handlers[event_type]: handler(detail) - async def _handle_message(self, msg: Dict[str, Any]): + async def _handle_message(self, msg: Dict[str, Any]) -> None: + """ + Handles incoming WebSocket messages based on their type. + + Args: + msg (Dict[str, Any]): The message to handle + """ if msg['type'] == 'done': self.status = 'completed' self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id})