From d8792d2301cb8d5fff2228a01294d69b2b32035c Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 18 Apr 2025 00:48:07 -0700
Subject: [PATCH] Update firecrawl.py

---
 apps/python-sdk/firecrawl/firecrawl.py | 359 +++++++++++++------------
 1 file changed, 190 insertions(+), 169 deletions(-)

diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 1eb5f8e7..82ff9606 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -1608,47 +1608,45 @@ class FirecrawlApp:
     def extract(
             self,
             urls: Optional[List[str]] = None,
-            params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
+            *,
+            prompt: Optional[str] = None,
+            schema_: Optional[Any] = None,
+            system_prompt: Optional[str] = None,
+            allow_external_links: Optional[bool] = False,
+            enable_web_search: Optional[bool] = False,
+            show_sources: Optional[bool] = False,
+            agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
         """
         Extract structured information from URLs.
 
         Args:
-            urls: URLs to extract from
-
-            params: See ExtractParams model:
-
-              Extraction Config:
-              * prompt - Custom extraction prompt
-              * schema - JSON schema/Pydantic model
-              * systemPrompt - System context
-              
-              Behavior Options:
-              * allowExternalLinks - Follow external links
-              * enableWebSearch - Enable web search
-              * includeSubdomains - Include subdomains
-              * showSources - Include source URLs
-              
-              Scraping Options:
-              * scrapeOptions - Page scraping config
+            urls (Optional[List[str]]): URLs to extract from
+            prompt (Optional[str]): Custom extraction prompt
+            schema_ (Optional[Any]): JSON schema/Pydantic model
+            system_prompt (Optional[str]): System context
+            allow_external_links (Optional[bool]): Follow external links
+            enable_web_search (Optional[bool]): Enable web search
+            show_sources (Optional[bool]): Include source URLs
+            agent (Optional[Dict[str, Any]]): Agent configuration
 
         Returns:
-            ExtractResponse with:
-            * Structured data matching schema
-            * Source information if requested
-            * Success/error status
+            ExtractResponse[Any] with:
+            * success (bool): Whether request succeeded
+            * data (Optional[Any]): Extracted data matching schema
+            * error (Optional[str]): Error message if any
 
         Raises:
             ValueError: If prompt/schema missing or extraction fails
         """
         headers = self._prepare_headers()
 
-        if not params or (not params.get('prompt') and not params.get('schema')):
+        if not prompt and not schema_:
             raise ValueError("Either prompt or schema is required")
 
-        if not urls and not params.get('prompt'):
+        if not urls and not prompt:
             raise ValueError("Either urls or prompt is required")
 
-        schema = params.get('schema')
+        schema = schema_
         if schema:
             if hasattr(schema, 'model_json_schema'):
                 # Convert Pydantic model to JSON schema
@@ -1656,26 +1654,22 @@ class FirecrawlApp:
             # Otherwise assume it's already a JSON schema dict
 
         request_data = {
-            'urls': urls,
-            'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)),
-            'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), 
-            'showSources': params.get('show_sources', params.get('showSources', False)),
+            'urls': urls or [],
+            'allowExternalLinks': allow_external_links,
+            'enableWebSearch': enable_web_search,
+            'showSources': show_sources,
             'schema': schema,
             'origin': f'python-sdk@{get_version()}'
         }
 
-        if not request_data['urls']:
-            request_data['urls'] = []
         # Only add prompt and systemPrompt if they exist
-        if params.get('prompt'):
-            request_data['prompt'] = params['prompt']
-        if params.get('system_prompt'):
-            request_data['systemPrompt'] = params['system_prompt']
-        elif params.get('systemPrompt'):  # Check legacy field name
-            request_data['systemPrompt'] = params['systemPrompt']
+        if prompt:
+            request_data['prompt'] = prompt
+        if system_prompt:
+            request_data['systemPrompt'] = system_prompt
             
-        if params.get('agent'):
-            request_data['agent'] = params['agent']
+        if agent:
+            request_data['agent'] = agent
 
         try:
             # Send the initial extract request
@@ -1706,7 +1700,7 @@ class FirecrawlApp:
                             except:
                                 raise Exception(f'Failed to parse Firecrawl response as JSON.')
                             if status_data['status'] == 'completed':
-                                return status_data
+                                return ExtractResponse(**status_data)
                             elif status_data['status'] in ['failed', 'cancelled']:
                                 raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
                         else:
@@ -1720,7 +1714,7 @@ class FirecrawlApp:
         except Exception as e:
             raise ValueError(str(e), 500)
 
-        return {'success': False, 'error': "Internal server error."}
+        return ExtractResponse(success=False, error="Internal server error.")
     
     def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
         """
@@ -1740,7 +1734,7 @@ class FirecrawlApp:
             response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
             if response.status_code == 200:
                 try:
-                    return response.json()
+                    return ExtractResponse(**response.json())
                 except:
                     raise Exception(f'Failed to parse Firecrawl response as JSON.')
             else:
@@ -1751,60 +1745,68 @@ class FirecrawlApp:
     def async_extract(
             self,
             urls: List[str],
-            params: Optional[ExtractParams] = None,
+            *,
+            prompt: Optional[str] = None,
+            schema_: Optional[Any] = None,
+            system_prompt: Optional[str] = None,
+            allow_external_links: Optional[bool] = False,
+            enable_web_search: Optional[bool] = False,
+            show_sources: Optional[bool] = False,
+            agent: Optional[Dict[str, Any]] = None,
             idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
         """
         Initiate an asynchronous extract job.
 
         Args:
             urls (List[str]): URLs to extract information from
-            params (Optional[ExtractParams]): See ExtractParams model:
-              Extraction Config:
-              * prompt - Custom extraction prompt
-              * schema - JSON schema/Pydantic model
-              * systemPrompt - System context
-              
-              Behavior Options:
-              * allowExternalLinks - Follow external links
-              * enableWebSearch - Enable web search
-              * includeSubdomains - Include subdomains
-              * showSources - Include source URLs
-              
-              Scraping Options:
-              * scrapeOptions - Page scraping config
+            prompt (Optional[str]): Custom extraction prompt
+            schema_ (Optional[Any]): JSON schema/Pydantic model
+            system_prompt (Optional[str]): System context
+            allow_external_links (Optional[bool]): Follow external links
+            enable_web_search (Optional[bool]): Enable web search
+            show_sources (Optional[bool]): Include source URLs
+            agent (Optional[Dict[str, Any]]): Agent configuration
             idempotency_key (Optional[str]): Unique key to prevent duplicate requests
 
         Returns:
-          ExtractResponse containing:
-          * success (bool): Whether job started successfully
-          * id (str): Unique identifier for the job
-          * error (str, optional): Error message if start failed
+            ExtractResponse[Any] with:
+            * success (bool): Whether request succeeded
+            * data (Optional[Any]): Extracted data matching schema
+            * error (Optional[str]): Error message if any
 
         Raises:
-          ValueError: If job initiation fails
+            ValueError: If job initiation fails
         """
         headers = self._prepare_headers(idempotency_key)
         
-        schema = params.get('schema') if params else None
+        schema = schema_
         if schema:
             if hasattr(schema, 'model_json_schema'):
                 # Convert Pydantic model to JSON schema
                 schema = schema.model_json_schema()
             # Otherwise assume it's already a JSON schema dict
 
-        jsonData = {'urls': urls, **(params or {})}
         request_data = {
-            **jsonData,
-            'allowExternalLinks': params.get('allow_external_links', False) if params else False,
+            'urls': urls,
+            'allowExternalLinks': allow_external_links,
+            'enableWebSearch': enable_web_search,
+            'showSources': show_sources,
             'schema': schema,
             'origin': f'python-sdk@{version}'
         }
 
+        if prompt:
+            request_data['prompt'] = prompt
+        if system_prompt:
+            request_data['systemPrompt'] = system_prompt
+        if agent:
+            request_data['agent'] = agent
+
         try:
             response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
             if response.status_code == 200:
                 try:
-                    return response.json()
+                    return ExtractResponse(**response.json())
                 except:
                     raise Exception(f'Failed to parse Firecrawl response as JSON.')
             else:
@@ -1815,41 +1817,36 @@ class FirecrawlApp:
     def generate_llms_text(
             self,
             url: str,
-            params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse:
+            *,
+            max_urls: Optional[int] = None,
+            show_full_text: Optional[bool] = None,
+            experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
         """
         Generate LLMs.txt for a given URL and poll until completion.
 
         Args:
-          url: Target URL to generate LLMs.txt from
-
-            params: See GenerateLLMsTextParams model:
-            params: See GenerateLLMsTextParams model:
-
-          params: See GenerateLLMsTextParams model:
-
-            Generation Options:
-            * maxUrls - Maximum URLs to process (default: 10)
-            * showFullText - Include full text in output (default: False)
+            url (str): Target URL to generate LLMs.txt from
+            max_urls (Optional[int]): Maximum URLs to process (default: 10)
+            show_full_text (Optional[bool]): Include full text in output (default: False)
+            experimental_stream (Optional[bool]): Enable experimental streaming
 
         Returns:
-          GenerateLLMsTextStatusResponse with:
-          * Generated LLMs.txt content
-          * Full version if requested
-          * Generation status
-          * Success/error information
+            GenerateLLMsTextStatusResponse with:
+            * Generated LLMs.txt content
+            * Full version if requested
+            * Generation status
+            * Success/error information
 
         Raises:
-          Exception: If generation fails
+            Exception: If generation fails
         """
-        if params is None:
-            params = {}
+        params = GenerateLLMsTextParams(
+            maxUrls=max_urls,
+            showFullText=show_full_text,
+            __experimental_stream=experimental_stream
+        )
 
-        if isinstance(params, dict):
-            generation_params = GenerateLLMsTextParams(**params)
-        else:
-            generation_params = params
-
-        response = self.async_generate_llms_text(url, generation_params)
+        response = self.async_generate_llms_text(url, params)
         if not response.get('success') or 'id' not in response:
             return response
 
@@ -1871,35 +1868,36 @@ class FirecrawlApp:
     def async_generate_llms_text(
             self,
             url: str,
-            params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse:
+            *,
+            max_urls: Optional[int] = None,
+            show_full_text: Optional[bool] = None,
+            experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
         """
         Initiate an asynchronous LLMs.txt generation operation.
 
         Args:
-          url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
-          params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Generation configuration parameters:
-            * maxUrls (int, optional): Maximum number of URLs to process (default: 10)
-            * showFullText (bool, optional): Include full text in output (default: False)
+            url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
+            max_urls (Optional[int]): Maximum URLs to process (default: 10)
+            show_full_text (Optional[bool]): Include full text in output (default: False)
+            experimental_stream (Optional[bool]): Enable experimental streaming
 
         Returns:
-          GenerateLLMsTextResponse: A response containing:
-            - success (bool): Whether the generation initiation was successful
-            - id (str): The unique identifier for the generation job
-            - error (str, optional): Error message if initiation failed
+            GenerateLLMsTextResponse: A response containing:
+            * success (bool): Whether the generation initiation was successful
+            * id (str): The unique identifier for the generation job
+            * error (str, optional): Error message if initiation failed
 
         Raises:
-          Exception: If the generation job initiation fails.
+            Exception: If the generation job initiation fails.
         """
-        if params is None:
-            params = {}
-
-        if isinstance(params, dict):
-            generation_params = GenerateLLMsTextParams(**params)
-        else:
-            generation_params = params
+        params = GenerateLLMsTextParams(
+            maxUrls=max_urls,
+            showFullText=show_full_text,
+            __experimental_stream=experimental_stream
+        )
 
         headers = self._prepare_headers()
-        json_data = {'url': url, **generation_params.dict(exclude_none=True)}
+        json_data = {'url': url, **params.dict(exclude_none=True)}
         json_data['origin'] = f"python-sdk@{version}"
 
         try:
@@ -1921,20 +1919,20 @@ class FirecrawlApp:
         Check the status of a LLMs.txt generation operation.
 
         Args:
-          id (str): The unique identifier of the LLMs.txt generation job to check status for.
+            id (str): The unique identifier of the LLMs.txt generation job to check status for.
 
         Returns:
-          GenerateLLMsTextStatusResponse: A response containing:
-          * success (bool): Whether the generation was successful
-          * status (str): Status of generation ("processing", "completed", "failed")
-          * data (Dict[str, str], optional): Generated text with fields:
-            * llmstxt (str): Generated LLMs.txt content
-            * llmsfulltxt (str, optional): Full version if requested
-          * error (str, optional): Error message if generation failed
-          * expiresAt (str): When the generated data expires
+            GenerateLLMsTextStatusResponse: A response containing:
+            * success (bool): Whether the generation was successful
+            * status (str): Status of generation ("processing", "completed", "failed")
+            * data (Dict[str, str], optional): Generated text with fields:
+              * llmstxt (str): Generated LLMs.txt content
+              * llmsfulltxt (str, optional): Full version if requested
+            * error (str, optional): Error message if generation failed
+            * expiresAt (str): When the generated data expires
 
         Raises:
-          Exception: If the status check fails.
+            Exception: If the status check fails.
         """
         headers = self._prepare_headers()
         try:
@@ -2172,52 +2170,57 @@ class FirecrawlApp:
     def deep_research(
             self,
             query: str,
-            params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, 
+            *,
+            max_depth: Optional[int] = None,
+            time_limit: Optional[int] = None,
+            max_urls: Optional[int] = None,
+            analysis_prompt: Optional[str] = None,
+            system_prompt: Optional[str] = None,
+            __experimental_stream_steps: Optional[bool] = None,
             on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
             on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
         """
         Initiates a deep research operation on a given query and polls until completion.
 
         Args:
-          query: Research query or topic to investigate
-
-          params: See DeepResearchParams model:
-            Research Settings:
-              * maxDepth - Maximum research depth (default: 7)
-              * timeLimit - Time limit in seconds (default: 270)
-              * maxUrls - Maximum URLs to process (default: 20)
-
-          Callbacks:
-          * on_activity - Progress callback receiving:
-              {type, status, message, timestamp, depth}
-          * on_source - Source discovery callback receiving:
-              {url, title, description}
+            query (str): Research query or topic to investigate
+            max_depth (Optional[int]): Maximum depth of research exploration
+            time_limit (Optional[int]): Time limit in seconds for research
+            max_urls (Optional[int]): Maximum number of URLs to process
+            analysis_prompt (Optional[str]): Custom prompt for analysis
+            system_prompt (Optional[str]): Custom system prompt
+            __experimental_stream_steps (Optional[bool]): Enable experimental streaming
+            on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
+            on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
 
         Returns:
-          DeepResearchResponse containing:
-
-          Status:
-          * success - Whether research completed successfully
-          * status - Current state (processing/completed/failed)
-          * error - Error message if failed
-          
-          Results:
-          * id - Unique identifier for the research job
-          * data - Research findings and analysis
-          * sources - List of discovered sources
-          * activities - Research progress log
-          * summaries - Generated research summaries
+            DeepResearchStatusResponse containing:
+            * success (bool): Whether research completed successfully
+            * status (str): Current state (processing/completed/failed)
+            * error (Optional[str]): Error message if failed
+            * id (str): Unique identifier for the research job
+            * data (Any): Research findings and analysis
+            * sources (List[Dict]): List of discovered sources
+            * activities (List[Dict]): Research progress log
+            * summaries (List[str]): Generated research summaries
 
         Raises:
-          Exception: If research fails
+            Exception: If research fails
         """
-        if params is None:
-            params = {}
-
-        if isinstance(params, dict):
-            research_params = DeepResearchParams(**params)
-        else:
-            research_params = params
+        research_params = {}
+        if max_depth is not None:
+            research_params['maxDepth'] = max_depth
+        if time_limit is not None:
+            research_params['timeLimit'] = time_limit
+        if max_urls is not None:
+            research_params['maxUrls'] = max_urls
+        if analysis_prompt is not None:
+            research_params['analysisPrompt'] = analysis_prompt
+        if system_prompt is not None:
+            research_params['systemPrompt'] = system_prompt
+        if __experimental_stream_steps is not None:
+            research_params['__experimental_streamSteps'] = __experimental_stream_steps
+        research_params = DeepResearchParams(**research_params)
 
         response = self.async_deep_research(query, research_params)
         if not response.get('success') or 'id' not in response:
@@ -2253,19 +2256,30 @@ class FirecrawlApp:
 
         return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
 
-    def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
+    def async_deep_research(
+            self,
+            query: str,
+            *,
+            max_depth: Optional[int] = None,
+            time_limit: Optional[int] = None,
+            max_urls: Optional[int] = None,
+            analysis_prompt: Optional[str] = None,
+            system_prompt: Optional[str] = None,
+            __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
         """
         Initiates an asynchronous deep research operation.
 
         Args:
-            query (str): The research query to investigate. Should be a clear, specific question or topic.
-            params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Research configuration parameters:
-              * maxDepth (int, optional): Maximum depth of research exploration (default: 7)
-              * timeLimit (int, optional): Time limit in seconds for research (default: 270)
-              * maxUrls (int, optional): Maximum number of URLs to process (default: 20)
+            query (str): Research query or topic to investigate
+            max_depth (Optional[int]): Maximum depth of research exploration
+            time_limit (Optional[int]): Time limit in seconds for research
+            max_urls (Optional[int]): Maximum number of URLs to process
+            analysis_prompt (Optional[str]): Custom prompt for analysis
+            system_prompt (Optional[str]): Custom system prompt
+            __experimental_stream_steps (Optional[bool]): Enable experimental streaming
 
         Returns:
-          DeepResearchResponse: A response containing:
+            Dict[str, Any]: A response containing:
             * success (bool): Whether the research initiation was successful
             * id (str): The unique identifier for the research job
             * error (str, optional): Error message if initiation failed
@@ -2273,13 +2287,20 @@ class FirecrawlApp:
         Raises:
             Exception: If the research initiation fails.
         """
-        if params is None:
-            params = {}
-
-        if isinstance(params, dict):
-            research_params = DeepResearchParams(**params)
-        else:
-            research_params = params
+        research_params = {}
+        if max_depth is not None:
+            research_params['maxDepth'] = max_depth
+        if time_limit is not None:
+            research_params['timeLimit'] = time_limit
+        if max_urls is not None:
+            research_params['maxUrls'] = max_urls
+        if analysis_prompt is not None:
+            research_params['analysisPrompt'] = analysis_prompt
+        if system_prompt is not None:
+            research_params['systemPrompt'] = system_prompt
+        if __experimental_stream_steps is not None:
+            research_params['__experimental_streamSteps'] = __experimental_stream_steps
+        research_params = DeepResearchParams(**research_params)
 
         headers = self._prepare_headers()