Update firecrawl.py

This commit is contained in:
Nicolas 2025-04-18 00:48:07 -07:00
parent 5e6e41ab17
commit d8792d2301

View File

@ -1608,47 +1608,45 @@ class FirecrawlApp:
def extract( def extract(
self, self,
urls: Optional[List[str]] = None, urls: Optional[List[str]] = None,
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: *,
prompt: Optional[str] = None,
schema_: Optional[Any] = None,
system_prompt: Optional[str] = None,
allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
""" """
Extract structured information from URLs. Extract structured information from URLs.
Args: Args:
urls: URLs to extract from urls (Optional[List[str]]): URLs to extract from
prompt (Optional[str]): Custom extraction prompt
params: See ExtractParams model: schema_ (Optional[Any]): JSON schema/Pydantic model
system_prompt (Optional[str]): System context
Extraction Config: allow_external_links (Optional[bool]): Follow external links
* prompt - Custom extraction prompt enable_web_search (Optional[bool]): Enable web search
* schema - JSON schema/Pydantic model show_sources (Optional[bool]): Include source URLs
* systemPrompt - System context agent (Optional[Dict[str, Any]]): Agent configuration
Behavior Options:
* allowExternalLinks - Follow external links
* enableWebSearch - Enable web search
* includeSubdomains - Include subdomains
* showSources - Include source URLs
Scraping Options:
* scrapeOptions - Page scraping config
Returns: Returns:
ExtractResponse with: ExtractResponse[Any] with:
* Structured data matching schema * success (bool): Whether request succeeded
* Source information if requested * data (Optional[Any]): Extracted data matching schema
* Success/error status * error (Optional[str]): Error message if any
Raises: Raises:
ValueError: If prompt/schema missing or extraction fails ValueError: If prompt/schema missing or extraction fails
""" """
headers = self._prepare_headers() headers = self._prepare_headers()
if not params or (not params.get('prompt') and not params.get('schema')): if not prompt and not schema_:
raise ValueError("Either prompt or schema is required") raise ValueError("Either prompt or schema is required")
if not urls and not params.get('prompt'): if not urls and not prompt:
raise ValueError("Either urls or prompt is required") raise ValueError("Either urls or prompt is required")
schema = params.get('schema') schema = schema_
if schema: if schema:
if hasattr(schema, 'model_json_schema'): if hasattr(schema, 'model_json_schema'):
# Convert Pydantic model to JSON schema # Convert Pydantic model to JSON schema
@ -1656,26 +1654,22 @@ class FirecrawlApp:
# Otherwise assume it's already a JSON schema dict # Otherwise assume it's already a JSON schema dict
request_data = { request_data = {
'urls': urls, 'urls': urls or [],
'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)), 'allowExternalLinks': allow_external_links,
'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), 'enableWebSearch': enable_web_search,
'showSources': params.get('show_sources', params.get('showSources', False)), 'showSources': show_sources,
'schema': schema, 'schema': schema,
'origin': f'python-sdk@{get_version()}' 'origin': f'python-sdk@{get_version()}'
} }
if not request_data['urls']:
request_data['urls'] = []
# Only add prompt and systemPrompt if they exist # Only add prompt and systemPrompt if they exist
if params.get('prompt'): if prompt:
request_data['prompt'] = params['prompt'] request_data['prompt'] = prompt
if params.get('system_prompt'): if system_prompt:
request_data['systemPrompt'] = params['system_prompt'] request_data['systemPrompt'] = system_prompt
elif params.get('systemPrompt'): # Check legacy field name
request_data['systemPrompt'] = params['systemPrompt']
if params.get('agent'): if agent:
request_data['agent'] = params['agent'] request_data['agent'] = agent
try: try:
# Send the initial extract request # Send the initial extract request
@ -1706,7 +1700,7 @@ class FirecrawlApp:
except: except:
raise Exception(f'Failed to parse Firecrawl response as JSON.') raise Exception(f'Failed to parse Firecrawl response as JSON.')
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
return status_data return ExtractResponse(**status_data)
elif status_data['status'] in ['failed', 'cancelled']: elif status_data['status'] in ['failed', 'cancelled']:
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
else: else:
@ -1720,7 +1714,7 @@ class FirecrawlApp:
except Exception as e: except Exception as e:
raise ValueError(str(e), 500) raise ValueError(str(e), 500)
return {'success': False, 'error': "Internal server error."} return ExtractResponse(success=False, error="Internal server error.")
def get_extract_status(self, job_id: str) -> ExtractResponse[Any]: def get_extract_status(self, job_id: str) -> ExtractResponse[Any]:
""" """
@ -1740,7 +1734,7 @@ class FirecrawlApp:
response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers) response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
if response.status_code == 200: if response.status_code == 200:
try: try:
return response.json() return ExtractResponse(**response.json())
except: except:
raise Exception(f'Failed to parse Firecrawl response as JSON.') raise Exception(f'Failed to parse Firecrawl response as JSON.')
else: else:
@ -1751,60 +1745,68 @@ class FirecrawlApp:
def async_extract( def async_extract(
self, self,
urls: List[str], urls: List[str],
params: Optional[ExtractParams] = None, *,
prompt: Optional[str] = None,
schema_: Optional[Any] = None,
system_prompt: Optional[str] = None,
allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None,
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
""" """
Initiate an asynchronous extract job. Initiate an asynchronous extract job.
Args: Args:
urls (List[str]): URLs to extract information from urls (List[str]): URLs to extract information from
params (Optional[ExtractParams]): See ExtractParams model: prompt (Optional[str]): Custom extraction prompt
Extraction Config: schema_ (Optional[Any]): JSON schema/Pydantic model
* prompt - Custom extraction prompt system_prompt (Optional[str]): System context
* schema - JSON schema/Pydantic model allow_external_links (Optional[bool]): Follow external links
* systemPrompt - System context enable_web_search (Optional[bool]): Enable web search
show_sources (Optional[bool]): Include source URLs
Behavior Options: agent (Optional[Dict[str, Any]]): Agent configuration
* allowExternalLinks - Follow external links
* enableWebSearch - Enable web search
* includeSubdomains - Include subdomains
* showSources - Include source URLs
Scraping Options:
* scrapeOptions - Page scraping config
idempotency_key (Optional[str]): Unique key to prevent duplicate requests idempotency_key (Optional[str]): Unique key to prevent duplicate requests
Returns: Returns:
ExtractResponse containing: ExtractResponse[Any] with:
* success (bool): Whether job started successfully * success (bool): Whether request succeeded
* id (str): Unique identifier for the job * data (Optional[Any]): Extracted data matching schema
* error (str, optional): Error message if start failed * error (Optional[str]): Error message if any
Raises: Raises:
ValueError: If job initiation fails ValueError: If job initiation fails
""" """
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers(idempotency_key)
schema = params.get('schema') if params else None schema = schema_
if schema: if schema:
if hasattr(schema, 'model_json_schema'): if hasattr(schema, 'model_json_schema'):
# Convert Pydantic model to JSON schema # Convert Pydantic model to JSON schema
schema = schema.model_json_schema() schema = schema.model_json_schema()
# Otherwise assume it's already a JSON schema dict # Otherwise assume it's already a JSON schema dict
jsonData = {'urls': urls, **(params or {})}
request_data = { request_data = {
**jsonData, 'urls': urls,
'allowExternalLinks': params.get('allow_external_links', False) if params else False, 'allowExternalLinks': allow_external_links,
'enableWebSearch': enable_web_search,
'showSources': show_sources,
'schema': schema, 'schema': schema,
'origin': f'python-sdk@{version}' 'origin': f'python-sdk@{version}'
} }
if prompt:
request_data['prompt'] = prompt
if system_prompt:
request_data['systemPrompt'] = system_prompt
if agent:
request_data['agent'] = agent
try: try:
response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers) response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
if response.status_code == 200: if response.status_code == 200:
try: try:
return response.json() return ExtractResponse(**response.json())
except: except:
raise Exception(f'Failed to parse Firecrawl response as JSON.') raise Exception(f'Failed to parse Firecrawl response as JSON.')
else: else:
@ -1815,21 +1817,18 @@ class FirecrawlApp:
def generate_llms_text( def generate_llms_text(
self, self,
url: str, url: str,
params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse: *,
max_urls: Optional[int] = None,
show_full_text: Optional[bool] = None,
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
""" """
Generate LLMs.txt for a given URL and poll until completion. Generate LLMs.txt for a given URL and poll until completion.
Args: Args:
url: Target URL to generate LLMs.txt from url (str): Target URL to generate LLMs.txt from
max_urls (Optional[int]): Maximum URLs to process (default: 10)
params: See GenerateLLMsTextParams model: show_full_text (Optional[bool]): Include full text in output (default: False)
params: See GenerateLLMsTextParams model: experimental_stream (Optional[bool]): Enable experimental streaming
params: See GenerateLLMsTextParams model:
Generation Options:
* maxUrls - Maximum URLs to process (default: 10)
* showFullText - Include full text in output (default: False)
Returns: Returns:
GenerateLLMsTextStatusResponse with: GenerateLLMsTextStatusResponse with:
@ -1841,15 +1840,13 @@ class FirecrawlApp:
Raises: Raises:
Exception: If generation fails Exception: If generation fails
""" """
if params is None: params = GenerateLLMsTextParams(
params = {} maxUrls=max_urls,
showFullText=show_full_text,
__experimental_stream=experimental_stream
)
if isinstance(params, dict): response = self.async_generate_llms_text(url, params)
generation_params = GenerateLLMsTextParams(**params)
else:
generation_params = params
response = self.async_generate_llms_text(url, generation_params)
if not response.get('success') or 'id' not in response: if not response.get('success') or 'id' not in response:
return response return response
@ -1871,35 +1868,36 @@ class FirecrawlApp:
def async_generate_llms_text( def async_generate_llms_text(
self, self,
url: str, url: str,
params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse: *,
max_urls: Optional[int] = None,
show_full_text: Optional[bool] = None,
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
""" """
Initiate an asynchronous LLMs.txt generation operation. Initiate an asynchronous LLMs.txt generation operation.
Args: Args:
url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL.
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Generation configuration parameters: max_urls (Optional[int]): Maximum URLs to process (default: 10)
* maxUrls (int, optional): Maximum number of URLs to process (default: 10) show_full_text (Optional[bool]): Include full text in output (default: False)
* showFullText (bool, optional): Include full text in output (default: False) experimental_stream (Optional[bool]): Enable experimental streaming
Returns: Returns:
GenerateLLMsTextResponse: A response containing: GenerateLLMsTextResponse: A response containing:
- success (bool): Whether the generation initiation was successful * success (bool): Whether the generation initiation was successful
- id (str): The unique identifier for the generation job * id (str): The unique identifier for the generation job
- error (str, optional): Error message if initiation failed * error (str, optional): Error message if initiation failed
Raises: Raises:
Exception: If the generation job initiation fails. Exception: If the generation job initiation fails.
""" """
if params is None: params = GenerateLLMsTextParams(
params = {} maxUrls=max_urls,
showFullText=show_full_text,
if isinstance(params, dict): __experimental_stream=experimental_stream
generation_params = GenerateLLMsTextParams(**params) )
else:
generation_params = params
headers = self._prepare_headers() headers = self._prepare_headers()
json_data = {'url': url, **generation_params.dict(exclude_none=True)} json_data = {'url': url, **params.dict(exclude_none=True)}
json_data['origin'] = f"python-sdk@{version}" json_data['origin'] = f"python-sdk@{version}"
try: try:
@ -2172,52 +2170,57 @@ class FirecrawlApp:
def deep_research( def deep_research(
self, self,
query: str, query: str,
params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, *,
max_depth: Optional[int] = None,
time_limit: Optional[int] = None,
max_urls: Optional[int] = None,
analysis_prompt: Optional[str] = None,
system_prompt: Optional[str] = None,
__experimental_stream_steps: Optional[bool] = None,
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
""" """
Initiates a deep research operation on a given query and polls until completion. Initiates a deep research operation on a given query and polls until completion.
Args: Args:
query: Research query or topic to investigate query (str): Research query or topic to investigate
max_depth (Optional[int]): Maximum depth of research exploration
params: See DeepResearchParams model: time_limit (Optional[int]): Time limit in seconds for research
Research Settings: max_urls (Optional[int]): Maximum number of URLs to process
* maxDepth - Maximum research depth (default: 7) analysis_prompt (Optional[str]): Custom prompt for analysis
* timeLimit - Time limit in seconds (default: 270) system_prompt (Optional[str]): Custom system prompt
* maxUrls - Maximum URLs to process (default: 20) __experimental_stream_steps (Optional[bool]): Enable experimental streaming
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
Callbacks: on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
* on_activity - Progress callback receiving:
{type, status, message, timestamp, depth}
* on_source - Source discovery callback receiving:
{url, title, description}
Returns: Returns:
DeepResearchResponse containing: DeepResearchStatusResponse containing:
* success (bool): Whether research completed successfully
Status: * status (str): Current state (processing/completed/failed)
* success - Whether research completed successfully * error (Optional[str]): Error message if failed
* status - Current state (processing/completed/failed) * id (str): Unique identifier for the research job
* error - Error message if failed * data (Any): Research findings and analysis
* sources (List[Dict]): List of discovered sources
Results: * activities (List[Dict]): Research progress log
* id - Unique identifier for the research job * summaries (List[str]): Generated research summaries
* data - Research findings and analysis
* sources - List of discovered sources
* activities - Research progress log
* summaries - Generated research summaries
Raises: Raises:
Exception: If research fails Exception: If research fails
""" """
if params is None: research_params = {}
params = {} if max_depth is not None:
research_params['maxDepth'] = max_depth
if isinstance(params, dict): if time_limit is not None:
research_params = DeepResearchParams(**params) research_params['timeLimit'] = time_limit
else: if max_urls is not None:
research_params = params research_params['maxUrls'] = max_urls
if analysis_prompt is not None:
research_params['analysisPrompt'] = analysis_prompt
if system_prompt is not None:
research_params['systemPrompt'] = system_prompt
if __experimental_stream_steps is not None:
research_params['__experimental_streamSteps'] = __experimental_stream_steps
research_params = DeepResearchParams(**research_params)
response = self.async_deep_research(query, research_params) response = self.async_deep_research(query, research_params)
if not response.get('success') or 'id' not in response: if not response.get('success') or 'id' not in response:
@ -2253,19 +2256,30 @@ class FirecrawlApp:
return {'success': False, 'error': 'Deep research job terminated unexpectedly'} return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: def async_deep_research(
self,
query: str,
*,
max_depth: Optional[int] = None,
time_limit: Optional[int] = None,
max_urls: Optional[int] = None,
analysis_prompt: Optional[str] = None,
system_prompt: Optional[str] = None,
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
""" """
Initiates an asynchronous deep research operation. Initiates an asynchronous deep research operation.
Args: Args:
query (str): The research query to investigate. Should be a clear, specific question or topic. query (str): Research query or topic to investigate
params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Research configuration parameters: max_depth (Optional[int]): Maximum depth of research exploration
* maxDepth (int, optional): Maximum depth of research exploration (default: 7) time_limit (Optional[int]): Time limit in seconds for research
* timeLimit (int, optional): Time limit in seconds for research (default: 270) max_urls (Optional[int]): Maximum number of URLs to process
* maxUrls (int, optional): Maximum number of URLs to process (default: 20) analysis_prompt (Optional[str]): Custom prompt for analysis
system_prompt (Optional[str]): Custom system prompt
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
Returns: Returns:
DeepResearchResponse: A response containing: Dict[str, Any]: A response containing:
* success (bool): Whether the research initiation was successful * success (bool): Whether the research initiation was successful
* id (str): The unique identifier for the research job * id (str): The unique identifier for the research job
* error (str, optional): Error message if initiation failed * error (str, optional): Error message if initiation failed
@ -2273,13 +2287,20 @@ class FirecrawlApp:
Raises: Raises:
Exception: If the research initiation fails. Exception: If the research initiation fails.
""" """
if params is None: research_params = {}
params = {} if max_depth is not None:
research_params['maxDepth'] = max_depth
if isinstance(params, dict): if time_limit is not None:
research_params = DeepResearchParams(**params) research_params['timeLimit'] = time_limit
else: if max_urls is not None:
research_params = params research_params['maxUrls'] = max_urls
if analysis_prompt is not None:
research_params['analysisPrompt'] = analysis_prompt
if system_prompt is not None:
research_params['systemPrompt'] = system_prompt
if __experimental_stream_steps is not None:
research_params['__experimental_streamSteps'] = __experimental_stream_steps
research_params = DeepResearchParams(**research_params)
headers = self._prepare_headers() headers = self._prepare_headers()