From 79bc54c11e9f384aa15495c1cf4ad973e5af5c0c Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Apr 2025 13:00:05 -0700 Subject: [PATCH 1/5] scrape options fixing types --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 214 ++++++++++++++----------- 2 files changed, 121 insertions(+), 95 deletions(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index b641b7cd..ac4a6eca 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/ import logging import os -from .firecrawl import FirecrawlApp, JsonConfig # noqa +from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa __version__ = "2.0.2" diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 39323044..ef67ece8 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -97,6 +97,16 @@ class ActionsResult(pydantic.BaseModel): """Result of actions performed during scraping.""" screenshots: List[str] +class ChangeTrackingData(pydantic.BaseModel): + """ + Data for the change tracking format. + """ + previousScrapeAt: Optional[str] = None + changeStatus: str # "new" | "same" | "changed" | "removed" + visibility: str # "visible" | "hidden" + diff: Optional[Dict[str, Any]] = None + json: Optional[Any] = None + class FirecrawlDocument(pydantic.BaseModel, Generic[T]): """Document retrieved or processed by Firecrawl.""" url: Optional[str] = None @@ -111,6 +121,7 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]): actions: Optional[ActionsResult] = None title: Optional[str] = None # v1 search only description: Optional[str] = None # v1 search only + changeTracking: Optional[ChangeTrackingData] = None class LocationConfig(pydantic.BaseModel): """Location configuration for scraping.""" @@ -124,9 +135,9 @@ class WebhookConfig(pydantic.BaseModel): metadata: Optional[Dict[str, str]] = None events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None -class CommonOptions(pydantic.BaseModel): +class ScrapeOptions(pydantic.BaseModel): """Parameters for scraping operations.""" - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None headers: Optional[Dict[str, str]] = None includeTags: Optional[List[str]] = None excludeTags: Optional[List[str]] = None @@ -193,7 +204,7 @@ class JsonConfig(pydantic.BaseModel): systemPrompt: Optional[str] = None agent: Optional[ExtractAgent] = None -class ScrapeParams(CommonOptions): +class ScrapeParams(ScrapeOptions): """Parameters for scraping operations.""" extract: Optional[JsonConfig] = None jsonOptions: Optional[JsonConfig] = None @@ -235,7 +246,7 @@ class CrawlParams(pydantic.BaseModel): allowBackwardLinks: Optional[bool] = None allowExternalLinks: Optional[bool] = None ignoreSitemap: Optional[bool] = None - scrapeOptions: Optional[CommonOptions] = None + scrapeOptions: Optional[ScrapeOptions] = None webhook: Optional[Union[str, WebhookConfig]] = None deduplicateSimilarURLs: Optional[bool] = None ignoreQueryParameters: Optional[bool] = None @@ -289,7 +300,7 @@ class ExtractParams(pydantic.BaseModel): includeSubdomains: Optional[bool] = None origin: Optional[str] = None showSources: Optional[bool] = None - scrapeOptions: Optional[CommonOptions] = None + scrapeOptions: Optional[ScrapeOptions] = None class ExtractResponse(pydantic.BaseModel, Generic[T]): """Response from extract operations.""" @@ -309,7 +320,7 @@ class SearchParams(pydantic.BaseModel): location: Optional[str] = None origin: Optional[str] = "api" timeout: Optional[int] = 60000 - scrapeOptions: Optional[CommonOptions] = None + scrapeOptions: Optional[ScrapeOptions] = None class SearchResponse(pydantic.BaseModel): """Response from search operations.""" @@ -377,16 +388,6 @@ class GenerateLLMsTextStatusResponse(pydantic.BaseModel): status: Literal["processing", "completed", "failed"] error: Optional[str] = None expiresAt: str - -class ChangeTrackingData(pydantic.BaseModel): - """ - Data for the change tracking format. - """ - previousScrapeAt: Optional[str] = None - changeStatus: str # "new" | "same" | "changed" | "removed" - visibility: str # "visible" | "hidden" - diff: Optional[Dict[str, Any]] = None - json: Optional[Any] = None class SearchResponse(pydantic.BaseModel): """ @@ -442,7 +443,7 @@ class FirecrawlApp: self, url: str, *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, only_main_content: Optional[bool] = None, @@ -568,7 +569,7 @@ class FirecrawlApp: country: Optional[str] = None, location: Optional[str] = None, timeout: Optional[int] = None, - scrape_options: Optional[CommonOptions] = None, + scrape_options: Optional[ScrapeOptions] = None, params: Optional[Union[Dict[str, Any], SearchParams]] = None, **kwargs) -> SearchResponse: """ @@ -583,7 +584,7 @@ class FirecrawlApp: country (Optional[str]): Country code (default: "us") location (Optional[str]): Geo-targeting timeout (Optional[int]): Request timeout in milliseconds - scrape_options (Optional[CommonOptions]): Result scraping configuration + scrape_options (Optional[ScrapeOptions]): Result scraping configuration params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters **kwargs: Additional keyword arguments for future compatibility @@ -664,7 +665,7 @@ class FirecrawlApp: allow_backward_links: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, - scrape_options: Optional[CommonOptions] = None, + scrape_options: Optional[ScrapeOptions] = None, webhook: Optional[Union[str, WebhookConfig]] = None, deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, @@ -686,7 +687,7 @@ class FirecrawlApp: allow_backward_links (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing - scrape_options (Optional[CommonOptions]): Page scraping configuration + scrape_options (Optional[ScrapeOptions]): Page scraping configuration webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings deduplicate_similar_urls (Optional[bool]): Remove similar URLs ignore_query_parameters (Optional[bool]): Ignore URL parameters @@ -768,7 +769,7 @@ class FirecrawlApp: allow_backward_links: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, - scrape_options: Optional[CommonOptions] = None, + scrape_options: Optional[ScrapeOptions] = None, webhook: Optional[Union[str, WebhookConfig]] = None, deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, @@ -789,7 +790,7 @@ class FirecrawlApp: allow_backward_links (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing - scrape_options (Optional[CommonOptions]): Page scraping configuration + scrape_options (Optional[ScrapeOptions]): Page scraping configuration webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings deduplicate_similar_urls (Optional[bool]): Remove similar URLs ignore_query_parameters (Optional[bool]): Ignore URL parameters @@ -1007,7 +1008,7 @@ class FirecrawlApp: allow_backward_links: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, - scrape_options: Optional[CommonOptions] = None, + scrape_options: Optional[ScrapeOptions] = None, webhook: Optional[Union[str, WebhookConfig]] = None, deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, @@ -1028,7 +1029,7 @@ class FirecrawlApp: allow_backward_links (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing - scrape_options (Optional[CommonOptions]): Page scraping configuration + scrape_options (Optional[ScrapeOptions]): Page scraping configuration webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings deduplicate_similar_urls (Optional[bool]): Remove similar URLs ignore_query_parameters (Optional[bool]): Ignore URL parameters @@ -2922,9 +2923,9 @@ class AsyncFirecrawlApp(FirecrawlApp): headers ) - if response.status_code == 200: + if response.get('success'): try: - id = response.json().get('id') + id = response.get('id') except: raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) @@ -3050,7 +3051,7 @@ class AsyncFirecrawlApp(FirecrawlApp): headers ) - if response.status_code == 200: + if response.get('status_code') == 200: try: return BatchScrapeResponse(**response.json()) except: @@ -3059,7 +3060,7 @@ class AsyncFirecrawlApp(FirecrawlApp): self._handle_error(response, 'start batch scrape job') async def crawl_url( - self, + self, url: str, *, include_paths: Optional[List[str]] = None, @@ -3070,7 +3071,7 @@ class AsyncFirecrawlApp(FirecrawlApp): allow_backward_links: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, - scrape_options: Optional[CommonOptions] = None, + scrape_options: Optional[ScrapeOptions] = None, webhook: Optional[Union[str, WebhookConfig]] = None, deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, @@ -3092,7 +3093,7 @@ class AsyncFirecrawlApp(FirecrawlApp): allow_backward_links (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing - scrape_options (Optional[CommonOptions]): Page scraping configuration + scrape_options (Optional[ScrapeOptions]): Page scraping configuration webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings deduplicate_similar_urls (Optional[bool]): Remove similar URLs ignore_query_parameters (Optional[bool]): Ignore URL parameters @@ -3148,15 +3149,15 @@ class AsyncFirecrawlApp(FirecrawlApp): params_dict = final_params.dict(exclude_none=True) params_dict['url'] = url params_dict['origin'] = f"python-sdk@{version}" - # Make request headers = self._prepare_headers(idempotency_key) response = await self._async_post_request( f'{self.api_url}/v1/crawl', params_dict, headers) - if response.status_code == 200: + print(response) + if response.get('success'): try: - id = response.json().get('id') + id = response.get('id') except: raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) @@ -3176,11 +3177,12 @@ class AsyncFirecrawlApp(FirecrawlApp): allow_backward_links: Optional[bool] = None, allow_external_links: Optional[bool] = None, ignore_sitemap: Optional[bool] = None, - scrape_options: Optional[CommonOptions] = None, + scrape_options: Optional[ScrapeOptions] = None, webhook: Optional[Union[str, WebhookConfig]] = None, deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, regex_on_full_url: Optional[bool] = None, + poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None, **kwargs ) -> CrawlResponse: @@ -3197,7 +3199,7 @@ class AsyncFirecrawlApp(FirecrawlApp): allow_backward_links (Optional[bool]): Follow parent directory links allow_external_links (Optional[bool]): Follow external domain links ignore_sitemap (Optional[bool]): Skip sitemap.xml processing - scrape_options (Optional[CommonOptions]): Page scraping configuration + scrape_options (Optional[ScrapeOptions]): Page scraping configuration webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings deduplicate_similar_urls (Optional[bool]): Remove similar URLs ignore_query_parameters (Optional[bool]): Ignore URL parameters @@ -3262,9 +3264,9 @@ class AsyncFirecrawlApp(FirecrawlApp): headers ) - if response.status_code == 200: + if response.get('success'): try: - return CrawlResponse(**response.json()) + return CrawlResponse(**response) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -3303,7 +3305,7 @@ class AsyncFirecrawlApp(FirecrawlApp): headers ) - if status_data['status'] == 'completed': + if status_data.get('status') == 'completed': if 'data' in status_data: data = status_data['data'] while 'next' in status_data: @@ -3317,26 +3319,24 @@ class AsyncFirecrawlApp(FirecrawlApp): data.extend(next_data.get('data', [])) status_data = next_data status_data['data'] = data - - response = { - 'status': status_data.get('status'), - 'total': status_data.get('total'), - 'completed': status_data.get('completed'), - 'creditsUsed': status_data.get('creditsUsed'), - 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data') - } + # Create CrawlStatusResponse object from status data + response = CrawlStatusResponse( + status=status_data.get('status'), + total=status_data.get('total'), + completed=status_data.get('completed'), + creditsUsed=status_data.get('creditsUsed'), + expiresAt=status_data.get('expiresAt'), + data=status_data.get('data'), + success=False if 'error' in status_data else True + ) if 'error' in status_data: - response['error'] = status_data['error'] + response.error = status_data.get('error') if 'next' in status_data: - response['next'] = status_data['next'] + response.next = status_data.get('next') - return { - 'success': False if 'error' in status_data else True, - **response - } + return response async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse: """ @@ -3359,7 +3359,7 @@ class AsyncFirecrawlApp(FirecrawlApp): headers ) - if status_data['status'] == 'completed': + if status_data.get('status') == 'completed': if 'data' in status_data: data = status_data['data'] while 'next' in status_data: @@ -3376,15 +3376,22 @@ class AsyncFirecrawlApp(FirecrawlApp): return status_data else: raise Exception('Job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: + elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: await asyncio.sleep(max(poll_interval, 2)) else: raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}') async def map_url( - self, - url: str, - params: Optional[MapParams] = None) -> MapResponse: + self, + url: str, + *, + search: Optional[str] = None, + ignore_sitemap: Optional[bool] = None, + include_subdomains: Optional[bool] = None, + sitemap_only: Optional[bool] = None, + limit: Optional[int] = None, + timeout: Optional[int] = None, + params: Optional[MapParams] = None) -> MapResponse: """ Asynchronously map and discover links from a URL. @@ -3409,21 +3416,40 @@ class AsyncFirecrawlApp(FirecrawlApp): Raises: Exception: If mapping fails """ - headers = self._prepare_headers() - json_data = {'url': url} + map_params = {} if params: - json_data.update(params) - json_data['origin'] = f"python-sdk@{version}" + map_params.update(params.dict(exclude_none=True)) + # Add individual parameters + if search is not None: + map_params['search'] = search + if ignore_sitemap is not None: + map_params['ignoreSitemap'] = ignore_sitemap + if include_subdomains is not None: + map_params['includeSubdomains'] = include_subdomains + if sitemap_only is not None: + map_params['sitemapOnly'] = sitemap_only + if limit is not None: + map_params['limit'] = limit + if timeout is not None: + map_params['timeout'] = timeout + + # Create final params object + final_params = MapParams(**map_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request endpoint = f'/v1/map' response = await self._async_post_request( f'{self.api_url}{endpoint}', - json_data, - headers + params_dict, + headers={"Authorization": f"Bearer {self.api_key}"} ) if response.get('success') and 'links' in response: - return response + return MapResponse(**response) elif 'error' in response: raise Exception(f'Failed to map URL. Error: {response["error"]}') else: @@ -3472,14 +3498,14 @@ class AsyncFirecrawlApp(FirecrawlApp): if hasattr(schema, 'model_json_schema'): schema = schema.model_json_schema() - request_data = { - 'urls': urls, - 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)), - 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), - 'showSources': params.get('show_sources', params.get('showSources', False)), - 'schema': schema, - 'origin': f'python-sdk@{version}' - } + request_data = ExtractResponse( + urls=urls, + allowExternalLinks=params.get('allow_external_links', params.get('allowExternalLinks', False)), + enableWebSearch=params.get('enable_web_search', params.get('enableWebSearch', False)), + showSources=params.get('show_sources', params.get('showSources', False)), + schema=schema, + origin=f'python-sdk@{version}' + ) if params.get('prompt'): request_data['prompt'] = params['prompt'] @@ -3562,14 +3588,14 @@ class AsyncFirecrawlApp(FirecrawlApp): status_data = next_data status_data['data'] = data - response = { - 'status': status_data.get('status'), - 'total': status_data.get('total'), - 'completed': status_data.get('completed'), - 'creditsUsed': status_data.get('creditsUsed'), - 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data') - } + response = BatchScrapeStatusResponse( + status=status_data.get('status'), + total=status_data.get('total'), + completed=status_data.get('completed'), + creditsUsed=status_data.get('creditsUsed'), + expiresAt=status_data.get('expiresAt'), + data=status_data.get('data') + ) if 'error' in status_data: response['error'] = status_data['error'] @@ -3726,14 +3752,14 @@ class AsyncFirecrawlApp(FirecrawlApp): if hasattr(schema, 'model_json_schema'): schema = schema.model_json_schema() - request_data = { - 'urls': urls or [], - 'allowExternalLinks': allow_external_links, - 'enableWebSearch': enable_web_search, - 'showSources': show_sources, - 'schema': schema, - 'origin': f'python-sdk@{version}' - } + request_data = ExtractResponse( + urls=urls or [], + allowExternalLinks=allow_external_links, + enableWebSearch=enable_web_search, + showSources=show_sources, + schema=schema, + origin=f'python-sdk@{version}' + ) if prompt: request_data['prompt'] = prompt @@ -3810,7 +3836,7 @@ class AsyncFirecrawlApp(FirecrawlApp): await asyncio.sleep(2) - return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} + return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly') async def async_generate_llms_text( self, @@ -3981,7 +4007,7 @@ class AsyncFirecrawlApp(FirecrawlApp): await asyncio.sleep(2) - return {'success': False, 'error': 'Deep research job terminated unexpectedly'} + return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly') async def async_deep_research( self, @@ -4088,7 +4114,7 @@ class AsyncFirecrawlApp(FirecrawlApp): country: Optional[str] = None, location: Optional[str] = None, timeout: Optional[int] = None, - scrape_options: Optional[CommonOptions] = None, + scrape_options: Optional[ScrapeOptions] = None, params: Optional[Union[Dict[str, Any], SearchParams]] = None, **kwargs) -> SearchResponse: """ @@ -4103,7 +4129,7 @@ class AsyncFirecrawlApp(FirecrawlApp): country (Optional[str]): Country code (default: "us") location (Optional[str]): Geo-targeting timeout (Optional[int]): Request timeout in milliseconds - scrape_options (Optional[CommonOptions]): Result scraping configuration + scrape_options (Optional[ScrapeOptions]): Result scraping configuration params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters **kwargs: Additional keyword arguments for future compatibility From 0aedef7210ac482ad5664b780daa3561363b773f Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Apr 2025 13:37:09 -0700 Subject: [PATCH 2/5] fix --- apps/python-sdk/example.py | 21 +----- apps/python-sdk/example_async.py | 18 ++++- apps/python-sdk/firecrawl/firecrawl.py | 93 ++++++++++++++------------ 3 files changed, 70 insertions(+), 62 deletions(-) diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index ded241cd..9b28e17e 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -42,23 +42,7 @@ while attempts > 0 and crawl_status.status != 'completed': crawl_status = app.check_crawl_status(async_result.id) print(crawl_status) -# LLM Extraction: -# Define schema to extract contents into using pydantic -class ArticleSchema(BaseModel): - title: str - points: int - by: str - commentsURL: str - -class TopArticlesSchema(BaseModel): - top: List[ArticleSchema] = Field(..., description="Top 5 stories") - -extract_config = JsonConfig(schema=TopArticlesSchema.model_json_schema()) - -llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) - -print(llm_extraction_result.extract) - +# JSON format: # Define schema to extract contents into using json schema json_schema = { "type": "object", @@ -86,9 +70,6 @@ llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=[ print(llm_extraction_result.json) -print(llm_extraction_result['llm_extraction']) - - # Map a website: map_result = app.map_url('https://firecrawl.dev', search="blog") print(map_result) diff --git a/apps/python-sdk/example_async.py b/apps/python-sdk/example_async.py index b4b76abf..1ad02244 100644 --- a/apps/python-sdk/example_async.py +++ b/apps/python-sdk/example_async.py @@ -2,7 +2,7 @@ import time import nest_asyncio import uuid import asyncio -from firecrawl.firecrawl import AsyncFirecrawlApp +from firecrawl.firecrawl import AsyncFirecrawlApp, ScrapeOptions, JsonConfig from pydantic import BaseModel, Field from typing import List @@ -84,6 +84,20 @@ async def example_map_and_extract(): extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) print(extract_result) +async def example_deep_research(): + # Deep research example + research_result = await app.deep_research( + "What are the latest developments in large language models?", + max_urls=4 + ) + print("Research Results:", research_result) + +async def example_generate_llms_text(): + # Generate LLMs.txt example + llms_result = await app.generate_llms_text( + "https://firecrawl.dev") + print("LLMs.txt Results:", llms_result) + # Define event handlers for websocket def on_document(detail): print("DOC", detail) @@ -115,6 +129,8 @@ async def main(): await example_llm_extraction() await example_map_and_extract() await example_websocket_crawl() + await example_deep_research() + await example_generate_llms_text() if __name__ == "__main__": asyncio.run(main()) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index ef67ece8..8f12e781 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1742,7 +1742,7 @@ class FirecrawlApp: def async_extract( self, - urls: List[str], + urls: Optional[List[str]] = None, *, prompt: Optional[str] = None, schema: Optional[Any] = None, @@ -1750,8 +1750,7 @@ class FirecrawlApp: allow_external_links: Optional[bool] = False, enable_web_search: Optional[bool] = False, show_sources: Optional[bool] = False, - agent: Optional[Dict[str, Any]] = None, - idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: + agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extract job. @@ -1775,7 +1774,7 @@ class FirecrawlApp: Raises: ValueError: If job initiation fails """ - headers = self._prepare_headers(idempotency_key) + headers = self._prepare_headers() schema = schema if schema: @@ -3457,27 +3456,28 @@ class AsyncFirecrawlApp(FirecrawlApp): async def extract( self, - urls: List[str], - params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: + urls: Optional[List[str]] = None, + *, + prompt: Optional[str] = None, + schema: Optional[Any] = None, + system_prompt: Optional[str] = None, + allow_external_links: Optional[bool] = False, + enable_web_search: Optional[bool] = False, + show_sources: Optional[bool] = False, + agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]: + """ Asynchronously extract structured information from URLs. Args: - urls (List[str]): URLs to extract from - params (Optional[ExtractParams]): See ExtractParams model: - Extraction Config: - * prompt - Custom extraction prompt - * schema - JSON schema/Pydantic model - * systemPrompt - System context - - Behavior Options: - * allowExternalLinks - Follow external links - * enableWebSearch - Enable web search - * includeSubdomains - Include subdomains - * showSources - Include source URLs - - Scraping Options: - * scrapeOptions - Page scraping config + urls (Optional[List[str]]): URLs to extract from + prompt (Optional[str]): Custom extraction prompt + schema (Optional[Any]): JSON schema/Pydantic model + system_prompt (Optional[str]): System context + allow_external_links (Optional[bool]): Follow external links + enable_web_search (Optional[bool]): Enable web search + show_sources (Optional[bool]): Include source URLs + agent (Optional[Dict[str, Any]]): Agent configuration Returns: ExtractResponse with: @@ -3490,29 +3490,35 @@ class AsyncFirecrawlApp(FirecrawlApp): """ headers = self._prepare_headers() - if not params or (not params.get('prompt') and not params.get('schema')): + if not prompt and not schema: raise ValueError("Either prompt or schema is required") - schema = params.get('schema') + if not urls and not prompt: + raise ValueError("Either urls or prompt is required") + if schema: if hasattr(schema, 'model_json_schema'): + # Convert Pydantic model to JSON schema schema = schema.model_json_schema() + # Otherwise assume it's already a JSON schema dict - request_data = ExtractResponse( - urls=urls, - allowExternalLinks=params.get('allow_external_links', params.get('allowExternalLinks', False)), - enableWebSearch=params.get('enable_web_search', params.get('enableWebSearch', False)), - showSources=params.get('show_sources', params.get('showSources', False)), - schema=schema, - origin=f'python-sdk@{version}' - ) + request_data = { + 'urls': urls or [], + 'allowExternalLinks': allow_external_links, + 'enableWebSearch': enable_web_search, + 'showSources': show_sources, + 'schema': schema, + 'origin': f'python-sdk@{get_version()}' + } - if params.get('prompt'): - request_data['prompt'] = params['prompt'] - if params.get('system_prompt'): - request_data['systemPrompt'] = params['system_prompt'] - elif params.get('systemPrompt'): - request_data['systemPrompt'] = params['systemPrompt'] + # Only add prompt and systemPrompt if they exist + if prompt: + request_data['prompt'] = prompt + if system_prompt: + request_data['systemPrompt'] = system_prompt + + if agent: + request_data['agent'] = agent response = await self._async_post_request( f'{self.api_url}/v1/extract', @@ -3532,7 +3538,7 @@ class AsyncFirecrawlApp(FirecrawlApp): ) if status_data['status'] == 'completed': - return status_data + return ExtractResponse(**status_data) elif status_data['status'] in ['failed', 'cancelled']: raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') @@ -3715,8 +3721,7 @@ class AsyncFirecrawlApp(FirecrawlApp): allow_external_links: Optional[bool] = False, enable_web_search: Optional[bool] = False, show_sources: Optional[bool] = False, - agent: Optional[Dict[str, Any]] = None, - idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: + agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extraction job without waiting for completion. @@ -3740,7 +3745,7 @@ class AsyncFirecrawlApp(FirecrawlApp): Raises: ValueError: If job initiation fails """ - headers = self._prepare_headers(idempotency_key) + headers = self._prepare_headers() if not prompt and not schema: raise ValueError("Either prompt or schema is required") @@ -3871,6 +3876,12 @@ class AsyncFirecrawlApp(FirecrawlApp): if experimental_stream is not None: params['__experimental_stream'] = experimental_stream + params = GenerateLLMsTextParams( + maxUrls=max_urls, + showFullText=show_full_text, + __experimental_stream=experimental_stream + ) + headers = self._prepare_headers() json_data = {'url': url, **params.dict(exclude_none=True)} json_data['origin'] = f"python-sdk@{version}" From 91ebd140e5a6481b88fe03065cb7580e27165a39 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 18 Apr 2025 13:37:34 -0700 Subject: [PATCH 3/5] version bump --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index ac4a6eca..437f7fc5 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa -__version__ = "2.0.2" +__version__ = "2.0.3" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From c7df80e2a86afc5337da735c668b357d65b641b9 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 13:42:21 -0700 Subject: [PATCH 4/5] Update __init__.py --- apps/python-sdk/firecrawl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 437f7fc5..214163e5 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa -__version__ = "2.0.3" +__version__ = "2.1.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 2c72097c3f373d35cf655d2b77ce4ad03aedb806 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 18 Apr 2025 13:44:16 -0700 Subject: [PATCH 5/5] Nick: --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 214163e5..0c9bffcf 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa -__version__ = "2.1.0" +__version__ = "2.1.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 8f12e781..b97945dc 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -3153,7 +3153,6 @@ class AsyncFirecrawlApp(FirecrawlApp): response = await self._async_post_request( f'{self.api_url}/v1/crawl', params_dict, headers) - print(response) if response.get('success'): try: id = response.get('id')