From f25edc4474b6ed1c1ff1580f00c8c99a9248b755 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 15:45:32 +0000 Subject: [PATCH 1/5] Fix deep_research method to convert camelCase to snake_case and add dot notation access Co-Authored-By: Nicolas Camara --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 48 ++++++++------ apps/python-sdk/firecrawl/utils.py | 87 ++++++++++++++++++++++++++ apps/python-sdk/test_deep_research.py | 29 +++++++++ 4 files changed, 147 insertions(+), 19 deletions(-) create mode 100644 apps/python-sdk/firecrawl/utils.py create mode 100644 apps/python-sdk/test_deep_research.py diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 4031da9f..3c7c8b3b 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa -__version__ = "2.5.4" +__version__ = "2.6.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 1cf62cf7..006c017d 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -23,6 +23,7 @@ import websockets import aiohttp import asyncio from pydantic import Field +from .utils import convert_dict_keys_to_snake_case, convert_to_dot_dict # Suppress Pydantic warnings about attribute shadowing warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"") @@ -2270,8 +2271,9 @@ class FirecrawlApp: * status (str): Current state (processing/completed/failed) * error (Optional[str]): Error message if failed * id (str): Unique identifier for the research job - * data (Any): Research findings and analysis - * sources (List[Dict]): List of discovered sources + * data (Any): Research findings and analysis with dot notation access + * final_analysis (str): Final analysis of the research (converted from camelCase) + * sources (List[Dict]): List of discovered sources * activities (List[Dict]): Research progress log * summaries (List[str]): Generated research summaries @@ -2301,38 +2303,41 @@ class FirecrawlApp: analysis_prompt=analysis_prompt, system_prompt=system_prompt ) - if not response.get('success') or 'id' not in response: - return response + + dot_dict_response = convert_to_dot_dict(response) + + if not dot_dict_response.get('success') or 'id' not in dot_dict_response: + return dot_dict_response - job_id = response['id'] + job_id = dot_dict_response.id last_activity_count = 0 last_source_count = 0 while True: status = self.check_deep_research_status(job_id) - if on_activity and 'activities' in status: - new_activities = status['activities'][last_activity_count:] + if on_activity and hasattr(status, 'activities'): + new_activities = status.activities[last_activity_count:] for activity in new_activities: on_activity(activity) - last_activity_count = len(status['activities']) + last_activity_count = len(status.activities) - if on_source and 'sources' in status: - new_sources = status['sources'][last_source_count:] + if on_source and hasattr(status, 'sources'): + new_sources = status.sources[last_source_count:] for source in new_sources: on_source(source) - last_source_count = len(status['sources']) + last_source_count = len(status.sources) - if status['status'] == 'completed': + if status.status == 'completed': return status - elif status['status'] == 'failed': + elif status.status == 'failed': raise Exception(f'Deep research failed. Error: {status.get("error")}') - elif status['status'] != 'processing': + elif status.status != 'processing': break time.sleep(2) # Polling interval - return {'success': False, 'error': 'Deep research job terminated unexpectedly'} + return convert_to_dot_dict({'success': False, 'error': 'Deep research job terminated unexpectedly'}) def async_deep_research( self, @@ -2422,8 +2427,9 @@ class FirecrawlApp: Results: * id - Unique identifier for the research job - * data - Research findings and analysis - * sources - List of discovered sources + * data - Research findings and analysis with dot notation access + * final_analysis - Final analysis of the research (converted from camelCase) + * sources - List of discovered sources * activities - Research progress log * summaries - Generated research summaries @@ -2435,7 +2441,13 @@ class FirecrawlApp: response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers) if response.status_code == 200: try: - return response.json() + json_response = response.json() + + snake_case_response = convert_dict_keys_to_snake_case(json_response) + + dot_dict_response = convert_to_dot_dict(snake_case_response) + + return dot_dict_response except: raise Exception('Failed to parse Firecrawl response as JSON.') elif response.status_code == 404: diff --git a/apps/python-sdk/firecrawl/utils.py b/apps/python-sdk/firecrawl/utils.py new file mode 100644 index 00000000..2a40cfc7 --- /dev/null +++ b/apps/python-sdk/firecrawl/utils.py @@ -0,0 +1,87 @@ +""" +Utility functions for the Firecrawl SDK. +""" +import re +from typing import Any, Dict, List, Union + + +def camel_to_snake(name: str) -> str: + """ + Convert a camelCase string to snake_case. + + Args: + name (str): The camelCase string to convert. + + Returns: + str: The snake_case string. + """ + if not name: + return name + + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + + +def convert_dict_keys_to_snake_case(data: Any) -> Any: + """ + Recursively convert all dictionary keys from camelCase to snake_case. + + Args: + data (Any): The data to convert. Can be a dictionary, list, or primitive type. + + Returns: + Any: The converted data with snake_case keys. + """ + if isinstance(data, dict): + return {camel_to_snake(k): convert_dict_keys_to_snake_case(v) for k, v in data.items()} + elif isinstance(data, list): + return [convert_dict_keys_to_snake_case(item) for item in data] + else: + return data + + +class DotDict(dict): + """ + A dictionary that supports dot notation access to its items. + + Example: + >>> d = DotDict({'foo': 'bar'}) + >>> d.foo + 'bar' + >>> d['foo'] + 'bar' + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + for key, value in self.items(): + if isinstance(value, dict): + self[key] = DotDict(value) + elif isinstance(value, list): + self[key] = [DotDict(item) if isinstance(item, dict) else item for item in value] + + def __getattr__(self, key): + try: + return self[key] + except KeyError: + raise AttributeError(f"'DotDict' object has no attribute '{key}'") + + def __setattr__(self, key, value): + self[key] = value + + +def convert_to_dot_dict(data: Union[Dict, List, Any]) -> Union[DotDict, List, Any]: + """ + Convert a dictionary or list of dictionaries to DotDict objects. + + Args: + data (Union[Dict, List, Any]): The data to convert. + + Returns: + Union[DotDict, List, Any]: The converted data with DotDict objects. + """ + if isinstance(data, dict): + return DotDict(data) + elif isinstance(data, list): + return [convert_to_dot_dict(item) for item in data] + else: + return data diff --git a/apps/python-sdk/test_deep_research.py b/apps/python-sdk/test_deep_research.py new file mode 100644 index 00000000..5f094a3f --- /dev/null +++ b/apps/python-sdk/test_deep_research.py @@ -0,0 +1,29 @@ +""" +Test script for the deep_research method with the new camelCase to snake_case conversion. +""" +from firecrawl import FirecrawlApp + +firecrawl = FirecrawlApp(api_key="your_api_key") + + +def on_activity(activity): + print(f"[{activity.type}] {activity.message}") + +results = firecrawl.deep_research( + query="What are the latest developments in quantum computing?", + max_depth=5, + time_limit=180, + max_urls=15, + on_activity=on_activity +) + +print(f"Final Analysis: {results.data.final_analysis}") +print(f"Sources: {len(results.data.sources)} references") + +print("\nAll available fields in the response:") +print(f"Success: {results.success}") +print(f"Status: {results.status}") +print(f"Current Depth: {results.current_depth}") +print(f"Max Depth: {results.max_depth}") +print(f"Activities: {len(results.activities)} activities") +print(f"Summaries: {len(results.summaries)} summaries") From 99ec770f4dd59dd5c0a28ba0d3ebab4ea9a8f0dd Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 15:50:10 +0000 Subject: [PATCH 2/5] Add type hints for deep_research response structure Co-Authored-By: Nicolas Camara --- apps/python-sdk/firecrawl/firecrawl.py | 4 +-- apps/python-sdk/firecrawl/utils.py | 42 +++++++++++++++++++++----- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 006c017d..89f3524a 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -23,7 +23,7 @@ import websockets import aiohttp import asyncio from pydantic import Field -from .utils import convert_dict_keys_to_snake_case, convert_to_dot_dict +from .utils import convert_dict_keys_to_snake_case, convert_to_dot_dict, DeepResearchResponse, DeepResearchData, DeepResearchDataSource # Suppress Pydantic warnings about attribute shadowing warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"") @@ -2250,7 +2250,7 @@ class FirecrawlApp: system_prompt: Optional[str] = None, __experimental_stream_steps: Optional[bool] = None, on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, - on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: + on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> Union[DotDict[DeepResearchResponse], Dict[str, Any]]: """ Initiates a deep research operation on a given query and polls until completion. diff --git a/apps/python-sdk/firecrawl/utils.py b/apps/python-sdk/firecrawl/utils.py index 2a40cfc7..a6d46363 100644 --- a/apps/python-sdk/firecrawl/utils.py +++ b/apps/python-sdk/firecrawl/utils.py @@ -2,7 +2,35 @@ Utility functions for the Firecrawl SDK. """ import re -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Union, TypeVar, Generic, Optional, TypedDict + + +T = TypeVar('T') + + +class DeepResearchDataSource(TypedDict, total=False): + """Type definition for a source in deep research data.""" + url: str + title: str + content: str + summary: str + + +class DeepResearchData(TypedDict, total=False): + """Type definition for deep research data.""" + final_analysis: str + sources: List[DeepResearchDataSource] + + +class DeepResearchResponse(TypedDict, total=False): + """Type definition for deep research response.""" + success: bool + status: str + current_depth: int + max_depth: int + activities: List[Dict[str, Any]] + summaries: List[str] + data: DeepResearchData def camel_to_snake(name: str) -> str: @@ -40,7 +68,7 @@ def convert_dict_keys_to_snake_case(data: Any) -> Any: return data -class DotDict(dict): +class DotDict(dict, Generic[T]): """ A dictionary that supports dot notation access to its items. @@ -59,25 +87,25 @@ class DotDict(dict): elif isinstance(value, list): self[key] = [DotDict(item) if isinstance(item, dict) else item for item in value] - def __getattr__(self, key): + def __getattr__(self, key: str) -> Any: try: return self[key] except KeyError: raise AttributeError(f"'DotDict' object has no attribute '{key}'") - def __setattr__(self, key, value): + def __setattr__(self, key: str, value: Any) -> None: self[key] = value -def convert_to_dot_dict(data: Union[Dict, List, Any]) -> Union[DotDict, List, Any]: +def convert_to_dot_dict(data: Union[Dict[str, Any], List[Any], Any]) -> Union[DotDict[Any], List[Any], Any]: """ Convert a dictionary or list of dictionaries to DotDict objects. Args: - data (Union[Dict, List, Any]): The data to convert. + data (Union[Dict[str, Any], List[Any], Any]): The data to convert. Returns: - Union[DotDict, List, Any]: The converted data with DotDict objects. + Union[DotDict[Any], List[Any], Any]: The converted data with DotDict objects. """ if isinstance(data, dict): return DotDict(data) From 054d99c1af25e13428ac41c5bd724c327c9f5338 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 15:51:12 +0000 Subject: [PATCH 3/5] Update check_deep_research_status return type annotation Co-Authored-By: Nicolas Camara --- apps/python-sdk/firecrawl/firecrawl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 89f3524a..6961c44a 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -23,7 +23,7 @@ import websockets import aiohttp import asyncio from pydantic import Field -from .utils import convert_dict_keys_to_snake_case, convert_to_dot_dict, DeepResearchResponse, DeepResearchData, DeepResearchDataSource +from .utils import convert_dict_keys_to_snake_case, convert_to_dot_dict, DotDict, DeepResearchResponse, DeepResearchData, DeepResearchDataSource # Suppress Pydantic warnings about attribute shadowing warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"") @@ -2410,7 +2410,7 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse: + def check_deep_research_status(self, id: str) -> Union[DotDict[DeepResearchResponse], Dict[str, Any]]: """ Check the status of a deep research operation. From 476e042496a52cdf7c019c43727b70a0900c0d84 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 9 May 2025 18:32:05 -0300 Subject: [PATCH 4/5] Enhance Firecrawl SDK: Add LocationConfig to __init__.py and update ScrapeOptions formats to remove 'content' type. Introduce new E2E tests for async and standard scraping functionalities, including batch scraping and URL mapping. --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 20 +-- .../e2e/async/test_async_batch_scrape.py | 18 ++ .../tests/e2e/async/test_async_crawl.py | 16 ++ .../tests/e2e/async/test_batch_scrape.py | 117 +++++++++++++ .../e2e/async/test_check_crawl_status.py | 16 ++ apps/python-sdk/tests/e2e/async/test_crawl.py | 16 ++ .../tests/e2e/async/test_deep_research.py | 16 ++ .../tests/e2e/async/test_extract.py | 16 ++ .../e2e/async/test_generate_llms_text.py | 16 ++ apps/python-sdk/tests/e2e/async/test_map.py | 40 +++++ .../python-sdk/tests/e2e/async/test_scrape.py | 158 ++++++++++++++++++ .../tests/e2e/test_async_batch_scrape.py | 15 ++ apps/python-sdk/tests/e2e/test_async_crawl.py | 15 ++ .../python-sdk/tests/e2e/test_batch_scrape.py | 15 ++ .../tests/e2e/test_check_crawl_status.py | 15 ++ apps/python-sdk/tests/e2e/test_crawl.py | 15 ++ .../tests/e2e/test_deep_research.py | 15 ++ apps/python-sdk/tests/e2e/test_extract.py | 15 ++ .../tests/e2e/test_generate_llms_text.py | 15 ++ apps/python-sdk/tests/e2e/test_map.py | 36 ++++ apps/python-sdk/tests/e2e/test_scrape.py | 154 +++++++++++++++++ 22 files changed, 750 insertions(+), 11 deletions(-) create mode 100644 apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/async/test_async_crawl.py create mode 100644 apps/python-sdk/tests/e2e/async/test_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/async/test_check_crawl_status.py create mode 100644 apps/python-sdk/tests/e2e/async/test_crawl.py create mode 100644 apps/python-sdk/tests/e2e/async/test_deep_research.py create mode 100644 apps/python-sdk/tests/e2e/async/test_extract.py create mode 100644 apps/python-sdk/tests/e2e/async/test_generate_llms_text.py create mode 100644 apps/python-sdk/tests/e2e/async/test_map.py create mode 100644 apps/python-sdk/tests/e2e/async/test_scrape.py create mode 100644 apps/python-sdk/tests/e2e/test_async_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/test_async_crawl.py create mode 100644 apps/python-sdk/tests/e2e/test_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/test_check_crawl_status.py create mode 100644 apps/python-sdk/tests/e2e/test_crawl.py create mode 100644 apps/python-sdk/tests/e2e/test_deep_research.py create mode 100644 apps/python-sdk/tests/e2e/test_extract.py create mode 100644 apps/python-sdk/tests/e2e/test_generate_llms_text.py create mode 100644 apps/python-sdk/tests/e2e/test_map.py create mode 100644 apps/python-sdk/tests/e2e/test_scrape.py diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 3c7c8b3b..a4548185 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/ import logging import os -from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa +from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa __version__ = "2.6.0" diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 6961c44a..1df2fcc9 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -144,7 +144,7 @@ class ChangeTrackingOptions(pydantic.BaseModel): class ScrapeOptions(pydantic.BaseModel): """Parameters for scraping operations.""" - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None headers: Optional[Dict[str, str]] = None includeTags: Optional[List[str]] = None excludeTags: Optional[List[str]] = None @@ -448,7 +448,7 @@ class FirecrawlApp: self, url: str, *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, only_main_content: Optional[bool] = None, @@ -470,7 +470,7 @@ class FirecrawlApp: Args: url (str): Target URL to scrape - formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) include_tags (Optional[List[str]]): HTML tags to include exclude_tags (Optional[List[str]]): HTML tags to exclude only_main_content (Optional[bool]): Extract main content only @@ -1179,7 +1179,7 @@ class FirecrawlApp: self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -1313,7 +1313,7 @@ class FirecrawlApp: self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -1445,7 +1445,7 @@ class FirecrawlApp: self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -2844,7 +2844,7 @@ class AsyncFirecrawlApp(FirecrawlApp): self, url: str, *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, only_main_content: Optional[bool] = None, @@ -2865,7 +2865,7 @@ class AsyncFirecrawlApp(FirecrawlApp): Args: url (str): Target URL to scrape - formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) include_tags (Optional[List[str]]): HTML tags to include exclude_tags (Optional[List[str]]): HTML tags to exclude only_main_content (Optional[bool]): Extract main content only @@ -2972,7 +2972,7 @@ class AsyncFirecrawlApp(FirecrawlApp): self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -3111,7 +3111,7 @@ class AsyncFirecrawlApp(FirecrawlApp): self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, diff --git a/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py new file mode 100644 index 00000000..ce366a40 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py @@ -0,0 +1,18 @@ +import os +import pytest +import sys + +from dotenv import load_dotenv +from firecrawl.firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_async_batch_scrape_urls_async_placeholder(): + # TODO: Implement async E2E tests for async_batch_scrape_urls + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_async_crawl.py b/apps/python-sdk/tests/e2e/async/test_async_crawl.py new file mode 100644 index 00000000..02d05f01 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_async_crawl.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_async_crawl_url_async_placeholder(): + # TODO: Implement async E2E tests for async_crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_batch_scrape.py b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py new file mode 100644 index 00000000..a5dde5d5 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py @@ -0,0 +1,117 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URLS = [ + "https://example.com", + "https://iana.org" +] + +@pytest.mark.asyncio +async def test_batch_scrape_urls_async_simple(): + result = await app.batch_scrape_urls( + TEST_URLS, + formats=["markdown"] + ) + assert result.success + assert hasattr(result, "data") + assert len(result.data) == len(TEST_URLS) + assert any("Example Domain" in doc.markdown for doc in result.data) + +@pytest.mark.asyncio +async def test_batch_scrape_urls_async_all_params_with_extract(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + result = await app.batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions + ) + assert result.success + assert hasattr(result, "data") + + assert len(result.data) == len(TEST_URLS) + for doc in result.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "rawHtml") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "extract") + assert hasattr(doc, "metadata") + +@pytest.mark.asyncio +async def test_batch_scrape_urls_async_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + result = await app.batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "json"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions + ) + assert result.success + assert hasattr(result, "data") + assert len(result.data) == len(TEST_URLS) + for doc in result.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "rawHtml") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "json") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py b/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py new file mode 100644 index 00000000..ac929fde --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_check_crawl_status_async_placeholder(): + # TODO: Implement async E2E tests for check_crawl_status + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_crawl.py b/apps/python-sdk/tests/e2e/async/test_crawl.py new file mode 100644 index 00000000..fec46b1a --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_crawl.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_crawl_url_async_placeholder(): + # TODO: Implement async E2E tests for crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_deep_research.py b/apps/python-sdk/tests/e2e/async/test_deep_research.py new file mode 100644 index 00000000..66cb51d5 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_deep_research.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_deep_research_async_placeholder(): + # TODO: Implement async E2E tests for deep_research + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_extract.py b/apps/python-sdk/tests/e2e/async/test_extract.py new file mode 100644 index 00000000..4a059f13 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_extract.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_extract_async_placeholder(): + # TODO: Implement async E2E tests for extract + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py b/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py new file mode 100644 index 00000000..00d6b987 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_generate_llms_text_async_placeholder(): + # TODO: Implement async E2E tests for generate_llms_text + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_map.py b/apps/python-sdk/tests/e2e/async/test_map.py new file mode 100644 index 00000000..b034e9f0 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_map.py @@ -0,0 +1,40 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = "example.com" + +@pytest.mark.asyncio +async def test_map_url_async_simple(): + result = await app.map_url(TEST_URL) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) + +@pytest.mark.asyncio +async def test_map_url_async_all_params(): + result = await app.map_url( + TEST_URL, + search="test", + sitemap_only=False, + include_subdomains=False, + limit=10 + ) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_scrape.py b/apps/python-sdk/tests/e2e/async/test_scrape.py new file mode 100644 index 00000000..ae0e0a46 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_scrape.py @@ -0,0 +1,158 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions +from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = 'example.com' + +@pytest.mark.asyncio +async def test_scrape_url_async_simple(): + response = await app.scrape_url( + TEST_URL, + formats=["markdown"] + ) + assert response.success + assert hasattr(response, "markdown") + assert "# Example Domain" in response.markdown + +@pytest.mark.asyncio +async def test_scrape_url_async_all_params_with_extract_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = await app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +@pytest.mark.asyncio +async def test_scrape_url_async_all_params_with_extract_full_page_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = await app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +@pytest.mark.asyncio +async def test_scrape_url_async_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = await app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "json") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py new file mode 100644 index 00000000..d3fc8c6c --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_async_batch_scrape_urls_placeholder(): + # TODO: Implement E2E tests for async_batch_scrape_urls + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_async_crawl.py b/apps/python-sdk/tests/e2e/test_async_crawl.py new file mode 100644 index 00000000..a7b3dc06 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_async_crawl.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_async_crawl_url_placeholder(): + # TODO: Implement E2E tests for async_crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_batch_scrape.py b/apps/python-sdk/tests/e2e/test_batch_scrape.py new file mode 100644 index 00000000..ff8aa0af --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_batch_scrape.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_batch_scrape_urls_placeholder(): + # TODO: Implement E2E tests for batch_scrape_urls + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_check_crawl_status.py b/apps/python-sdk/tests/e2e/test_check_crawl_status.py new file mode 100644 index 00000000..22b1a4d4 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_check_crawl_status.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_check_crawl_status_placeholder(): + # TODO: Implement E2E tests for check_crawl_status + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_crawl.py b/apps/python-sdk/tests/e2e/test_crawl.py new file mode 100644 index 00000000..bf758919 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_crawl.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_crawl_url_placeholder(): + # TODO: Implement E2E tests for crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_deep_research.py b/apps/python-sdk/tests/e2e/test_deep_research.py new file mode 100644 index 00000000..463aea64 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_deep_research.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_deep_research_placeholder(): + # TODO: Implement E2E tests for deep_research + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_extract.py b/apps/python-sdk/tests/e2e/test_extract.py new file mode 100644 index 00000000..4e925041 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_extract.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_extract_placeholder(): + # TODO: Implement E2E tests for extract + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_generate_llms_text.py b/apps/python-sdk/tests/e2e/test_generate_llms_text.py new file mode 100644 index 00000000..fe9928e1 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_generate_llms_text.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_generate_llms_text_placeholder(): + # TODO: Implement E2E tests for generate_llms_text + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_map.py b/apps/python-sdk/tests/e2e/test_map.py new file mode 100644 index 00000000..da9bb4d5 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_map.py @@ -0,0 +1,36 @@ +import sys +import os +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = "example.com" + +def test_map_url_simple(): + result = app.map_url(TEST_URL) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) + +def test_map_url_all_params(): + result = app.map_url( + TEST_URL, + search="test", + sitemap_only=False, + include_subdomains=False, + limit=10 + ) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_scrape.py b/apps/python-sdk/tests/e2e/test_scrape.py new file mode 100644 index 00000000..f20a80f7 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_scrape.py @@ -0,0 +1,154 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl import FirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions +from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = 'example.com' + +def test_scrape_url_simple(): + response = app.scrape_url( + TEST_URL, + formats=["markdown"] + ) + assert response.success + assert hasattr(response, "markdown") + assert "# Example Domain" in response.markdown + +def test_scrape_url_all_params_with_extract_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +def test_scrape_url_all_params_with_extract_full_page_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +def test_scrape_url_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "json") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") \ No newline at end of file From 6d8033402c7a7dbb26da6589964a30cd3aec0e43 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 14 May 2025 15:41:39 -0300 Subject: [PATCH 5/5] e2e tests for python methods --- .../e2e/async/test_async_batch_scrape.py | 122 +++++++++++++++++- .../tests/e2e/async/test_async_crawl.py | 86 +++++++++++- .../tests/e2e/async/test_batch_scrape.py | 12 +- .../e2e/async/test_check_crawl_status.py | 16 --- apps/python-sdk/tests/e2e/async/test_crawl.py | 86 +++++++++++- .../tests/e2e/async/test_deep_research.py | 40 +++++- .../tests/e2e/async/test_extract.py | 91 ++++++++++++- .../e2e/async/test_generate_llms_text.py | 35 ++++- .../python-sdk/tests/e2e/async/test_scrape.py | 24 ++-- .../tests/e2e/test_async_batch_scrape.py | 119 ++++++++++++++++- apps/python-sdk/tests/e2e/test_async_crawl.py | 84 +++++++++++- .../python-sdk/tests/e2e/test_batch_scrape.py | 119 ++++++++++++++++- .../tests/e2e/test_check_crawl_status.py | 15 --- apps/python-sdk/tests/e2e/test_crawl.py | 73 ++++++++++- .../tests/e2e/test_deep_research.py | 39 +++++- apps/python-sdk/tests/e2e/test_extract.py | 87 ++++++++++++- .../tests/e2e/test_generate_llms_text.py | 34 ++++- apps/python-sdk/tests/e2e/test_scrape.py | 16 +-- 18 files changed, 987 insertions(+), 111 deletions(-) delete mode 100644 apps/python-sdk/tests/e2e/async/test_check_crawl_status.py delete mode 100644 apps/python-sdk/tests/e2e/test_check_crawl_status.py diff --git a/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py index ce366a40..f28a525c 100644 --- a/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py +++ b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py @@ -1,9 +1,11 @@ import os import pytest import sys - from dotenv import load_dotenv -from firecrawl.firecrawl import AsyncFirecrawlApp +import asyncio + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction load_dotenv() @@ -12,7 +14,117 @@ API_KEY = os.getenv("TEST_API_KEY") app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) +TEST_URLS = [ + "https://example.com", + "https://www.iana.org" +] + +async def wait_for_batch_completion(app, job_id, timeout=60): + for _ in range(timeout): + status = await app.check_batch_scrape_status(job_id) + if status.status == "completed": + return status + await asyncio.sleep(1) + raise TimeoutError("Batch scrape did not complete in time") + @pytest.mark.asyncio -async def test_async_batch_scrape_urls_async_placeholder(): - # TODO: Implement async E2E tests for async_batch_scrape_urls - assert True \ No newline at end of file +async def test_async_batch_scrape_urls_simple(): + job = await app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown"] + ) + assert job.id is not None + status = await wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + assert any("Example Domain" in doc.markdown for doc in status.data) + assert any("Internet Assigned Numbers" in doc.markdown for doc in status.data) + +@pytest.mark.asyncio +async def test_async_batch_scrape_urls_all_params_with_extract(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + job = await app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions + ) + assert job.id is not None + status = await wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "extract") + assert hasattr(doc, "metadata") + +@pytest.mark.asyncio +async def test_async_batch_scrape_urls_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + job = await app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot", "json"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions + ) + assert job.id is not None + status = await wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "json") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_async_crawl.py b/apps/python-sdk/tests/e2e/async/test_async_crawl.py index 02d05f01..634a06e7 100644 --- a/apps/python-sdk/tests/e2e/async/test_async_crawl.py +++ b/apps/python-sdk/tests/e2e/async/test_async_crawl.py @@ -1,7 +1,11 @@ import os +import sys import pytest +import asyncio from dotenv import load_dotenv -from firecrawl import AsyncFirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions load_dotenv() @@ -10,7 +14,81 @@ API_KEY = os.getenv("TEST_API_KEY") app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) +TEST_URL = "https://example.com" + +async def wait_for_crawl_completion(app, job_id, timeout=60): + for _ in range(timeout): + status = await app.check_crawl_status(job_id) + if status.status == "completed": + return status + await asyncio.sleep(1) + raise TimeoutError("Crawl did not complete in time") + @pytest.mark.asyncio -async def test_async_crawl_url_async_placeholder(): - # TODO: Implement async E2E tests for async_crawl_url - assert True \ No newline at end of file +async def test_async_crawl_url_simple(): + job = await app.async_crawl_url( + TEST_URL, + limit=1, + scrape_options=ScrapeOptions(formats=["markdown"]) + ) + assert job.id is not None + status = await wait_for_crawl_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + assert any("Example Domain" in doc.markdown for doc in status.data) + +@pytest.mark.asyncio +async def test_async_crawl_url_all_params(): + location = LocationConfig(country="us", languages=["en"]) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + scrape_options = ScrapeOptions( + formats=["markdown", "html", "raw_html", "links", "screenshot"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + actions=actions + ) + job = await app.async_crawl_url( + TEST_URL, + include_paths=["/"], + exclude_paths=["/notarealpage"], + max_depth=2, + max_discovery_depth=2, + limit=2, + allow_backward_links=True, + allow_external_links=True, + ignore_sitemap=True, + scrape_options=scrape_options, + deduplicate_similar_urls=True, + ignore_query_parameters=True, + regex_on_full_url=True, + delay=1 + ) + assert job.id is not None + status = await wait_for_crawl_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "metadata") diff --git a/apps/python-sdk/tests/e2e/async/test_batch_scrape.py b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py index a5dde5d5..49bddf95 100644 --- a/apps/python-sdk/tests/e2e/async/test_batch_scrape.py +++ b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py @@ -42,11 +42,11 @@ async def test_batch_scrape_urls_async_all_params_with_extract(): WriteAction(type="write", text="test input"), PressAction(type="press", key="Enter"), ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") ] result = await app.batch_scrape_urls( TEST_URLS, - formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract"], + formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"], include_tags=["h1", "p"], exclude_tags=["footer"], only_main_content=True, @@ -68,7 +68,7 @@ async def test_batch_scrape_urls_async_all_params_with_extract(): for doc in result.data: assert hasattr(doc, "markdown") assert hasattr(doc, "html") - assert hasattr(doc, "rawHtml") + assert hasattr(doc, "raw_html") assert hasattr(doc, "links") assert hasattr(doc, "screenshot") assert hasattr(doc, "extract") @@ -85,11 +85,11 @@ async def test_batch_scrape_urls_async_all_params_with_json_options(): WriteAction(type="write", text="test input"), PressAction(type="press", key="Enter"), ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") ] result = await app.batch_scrape_urls( TEST_URLS, - formats=["markdown", "html", "rawHtml", "links", "screenshot", "json"], + formats=["markdown", "html", "raw_html", "links", "screenshot", "json"], include_tags=["h1", "p"], exclude_tags=["footer"], only_main_content=True, @@ -110,7 +110,7 @@ async def test_batch_scrape_urls_async_all_params_with_json_options(): for doc in result.data: assert hasattr(doc, "markdown") assert hasattr(doc, "html") - assert hasattr(doc, "rawHtml") + assert hasattr(doc, "raw_html") assert hasattr(doc, "links") assert hasattr(doc, "screenshot") assert hasattr(doc, "json") diff --git a/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py b/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py deleted file mode 100644 index ac929fde..00000000 --- a/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import pytest -from dotenv import load_dotenv -from firecrawl import AsyncFirecrawlApp - -load_dotenv() - -API_URL = "https://api.firecrawl.dev" -API_KEY = os.getenv("TEST_API_KEY") - -app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) - -@pytest.mark.asyncio -async def test_check_crawl_status_async_placeholder(): - # TODO: Implement async E2E tests for check_crawl_status - assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_crawl.py b/apps/python-sdk/tests/e2e/async/test_crawl.py index fec46b1a..4cf2aa8a 100644 --- a/apps/python-sdk/tests/e2e/async/test_crawl.py +++ b/apps/python-sdk/tests/e2e/async/test_crawl.py @@ -1,7 +1,11 @@ import os +import sys import pytest +import asyncio from dotenv import load_dotenv -from firecrawl import AsyncFirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions load_dotenv() @@ -10,7 +14,81 @@ API_KEY = os.getenv("TEST_API_KEY") app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) +TEST_URL = "https://example.com" + +async def wait_for_crawl_completion(app, job_id, timeout=60): + for _ in range(timeout): + status = await app.check_crawl_status(job_id) + if status.status == "completed": + return status + await asyncio.sleep(1) + raise TimeoutError("Crawl did not complete in time") + @pytest.mark.asyncio -async def test_crawl_url_async_placeholder(): - # TODO: Implement async E2E tests for crawl_url - assert True \ No newline at end of file +async def test_crawl_url_async_simple(): + job = await app.async_crawl_url( + TEST_URL, + limit=1, + scrape_options=ScrapeOptions(formats=["markdown"]) + ) + assert job.id is not None + status = await wait_for_crawl_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + assert any("Example Domain" in doc.markdown for doc in status.data) + +@pytest.mark.asyncio +async def test_crawl_url_async_all_params(): + location = LocationConfig(country="us", languages=["en"]) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + scrape_options = ScrapeOptions( + formats=["markdown", "html", "raw_html", "links", "screenshot"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + actions=actions + ) + job = await app.async_crawl_url( + TEST_URL, + include_paths=["/"], + exclude_paths=["/notarealpage"], + max_depth=2, + max_discovery_depth=2, + limit=2, + allow_backward_links=True, + allow_external_links=True, + ignore_sitemap=True, + scrape_options=scrape_options, + deduplicate_similar_urls=True, + ignore_query_parameters=True, + regex_on_full_url=True, + delay=1 + ) + assert job.id is not None + status = await wait_for_crawl_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_deep_research.py b/apps/python-sdk/tests/e2e/async/test_deep_research.py index 66cb51d5..663f0702 100644 --- a/apps/python-sdk/tests/e2e/async/test_deep_research.py +++ b/apps/python-sdk/tests/e2e/async/test_deep_research.py @@ -1,7 +1,10 @@ import os +import sys import pytest from dotenv import load_dotenv -from firecrawl import AsyncFirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp load_dotenv() @@ -11,6 +14,35 @@ API_KEY = os.getenv("TEST_API_KEY") app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) @pytest.mark.asyncio -async def test_deep_research_async_placeholder(): - # TODO: Implement async E2E tests for deep_research - assert True \ No newline at end of file +async def test_deep_research_async_simple(): + result = await app.deep_research("What is the capital of France?", max_urls=2) + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "success") + assert result.success + assert hasattr(result, "data") + assert result.data is not None + assert any("Paris" in str(result.data) or "France" in str(result.data) for _ in [0]) + +@pytest.mark.asyncio +async def test_deep_research_async_all_params(): + result = await app.deep_research( + "What are the latest advancements in AI?", + max_depth=2, + time_limit=60, + max_urls=3, + analysis_prompt="Summarize the most important recent AI advancements.", + system_prompt="You are an expert AI researcher." + ) + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "success") + assert result.success + assert hasattr(result, "data") + assert hasattr(result, "activities") + assert isinstance(result.activities, list) + assert result.data is not None + assert hasattr(result.data, "sources") + assert isinstance(result.data.sources, list) + assert hasattr(result.data, "final_analysis") + assert isinstance(result.data.final_analysis, str) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_extract.py b/apps/python-sdk/tests/e2e/async/test_extract.py index 4a059f13..22597c2d 100644 --- a/apps/python-sdk/tests/e2e/async/test_extract.py +++ b/apps/python-sdk/tests/e2e/async/test_extract.py @@ -1,7 +1,12 @@ import os +import sys import pytest from dotenv import load_dotenv -from firecrawl import AsyncFirecrawlApp +from pydantic import BaseModel +from typing import List + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp load_dotenv() @@ -10,7 +15,85 @@ API_KEY = os.getenv("TEST_API_KEY") app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) +class ExtractSchema(BaseModel): + title: str + description: str + links: List[str] + @pytest.mark.asyncio -async def test_extract_async_placeholder(): - # TODO: Implement async E2E tests for extract - assert True \ No newline at end of file +async def test_extract_async_simple_schema_class(): + result = await app.extract( + ["https://example.com"], + prompt="Extract the title, description, and links from the website", + schema=ExtractSchema + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert isinstance(result.data.links, list) + +@pytest.mark.asyncio +async def test_extract_async_simple_schema_dict(): + result = await app.extract( + ["https://example.com"], + prompt="Extract the title, description, and links from the website", + schema=ExtractSchema.schema() + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert isinstance(result.data.links, list) + +@pytest.mark.asyncio +async def test_extract_async_simple_schema_json(): + # Try both model_json_schema (Pydantic v2) and schema_json (Pydantic v1) + schema = None + if hasattr(ExtractSchema, "model_json_schema"): + schema = ExtractSchema.model_json_schema() + elif hasattr(ExtractSchema, "schema_json"): + schema = ExtractSchema.schema_json() + else: + pytest.skip("No JSON schema export method available on ExtractSchema") + result = await app.extract( + ["https://example.com"], + prompt="Extract the title, description, and links from the website", + schema=schema + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert isinstance(result.data.links, list) + +@pytest.mark.asyncio +async def test_extract_async_all_params(): + class AllParamsSchema(BaseModel): + title: str + description: str + links: List[str] + author: str + schema = AllParamsSchema.schema() + result = await app.extract( + ["https://www.iana.org"], + prompt="Extract the title, description, links, and author from the website", + schema=schema, + system_prompt="You are a helpful extraction agent.", + allow_external_links=True, + enable_web_search=True, + show_sources=True, + agent={"model": "FIRE-1"} + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert hasattr(result.data, "author") + assert isinstance(result.data.links, list) + assert hasattr(result, "sources") + assert isinstance(result.sources, list) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py b/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py index 00d6b987..b05dd31c 100644 --- a/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py +++ b/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py @@ -1,7 +1,10 @@ import os +import sys import pytest from dotenv import load_dotenv -from firecrawl import AsyncFirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp load_dotenv() @@ -11,6 +14,30 @@ API_KEY = os.getenv("TEST_API_KEY") app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) @pytest.mark.asyncio -async def test_generate_llms_text_async_placeholder(): - # TODO: Implement async E2E tests for generate_llms_text - assert True \ No newline at end of file +async def test_generate_llms_text_async_simple(): + result = await app.generate_llms_text("https://example.com") + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "data") + assert result.data is not None + assert hasattr(result.data, "llmstxt") + assert isinstance(result.data.llmstxt, str) + assert len(result.data.llmstxt) > 0 + +@pytest.mark.asyncio +async def test_generate_llms_text_async_all_params(): + result = await app.generate_llms_text( + "https://www.iana.org", + max_urls=5, + show_full_text=True, + experimental_stream=True + ) + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "data") + assert result.data is not None + assert hasattr(result.data, "llmstxt") + assert isinstance(result.data.llmstxt, str) + assert len(result.data.llmstxt) > 0 + assert hasattr(result.data, "llmsfulltxt") + assert result.data.llmsfulltxt is None or isinstance(result.data.llmsfulltxt, str) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_scrape.py b/apps/python-sdk/tests/e2e/async/test_scrape.py index ae0e0a46..0969937b 100644 --- a/apps/python-sdk/tests/e2e/async/test_scrape.py +++ b/apps/python-sdk/tests/e2e/async/test_scrape.py @@ -40,11 +40,11 @@ async def test_scrape_url_async_all_params_with_extract_screenshot(): WriteAction(type="write", text="test input"), PressAction(type="press", key="Enter"), ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") ] response = await app.scrape_url( url=TEST_URL, - formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"], + formats=["markdown", "html", "raw_html", "links", "screenshot", "extract", "change_tracking"], include_tags=["h1", "p"], exclude_tags=["footer"], only_main_content=True, @@ -64,12 +64,12 @@ async def test_scrape_url_async_all_params_with_extract_screenshot(): assert response.success assert hasattr(response, "markdown") assert hasattr(response, "html") - assert hasattr(response, "rawHtml") + assert hasattr(response, "raw_html") assert hasattr(response, "links") assert hasattr(response, "screenshot") assert hasattr(response, "extract") assert hasattr(response, "metadata") - assert hasattr(response, "changeTracking") + assert hasattr(response, "change_tracking") @pytest.mark.asyncio async def test_scrape_url_async_all_params_with_extract_full_page_screenshot(): @@ -83,11 +83,11 @@ async def test_scrape_url_async_all_params_with_extract_full_page_screenshot(): WriteAction(type="write", text="test input"), PressAction(type="press", key="Enter"), ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") ] response = await app.scrape_url( url=TEST_URL, - formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"], + formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"], include_tags=["h1", "p"], exclude_tags=["footer"], only_main_content=True, @@ -107,12 +107,12 @@ async def test_scrape_url_async_all_params_with_extract_full_page_screenshot(): assert response.success assert hasattr(response, "markdown") assert hasattr(response, "html") - assert hasattr(response, "rawHtml") + assert hasattr(response, "raw_html") assert hasattr(response, "links") assert hasattr(response, "screenshot") assert hasattr(response, "extract") assert hasattr(response, "metadata") - assert hasattr(response, "changeTracking") + assert hasattr(response, "change_tracking") @pytest.mark.asyncio async def test_scrape_url_async_all_params_with_json_options(): @@ -126,11 +126,11 @@ async def test_scrape_url_async_all_params_with_json_options(): WriteAction(type="write", text="test input"), PressAction(type="press", key="Enter"), ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") ] response = await app.scrape_url( url=TEST_URL, - formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"], + formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"], include_tags=["h1", "p"], exclude_tags=["footer"], only_main_content=True, @@ -150,9 +150,9 @@ async def test_scrape_url_async_all_params_with_json_options(): assert response.success assert hasattr(response, "markdown") assert hasattr(response, "html") - assert hasattr(response, "rawHtml") + assert hasattr(response, "raw_html") assert hasattr(response, "links") assert hasattr(response, "screenshot") assert hasattr(response, "json") assert hasattr(response, "metadata") - assert hasattr(response, "changeTracking") \ No newline at end of file + assert hasattr(response, "change_tracking") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py index d3fc8c6c..b05617fe 100644 --- a/apps/python-sdk/tests/e2e/test_async_batch_scrape.py +++ b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py @@ -1,7 +1,10 @@ import os import pytest from dotenv import load_dotenv -from firecrawl import FirecrawlApp +import sys + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction load_dotenv() @@ -10,6 +13,114 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) -def test_async_batch_scrape_urls_placeholder(): - # TODO: Implement E2E tests for async_batch_scrape_urls - assert True \ No newline at end of file +TEST_URLS = [ + "https://example.com", + "https://www.iana.org" +] + +def wait_for_batch_completion(app, job_id, timeout=60): + for _ in range(timeout): + status = app.check_batch_scrape_status(job_id) + if status.status == "completed": + return status + import time; time.sleep(1) + raise TimeoutError("Batch scrape did not complete in time") + +def test_async_batch_scrape_urls_simple(): + job = app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown"] + ) + assert job.id is not None + status = wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + assert any("Example Domain" in doc.markdown for doc in status.data) + assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data) + +def test_async_batch_scrape_urls_all_params_with_extract(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + job = app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions + ) + assert job.id is not None + status = wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "extract") + assert hasattr(doc, "metadata") + +def test_async_batch_scrape_urls_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + job = app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot", "json"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions + ) + assert job.id is not None + status = wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "json") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_async_crawl.py b/apps/python-sdk/tests/e2e/test_async_crawl.py index a7b3dc06..e63cff54 100644 --- a/apps/python-sdk/tests/e2e/test_async_crawl.py +++ b/apps/python-sdk/tests/e2e/test_async_crawl.py @@ -1,7 +1,10 @@ import os import pytest +import sys from dotenv import load_dotenv -from firecrawl import FirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions load_dotenv() @@ -10,6 +13,79 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) -def test_async_crawl_url_placeholder(): - # TODO: Implement E2E tests for async_crawl_url - assert True \ No newline at end of file +TEST_URL = "https://example.com" + +def wait_for_crawl_completion(app, job_id, timeout=60): + for _ in range(timeout): + status = app.check_crawl_status(job_id) + if status.status == "completed": + return status + import time; time.sleep(1) + raise TimeoutError("Crawl did not complete in time") + +def test_crawl_url_simple(): + job = app.async_crawl_url( + TEST_URL, + limit=1, + scrape_options=ScrapeOptions(formats=["markdown"]) + ) + assert job.id is not None + status = wait_for_crawl_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + assert any("Example Domain" in doc.markdown for doc in status.data) + +def test_crawl_url_all_params(): + location = LocationConfig(country="us", languages=["en"]) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + scrape_options = ScrapeOptions( + formats=["markdown", "html", "raw_html", "links", "screenshot"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + actions=actions + ) + job = app.async_crawl_url( + TEST_URL, + include_paths=["/"], + exclude_paths=["/notarealpage"], + max_depth=2, + max_discovery_depth=2, + limit=2, + allow_backward_links=True, + allow_external_links=True, + ignore_sitemap=True, + scrape_options=scrape_options, + deduplicate_similar_urls=True, + ignore_query_parameters=True, + regex_on_full_url=True, + delay=1 + ) + assert job.id is not None + status = wait_for_crawl_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_batch_scrape.py b/apps/python-sdk/tests/e2e/test_batch_scrape.py index ff8aa0af..57a19639 100644 --- a/apps/python-sdk/tests/e2e/test_batch_scrape.py +++ b/apps/python-sdk/tests/e2e/test_batch_scrape.py @@ -1,7 +1,10 @@ import os import pytest from dotenv import load_dotenv -from firecrawl import FirecrawlApp +import sys + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction load_dotenv() @@ -10,6 +13,114 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) -def test_batch_scrape_urls_placeholder(): - # TODO: Implement E2E tests for batch_scrape_urls - assert True \ No newline at end of file +TEST_URLS = [ + "https://example.com", + "https://www.iana.org" +] + +def wait_for_batch_completion(app, job_id, timeout=60): + for _ in range(timeout): + status = app.check_batch_scrape_status(job_id) + if status.status == "completed": + return status + import time; time.sleep(1) + raise TimeoutError("Batch scrape did not complete in time") + +def test_batch_scrape_urls_simple(): + job = app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown"] + ) + assert job.id is not None + status = wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + assert any("Example Domain" in doc.markdown for doc in status.data) + assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data) + +def test_batch_scrape_urls_all_params_with_extract(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + job = app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions + ) + assert job.id is not None + status = wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "extract") + assert hasattr(doc, "metadata") + +def test_batch_scrape_urls_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + job = app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot", "json"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions + ) + assert job.id is not None + status = wait_for_batch_completion(app, job.id) + assert status.success + assert hasattr(status, "data") + assert len(status.data) == len(TEST_URLS) + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "json") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_check_crawl_status.py b/apps/python-sdk/tests/e2e/test_check_crawl_status.py deleted file mode 100644 index 22b1a4d4..00000000 --- a/apps/python-sdk/tests/e2e/test_check_crawl_status.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -import pytest -from dotenv import load_dotenv -from firecrawl import FirecrawlApp - -load_dotenv() - -API_URL = "https://api.firecrawl.dev" -API_KEY = os.getenv("TEST_API_KEY") - -app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) - -def test_check_crawl_status_placeholder(): - # TODO: Implement E2E tests for check_crawl_status - assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_crawl.py b/apps/python-sdk/tests/e2e/test_crawl.py index bf758919..ac00f7be 100644 --- a/apps/python-sdk/tests/e2e/test_crawl.py +++ b/apps/python-sdk/tests/e2e/test_crawl.py @@ -1,7 +1,9 @@ import os -import pytest +import sys from dotenv import load_dotenv -from firecrawl import FirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions load_dotenv() @@ -10,6 +12,67 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) -def test_crawl_url_placeholder(): - # TODO: Implement E2E tests for crawl_url - assert True \ No newline at end of file +TEST_URL = "https://example.com" + +def test_crawl_url_simple(): + status = app.crawl_url( + TEST_URL, + limit=1, + scrape_options=ScrapeOptions(formats=["markdown"]) + ) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + assert any("Example Domain" in doc.markdown for doc in status.data) + +def test_crawl_url_all_params(): + location = LocationConfig(country="us", languages=["en"]) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + ] + scrape_options = ScrapeOptions( + formats=["markdown", "html", "raw_html", "links", "screenshot"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + actions=actions + ) + status = app.crawl_url( + TEST_URL, + include_paths=["/"], + exclude_paths=["/notarealpage"], + max_depth=2, + max_discovery_depth=2, + limit=2, + allow_backward_links=True, + allow_external_links=True, + ignore_sitemap=True, + scrape_options=scrape_options, + deduplicate_similar_urls=True, + ignore_query_parameters=True, + regex_on_full_url=True, + delay=1 + ) + assert status.success + assert hasattr(status, "data") + assert len(status.data) >= 1 + for doc in status.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "raw_html") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_deep_research.py b/apps/python-sdk/tests/e2e/test_deep_research.py index 463aea64..d32445e7 100644 --- a/apps/python-sdk/tests/e2e/test_deep_research.py +++ b/apps/python-sdk/tests/e2e/test_deep_research.py @@ -1,7 +1,10 @@ import os +import sys import pytest from dotenv import load_dotenv -from firecrawl import FirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp load_dotenv() @@ -10,6 +13,34 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) -def test_deep_research_placeholder(): - # TODO: Implement E2E tests for deep_research - assert True \ No newline at end of file +def test_deep_research_simple(): + result = app.deep_research("What is the capital of France?", max_urls=2) + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "success") + assert result.success + assert hasattr(result, "data") + assert result.data is not None + assert any("Paris" in str(result.data) or "France" in str(result.data) for _ in [0]) + +def test_deep_research_all_params(): + result = app.deep_research( + "What are the latest advancements in AI?", + max_depth=2, + time_limit=60, + max_urls=3, + analysis_prompt="Summarize the most important recent AI advancements.", + system_prompt="You are an expert AI researcher." + ) + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "success") + assert result.success + assert hasattr(result, "data") + assert hasattr(result, "activities") + assert isinstance(result.activities, list) + assert result.data is not None + assert hasattr(result.data, "sources") + assert isinstance(result.data.sources, list) + assert hasattr(result.data, "final_analysis") + assert isinstance(result.data.final_analysis, str) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_extract.py b/apps/python-sdk/tests/e2e/test_extract.py index 4e925041..93f58d24 100644 --- a/apps/python-sdk/tests/e2e/test_extract.py +++ b/apps/python-sdk/tests/e2e/test_extract.py @@ -1,7 +1,12 @@ import os +import sys import pytest from dotenv import load_dotenv -from firecrawl import FirecrawlApp +from pydantic import BaseModel +from typing import List + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp load_dotenv() @@ -10,6 +15,80 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) -def test_extract_placeholder(): - # TODO: Implement E2E tests for extract - assert True \ No newline at end of file +class ExtractSchema(BaseModel): + title: str + description: str + links: List[str] + +def test_extract_simple_schema_class(): + result = app.extract( + ["https://example.com"], + prompt="Extract the title, description, and links from the website", + schema=ExtractSchema + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert isinstance(result.data.links, list) + +def test_extract_simple_schema_dict(): + result = app.extract( + ["https://example.com"], + prompt="Extract the title, description, and links from the website", + schema=ExtractSchema.schema() + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert isinstance(result.data.links, list) + +def test_extract_simple_schema_json(): + schema = None + if hasattr(ExtractSchema, "model_json_schema"): + schema = ExtractSchema.model_json_schema() + elif hasattr(ExtractSchema, "schema_json"): + schema = ExtractSchema.schema_json() + else: + pytest.skip("No JSON schema export method available on ExtractSchema") + result = app.extract( + ["https://example.com"], + prompt="Extract the title, description, and links from the website", + schema=schema + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert isinstance(result.data.links, list) + +def test_extract_all_params(): + class AllParamsSchema(BaseModel): + title: str + description: str + links: List[str] + author: str + schema = AllParamsSchema.schema() + result = app.extract( + ["https://www.iana.org"], + prompt="Extract the title, description, links, and author from the website", + schema=schema, + system_prompt="You are a helpful extraction agent.", + allow_external_links=True, + enable_web_search=True, + show_sources=True, + agent={"model": "FIRE-1"} + ) + assert result.success + assert result.data is not None + assert hasattr(result.data, "title") + assert hasattr(result.data, "description") + assert hasattr(result.data, "links") + assert hasattr(result.data, "author") + assert isinstance(result.data.links, list) + assert hasattr(result, "sources") + assert isinstance(result.sources, list) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_generate_llms_text.py b/apps/python-sdk/tests/e2e/test_generate_llms_text.py index fe9928e1..ff245026 100644 --- a/apps/python-sdk/tests/e2e/test_generate_llms_text.py +++ b/apps/python-sdk/tests/e2e/test_generate_llms_text.py @@ -1,7 +1,10 @@ import os +import sys import pytest from dotenv import load_dotenv -from firecrawl import FirecrawlApp + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp load_dotenv() @@ -10,6 +13,29 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) -def test_generate_llms_text_placeholder(): - # TODO: Implement E2E tests for generate_llms_text - assert True \ No newline at end of file +def test_generate_llms_text_simple(): + result = app.generate_llms_text("https://example.com") + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "data") + assert result.data is not None + assert hasattr(result.data, "llmstxt") + assert isinstance(result.data.llmstxt, str) + assert len(result.data.llmstxt) > 0 + +def test_generate_llms_text_all_params(): + result = app.generate_llms_text( + "https://www.iana.org", + max_urls=5, + show_full_text=True, + experimental_stream=True + ) + assert hasattr(result, "status") + assert result.status == "completed" + assert hasattr(result, "data") + assert result.data is not None + assert hasattr(result.data, "llmstxt") + assert isinstance(result.data.llmstxt, str) + assert len(result.data.llmstxt) > 0 + assert hasattr(result.data, "llmsfulltxt") + assert result.data.llmsfulltxt is None or isinstance(result.data.llmsfulltxt, str) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_scrape.py b/apps/python-sdk/tests/e2e/test_scrape.py index f20a80f7..8ab64f5d 100644 --- a/apps/python-sdk/tests/e2e/test_scrape.py +++ b/apps/python-sdk/tests/e2e/test_scrape.py @@ -80,11 +80,11 @@ def test_scrape_url_all_params_with_extract_full_page_screenshot(): WriteAction(type="write", text="test input"), PressAction(type="press", key="Enter"), ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") ] response = app.scrape_url( url=TEST_URL, - formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"], + formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"], include_tags=["h1", "p"], exclude_tags=["footer"], only_main_content=True, @@ -104,12 +104,12 @@ def test_scrape_url_all_params_with_extract_full_page_screenshot(): assert response.success assert hasattr(response, "markdown") assert hasattr(response, "html") - assert hasattr(response, "rawHtml") + assert hasattr(response, "raw_html") assert hasattr(response, "links") assert hasattr(response, "screenshot") assert hasattr(response, "extract") assert hasattr(response, "metadata") - assert hasattr(response, "changeTracking") + assert hasattr(response, "change_tracking") def test_scrape_url_all_params_with_json_options(): location = LocationConfig(country="us", languages=["en"]) @@ -122,11 +122,11 @@ def test_scrape_url_all_params_with_json_options(): WriteAction(type="write", text="test input"), PressAction(type="press", key="Enter"), ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") ] response = app.scrape_url( url=TEST_URL, - formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"], + formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"], include_tags=["h1", "p"], exclude_tags=["footer"], only_main_content=True, @@ -146,9 +146,9 @@ def test_scrape_url_all_params_with_json_options(): assert response.success assert hasattr(response, "markdown") assert hasattr(response, "html") - assert hasattr(response, "rawHtml") + assert hasattr(response, "raw_html") assert hasattr(response, "links") assert hasattr(response, "screenshot") assert hasattr(response, "json") assert hasattr(response, "metadata") - assert hasattr(response, "changeTracking") \ No newline at end of file + assert hasattr(response, "change_tracking") \ No newline at end of file