python sdk and tests

2025-08-12 03:29:01 +08:00 · 2024-08-08 14:25:09 -03:00 · 2024-08-08 14:25:09 -03:00 · 0b8df5e264
commit 0b8df5e264
parent 6cdf4c68ec
7 changed files with 317 additions and 36 deletions
--- a/apps/python-sdk/examplev0.py
+++ b/apps/python-sdk/examplev0.py
@ -0,0 +1,75 @@
+import uuid
+from firecrawl.firecrawl import FirecrawlApp
+
+app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
+
+# Scrape a website:
+scrape_result = app.scrape_url('firecrawl.dev')
+print(scrape_result['markdown'])
+
+# Crawl a website:
+idempotency_key = str(uuid.uuid4()) # optional idempotency key
+crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
+print(crawl_result)
+
+# LLM Extraction:
+# Define schema to extract contents into using pydantic
+from pydantic import BaseModel, Field
+from typing import List
+
+class ArticleSchema(BaseModel):
+    title: str
+    points: int 
+    by: str
+    commentsURL: str
+
+class TopArticlesSchema(BaseModel):
+    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
+
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
+    'extractorOptions': {
+        'extractionSchema': TopArticlesSchema.model_json_schema(),
+        'mode': 'llm-extraction'
+    },
+    'pageOptions':{
+        'onlyMainContent': True
+    }
+})
+
+print(llm_extraction_result['llm_extraction'])
+
+# Define schema to extract contents into using json schema
+json_schema = {
+  "type": "object",
+  "properties": {
+    "top": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "title": {"type": "string"},
+          "points": {"type": "number"},
+          "by": {"type": "string"},
+          "commentsURL": {"type": "string"}
+        },
+        "required": ["title", "points", "by", "commentsURL"]
+      },
+      "minItems": 5,
+      "maxItems": 5,
+      "description": "Top 5 stories on Hacker News"
+    }
+  },
+  "required": ["top"]
+}
+
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
+    'extractorOptions': {
+        'extractionSchema': json_schema,
+        'mode': 'llm-extraction'
+    },
+    'pageOptions':{
+        'onlyMainContent': True
+    }
+})
+
+print(llm_extraction_result['llm_extraction'])
--- a/apps/python-sdk/firecrawl/tests/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/tests/e2e_withAuth/test.py
@ -20,31 +20,31 @@ FirecrawlApp = firecrawl.FirecrawlApp

 def test_no_api_key():
    with pytest.raises(Exception) as excinfo:
-      invalid_app = FirecrawlApp(api_url=API_URL)
+      invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
    assert "No API key provided" in str(excinfo.value)

 def test_scrape_url_invalid_api_key():
-    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
    with pytest.raises(Exception) as excinfo:
        invalid_app.scrape_url('https://firecrawl.dev')
    assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_blocklisted_url():
    blocklisted_url = "https://facebook.com/fake-test"
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    with pytest.raises(Exception) as excinfo:
        app.scrape_url(blocklisted_url)
    assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)

 def test_successful_response_with_valid_preview_token():
-    app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
+    app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
    response = app.scrape_url('https://roastmywebsite.ai')
    assert response is not None
    assert 'content' in response
    assert "_Roast_" in response['content']

 def test_scrape_url_e2e():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.scrape_url('https://roastmywebsite.ai')
    assert response is not None
    assert 'content' in response
@ -54,7 +54,7 @@ def test_scrape_url_e2e():
    assert "_Roast_" in response['content']

 def test_successful_response_with_valid_api_key_and_include_html():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
    assert response is not None
    assert 'content' in response
@ -66,7 +66,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
    assert "<h1" in response['html']

 def test_successful_response_for_valid_scrape_with_pdf_file():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
    assert response is not None
    assert 'content' in response
@ -74,7 +74,7 @@ def test_successful_response_for_valid_scrape_with_pdf_file():
    assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']

 def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
    time.sleep(6)  # wait for 6 seconds
    assert response is not None
@ -83,20 +83,20 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
    assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']

 def test_crawl_url_invalid_api_key():
-    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
    with pytest.raises(Exception) as excinfo:
        invalid_app.crawl_url('https://firecrawl.dev')
    assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_should_return_error_for_blocklisted_url():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    blocklisted_url = "https://twitter.com/fake-test"
    with pytest.raises(Exception) as excinfo:
        app.crawl_url(blocklisted_url)
    assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)

 def test_crawl_url_wait_for_completion_e2e():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
    assert response is not None
    assert len(response) > 0
@ -104,7 +104,7 @@ def test_crawl_url_wait_for_completion_e2e():
    assert "_Roast_" in response[0]['content']

 def test_crawl_url_with_idempotency_key_e2e():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    uniqueIdempotencyKey = str(uuid4())
    response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
    assert response is not None
@ -117,7 +117,7 @@ def test_crawl_url_with_idempotency_key_e2e():
    assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) 

 def test_check_crawl_status_e2e():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
    assert response is not None
    assert 'jobId' in response
@ -131,20 +131,20 @@ def test_check_crawl_status_e2e():
    assert len(status_response['data']) > 0

 def test_search_e2e():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.search("test query")
    assert response is not None
    assert 'content' in response[0]
    assert len(response) > 2

 def test_search_invalid_api_key():
-    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
    with pytest.raises(Exception) as excinfo:
        invalid_app.search("test query")
    assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)

 def test_llm_extraction():
-    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
    response = app.scrape_url("https://mendable.ai", {
        'extractorOptions': {
            'mode': 'llm-extraction',
--- a/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/.env.example
+++ b/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/.env.example
@ -0,0 +1,3 @@
+API_URL=http://localhost:3002
+ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
+TEST_API_KEY=fc-YOUR_API_KEY
--- a/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/init.py
+++ b/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/init.py
--- a/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
+++ b/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/pycache/test.cpython-311-pytest-8.2.1.pyc
--- a/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/test.py
@ -0,0 +1,168 @@
+import importlib.util
+import pytest
+import time
+import os
+from uuid import uuid4
+from dotenv import load_dotenv
+
+load_dotenv()
+
+API_URL = "http://127.0.0.1:3002";
+ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
+TEST_API_KEY = os.getenv('TEST_API_KEY')
+
+print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
+
+spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
+firecrawl = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(firecrawl)
+FirecrawlApp = firecrawl.FirecrawlApp
+
+def test_no_api_key():
+    with pytest.raises(Exception) as excinfo:
+      invalid_app = FirecrawlApp(api_url=API_URL)
+    assert "No API key provided" in str(excinfo.value)
+
+def test_scrape_url_invalid_api_key():
+    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+    with pytest.raises(Exception) as excinfo:
+        invalid_app.scrape_url('https://firecrawl.dev')
+    assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
+
+def test_blocklisted_url():
+    blocklisted_url = "https://facebook.com/fake-test"
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    with pytest.raises(Exception) as excinfo:
+        app.scrape_url(blocklisted_url)
+    assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
+
+def test_successful_response_with_valid_preview_token():
+    app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
+    response = app.scrape_url('https://roastmywebsite.ai')
+    assert response is not None
+    assert 'content' in response
+    assert "_Roast_" in response['content']
+
+def test_scrape_url_e2e():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.scrape_url('https://roastmywebsite.ai')
+    assert response is not None
+    assert 'content' not in response
+    assert 'markdown' in response
+    assert 'metadata' in response
+    assert 'html' not in response
+    assert "_Roast_" in response['markdown']
+
+def test_successful_response_with_valid_api_key_and_include_html():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.scrape_url('https://roastmywebsite.ai', { 'formats': [ 'markdown', 'html' ]})
+    assert response is not None
+    assert 'content' not in response
+    assert 'markdown' in response
+    assert 'html' in response
+    assert 'metadata' in response
+    assert "_Roast_" in response['markdown']
+    assert "<h1" in response['html']
+
+def test_successful_response_for_valid_scrape_with_pdf_file():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
+    assert response is not None
+    assert 'content' not in response
+    assert 'metadata' in response
+    assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
+
+def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
+    time.sleep(6)  # wait for 6 seconds
+    assert response is not None
+    assert 'content' not in response
+    assert 'metadata' in response
+    assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
+
+def test_crawl_url_invalid_api_key():
+    invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+    with pytest.raises(Exception) as excinfo:
+        invalid_app.crawl_url('https://firecrawl.dev')
+    assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
+
+def test_should_return_error_for_blocklisted_url():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    blocklisted_url = "https://twitter.com/fake-test"
+    with pytest.raises(Exception) as excinfo:
+        app.crawl_url(blocklisted_url)
+    assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
+
+def test_crawl_url_wait_for_completion_e2e():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
+    assert response is not None
+    assert len(response) > 0
+    assert 'content' not in response[0]
+    assert 'markdown' in response[0]
+    assert "_Roast_" in response[0]['markdown']
+
+def test_crawl_url_with_idempotency_key_e2e():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    uniqueIdempotencyKey = str(uuid4())
+    response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
+    assert response is not None
+    assert len(response) > 0
+    assert 'content' in response[0]
+    assert "_Roast_" in response[0]['content']
+
+    with pytest.raises(Exception) as excinfo:
+        app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
+    assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) 
+
+def test_check_crawl_status_e2e():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
+    assert response is not None
+    assert 'jobId' in response
+    
+    time.sleep(30)  # wait for 30 seconds
+    status_response = app.check_crawl_status(response['jobId'])
+    assert status_response is not None
+    assert 'status' in status_response
+    assert status_response['status'] == 'completed'
+    assert 'data' in status_response
+    assert len(status_response['data']) > 0
+
+def test_search_e2e():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    with pytest.raises(NotImplementedError) as excinfo:
+        app.search("test query")
+    assert "Search is not supported in v1" in str(excinfo.value)
+
+def test_llm_extraction():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.scrape_url("https://mendable.ai", {
+        'extractorOptions': {
+            'mode': 'llm-extraction',
+            'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+            'extractionSchema': {
+                'type': 'object',
+                'properties': {
+                    'company_mission': {'type': 'string'},
+                    'supports_sso': {'type': 'boolean'},
+                    'is_open_source': {'type': 'boolean'}
+                },
+                'required': ['company_mission', 'supports_sso', 'is_open_source']
+            }
+        }
+    })
+    assert response is not None
+    assert 'llm_extraction' in response
+    llm_extraction = response['llm_extraction']
+    assert 'company_mission' in llm_extraction
+    assert isinstance(llm_extraction['supports_sso'], bool)
+    assert isinstance(llm_extraction['is_open_source'], bool)
+
+def test_map_e2e():
+    app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
+    response = app.map_url('https://roastmywebsite.ai')
+    assert response is not None
+    assert isinstance(response, list)
+    
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -19,24 +19,22 @@ import requests
 logger : logging.Logger = logging.getLogger("firecrawl")

 class FirecrawlApp:
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None:
      """
-    Initialize the FirecrawlApp instance.
+      Initialize the FirecrawlApp instance with API key, API URL, and version.

      Args:
          api_key (Optional[str]): API key for authenticating with the Firecrawl API.
          api_url (Optional[str]): Base URL for the Firecrawl API.
+          version (str): API version, either 'v0' or 'v1'.
      """
-    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
      self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
+      self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+      self.version = version
      if self.api_key is None:
          logger.warning("No API key provided")
          raise ValueError('No API key provided')
-        else:
-            logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
-
-        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
-        if self.api_url != 'https://api.firecrawl.dev':
-            logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
+      logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}")

    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
        """
@ -75,9 +73,11 @@ class FirecrawlApp:
            for key, value in params.items():
                if key != 'extractorOptions':
                    scrape_params[key] = value
+
+        endpoint = f'/{self.version}/scrape'
        # Make the POST request with the prepared headers and JSON data
        response = requests.post(
-            f'{self.api_url}/v0/scrape',
+            f'{self.api_url}{endpoint}',
            headers=headers,
            json=scrape_params,
        )
@ -104,6 +104,9 @@ class FirecrawlApp:
        Raises:
            Exception: If the search request fails.
        """
+        if self.version == 'v1':
+            raise NotImplementedError("Search is not supported in v1")
+        
        headers = self._prepare_headers()
        json_data = {'query': query}
        if params:
@ -145,11 +148,12 @@ class FirecrawlApp:
        Raises:
            Exception: If the crawl job initiation or monitoring fails.
        """
+        endpoint = f'/{self.version}/crawl'
        headers = self._prepare_headers(idempotency_key)
        json_data = {'url': url}
        if params:
            json_data.update(params)
-        response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
+        response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
        if response.status_code == 200:
            job_id = response.json().get('jobId')
            if wait_until_done:
@ -172,13 +176,44 @@ class FirecrawlApp:
        Raises:
            Exception: If the status check request fails.
        """
+        endpoint = f'/{self.version}/crawl/status/{job_id}'
        headers = self._prepare_headers()
-        response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
+        response = self._get_request(f'{self.api_url}{endpoint}', headers)
        if response.status_code == 200:
            return response.json()
        else:
            self._handle_error(response, 'check crawl status')

+    def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
+        """
+        Perform a map search using the Firecrawl API.
+        """
+        if self.version == 'v0':
+            raise NotImplementedError("Map is not supported in v0")
+        
+        endpoint = f'/{self.version}/map'
+        headers = self._prepare_headers()
+
+        # Prepare the base scrape parameters with the URL
+        json_data = {'url': url}
+        if params:
+            json_data.update(params)
+        
+        # Make the POST request with the prepared headers and JSON data
+        response = requests.post(
+            f'{self.api_url}{endpoint}',
+            headers=headers,
+            json=json_data,
+        )
+        if response.status_code == 200:
+            response = response.json()
+            if response['success'] and 'data' in response:
+                return response['data']
+            else:
+                raise Exception(f'Failed to map URL. Error: {response["error"]}')
+        else:
+            self._handle_error(response, 'map')
+
    def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
        """
        Prepare the headers for API requests.