diff --git a/apps/python-sdk/examplev0.py b/apps/python-sdk/examplev0.py
new file mode 100644
index 00000000..d80fa795
--- /dev/null
+++ b/apps/python-sdk/examplev0.py
@@ -0,0 +1,75 @@
+import uuid
+from firecrawl.firecrawl import FirecrawlApp
+
+app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
+
+# Scrape a website:
+scrape_result = app.scrape_url('firecrawl.dev')
+print(scrape_result['markdown'])
+
+# Crawl a website:
+idempotency_key = str(uuid.uuid4()) # optional idempotency key
+crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
+print(crawl_result)
+
+# LLM Extraction:
+# Define schema to extract contents into using pydantic
+from pydantic import BaseModel, Field
+from typing import List
+
+class ArticleSchema(BaseModel):
+ title: str
+ points: int
+ by: str
+ commentsURL: str
+
+class TopArticlesSchema(BaseModel):
+ top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
+
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
+ 'extractorOptions': {
+ 'extractionSchema': TopArticlesSchema.model_json_schema(),
+ 'mode': 'llm-extraction'
+ },
+ 'pageOptions':{
+ 'onlyMainContent': True
+ }
+})
+
+print(llm_extraction_result['llm_extraction'])
+
+# Define schema to extract contents into using json schema
+json_schema = {
+ "type": "object",
+ "properties": {
+ "top": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "title": {"type": "string"},
+ "points": {"type": "number"},
+ "by": {"type": "string"},
+ "commentsURL": {"type": "string"}
+ },
+ "required": ["title", "points", "by", "commentsURL"]
+ },
+ "minItems": 5,
+ "maxItems": 5,
+ "description": "Top 5 stories on Hacker News"
+ }
+ },
+ "required": ["top"]
+}
+
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
+ 'extractorOptions': {
+ 'extractionSchema': json_schema,
+ 'mode': 'llm-extraction'
+ },
+ 'pageOptions':{
+ 'onlyMainContent': True
+ }
+})
+
+print(llm_extraction_result['llm_extraction'])
\ No newline at end of file
diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
index 452d4982..457c206a 100644
--- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
@@ -20,31 +20,31 @@ FirecrawlApp = firecrawl.FirecrawlApp
def test_no_api_key():
with pytest.raises(Exception) as excinfo:
- invalid_app = FirecrawlApp(api_url=API_URL)
+ invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
assert "No API key provided" in str(excinfo.value)
def test_scrape_url_invalid_api_key():
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
with pytest.raises(Exception) as excinfo:
invalid_app.scrape_url('https://firecrawl.dev')
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_blocklisted_url():
blocklisted_url = "https://facebook.com/fake-test"
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
with pytest.raises(Exception) as excinfo:
app.scrape_url(blocklisted_url)
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
def test_successful_response_with_valid_preview_token():
- app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
+ app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
response = app.scrape_url('https://roastmywebsite.ai')
assert response is not None
assert 'content' in response
assert "_Roast_" in response['content']
def test_scrape_url_e2e():
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url('https://roastmywebsite.ai')
assert response is not None
assert 'content' in response
@@ -54,7 +54,7 @@ def test_scrape_url_e2e():
assert "_Roast_" in response['content']
def test_successful_response_with_valid_api_key_and_include_html():
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
assert response is not None
assert 'content' in response
@@ -66,7 +66,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
assert "
0
@@ -104,7 +104,7 @@ def test_crawl_url_wait_for_completion_e2e():
assert "_Roast_" in response[0]['content']
def test_crawl_url_with_idempotency_key_e2e():
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
uniqueIdempotencyKey = str(uuid4())
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
assert response is not None
@@ -117,7 +117,7 @@ def test_crawl_url_with_idempotency_key_e2e():
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
def test_check_crawl_status_e2e():
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
assert response is not None
assert 'jobId' in response
@@ -131,20 +131,20 @@ def test_check_crawl_status_e2e():
assert len(status_response['data']) > 0
def test_search_e2e():
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.search("test query")
assert response is not None
assert 'content' in response[0]
assert len(response) > 2
def test_search_invalid_api_key():
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
with pytest.raises(Exception) as excinfo:
invalid_app.search("test query")
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
def test_llm_extraction():
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
response = app.scrape_url("https://mendable.ai", {
'extractorOptions': {
'mode': 'llm-extraction',
diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example
new file mode 100644
index 00000000..904887bf
--- /dev/null
+++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example
@@ -0,0 +1,3 @@
+API_URL=http://localhost:3002
+ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
+TEST_API_KEY=fc-YOUR_API_KEY
\ No newline at end of file
diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc
new file mode 100644
index 00000000..5ba1f132
Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ
diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py
new file mode 100644
index 00000000..517d8cf9
--- /dev/null
+++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py
@@ -0,0 +1,168 @@
+import importlib.util
+import pytest
+import time
+import os
+from uuid import uuid4
+from dotenv import load_dotenv
+
+load_dotenv()
+
+API_URL = "http://127.0.0.1:3002";
+ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
+TEST_API_KEY = os.getenv('TEST_API_KEY')
+
+print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
+
+spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
+firecrawl = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(firecrawl)
+FirecrawlApp = firecrawl.FirecrawlApp
+
+def test_no_api_key():
+ with pytest.raises(Exception) as excinfo:
+ invalid_app = FirecrawlApp(api_url=API_URL)
+ assert "No API key provided" in str(excinfo.value)
+
+def test_scrape_url_invalid_api_key():
+ invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+ with pytest.raises(Exception) as excinfo:
+ invalid_app.scrape_url('https://firecrawl.dev')
+ assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
+
+def test_blocklisted_url():
+ blocklisted_url = "https://facebook.com/fake-test"
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ with pytest.raises(Exception) as excinfo:
+ app.scrape_url(blocklisted_url)
+ assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
+
+def test_successful_response_with_valid_preview_token():
+ app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
+ response = app.scrape_url('https://roastmywebsite.ai')
+ assert response is not None
+ assert 'content' in response
+ assert "_Roast_" in response['content']
+
+def test_scrape_url_e2e():
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ response = app.scrape_url('https://roastmywebsite.ai')
+ assert response is not None
+ assert 'content' not in response
+ assert 'markdown' in response
+ assert 'metadata' in response
+ assert 'html' not in response
+ assert "_Roast_" in response['markdown']
+
+def test_successful_response_with_valid_api_key_and_include_html():
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ response = app.scrape_url('https://roastmywebsite.ai', { 'formats': [ 'markdown', 'html' ]})
+ assert response is not None
+ assert 'content' not in response
+ assert 'markdown' in response
+ assert 'html' in response
+ assert 'metadata' in response
+ assert "_Roast_" in response['markdown']
+ assert " 0
+ assert 'content' not in response[0]
+ assert 'markdown' in response[0]
+ assert "_Roast_" in response[0]['markdown']
+
+def test_crawl_url_with_idempotency_key_e2e():
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ uniqueIdempotencyKey = str(uuid4())
+ response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
+ assert response is not None
+ assert len(response) > 0
+ assert 'content' in response[0]
+ assert "_Roast_" in response[0]['content']
+
+ with pytest.raises(Exception) as excinfo:
+ app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
+ assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
+
+def test_check_crawl_status_e2e():
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
+ assert response is not None
+ assert 'jobId' in response
+
+ time.sleep(30) # wait for 30 seconds
+ status_response = app.check_crawl_status(response['jobId'])
+ assert status_response is not None
+ assert 'status' in status_response
+ assert status_response['status'] == 'completed'
+ assert 'data' in status_response
+ assert len(status_response['data']) > 0
+
+def test_search_e2e():
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ with pytest.raises(NotImplementedError) as excinfo:
+ app.search("test query")
+ assert "Search is not supported in v1" in str(excinfo.value)
+
+def test_llm_extraction():
+ app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+ response = app.scrape_url("https://mendable.ai", {
+ 'extractorOptions': {
+ 'mode': 'llm-extraction',
+ 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
+ 'extractionSchema': {
+ 'type': 'object',
+ 'properties': {
+ 'company_mission': {'type': 'string'},
+ 'supports_sso': {'type': 'boolean'},
+ 'is_open_source': {'type': 'boolean'}
+ },
+ 'required': ['company_mission', 'supports_sso', 'is_open_source']
+ }
+ }
+ })
+ assert response is not None
+ assert 'llm_extraction' in response
+ llm_extraction = response['llm_extraction']
+ assert 'company_mission' in llm_extraction
+ assert isinstance(llm_extraction['supports_sso'], bool)
+ assert isinstance(llm_extraction['is_open_source'], bool)
+
+def test_map_e2e():
+ app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
+ response = app.map_url('https://roastmywebsite.ai')
+ assert response is not None
+ assert isinstance(response, list)
+
\ No newline at end of file
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 7ec0d33f..25c9663e 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -19,24 +19,22 @@ import requests
logger : logging.Logger = logging.getLogger("firecrawl")
class FirecrawlApp:
- """
- Initialize the FirecrawlApp instance.
+ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None:
+ """
+ Initialize the FirecrawlApp instance with API key, API URL, and version.
- Args:
- api_key (Optional[str]): API key for authenticating with the Firecrawl API.
- api_url (Optional[str]): Base URL for the Firecrawl API.
- """
- def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
- self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
- if self.api_key is None:
- logger.warning("No API key provided")
- raise ValueError('No API key provided')
- else:
- logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
-
- self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
- if self.api_url != 'https://api.firecrawl.dev':
- logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
+ Args:
+ api_key (Optional[str]): API key for authenticating with the Firecrawl API.
+ api_url (Optional[str]): Base URL for the Firecrawl API.
+ version (str): API version, either 'v0' or 'v1'.
+ """
+ self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
+ self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
+ self.version = version
+ if self.api_key is None:
+ logger.warning("No API key provided")
+ raise ValueError('No API key provided')
+ logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}")
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
"""
@@ -75,9 +73,11 @@ class FirecrawlApp:
for key, value in params.items():
if key != 'extractorOptions':
scrape_params[key] = value
+
+ endpoint = f'/{self.version}/scrape'
# Make the POST request with the prepared headers and JSON data
response = requests.post(
- f'{self.api_url}/v0/scrape',
+ f'{self.api_url}{endpoint}',
headers=headers,
json=scrape_params,
)
@@ -104,6 +104,9 @@ class FirecrawlApp:
Raises:
Exception: If the search request fails.
"""
+ if self.version == 'v1':
+ raise NotImplementedError("Search is not supported in v1")
+
headers = self._prepare_headers()
json_data = {'query': query}
if params:
@@ -145,11 +148,12 @@ class FirecrawlApp:
Raises:
Exception: If the crawl job initiation or monitoring fails.
"""
+ endpoint = f'/{self.version}/crawl'
headers = self._prepare_headers(idempotency_key)
json_data = {'url': url}
if params:
json_data.update(params)
- response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
+ response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
if response.status_code == 200:
job_id = response.json().get('jobId')
if wait_until_done:
@@ -172,13 +176,44 @@ class FirecrawlApp:
Raises:
Exception: If the status check request fails.
"""
+ endpoint = f'/{self.version}/crawl/status/{job_id}'
headers = self._prepare_headers()
- response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
+ response = self._get_request(f'{self.api_url}{endpoint}', headers)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'check crawl status')
+ def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
+ """
+ Perform a map search using the Firecrawl API.
+ """
+ if self.version == 'v0':
+ raise NotImplementedError("Map is not supported in v0")
+
+ endpoint = f'/{self.version}/map'
+ headers = self._prepare_headers()
+
+ # Prepare the base scrape parameters with the URL
+ json_data = {'url': url}
+ if params:
+ json_data.update(params)
+
+ # Make the POST request with the prepared headers and JSON data
+ response = requests.post(
+ f'{self.api_url}{endpoint}',
+ headers=headers,
+ json=json_data,
+ )
+ if response.status_code == 200:
+ response = response.json()
+ if response['success'] and 'data' in response:
+ return response['data']
+ else:
+ raise Exception(f'Failed to map URL. Error: {response["error"]}')
+ else:
+ self._handle_error(response, 'map')
+
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
"""
Prepare the headers for API requests.