diff --git a/apps/python-sdk/examplev0.py b/apps/python-sdk/examplev0.py new file mode 100644 index 00000000..d80fa795 --- /dev/null +++ b/apps/python-sdk/examplev0.py @@ -0,0 +1,75 @@ +import uuid +from firecrawl.firecrawl import FirecrawlApp + +app = FirecrawlApp(api_key="fc-YOUR_API_KEY") + +# Scrape a website: +scrape_result = app.scrape_url('firecrawl.dev') +print(scrape_result['markdown']) + +# Crawl a website: +idempotency_key = str(uuid.uuid4()) # optional idempotency key +crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key) +print(crawl_result) + +# LLM Extraction: +# Define schema to extract contents into using pydantic +from pydantic import BaseModel, Field +from typing import List + +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) + +# Define schema to extract contents into using json schema +json_schema = { + "type": "object", + "properties": { + "top": { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "points": {"type": "number"}, + "by": {"type": "string"}, + "commentsURL": {"type": "string"} + }, + "required": ["title", "points", "by", "commentsURL"] + }, + "minItems": 5, + "maxItems": 5, + "description": "Top 5 stories on Hacker News" + } + }, + "required": ["top"] +} + +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': json_schema, + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) + +print(llm_extraction_result['llm_extraction']) \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py index 452d4982..457c206a 100644 --- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py @@ -20,31 +20,31 @@ FirecrawlApp = firecrawl.FirecrawlApp def test_no_api_key(): with pytest.raises(Exception) as excinfo: - invalid_app = FirecrawlApp(api_url=API_URL) + invalid_app = FirecrawlApp(api_url=API_URL, version='v0') assert "No API key provided" in str(excinfo.value) def test_scrape_url_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.scrape_url('https://firecrawl.dev') assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_blocklisted_url(): blocklisted_url = "https://facebook.com/fake-test" - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') with pytest.raises(Exception) as excinfo: app.scrape_url(blocklisted_url) assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) def test_successful_response_with_valid_preview_token(): - app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response assert "_Roast_" in response['content'] def test_scrape_url_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai') assert response is not None assert 'content' in response @@ -54,7 +54,7 @@ def test_scrape_url_e2e(): assert "_Roast_" in response['content'] def test_successful_response_with_valid_api_key_and_include_html(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}}) assert response is not None assert 'content' in response @@ -66,7 +66,7 @@ def test_successful_response_with_valid_api_key_and_include_html(): assert " 0 @@ -104,7 +104,7 @@ def test_crawl_url_wait_for_completion_e2e(): assert "_Roast_" in response[0]['content'] def test_crawl_url_with_idempotency_key_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') uniqueIdempotencyKey = str(uuid4()) response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) assert response is not None @@ -117,7 +117,7 @@ def test_crawl_url_with_idempotency_key_e2e(): assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) def test_check_crawl_status_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) assert response is not None assert 'jobId' in response @@ -131,20 +131,20 @@ def test_check_crawl_status_e2e(): assert len(status_response['data']) > 0 def test_search_e2e(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.search("test query") assert response is not None assert 'content' in response[0] assert len(response) > 2 def test_search_invalid_api_key(): - invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0') with pytest.raises(Exception) as excinfo: invalid_app.search("test query") assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) def test_llm_extraction(): - app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0') response = app.scrape_url("https://mendable.ai", { 'extractorOptions': { 'mode': 'llm-extraction', diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example new file mode 100644 index 00000000..904887bf --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/.env.example @@ -0,0 +1,3 @@ +API_URL=http://localhost:3002 +ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py +TEST_API_KEY=fc-YOUR_API_KEY \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc new file mode 100644 index 00000000..5ba1f132 Binary files /dev/null and b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/__pycache__/test.cpython-311-pytest-8.2.1.pyc differ diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py new file mode 100644 index 00000000..517d8cf9 --- /dev/null +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -0,0 +1,168 @@ +import importlib.util +import pytest +import time +import os +from uuid import uuid4 +from dotenv import load_dotenv + +load_dotenv() + +API_URL = "http://127.0.0.1:3002"; +ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py" +TEST_API_KEY = os.getenv('TEST_API_KEY') + +print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}") + +spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH) +firecrawl = importlib.util.module_from_spec(spec) +spec.loader.exec_module(firecrawl) +FirecrawlApp = firecrawl.FirecrawlApp + +def test_no_api_key(): + with pytest.raises(Exception) as excinfo: + invalid_app = FirecrawlApp(api_url=API_URL) + assert "No API key provided" in str(excinfo.value) + +def test_scrape_url_invalid_api_key(): + invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as excinfo: + invalid_app.scrape_url('https://firecrawl.dev') + assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value) + +def test_blocklisted_url(): + blocklisted_url = "https://facebook.com/fake-test" + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(Exception) as excinfo: + app.scrape_url(blocklisted_url) + assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value) + +def test_successful_response_with_valid_preview_token(): + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + response = app.scrape_url('https://roastmywebsite.ai') + assert response is not None + assert 'content' in response + assert "_Roast_" in response['content'] + +def test_scrape_url_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://roastmywebsite.ai') + assert response is not None + assert 'content' not in response + assert 'markdown' in response + assert 'metadata' in response + assert 'html' not in response + assert "_Roast_" in response['markdown'] + +def test_successful_response_with_valid_api_key_and_include_html(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url('https://roastmywebsite.ai', { 'formats': [ 'markdown', 'html' ]}) + assert response is not None + assert 'content' not in response + assert 'markdown' in response + assert 'html' in response + assert 'metadata' in response + assert "_Roast_" in response['markdown'] + assert " 0 + assert 'content' not in response[0] + assert 'markdown' in response[0] + assert "_Roast_" in response[0]['markdown'] + +def test_crawl_url_with_idempotency_key_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + uniqueIdempotencyKey = str(uuid4()) + response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert response is not None + assert len(response) > 0 + assert 'content' in response[0] + assert "_Roast_" in response[0]['content'] + + with pytest.raises(Exception) as excinfo: + app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey) + assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value) + +def test_check_crawl_status_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False) + assert response is not None + assert 'jobId' in response + + time.sleep(30) # wait for 30 seconds + status_response = app.check_crawl_status(response['jobId']) + assert status_response is not None + assert 'status' in status_response + assert status_response['status'] == 'completed' + assert 'data' in status_response + assert len(status_response['data']) > 0 + +def test_search_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(NotImplementedError) as excinfo: + app.search("test query") + assert "Search is not supported in v1" in str(excinfo.value) + +def test_llm_extraction(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.scrape_url("https://mendable.ai", { + 'extractorOptions': { + 'mode': 'llm-extraction', + 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source", + 'extractionSchema': { + 'type': 'object', + 'properties': { + 'company_mission': {'type': 'string'}, + 'supports_sso': {'type': 'boolean'}, + 'is_open_source': {'type': 'boolean'} + }, + 'required': ['company_mission', 'supports_sso', 'is_open_source'] + } + } + }) + assert response is not None + assert 'llm_extraction' in response + llm_extraction = response['llm_extraction'] + assert 'company_mission' in llm_extraction + assert isinstance(llm_extraction['supports_sso'], bool) + assert isinstance(llm_extraction['is_open_source'], bool) + +def test_map_e2e(): + app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token") + response = app.map_url('https://roastmywebsite.ai') + assert response is not None + assert isinstance(response, list) + \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 7ec0d33f..25c9663e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -19,24 +19,22 @@ import requests logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None: + """ + Initialize the FirecrawlApp instance with API key, API URL, and version. - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - logger.warning("No API key provided") - raise ValueError('No API key provided') - else: - logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key) - - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - if self.api_url != 'https://api.firecrawl.dev': - logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url) + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + version (str): API version, either 'v0' or 'v1'. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.version = version + if self.api_key is None: + logger.warning("No API key provided") + raise ValueError('No API key provided') + logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}") def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: """ @@ -75,9 +73,11 @@ class FirecrawlApp: for key, value in params.items(): if key != 'extractorOptions': scrape_params[key] = value + + endpoint = f'/{self.version}/scrape' # Make the POST request with the prepared headers and JSON data response = requests.post( - f'{self.api_url}/v0/scrape', + f'{self.api_url}{endpoint}', headers=headers, json=scrape_params, ) @@ -104,6 +104,9 @@ class FirecrawlApp: Raises: Exception: If the search request fails. """ + if self.version == 'v1': + raise NotImplementedError("Search is not supported in v1") + headers = self._prepare_headers() json_data = {'query': query} if params: @@ -145,11 +148,12 @@ class FirecrawlApp: Raises: Exception: If the crawl job initiation or monitoring fails. """ + endpoint = f'/{self.version}/crawl' headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) + response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) if response.status_code == 200: job_id = response.json().get('jobId') if wait_until_done: @@ -172,13 +176,44 @@ class FirecrawlApp: Raises: Exception: If the status check request fails. """ + endpoint = f'/{self.version}/crawl/status/{job_id}' headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) + response = self._get_request(f'{self.api_url}{endpoint}', headers) if response.status_code == 200: return response.json() else: self._handle_error(response, 'check crawl status') + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Perform a map search using the Firecrawl API. + """ + if self.version == 'v0': + raise NotImplementedError("Map is not supported in v0") + + endpoint = f'/{self.version}/map' + headers = self._prepare_headers() + + # Prepare the base scrape parameters with the URL + json_data = {'url': url} + if params: + json_data.update(params) + + # Make the POST request with the prepared headers and JSON data + response = requests.post( + f'{self.api_url}{endpoint}', + headers=headers, + json=json_data, + ) + if response.status_code == 200: + response = response.json() + if response['success'] and 'data' in response: + return response['data'] + else: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + self._handle_error(response, 'map') + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests.