mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 03:29:01 +08:00
python sdk and tests
This commit is contained in:
parent
6cdf4c68ec
commit
0b8df5e264
75
apps/python-sdk/examplev0.py
Normal file
75
apps/python-sdk/examplev0.py
Normal file
@ -0,0 +1,75 @@
|
||||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
|
||||
print(crawl_result)
|
||||
|
||||
# LLM Extraction:
|
||||
# Define schema to extract contents into using pydantic
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
||||
|
||||
# Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': json_schema,
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
@ -20,31 +20,31 @@ FirecrawlApp = firecrawl.FirecrawlApp
|
||||
|
||||
def test_no_api_key():
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
||||
assert "No API key provided" in str(excinfo.value)
|
||||
|
||||
def test_scrape_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert "_Roast_" in response['content']
|
||||
|
||||
def test_scrape_url_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
@ -54,7 +54,7 @@ def test_scrape_url_e2e():
|
||||
assert "_Roast_" in response['content']
|
||||
|
||||
def test_successful_response_with_valid_api_key_and_include_html():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
@ -66,7 +66,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
|
||||
assert "<h1" in response['html']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
@ -74,7 +74,7 @@ def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
||||
time.sleep(6) # wait for 6 seconds
|
||||
assert response is not None
|
||||
@ -83,20 +83,20 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_crawl_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
@ -104,7 +104,7 @@ def test_crawl_url_wait_for_completion_e2e():
|
||||
assert "_Roast_" in response[0]['content']
|
||||
|
||||
def test_crawl_url_with_idempotency_key_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
uniqueIdempotencyKey = str(uuid4())
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert response is not None
|
||||
@ -117,7 +117,7 @@ def test_crawl_url_with_idempotency_key_e2e():
|
||||
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
||||
|
||||
def test_check_crawl_status_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
||||
assert response is not None
|
||||
assert 'jobId' in response
|
||||
@ -131,20 +131,20 @@ def test_check_crawl_status_e2e():
|
||||
assert len(status_response['data']) > 0
|
||||
|
||||
def test_search_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.search("test query")
|
||||
assert response is not None
|
||||
assert 'content' in response[0]
|
||||
assert len(response) > 2
|
||||
|
||||
def test_search_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.search("test query")
|
||||
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_llm_extraction():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url("https://mendable.ai", {
|
||||
'extractorOptions': {
|
||||
'mode': 'llm-extraction',
|
||||
|
@ -0,0 +1,3 @@
|
||||
API_URL=http://localhost:3002
|
||||
ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
|
||||
TEST_API_KEY=fc-YOUR_API_KEY
|
Binary file not shown.
168
apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py
Normal file
168
apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py
Normal file
@ -0,0 +1,168 @@
|
||||
import importlib.util
|
||||
import pytest
|
||||
import time
|
||||
import os
|
||||
from uuid import uuid4
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "http://127.0.0.1:3002";
|
||||
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
||||
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
||||
|
||||
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
||||
|
||||
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
||||
firecrawl = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(firecrawl)
|
||||
FirecrawlApp = firecrawl.FirecrawlApp
|
||||
|
||||
def test_no_api_key():
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||
assert "No API key provided" in str(excinfo.value)
|
||||
|
||||
def test_scrape_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert "_Roast_" in response['content']
|
||||
|
||||
def test_scrape_url_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert 'content' not in response
|
||||
assert 'markdown' in response
|
||||
assert 'metadata' in response
|
||||
assert 'html' not in response
|
||||
assert "_Roast_" in response['markdown']
|
||||
|
||||
def test_successful_response_with_valid_api_key_and_include_html():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://roastmywebsite.ai', { 'formats': [ 'markdown', 'html' ]})
|
||||
assert response is not None
|
||||
assert 'content' not in response
|
||||
assert 'markdown' in response
|
||||
assert 'html' in response
|
||||
assert 'metadata' in response
|
||||
assert "_Roast_" in response['markdown']
|
||||
assert "<h1" in response['html']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
||||
assert response is not None
|
||||
assert 'content' not in response
|
||||
assert 'metadata' in response
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
||||
time.sleep(6) # wait for 6 seconds
|
||||
assert response is not None
|
||||
assert 'content' not in response
|
||||
assert 'metadata' in response
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_crawl_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
assert 'content' not in response[0]
|
||||
assert 'markdown' in response[0]
|
||||
assert "_Roast_" in response[0]['markdown']
|
||||
|
||||
def test_crawl_url_with_idempotency_key_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
uniqueIdempotencyKey = str(uuid4())
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
assert 'content' in response[0]
|
||||
assert "_Roast_" in response[0]['content']
|
||||
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
||||
|
||||
def test_check_crawl_status_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
||||
assert response is not None
|
||||
assert 'jobId' in response
|
||||
|
||||
time.sleep(30) # wait for 30 seconds
|
||||
status_response = app.check_crawl_status(response['jobId'])
|
||||
assert status_response is not None
|
||||
assert 'status' in status_response
|
||||
assert status_response['status'] == 'completed'
|
||||
assert 'data' in status_response
|
||||
assert len(status_response['data']) > 0
|
||||
|
||||
def test_search_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
with pytest.raises(NotImplementedError) as excinfo:
|
||||
app.search("test query")
|
||||
assert "Search is not supported in v1" in str(excinfo.value)
|
||||
|
||||
def test_llm_extraction():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url("https://mendable.ai", {
|
||||
'extractorOptions': {
|
||||
'mode': 'llm-extraction',
|
||||
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
'extractionSchema': {
|
||||
'type': 'object',
|
||||
'properties': {
|
||||
'company_mission': {'type': 'string'},
|
||||
'supports_sso': {'type': 'boolean'},
|
||||
'is_open_source': {'type': 'boolean'}
|
||||
},
|
||||
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
||||
}
|
||||
}
|
||||
})
|
||||
assert response is not None
|
||||
assert 'llm_extraction' in response
|
||||
llm_extraction = response['llm_extraction']
|
||||
assert 'company_mission' in llm_extraction
|
||||
assert isinstance(llm_extraction['supports_sso'], bool)
|
||||
assert isinstance(llm_extraction['is_open_source'], bool)
|
||||
|
||||
def test_map_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
response = app.map_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert isinstance(response, list)
|
||||
|
@ -19,24 +19,22 @@ import requests
|
||||
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||
|
||||
class FirecrawlApp:
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None, version: str = 'v1') -> None:
|
||||
"""
|
||||
Initialize the FirecrawlApp instance.
|
||||
Initialize the FirecrawlApp instance with API key, API URL, and version.
|
||||
|
||||
Args:
|
||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||
version (str): API version, either 'v0' or 'v1'.
|
||||
"""
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
self.version = version
|
||||
if self.api_key is None:
|
||||
logger.warning("No API key provided")
|
||||
raise ValueError('No API key provided')
|
||||
else:
|
||||
logger.debug("Initialized FirecrawlApp with API key: %s", self.api_key)
|
||||
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
if self.api_url != 'https://api.firecrawl.dev':
|
||||
logger.debug("Initialized FirecrawlApp with API URL: %s", self.api_url)
|
||||
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key} and version: {self.version}")
|
||||
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
@ -75,9 +73,11 @@ class FirecrawlApp:
|
||||
for key, value in params.items():
|
||||
if key != 'extractorOptions':
|
||||
scrape_params[key] = value
|
||||
|
||||
endpoint = f'/{self.version}/scrape'
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
f'{self.api_url}/v0/scrape',
|
||||
f'{self.api_url}{endpoint}',
|
||||
headers=headers,
|
||||
json=scrape_params,
|
||||
)
|
||||
@ -104,6 +104,9 @@ class FirecrawlApp:
|
||||
Raises:
|
||||
Exception: If the search request fails.
|
||||
"""
|
||||
if self.version == 'v1':
|
||||
raise NotImplementedError("Search is not supported in v1")
|
||||
|
||||
headers = self._prepare_headers()
|
||||
json_data = {'query': query}
|
||||
if params:
|
||||
@ -145,11 +148,12 @@ class FirecrawlApp:
|
||||
Raises:
|
||||
Exception: If the crawl job initiation or monitoring fails.
|
||||
"""
|
||||
endpoint = f'/{self.version}/crawl'
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
|
||||
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
job_id = response.json().get('jobId')
|
||||
if wait_until_done:
|
||||
@ -172,13 +176,44 @@ class FirecrawlApp:
|
||||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
endpoint = f'/{self.version}/crawl/status/{job_id}'
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
response = self._get_request(f'{self.api_url}{endpoint}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
Perform a map search using the Firecrawl API.
|
||||
"""
|
||||
if self.version == 'v0':
|
||||
raise NotImplementedError("Map is not supported in v0")
|
||||
|
||||
endpoint = f'/{self.version}/map'
|
||||
headers = self._prepare_headers()
|
||||
|
||||
# Prepare the base scrape parameters with the URL
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
f'{self.api_url}{endpoint}',
|
||||
headers=headers,
|
||||
json=json_data,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
||||
else:
|
||||
self._handle_error(response, 'map')
|
||||
|
||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
Loading…
x
Reference in New Issue
Block a user