Fix sdk/schemas (#1507)

* sdk-fix/schema-check

* version bump

* schema validation for extract and jsonOptions parameters

* Update firecrawl.py

---------

Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
Rafael Miller 2025-04-29 12:19:08 -03:00 committed by GitHub
parent a0a1675829
commit 317fa43f9e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 120 additions and 65 deletions

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
__version__ = "2.5.0"
__version__ = "2.5.1"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -29,7 +29,7 @@ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDoc
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
def get_version():
try:
@ -529,14 +529,16 @@ class FirecrawlApp:
scrape_params['blockAds'] = block_ads
if proxy:
scrape_params['proxy'] = proxy
if extract:
if hasattr(extract.schema, 'schema'):
extract.schema = extract.schema.schema()
scrape_params['extract'] = extract.dict(exclude_none=True)
if json_options:
if hasattr(json_options.schema, 'schema'):
json_options.schema = json_options.schema.schema()
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
if extract is not None:
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = self._ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
if json_options is not None:
json_options = self._ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
if actions:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
if change_tracking_options:
@ -544,6 +546,11 @@ class FirecrawlApp:
scrape_params.update(kwargs)
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
# Make request
response = requests.post(
f'{self.api_url}/v1/scrape',
@ -1252,13 +1259,15 @@ class FirecrawlApp:
if proxy is not None:
scrape_params['proxy'] = proxy
if extract is not None:
if hasattr(extract.schema, 'schema'):
extract.schema = extract.schema.schema()
scrape_params['extract'] = extract.dict(exclude_none=True)
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = self._ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
if json_options is not None:
if hasattr(json_options.schema, 'schema'):
json_options.schema = json_options.schema.schema()
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
json_options = self._ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
if actions is not None:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
if agent is not None:
@ -1273,6 +1282,11 @@ class FirecrawlApp:
params_dict['urls'] = urls
params_dict['origin'] = f"python-sdk@{version}"
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
# Make request
headers = self._prepare_headers(idempotency_key)
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
@ -1378,13 +1392,15 @@ class FirecrawlApp:
if proxy is not None:
scrape_params['proxy'] = proxy
if extract is not None:
if hasattr(extract.schema, 'schema'):
extract.schema = extract.schema.schema()
scrape_params['extract'] = extract.dict(exclude_none=True)
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = self._ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
if json_options is not None:
if hasattr(json_options.schema, 'schema'):
json_options.schema = json_options.schema.schema()
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
json_options = self._ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
if actions is not None:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
if agent is not None:
@ -1399,6 +1415,11 @@ class FirecrawlApp:
params_dict['urls'] = urls
params_dict['origin'] = f"python-sdk@{version}"
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
# Make request
headers = self._prepare_headers(idempotency_key)
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
@ -1499,13 +1520,15 @@ class FirecrawlApp:
if proxy is not None:
scrape_params['proxy'] = proxy
if extract is not None:
if hasattr(extract.schema, 'schema'):
extract.schema = extract.schema.schema()
scrape_params['extract'] = extract.dict(exclude_none=True)
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = self._ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
if json_options is not None:
if hasattr(json_options.schema, 'schema'):
json_options.schema = json_options.schema.schema()
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
json_options = self._ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
if actions is not None:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
if agent is not None:
@ -1520,6 +1543,11 @@ class FirecrawlApp:
params_dict['urls'] = urls
params_dict['origin'] = f"python-sdk@{version}"
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
# Make request
headers = self._prepare_headers(idempotency_key)
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
@ -1606,7 +1634,7 @@ class FirecrawlApp:
id (str): The ID of the crawl job.
Returns:
CrawlErrorsResponse: A response containing:
CrawlErrorsResponse containing:
* errors (List[Dict[str, str]]): List of errors with fields:
* id (str): Error ID
* timestamp (str): When the error occurred
@ -1669,10 +1697,7 @@ class FirecrawlApp:
raise ValueError("Either urls or prompt is required")
if schema:
if hasattr(schema, 'model_json_schema'):
# Convert Pydantic model to JSON schema
schema = schema.model_json_schema()
# Otherwise assume it's already a JSON schema dict
schema = self._ensure_schema_dict(schema)
request_data = {
'urls': urls or [],
@ -1801,10 +1826,7 @@ class FirecrawlApp:
schema = schema
if schema:
if hasattr(schema, 'model_json_schema'):
# Convert Pydantic model to JSON schema
schema = schema.model_json_schema()
# Otherwise assume it's already a JSON schema dict
schema = self._ensure_schema_dict(schema)
request_data = {
'urls': urls,
@ -2467,6 +2489,24 @@ class FirecrawlApp:
# Additional type validation can be added here if needed
# For now, we rely on Pydantic models for detailed type validation
def _ensure_schema_dict(self, schema):
"""
Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
"""
if schema is None:
return schema
if isinstance(schema, type):
# Pydantic v1/v2 model class
if hasattr(schema, 'model_json_schema'):
return schema.model_json_schema()
elif hasattr(schema, 'schema'):
return schema.schema()
if isinstance(schema, dict):
return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
if isinstance(schema, (list, tuple)):
return [self._ensure_schema_dict(v) for v in schema]
return schema
class CrawlWatcher:
"""
A class to watch and handle crawl job events via WebSocket connection.
@ -2873,19 +2913,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
scrape_params['blockAds'] = block_ads
if proxy:
scrape_params['proxy'] = proxy
if extract:
extract_dict = extract.dict(exclude_none=True)
if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
scrape_params['extract'] = extract_dict
if json_options:
json_options_dict = json_options.dict(exclude_none=True)
if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
scrape_params['jsonOptions'] = json_options_dict
if extract is not None:
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = self._ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
if json_options is not None:
json_options = self._ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
if actions:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
# Make async request
endpoint = f'/v1/scrape'
response = await self._async_post_request(
@ -2996,13 +3041,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
if proxy is not None:
scrape_params['proxy'] = proxy
if extract is not None:
if hasattr(extract.schema, 'schema'):
extract.schema = extract.schema.schema()
scrape_params['extract'] = extract.dict(exclude_none=True)
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = self._ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
if json_options is not None:
if hasattr(json_options.schema, 'schema'):
json_options.schema = json_options.schema.schema()
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
json_options = self._ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
if actions is not None:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
if agent is not None:
@ -3017,6 +3064,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
params_dict['urls'] = urls
params_dict['origin'] = f"python-sdk@{version}"
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
# Make request
headers = self._prepare_headers(idempotency_key)
response = await self._async_post_request(
@ -3127,13 +3179,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
if proxy is not None:
scrape_params['proxy'] = proxy
if extract is not None:
if hasattr(extract.schema, 'schema'):
extract.schema = extract.schema.schema()
scrape_params['extract'] = extract.dict(exclude_none=True)
extract = self._ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = self._ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
if json_options is not None:
if hasattr(json_options.schema, 'schema'):
json_options.schema = json_options.schema.schema()
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
json_options = self._ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
if actions is not None:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
if agent is not None:
@ -3148,6 +3202,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
params_dict['urls'] = urls
params_dict['origin'] = f"python-sdk@{version}"
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
# Make request
headers = self._prepare_headers(idempotency_key)
response = await self._async_post_request(
@ -3605,10 +3664,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
raise ValueError("Either urls or prompt is required")
if schema:
if hasattr(schema, 'model_json_schema'):
# Convert Pydantic model to JSON schema
schema = schema.model_json_schema()
# Otherwise assume it's already a JSON schema dict
schema = self._ensure_schema_dict(schema)
request_data = {
'urls': urls or [],
@ -3862,8 +3918,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
raise ValueError("Either urls or prompt is required")
if schema:
if hasattr(schema, 'model_json_schema'):
schema = schema.model_json_schema()
schema = self._ensure_schema_dict(schema)
request_data = ExtractResponse(
urls=urls or [],