mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 01:45:59 +08:00
Fix sdk/schemas (#1507)
* sdk-fix/schema-check * version bump * schema validation for extract and jsonOptions parameters * Update firecrawl.py --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
a0a1675829
commit
317fa43f9e
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
||||
|
||||
__version__ = "2.5.0"
|
||||
__version__ = "2.5.1"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
@ -29,7 +29,7 @@ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDoc
|
||||
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
||||
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
|
||||
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
||||
|
||||
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
|
||||
|
||||
def get_version():
|
||||
try:
|
||||
@ -529,14 +529,16 @@ class FirecrawlApp:
|
||||
scrape_params['blockAds'] = block_ads
|
||||
if proxy:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract:
|
||||
if hasattr(extract.schema, 'schema'):
|
||||
extract.schema = extract.schema.schema()
|
||||
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||
if json_options:
|
||||
if hasattr(json_options.schema, 'schema'):
|
||||
json_options.schema = json_options.schema.schema()
|
||||
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||
if extract is not None:
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
||||
if json_options is not None:
|
||||
json_options = self._ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
||||
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
||||
if actions:
|
||||
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||
if change_tracking_options:
|
||||
@ -544,6 +546,11 @@ class FirecrawlApp:
|
||||
|
||||
scrape_params.update(kwargs)
|
||||
|
||||
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
||||
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
|
||||
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
||||
scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
|
||||
|
||||
# Make request
|
||||
response = requests.post(
|
||||
f'{self.api_url}/v1/scrape',
|
||||
@ -1252,13 +1259,15 @@ class FirecrawlApp:
|
||||
if proxy is not None:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract is not None:
|
||||
if hasattr(extract.schema, 'schema'):
|
||||
extract.schema = extract.schema.schema()
|
||||
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
||||
if json_options is not None:
|
||||
if hasattr(json_options.schema, 'schema'):
|
||||
json_options.schema = json_options.schema.schema()
|
||||
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||
json_options = self._ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
||||
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
||||
if actions is not None:
|
||||
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||
if agent is not None:
|
||||
@ -1273,6 +1282,11 @@ class FirecrawlApp:
|
||||
params_dict['urls'] = urls
|
||||
params_dict['origin'] = f"python-sdk@{version}"
|
||||
|
||||
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
||||
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
||||
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
||||
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
||||
|
||||
# Make request
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
||||
@ -1378,13 +1392,15 @@ class FirecrawlApp:
|
||||
if proxy is not None:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract is not None:
|
||||
if hasattr(extract.schema, 'schema'):
|
||||
extract.schema = extract.schema.schema()
|
||||
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
||||
if json_options is not None:
|
||||
if hasattr(json_options.schema, 'schema'):
|
||||
json_options.schema = json_options.schema.schema()
|
||||
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||
json_options = self._ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
||||
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
||||
if actions is not None:
|
||||
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||
if agent is not None:
|
||||
@ -1399,6 +1415,11 @@ class FirecrawlApp:
|
||||
params_dict['urls'] = urls
|
||||
params_dict['origin'] = f"python-sdk@{version}"
|
||||
|
||||
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
||||
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
||||
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
||||
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
||||
|
||||
# Make request
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
||||
@ -1499,13 +1520,15 @@ class FirecrawlApp:
|
||||
if proxy is not None:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract is not None:
|
||||
if hasattr(extract.schema, 'schema'):
|
||||
extract.schema = extract.schema.schema()
|
||||
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
||||
if json_options is not None:
|
||||
if hasattr(json_options.schema, 'schema'):
|
||||
json_options.schema = json_options.schema.schema()
|
||||
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||
json_options = self._ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
||||
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
||||
if actions is not None:
|
||||
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||
if agent is not None:
|
||||
@ -1520,6 +1543,11 @@ class FirecrawlApp:
|
||||
params_dict['urls'] = urls
|
||||
params_dict['origin'] = f"python-sdk@{version}"
|
||||
|
||||
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
||||
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
||||
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
||||
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
||||
|
||||
# Make request
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
||||
@ -1606,7 +1634,7 @@ class FirecrawlApp:
|
||||
id (str): The ID of the crawl job.
|
||||
|
||||
Returns:
|
||||
CrawlErrorsResponse: A response containing:
|
||||
CrawlErrorsResponse containing:
|
||||
* errors (List[Dict[str, str]]): List of errors with fields:
|
||||
* id (str): Error ID
|
||||
* timestamp (str): When the error occurred
|
||||
@ -1669,10 +1697,7 @@ class FirecrawlApp:
|
||||
raise ValueError("Either urls or prompt is required")
|
||||
|
||||
if schema:
|
||||
if hasattr(schema, 'model_json_schema'):
|
||||
# Convert Pydantic model to JSON schema
|
||||
schema = schema.model_json_schema()
|
||||
# Otherwise assume it's already a JSON schema dict
|
||||
schema = self._ensure_schema_dict(schema)
|
||||
|
||||
request_data = {
|
||||
'urls': urls or [],
|
||||
@ -1801,10 +1826,7 @@ class FirecrawlApp:
|
||||
|
||||
schema = schema
|
||||
if schema:
|
||||
if hasattr(schema, 'model_json_schema'):
|
||||
# Convert Pydantic model to JSON schema
|
||||
schema = schema.model_json_schema()
|
||||
# Otherwise assume it's already a JSON schema dict
|
||||
schema = self._ensure_schema_dict(schema)
|
||||
|
||||
request_data = {
|
||||
'urls': urls,
|
||||
@ -2467,6 +2489,24 @@ class FirecrawlApp:
|
||||
# Additional type validation can be added here if needed
|
||||
# For now, we rely on Pydantic models for detailed type validation
|
||||
|
||||
def _ensure_schema_dict(self, schema):
|
||||
"""
|
||||
Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
|
||||
"""
|
||||
if schema is None:
|
||||
return schema
|
||||
if isinstance(schema, type):
|
||||
# Pydantic v1/v2 model class
|
||||
if hasattr(schema, 'model_json_schema'):
|
||||
return schema.model_json_schema()
|
||||
elif hasattr(schema, 'schema'):
|
||||
return schema.schema()
|
||||
if isinstance(schema, dict):
|
||||
return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
|
||||
if isinstance(schema, (list, tuple)):
|
||||
return [self._ensure_schema_dict(v) for v in schema]
|
||||
return schema
|
||||
|
||||
class CrawlWatcher:
|
||||
"""
|
||||
A class to watch and handle crawl job events via WebSocket connection.
|
||||
@ -2873,19 +2913,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
scrape_params['blockAds'] = block_ads
|
||||
if proxy:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract:
|
||||
extract_dict = extract.dict(exclude_none=True)
|
||||
if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
|
||||
extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
|
||||
scrape_params['extract'] = extract_dict
|
||||
if json_options:
|
||||
json_options_dict = json_options.dict(exclude_none=True)
|
||||
if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
|
||||
json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
|
||||
scrape_params['jsonOptions'] = json_options_dict
|
||||
if extract is not None:
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
||||
if json_options is not None:
|
||||
json_options = self._ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
||||
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
||||
if actions:
|
||||
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||
|
||||
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
||||
scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
|
||||
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
||||
scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
|
||||
|
||||
# Make async request
|
||||
endpoint = f'/v1/scrape'
|
||||
response = await self._async_post_request(
|
||||
@ -2996,13 +3041,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
if proxy is not None:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract is not None:
|
||||
if hasattr(extract.schema, 'schema'):
|
||||
extract.schema = extract.schema.schema()
|
||||
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
||||
if json_options is not None:
|
||||
if hasattr(json_options.schema, 'schema'):
|
||||
json_options.schema = json_options.schema.schema()
|
||||
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||
json_options = self._ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
||||
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
||||
if actions is not None:
|
||||
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||
if agent is not None:
|
||||
@ -3017,6 +3064,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
params_dict['urls'] = urls
|
||||
params_dict['origin'] = f"python-sdk@{version}"
|
||||
|
||||
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
||||
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
||||
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
||||
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
||||
|
||||
# Make request
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
response = await self._async_post_request(
|
||||
@ -3127,13 +3179,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
if proxy is not None:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract is not None:
|
||||
if hasattr(extract.schema, 'schema'):
|
||||
extract.schema = extract.schema.schema()
|
||||
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||
extract = self._ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = self._ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
|
||||
if json_options is not None:
|
||||
if hasattr(json_options.schema, 'schema'):
|
||||
json_options.schema = json_options.schema.schema()
|
||||
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||
json_options = self._ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
|
||||
scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
|
||||
if actions is not None:
|
||||
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||
if agent is not None:
|
||||
@ -3148,6 +3202,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
params_dict['urls'] = urls
|
||||
params_dict['origin'] = f"python-sdk@{version}"
|
||||
|
||||
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
||||
params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
|
||||
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
||||
params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
||||
|
||||
# Make request
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
response = await self._async_post_request(
|
||||
@ -3605,10 +3664,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
raise ValueError("Either urls or prompt is required")
|
||||
|
||||
if schema:
|
||||
if hasattr(schema, 'model_json_schema'):
|
||||
# Convert Pydantic model to JSON schema
|
||||
schema = schema.model_json_schema()
|
||||
# Otherwise assume it's already a JSON schema dict
|
||||
schema = self._ensure_schema_dict(schema)
|
||||
|
||||
request_data = {
|
||||
'urls': urls or [],
|
||||
@ -3862,8 +3918,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
raise ValueError("Either urls or prompt is required")
|
||||
|
||||
if schema:
|
||||
if hasattr(schema, 'model_json_schema'):
|
||||
schema = schema.model_json_schema()
|
||||
schema = self._ensure_schema_dict(schema)
|
||||
|
||||
request_data = ExtractResponse(
|
||||
urls=urls or [],
|
||||
|
Loading…
x
Reference in New Issue
Block a user