diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 94cc269b..6a4c5d61 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa -__version__ = "2.5.0" +__version__ = "2.5.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 8e002194..fbafe10a 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -29,7 +29,7 @@ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDoc warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"") warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"") warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"") - +warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"") def get_version(): try: @@ -529,14 +529,16 @@ class FirecrawlApp: scrape_params['blockAds'] = block_ads if proxy: scrape_params['proxy'] = proxy - if extract: - if hasattr(extract.schema, 'schema'): - extract.schema = extract.schema.schema() - scrape_params['extract'] = extract.dict(exclude_none=True) - if json_options: - if hasattr(json_options.schema, 'schema'): - json_options.schema = json_options.schema.schema() - scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if extract is not None: + extract = self._ensure_schema_dict(extract) + if isinstance(extract, dict) and "schema" in extract: + extract["schema"] = self._ensure_schema_dict(extract["schema"]) + scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True) + if json_options is not None: + json_options = self._ensure_schema_dict(json_options) + if isinstance(json_options, dict) and "schema" in json_options: + json_options["schema"] = self._ensure_schema_dict(json_options["schema"]) + scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True) if actions: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] if change_tracking_options: @@ -544,6 +546,11 @@ class FirecrawlApp: scrape_params.update(kwargs) + if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']: + scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema']) + if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']: + scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema']) + # Make request response = requests.post( f'{self.api_url}/v1/scrape', @@ -1252,13 +1259,15 @@ class FirecrawlApp: if proxy is not None: scrape_params['proxy'] = proxy if extract is not None: - if hasattr(extract.schema, 'schema'): - extract.schema = extract.schema.schema() - scrape_params['extract'] = extract.dict(exclude_none=True) + extract = self._ensure_schema_dict(extract) + if isinstance(extract, dict) and "schema" in extract: + extract["schema"] = self._ensure_schema_dict(extract["schema"]) + scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True) if json_options is not None: - if hasattr(json_options.schema, 'schema'): - json_options.schema = json_options.schema.schema() - scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + json_options = self._ensure_schema_dict(json_options) + if isinstance(json_options, dict) and "schema" in json_options: + json_options["schema"] = self._ensure_schema_dict(json_options["schema"]) + scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True) if actions is not None: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] if agent is not None: @@ -1273,6 +1282,11 @@ class FirecrawlApp: params_dict['urls'] = urls params_dict['origin'] = f"python-sdk@{version}" + if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']: + params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema']) + if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']: + params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema']) + # Make request headers = self._prepare_headers(idempotency_key) response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) @@ -1378,13 +1392,15 @@ class FirecrawlApp: if proxy is not None: scrape_params['proxy'] = proxy if extract is not None: - if hasattr(extract.schema, 'schema'): - extract.schema = extract.schema.schema() - scrape_params['extract'] = extract.dict(exclude_none=True) + extract = self._ensure_schema_dict(extract) + if isinstance(extract, dict) and "schema" in extract: + extract["schema"] = self._ensure_schema_dict(extract["schema"]) + scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True) if json_options is not None: - if hasattr(json_options.schema, 'schema'): - json_options.schema = json_options.schema.schema() - scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + json_options = self._ensure_schema_dict(json_options) + if isinstance(json_options, dict) and "schema" in json_options: + json_options["schema"] = self._ensure_schema_dict(json_options["schema"]) + scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True) if actions is not None: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] if agent is not None: @@ -1399,6 +1415,11 @@ class FirecrawlApp: params_dict['urls'] = urls params_dict['origin'] = f"python-sdk@{version}" + if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']: + params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema']) + if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']: + params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema']) + # Make request headers = self._prepare_headers(idempotency_key) response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) @@ -1499,13 +1520,15 @@ class FirecrawlApp: if proxy is not None: scrape_params['proxy'] = proxy if extract is not None: - if hasattr(extract.schema, 'schema'): - extract.schema = extract.schema.schema() - scrape_params['extract'] = extract.dict(exclude_none=True) + extract = self._ensure_schema_dict(extract) + if isinstance(extract, dict) and "schema" in extract: + extract["schema"] = self._ensure_schema_dict(extract["schema"]) + scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True) if json_options is not None: - if hasattr(json_options.schema, 'schema'): - json_options.schema = json_options.schema.schema() - scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + json_options = self._ensure_schema_dict(json_options) + if isinstance(json_options, dict) and "schema" in json_options: + json_options["schema"] = self._ensure_schema_dict(json_options["schema"]) + scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True) if actions is not None: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] if agent is not None: @@ -1520,6 +1543,11 @@ class FirecrawlApp: params_dict['urls'] = urls params_dict['origin'] = f"python-sdk@{version}" + if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']: + params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema']) + if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']: + params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema']) + # Make request headers = self._prepare_headers(idempotency_key) response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) @@ -1606,7 +1634,7 @@ class FirecrawlApp: id (str): The ID of the crawl job. Returns: - CrawlErrorsResponse: A response containing: + CrawlErrorsResponse containing: * errors (List[Dict[str, str]]): List of errors with fields: * id (str): Error ID * timestamp (str): When the error occurred @@ -1669,10 +1697,7 @@ class FirecrawlApp: raise ValueError("Either urls or prompt is required") if schema: - if hasattr(schema, 'model_json_schema'): - # Convert Pydantic model to JSON schema - schema = schema.model_json_schema() - # Otherwise assume it's already a JSON schema dict + schema = self._ensure_schema_dict(schema) request_data = { 'urls': urls or [], @@ -1801,10 +1826,7 @@ class FirecrawlApp: schema = schema if schema: - if hasattr(schema, 'model_json_schema'): - # Convert Pydantic model to JSON schema - schema = schema.model_json_schema() - # Otherwise assume it's already a JSON schema dict + schema = self._ensure_schema_dict(schema) request_data = { 'urls': urls, @@ -2467,6 +2489,24 @@ class FirecrawlApp: # Additional type validation can be added here if needed # For now, we rely on Pydantic models for detailed type validation + def _ensure_schema_dict(self, schema): + """ + Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists. + """ + if schema is None: + return schema + if isinstance(schema, type): + # Pydantic v1/v2 model class + if hasattr(schema, 'model_json_schema'): + return schema.model_json_schema() + elif hasattr(schema, 'schema'): + return schema.schema() + if isinstance(schema, dict): + return {k: self._ensure_schema_dict(v) for k, v in schema.items()} + if isinstance(schema, (list, tuple)): + return [self._ensure_schema_dict(v) for v in schema] + return schema + class CrawlWatcher: """ A class to watch and handle crawl job events via WebSocket connection. @@ -2873,19 +2913,24 @@ class AsyncFirecrawlApp(FirecrawlApp): scrape_params['blockAds'] = block_ads if proxy: scrape_params['proxy'] = proxy - if extract: - extract_dict = extract.dict(exclude_none=True) - if 'schema' in extract_dict and hasattr(extract.schema, 'schema'): - extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted - scrape_params['extract'] = extract_dict - if json_options: - json_options_dict = json_options.dict(exclude_none=True) - if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'): - json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted - scrape_params['jsonOptions'] = json_options_dict + if extract is not None: + extract = self._ensure_schema_dict(extract) + if isinstance(extract, dict) and "schema" in extract: + extract["schema"] = self._ensure_schema_dict(extract["schema"]) + scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True) + if json_options is not None: + json_options = self._ensure_schema_dict(json_options) + if isinstance(json_options, dict) and "schema" in json_options: + json_options["schema"] = self._ensure_schema_dict(json_options["schema"]) + scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True) if actions: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']: + scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema']) + if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']: + scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema']) + # Make async request endpoint = f'/v1/scrape' response = await self._async_post_request( @@ -2996,13 +3041,15 @@ class AsyncFirecrawlApp(FirecrawlApp): if proxy is not None: scrape_params['proxy'] = proxy if extract is not None: - if hasattr(extract.schema, 'schema'): - extract.schema = extract.schema.schema() - scrape_params['extract'] = extract.dict(exclude_none=True) + extract = self._ensure_schema_dict(extract) + if isinstance(extract, dict) and "schema" in extract: + extract["schema"] = self._ensure_schema_dict(extract["schema"]) + scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True) if json_options is not None: - if hasattr(json_options.schema, 'schema'): - json_options.schema = json_options.schema.schema() - scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + json_options = self._ensure_schema_dict(json_options) + if isinstance(json_options, dict) and "schema" in json_options: + json_options["schema"] = self._ensure_schema_dict(json_options["schema"]) + scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True) if actions is not None: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] if agent is not None: @@ -3017,6 +3064,11 @@ class AsyncFirecrawlApp(FirecrawlApp): params_dict['urls'] = urls params_dict['origin'] = f"python-sdk@{version}" + if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']: + params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema']) + if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']: + params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema']) + # Make request headers = self._prepare_headers(idempotency_key) response = await self._async_post_request( @@ -3127,13 +3179,15 @@ class AsyncFirecrawlApp(FirecrawlApp): if proxy is not None: scrape_params['proxy'] = proxy if extract is not None: - if hasattr(extract.schema, 'schema'): - extract.schema = extract.schema.schema() - scrape_params['extract'] = extract.dict(exclude_none=True) + extract = self._ensure_schema_dict(extract) + if isinstance(extract, dict) and "schema" in extract: + extract["schema"] = self._ensure_schema_dict(extract["schema"]) + scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True) if json_options is not None: - if hasattr(json_options.schema, 'schema'): - json_options.schema = json_options.schema.schema() - scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + json_options = self._ensure_schema_dict(json_options) + if isinstance(json_options, dict) and "schema" in json_options: + json_options["schema"] = self._ensure_schema_dict(json_options["schema"]) + scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True) if actions is not None: scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] if agent is not None: @@ -3148,6 +3202,11 @@ class AsyncFirecrawlApp(FirecrawlApp): params_dict['urls'] = urls params_dict['origin'] = f"python-sdk@{version}" + if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']: + params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema']) + if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']: + params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema']) + # Make request headers = self._prepare_headers(idempotency_key) response = await self._async_post_request( @@ -3605,10 +3664,7 @@ class AsyncFirecrawlApp(FirecrawlApp): raise ValueError("Either urls or prompt is required") if schema: - if hasattr(schema, 'model_json_schema'): - # Convert Pydantic model to JSON schema - schema = schema.model_json_schema() - # Otherwise assume it's already a JSON schema dict + schema = self._ensure_schema_dict(schema) request_data = { 'urls': urls or [], @@ -3862,8 +3918,7 @@ class AsyncFirecrawlApp(FirecrawlApp): raise ValueError("Either urls or prompt is required") if schema: - if hasattr(schema, 'model_json_schema'): - schema = schema.model_json_schema() + schema = self._ensure_schema_dict(schema) request_data = ExtractResponse( urls=urls or [],