Fix sdk/schemas (#1507)

* sdk-fix/schema-check * version bump * schema validation for extract and jsonOptions parameters * Update firecrawl.py --------- Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2025-08-14 01:45:59 +08:00 · 2025-04-29 12:19:08 -03:00 · 2025-04-29 12:19:08 -03:00 · 317fa43f9e
commit 317fa43f9e
parent a0a1675829
2 changed files with 120 additions and 65 deletions
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa

-__version__ = "2.5.0"
+__version__ = "2.5.1"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -29,7 +29,7 @@ warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDoc
 warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
 warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
 warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
-
+warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")

 def get_version():
  try:
@ -529,14 +529,16 @@ class FirecrawlApp:
            scrape_params['blockAds'] = block_ads
        if proxy:
            scrape_params['proxy'] = proxy
-        if extract:
-            if hasattr(extract.schema, 'schema'):
-                extract.schema = extract.schema.schema()
-            scrape_params['extract'] = extract.dict(exclude_none=True)
-        if json_options:
-            if hasattr(json_options.schema, 'schema'):
-                json_options.schema = json_options.schema.schema()
-            scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
+        if extract is not None:
+            extract = self._ensure_schema_dict(extract)
+            if isinstance(extract, dict) and "schema" in extract:
+                extract["schema"] = self._ensure_schema_dict(extract["schema"])
+            scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
+        if json_options is not None:
+            json_options = self._ensure_schema_dict(json_options)
+            if isinstance(json_options, dict) and "schema" in json_options:
+                json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
+            scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
        if actions:
            scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
        if change_tracking_options:
@ -544,6 +546,11 @@ class FirecrawlApp:
        
        scrape_params.update(kwargs)

+        if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
+            scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
+        if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
+            scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
+
        # Make request
        response = requests.post(
            f'{self.api_url}/v1/scrape',
@ -1252,13 +1259,15 @@ class FirecrawlApp:
        if proxy is not None:
            scrape_params['proxy'] = proxy
        if extract is not None:
-            if hasattr(extract.schema, 'schema'):
-                extract.schema = extract.schema.schema()
-            scrape_params['extract'] = extract.dict(exclude_none=True)
+            extract = self._ensure_schema_dict(extract)
+            if isinstance(extract, dict) and "schema" in extract:
+                extract["schema"] = self._ensure_schema_dict(extract["schema"])
+            scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
        if json_options is not None:
-            if hasattr(json_options.schema, 'schema'):
-                json_options.schema = json_options.schema.schema()
-            scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
+            json_options = self._ensure_schema_dict(json_options)
+            if isinstance(json_options, dict) and "schema" in json_options:
+                json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
+            scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
        if actions is not None:
            scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
        if agent is not None:
@ -1273,6 +1282,11 @@ class FirecrawlApp:
        params_dict['urls'] = urls
        params_dict['origin'] = f"python-sdk@{version}"

+        if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
+            params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
+        if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
+            params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
+
        # Make request
        headers = self._prepare_headers(idempotency_key)
        response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
@ -1378,13 +1392,15 @@ class FirecrawlApp:
        if proxy is not None:
            scrape_params['proxy'] = proxy
        if extract is not None:
-            if hasattr(extract.schema, 'schema'):
-                extract.schema = extract.schema.schema()
-            scrape_params['extract'] = extract.dict(exclude_none=True)
+            extract = self._ensure_schema_dict(extract)
+            if isinstance(extract, dict) and "schema" in extract:
+                extract["schema"] = self._ensure_schema_dict(extract["schema"])
+            scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
        if json_options is not None:
-            if hasattr(json_options.schema, 'schema'):
-                json_options.schema = json_options.schema.schema()
-            scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
+            json_options = self._ensure_schema_dict(json_options)
+            if isinstance(json_options, dict) and "schema" in json_options:
+                json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
+            scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
        if actions is not None:
            scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
        if agent is not None:
@ -1399,6 +1415,11 @@ class FirecrawlApp:
        params_dict['urls'] = urls
        params_dict['origin'] = f"python-sdk@{version}"

+        if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
+            params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
+        if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
+            params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
+
        # Make request
        headers = self._prepare_headers(idempotency_key)
        response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
@ -1499,13 +1520,15 @@ class FirecrawlApp:
        if proxy is not None:
            scrape_params['proxy'] = proxy
        if extract is not None:
-            if hasattr(extract.schema, 'schema'):
-                extract.schema = extract.schema.schema()
-            scrape_params['extract'] = extract.dict(exclude_none=True)
+            extract = self._ensure_schema_dict(extract)
+            if isinstance(extract, dict) and "schema" in extract:
+                extract["schema"] = self._ensure_schema_dict(extract["schema"])
+            scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
        if json_options is not None:
-            if hasattr(json_options.schema, 'schema'):
-                json_options.schema = json_options.schema.schema()
-            scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
+            json_options = self._ensure_schema_dict(json_options)
+            if isinstance(json_options, dict) and "schema" in json_options:
+                json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
+            scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
        if actions is not None:
            scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
        if agent is not None:
@ -1520,6 +1543,11 @@ class FirecrawlApp:
        params_dict['urls'] = urls
        params_dict['origin'] = f"python-sdk@{version}"

+        if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
+            params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
+        if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
+            params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
+
        # Make request
        headers = self._prepare_headers(idempotency_key)
        response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
@ -1606,7 +1634,7 @@ class FirecrawlApp:
            id (str): The ID of the crawl job.

        Returns:
-            CrawlErrorsResponse: A response containing:
+            CrawlErrorsResponse containing:
            * errors (List[Dict[str, str]]): List of errors with fields:
              * id (str): Error ID
              * timestamp (str): When the error occurred
@ -1669,10 +1697,7 @@ class FirecrawlApp:
            raise ValueError("Either urls or prompt is required")

        if schema:
-            if hasattr(schema, 'model_json_schema'):
-                # Convert Pydantic model to JSON schema
-                schema = schema.model_json_schema()
-            # Otherwise assume it's already a JSON schema dict
+            schema = self._ensure_schema_dict(schema)

        request_data = {
            'urls': urls or [],
@ -1801,10 +1826,7 @@ class FirecrawlApp:
        
        schema = schema
        if schema:
-            if hasattr(schema, 'model_json_schema'):
-                # Convert Pydantic model to JSON schema
-                schema = schema.model_json_schema()
-            # Otherwise assume it's already a JSON schema dict
+            schema = self._ensure_schema_dict(schema)

        request_data = {
            'urls': urls,
@ -2467,6 +2489,24 @@ class FirecrawlApp:
        # Additional type validation can be added here if needed
        # For now, we rely on Pydantic models for detailed type validation

+    def _ensure_schema_dict(self, schema):
+        """
+        Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
+        """
+        if schema is None:
+            return schema
+        if isinstance(schema, type):
+            # Pydantic v1/v2 model class
+            if hasattr(schema, 'model_json_schema'):
+                return schema.model_json_schema()
+            elif hasattr(schema, 'schema'):
+                return schema.schema()
+        if isinstance(schema, dict):
+            return {k: self._ensure_schema_dict(v) for k, v in schema.items()}
+        if isinstance(schema, (list, tuple)):
+            return [self._ensure_schema_dict(v) for v in schema]
+        return schema
+
 class CrawlWatcher:
    """
    A class to watch and handle crawl job events via WebSocket connection.
@ -2873,19 +2913,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
            scrape_params['blockAds'] = block_ads
        if proxy:
            scrape_params['proxy'] = proxy
-        if extract:
-            extract_dict = extract.dict(exclude_none=True)
-            if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
-                extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
-            scrape_params['extract'] = extract_dict
-        if json_options:
-            json_options_dict = json_options.dict(exclude_none=True)
-            if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
-                 json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
-            scrape_params['jsonOptions'] = json_options_dict
+        if extract is not None:
+            extract = self._ensure_schema_dict(extract)
+            if isinstance(extract, dict) and "schema" in extract:
+                extract["schema"] = self._ensure_schema_dict(extract["schema"])
+            scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
+        if json_options is not None:
+            json_options = self._ensure_schema_dict(json_options)
+            if isinstance(json_options, dict) and "schema" in json_options:
+                json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
+            scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
        if actions:
            scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]

+        if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
+            scrape_params['extract']['schema'] = self._ensure_schema_dict(scrape_params['extract']['schema'])
+        if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
+            scrape_params['jsonOptions']['schema'] = self._ensure_schema_dict(scrape_params['jsonOptions']['schema'])
+
        # Make async request
        endpoint = f'/v1/scrape'
        response = await self._async_post_request(
@ -2996,13 +3041,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
        if proxy is not None:
            scrape_params['proxy'] = proxy
        if extract is not None:
-            if hasattr(extract.schema, 'schema'):
-                extract.schema = extract.schema.schema()
-            scrape_params['extract'] = extract.dict(exclude_none=True)
+            extract = self._ensure_schema_dict(extract)
+            if isinstance(extract, dict) and "schema" in extract:
+                extract["schema"] = self._ensure_schema_dict(extract["schema"])
+            scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
        if json_options is not None:
-            if hasattr(json_options.schema, 'schema'):
-                json_options.schema = json_options.schema.schema()
-            scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
+            json_options = self._ensure_schema_dict(json_options)
+            if isinstance(json_options, dict) and "schema" in json_options:
+                json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
+            scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
        if actions is not None:
            scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
        if agent is not None:
@ -3017,6 +3064,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
        params_dict['urls'] = urls
        params_dict['origin'] = f"python-sdk@{version}"

+        if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
+            params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
+        if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
+            params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
+
        # Make request
        headers = self._prepare_headers(idempotency_key)
        response = await self._async_post_request(
@ -3127,13 +3179,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
        if proxy is not None:
            scrape_params['proxy'] = proxy
        if extract is not None:
-            if hasattr(extract.schema, 'schema'):
-                extract.schema = extract.schema.schema()
-            scrape_params['extract'] = extract.dict(exclude_none=True)
+            extract = self._ensure_schema_dict(extract)
+            if isinstance(extract, dict) and "schema" in extract:
+                extract["schema"] = self._ensure_schema_dict(extract["schema"])
+            scrape_params['extract'] = extract if isinstance(extract, dict) else extract.dict(exclude_none=True)
        if json_options is not None:
-            if hasattr(json_options.schema, 'schema'):
-                json_options.schema = json_options.schema.schema()
-            scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
+            json_options = self._ensure_schema_dict(json_options)
+            if isinstance(json_options, dict) and "schema" in json_options:
+                json_options["schema"] = self._ensure_schema_dict(json_options["schema"])
+            scrape_params['jsonOptions'] = json_options if isinstance(json_options, dict) else json_options.dict(exclude_none=True)
        if actions is not None:
            scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
        if agent is not None:
@ -3148,6 +3202,11 @@ class AsyncFirecrawlApp(FirecrawlApp):
        params_dict['urls'] = urls
        params_dict['origin'] = f"python-sdk@{version}"

+        if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
+            params_dict['extract']['schema'] = self._ensure_schema_dict(params_dict['extract']['schema'])
+        if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
+            params_dict['jsonOptions']['schema'] = self._ensure_schema_dict(params_dict['jsonOptions']['schema'])
+
        # Make request
        headers = self._prepare_headers(idempotency_key)
        response = await self._async_post_request(
@ -3605,10 +3664,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
            raise ValueError("Either urls or prompt is required")

        if schema:
-            if hasattr(schema, 'model_json_schema'):
-                # Convert Pydantic model to JSON schema
-                schema = schema.model_json_schema()
-            # Otherwise assume it's already a JSON schema dict
+            schema = self._ensure_schema_dict(schema)

        request_data = {
            'urls': urls or [],
@ -3862,8 +3918,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
            raise ValueError("Either urls or prompt is required")

        if schema:
-            if hasattr(schema, 'model_json_schema'):
-                schema = schema.model_json_schema()
+            schema = self._ensure_schema_dict(schema)

        request_data = ExtractResponse(
            urls=urls or [],