From 45e33563ebe87df0065ed2ce500009c0d4a247f0 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Fri, 30 Aug 2024 12:58:38 -0300
Subject: [PATCH] Nick: python working

---
 apps/js-sdk/firecrawl/src/index.ts     | 31 ++++++++++++++-------
 apps/python-sdk/example.py             | 25 +++++++++--------
 apps/python-sdk/firecrawl/firecrawl.py | 38 ++++++++++++++++----------
 3 files changed, 58 insertions(+), 36 deletions(-)
diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index 2527be96..a5b3af2f 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -75,6 +75,7 @@ export interface FirecrawlDocument {
   html?: string;
   rawHtml?: string;
   links?: string[];
+  extract?: Record<any, any>;
   screenshot?: string;
   metadata: FirecrawlDocumentMetadata;
 }
@@ -344,20 +345,30 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
       Authorization: `Bearer ${this.apiKey}`,
     } as AxiosRequestHeaders;
     let jsonData: any = { url, ...params };
-    if (jsonData?.extractorOptions?.extractionSchema) {
-      let schema = jsonData.extractorOptions.extractionSchema;
+    if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) {
+      let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema;
       // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
       if (schema instanceof z.ZodSchema) {
         schema = zodToJsonSchema(schema);
       }
-      jsonData = {
-        ...jsonData,
-        extractorOptions: {
-          ...jsonData.extractorOptions,
-          extractionSchema: schema,
-          mode: jsonData.extractorOptions.mode || "llm-extraction",
-        },
-      };
+      if(this.version === 'v0') {
+        jsonData = {
+          ...jsonData,
+          extractorOptions: {
+            ...jsonData.extractorOptions,
+            extractionSchema: schema,
+            mode: jsonData.extractorOptions.mode || "llm-extraction",
+          },
+        };
+      } else {
+        jsonData = {
+          ...jsonData,
+          extract: {
+            ...jsonData.extract,
+            schema: schema,
+          },
+        };
+      }
     }
     try {
       const response: AxiosResponse = await axios.post(
diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py
index d80fa795..9d439a3b 100644
--- a/apps/python-sdk/example.py
+++ b/apps/python-sdk/example.py
@@ -1,17 +1,18 @@
 import uuid
 from firecrawl.firecrawl import FirecrawlApp
 
-app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
+app = FirecrawlApp(api_key="fc-")
 
 # Scrape a website:
 scrape_result = app.scrape_url('firecrawl.dev')
 print(scrape_result['markdown'])
 
 # Crawl a website:
-idempotency_key = str(uuid.uuid4()) # optional idempotency key
-crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
+crawl_result = app.crawl_url('docs.firecrawl.dev', {}, True, 2)
 print(crawl_result)
 
+
+
 # LLM Extraction:
 # Define schema to extract contents into using pydantic
 from pydantic import BaseModel, Field
@@ -27,18 +28,15 @@ class TopArticlesSchema(BaseModel):
     top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
 
 llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
-    'extractorOptions': {
-        'extractionSchema': TopArticlesSchema.model_json_schema(),
-        'mode': 'llm-extraction'
-    },
-    'pageOptions':{
-        'onlyMainContent': True
+    'formats': ['extract'],
+    'extract': {
+        'schema': TopArticlesSchema.model_json_schema()
     }
 })
 
-print(llm_extraction_result['llm_extraction'])
+print(llm_extraction_result['extract'])
 
-# Define schema to extract contents into using json schema
+# # Define schema to extract contents into using json schema
 json_schema = {
   "type": "object",
   "properties": {
@@ -62,7 +60,10 @@ json_schema = {
   "required": ["top"]
 }
 
-llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
+app2 = FirecrawlApp(api_key="fc-", version="v0")
+
+
+llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
     'extractorOptions': {
         'extractionSchema': json_schema,
         'mode': 'llm-extraction'
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 89c51803..4e9a7dab 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -58,21 +58,31 @@ class FirecrawlApp:
 
         # If there are additional params, process them
         if params:
-            # Initialize extractorOptions if present
-            extractor_options = params.get('extractorOptions', {})
-            # Check and convert the extractionSchema if it's a Pydantic model
-            if 'extractionSchema' in extractor_options:
-                if hasattr(extractor_options['extractionSchema'], 'schema'):
-                    extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
-                # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
-                extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
-                # Update the scrape_params with the processed extractorOptions
-                scrape_params['extractorOptions'] = extractor_options
+            if self.version == 'v0':
+                # Handle extractorOptions (for v0 compatibility)
+                extractor_options = params.get('extractorOptions', {})
+                if extractor_options:
+                    if 'extractionSchema' in extractor_options and hasattr(extractor_options['extractionSchema'], 'schema'):
+                        extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
+                    extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
+                    scrape_params['extractorOptions'] = extractor_options
 
-            # Include any other params directly at the top level of scrape_params
-            for key, value in params.items():
-                if key != 'extractorOptions':
-                    scrape_params[key] = value
+                # Include any other params directly at the top level of scrape_params
+                for key, value in params.items():
+                    if key not in ['extractorOptions']:
+                        scrape_params[key] = value
+            elif self.version == 'v1':
+                # Handle extract (for v1)
+                extract = params.get('extract', {})
+                if extract:
+                    if 'schema' in extract and hasattr(extract['schema'], 'schema'):
+                        extract['schema'] = extract['schema'].schema()
+                    scrape_params['extract'] = extract
+
+                # Include any other params directly at the top level of scrape_params
+                for key, value in params.items():
+                    if key not in ['extract']:
+                        scrape_params[key] = value
 
         endpoint = f'/{self.version}/scrape'
         # Make the POST request with the prepared headers and JSON data