Nick: python working

2025-08-12 04:39:03 +08:00 · 2024-08-30 12:58:38 -03:00 · 2024-08-30 12:58:38 -03:00 · 45e33563eb
commit 45e33563eb
parent bb4808443c
3 changed files with 58 additions and 36 deletions
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -75,6 +75,7 @@ export interface FirecrawlDocument {
  html?: string;
  rawHtml?: string;
  links?: string[];
+  extract?: Record<any, any>;
  screenshot?: string;
  metadata: FirecrawlDocumentMetadata;
 }
@ -344,20 +345,30 @@ export default class FirecrawlApp<T extends "v0" | "v1"> {
      Authorization: `Bearer ${this.apiKey}`,
    } as AxiosRequestHeaders;
    let jsonData: any = { url, ...params };
-    if (jsonData?.extractorOptions?.extractionSchema) {
-      let schema = jsonData.extractorOptions.extractionSchema;
+    if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) {
+      let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema;
      // Check if schema is an instance of ZodSchema to correctly identify Zod schemas
      if (schema instanceof z.ZodSchema) {
        schema = zodToJsonSchema(schema);
      }
-      jsonData = {
-        ...jsonData,
-        extractorOptions: {
-          ...jsonData.extractorOptions,
-          extractionSchema: schema,
-          mode: jsonData.extractorOptions.mode || "llm-extraction",
-        },
-      };
+      if(this.version === 'v0') {
+        jsonData = {
+          ...jsonData,
+          extractorOptions: {
+            ...jsonData.extractorOptions,
+            extractionSchema: schema,
+            mode: jsonData.extractorOptions.mode || "llm-extraction",
+          },
+        };
+      } else {
+        jsonData = {
+          ...jsonData,
+          extract: {
+            ...jsonData.extract,
+            schema: schema,
+          },
+        };
+      }
    }
    try {
      const response: AxiosResponse = await axios.post(
--- a/apps/python-sdk/example.py
+++ b/apps/python-sdk/example.py
@ -1,17 +1,18 @@
 import uuid
 from firecrawl.firecrawl import FirecrawlApp

-app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
+app = FirecrawlApp(api_key="fc-")

 # Scrape a website:
 scrape_result = app.scrape_url('firecrawl.dev')
 print(scrape_result['markdown'])

 # Crawl a website:
-idempotency_key = str(uuid.uuid4()) # optional idempotency key
-crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
+crawl_result = app.crawl_url('docs.firecrawl.dev', {}, True, 2)
 print(crawl_result)

+
+
 # LLM Extraction:
 # Define schema to extract contents into using pydantic
 from pydantic import BaseModel, Field
@ -27,18 +28,15 @@ class TopArticlesSchema(BaseModel):
    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")

 llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
-    'extractorOptions': {
-        'extractionSchema': TopArticlesSchema.model_json_schema(),
-        'mode': 'llm-extraction'
-    },
-    'pageOptions':{
-        'onlyMainContent': True
+    'formats': ['extract'],
+    'extract': {
+        'schema': TopArticlesSchema.model_json_schema()
    }
 })

-print(llm_extraction_result['llm_extraction'])
+print(llm_extraction_result['extract'])

-# Define schema to extract contents into using json schema
+# # Define schema to extract contents into using json schema
 json_schema = {
  "type": "object",
  "properties": {
@ -62,7 +60,10 @@ json_schema = {
  "required": ["top"]
 }

-llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
+app2 = FirecrawlApp(api_key="fc-", version="v0")
+
+
+llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
    'extractorOptions': {
        'extractionSchema': json_schema,
        'mode': 'llm-extraction'
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -58,21 +58,31 @@ class FirecrawlApp:

        # If there are additional params, process them
        if params:
-            # Initialize extractorOptions if present
-            extractor_options = params.get('extractorOptions', {})
-            # Check and convert the extractionSchema if it's a Pydantic model
-            if 'extractionSchema' in extractor_options:
-                if hasattr(extractor_options['extractionSchema'], 'schema'):
-                    extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
-                # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
-                extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
-                # Update the scrape_params with the processed extractorOptions
-                scrape_params['extractorOptions'] = extractor_options
+            if self.version == 'v0':
+                # Handle extractorOptions (for v0 compatibility)
+                extractor_options = params.get('extractorOptions', {})
+                if extractor_options:
+                    if 'extractionSchema' in extractor_options and hasattr(extractor_options['extractionSchema'], 'schema'):
+                        extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
+                    extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
+                    scrape_params['extractorOptions'] = extractor_options

-            # Include any other params directly at the top level of scrape_params
-            for key, value in params.items():
-                if key != 'extractorOptions':
-                    scrape_params[key] = value
+                # Include any other params directly at the top level of scrape_params
+                for key, value in params.items():
+                    if key not in ['extractorOptions']:
+                        scrape_params[key] = value
+            elif self.version == 'v1':
+                # Handle extract (for v1)
+                extract = params.get('extract', {})
+                if extract:
+                    if 'schema' in extract and hasattr(extract['schema'], 'schema'):
+                        extract['schema'] = extract['schema'].schema()
+                    scrape_params['extract'] = extract
+
+                # Include any other params directly at the top level of scrape_params
+                for key, value in params.items():
+                    if key not in ['extract']:
+                        scrape_params[key] = value

        endpoint = f'/{self.version}/scrape'
        # Make the POST request with the prepared headers and JSON data