From 80d6cb16fb096d242a4a6c8221d19d0501e65631 Mon Sep 17 00:00:00 2001
From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Date: Thu, 14 Nov 2024 15:51:27 -0300
Subject: [PATCH] sdks wip

---
 apps/js-sdk/example.js                 | 13 ++++++
 apps/js-sdk/example.ts                 | 13 ++++++
 apps/js-sdk/firecrawl/package.json     |  2 +-
 apps/js-sdk/firecrawl/src/index.ts     | 59 +++++++++++++++++++++++-
 apps/python-sdk/example.py             | 21 +++++++--
 apps/python-sdk/firecrawl/__init__.py  |  2 +-
 apps/python-sdk/firecrawl/firecrawl.py | 64 +++++++++++++++++++++++++-
 7 files changed, 167 insertions(+), 7 deletions(-)

diff --git a/apps/js-sdk/example.js b/apps/js-sdk/example.js
index c4b21d5f..166cc18d 100644
--- a/apps/js-sdk/example.js
+++ b/apps/js-sdk/example.js
@@ -1,4 +1,5 @@
 import FirecrawlApp from 'firecrawl';
+import { z } from 'zod';
 
 const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
 
@@ -42,6 +43,18 @@ const main = async () => {
   const mapResult = await app.mapUrl('https://firecrawl.dev');
   console.log(mapResult)
 
+  // Extract information from a website using LLM:
+  const extractSchema = z.object({
+    title: z.string(),
+    description: z.string(),
+    links: z.array(z.string())
+  });
+
+  const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
+    prompt: "Extract the title, description, and links from the website",
+    schema: extractSchema
+  });
+  console.log(extractResult);
 
   // Crawl a website with WebSockets:
   const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts
index 7412e479..a8fff30a 100644
--- a/apps/js-sdk/example.ts
+++ b/apps/js-sdk/example.ts
@@ -42,6 +42,19 @@ const main = async () => {
   const mapResult = await app.mapUrl('https://firecrawl.dev');
   console.log(mapResult)
 
+  // // Extract information from a website using LLM:
+  // const extractSchema = z.object({
+  //   title: z.string(),
+  //   description: z.string(),
+  //   links: z.array(z.string())
+  // });
+
+  // const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
+  //   prompt: "Extract the title, description, and links from the website",
+  //   schema: extractSchema
+  // });
+  // console.log(extractResult);
+
   // Crawl a website with WebSockets:
   const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
 
diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json
index 5d0a7fc9..8f3682c2 100644
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@mendable/firecrawl-js",
-  "version": "1.8.2",
+  "version": "1.9.0",
   "description": "JavaScript SDK for Firecrawl API",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index 45e19197..2b3ca2b3 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -234,6 +234,26 @@ export interface MapResponse {
   error?: string;
 }
 
+/**
+ * Parameters for extracting information from URLs.
+ * Defines options for extracting information from URLs.
+ */
+export interface ExtractParams {
+  prompt: string;
+  schema?: zt.ZodSchema;
+  systemPrompt?: string;
+}
+
+/**
+ * Response interface for extracting information from URLs.
+ * Defines the structure of the response received after extracting information from URLs.
+ */
+export interface ExtractResponse {
+  success: true;
+  data: zt.infer<zt.ZodSchema>;
+  error?: string;
+}
+
 /**
  * Error response interface.
  * Defines the structure of the response received when an error occurs.
@@ -243,7 +263,6 @@ export interface ErrorResponse {
   error: string;
 }
 
-
 /**
  * Custom error class for Firecrawl.
  * Extends the built-in Error class to include a status code.
@@ -675,6 +694,44 @@ export default class FirecrawlApp {
     return { success: false, error: "Internal server error." };
   }
 
+  /**
+   * Extracts information from a URL using the Firecrawl API.
+   * @param url - The URL to extract information from.
+   * @param params - Additional parameters for the extract request.
+   * @returns The response from the extract operation.
+   */
+  async extractUrls(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
+    const headers = this.prepareHeaders();
+
+    if (!params?.prompt) {
+      throw new FirecrawlError("Prompt is required", 400);
+    }
+
+    let jsonData: { urls: string[] } & ExtractParams= { urls,  ...params };
+    let jsonSchema: any;
+    try {
+      jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
+    } catch (error: any) {
+      throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
+    }
+
+    try {
+      const response: AxiosResponse = await this.postRequest(
+        this.apiUrl + `/v1/extract`,
+        { ...jsonData, schema: jsonSchema },
+        headers
+      );
+      if (response.status === 200) {
+        return response.data as ExtractResponse;
+      } else {
+        this.handleError(response, "extract");
+      }
+    } catch (error: any) {
+      throw new FirecrawlError(error.message, 500);
+    }
+    return { success: false, error: "Internal server error." };
+  }
+
   /**
    * Prepares the headers for an API request.
    * @param idempotencyKey - Optional key to ensure idempotency.
diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py
index e7c80b30..eba7cfd2 100644
--- a/apps/python-sdk/example.py
+++ b/apps/python-sdk/example.py
@@ -2,6 +2,8 @@ import time
 import nest_asyncio
 import uuid
 from firecrawl.firecrawl import FirecrawlApp
+from pydantic import BaseModel, Field
+from typing import List
 
 app = FirecrawlApp(api_key="fc-")
 
@@ -50,9 +52,6 @@ print(crawl_status)
 
 # LLM Extraction:
 # Define schema to extract contents into using pydantic
-from pydantic import BaseModel, Field
-from typing import List
-
 class ArticleSchema(BaseModel):
     title: str
     points: int 
@@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
 map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
 print(map_result)
 
+# Extract URLs:
+class ExtractSchema(BaseModel):
+    title: str
+    description: str
+    links: List[str]
+
+# Define the schema using Pydantic
+extract_schema = ExtractSchema.schema()
+
+# Perform the extraction
+extract_result = app.extract_urls(['https://firecrawl.dev'], {
+    'prompt': "Extract the title, description, and links from the website",
+    'schema': extract_schema
+})
+print(extract_result)
+
 # Crawl a website with WebSockets:
 # inside an async function...
 nest_asyncio.apply()
diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py
index cb897b7e..d39b77a8 100644
--- a/apps/python-sdk/firecrawl/__init__.py
+++ b/apps/python-sdk/firecrawl/__init__.py
@@ -13,7 +13,7 @@ import os
 
 from .firecrawl import FirecrawlApp # noqa
 
-__version__ = "1.5.0"
+__version__ = "1.6.0"
 
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index c2693c3d..bae2797d 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -12,15 +12,39 @@ Classes:
 import logging
 import os
 import time
-from typing import Any, Dict, Optional, List
+from typing import Any, Dict, Optional, List, Union
 import json
 
 import requests
+import pydantic
 import websockets
 
 logger : logging.Logger = logging.getLogger("firecrawl")
 
 class FirecrawlApp:
+    class ExtractParams(pydantic.BaseModel):
+        """
+        Parameters for the extract operation.
+        """
+        prompt: str
+        schema: Optional[Any] = None
+        system_prompt: Optional[str] = None
+
+    class ExtractResponse(pydantic.BaseModel):
+        """
+        Response from the extract operation.
+        """
+        success: bool
+        data: Optional[Any] = None
+        error: Optional[str] = None
+
+    class ErrorResponse(pydantic.BaseModel):
+        """
+        Error response.
+        """
+        success: bool
+        error: str
+
     def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
       """
       Initialize the FirecrawlApp instance with API key, API URL.
@@ -434,6 +458,44 @@ class FirecrawlApp:
         else:
             self._handle_error(response, 'check batch scrape status')
 
+
+    def extract_urls(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
+        """
+        Extracts information from a URL using the Firecrawl API.
+
+        Args:
+            urls (List[str]): The URLs to extract information from.
+            params (Optional[ExtractParams]): Additional parameters for the extract request.
+
+        Returns:
+            Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
+        """
+        headers = self._prepare_headers()
+
+        if not params or not params.get('prompt'):
+            raise ValueError("Prompt is required")
+
+        if not params.get('schema'):
+            raise ValueError("Schema is required for extraction")
+
+        jsonData = {'urls': urls, **params}
+        jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
+
+        try:
+            response = self._post_request(
+                f'{self.api_url}/v1/extract',
+                {**jsonData, 'schema': jsonSchema},
+                headers
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                self._handle_error(response, "extract")
+        except Exception as e:
+            raise ValueError(str(e), 500)
+
+        return {'success': False, 'error': "Internal server error."}
+
     def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
         """
         Prepare the headers for API requests.