From 80d6cb16fb096d242a4a6c8221d19d0501e65631 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Thu, 14 Nov 2024 15:51:27 -0300 Subject: [PATCH] sdks wip --- apps/js-sdk/example.js | 13 ++++++ apps/js-sdk/example.ts | 13 ++++++ apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 59 +++++++++++++++++++++++- apps/python-sdk/example.py | 21 +++++++-- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 64 +++++++++++++++++++++++++- 7 files changed, 167 insertions(+), 7 deletions(-) diff --git a/apps/js-sdk/example.js b/apps/js-sdk/example.js index c4b21d5f..166cc18d 100644 --- a/apps/js-sdk/example.js +++ b/apps/js-sdk/example.js @@ -1,4 +1,5 @@ import FirecrawlApp from 'firecrawl'; +import { z } from 'zod'; const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); @@ -42,6 +43,18 @@ const main = async () => { const mapResult = await app.mapUrl('https://firecrawl.dev'); console.log(mapResult) + // Extract information from a website using LLM: + const extractSchema = z.object({ + title: z.string(), + description: z.string(), + links: z.array(z.string()) + }); + + const extractResult = await app.extractUrls(['https://firecrawl.dev'], { + prompt: "Extract the title, description, and links from the website", + schema: extractSchema + }); + console.log(extractResult); // Crawl a website with WebSockets: const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); diff --git a/apps/js-sdk/example.ts b/apps/js-sdk/example.ts index 7412e479..a8fff30a 100644 --- a/apps/js-sdk/example.ts +++ b/apps/js-sdk/example.ts @@ -42,6 +42,19 @@ const main = async () => { const mapResult = await app.mapUrl('https://firecrawl.dev'); console.log(mapResult) + // // Extract information from a website using LLM: + // const extractSchema = z.object({ + // title: z.string(), + // description: z.string(), + // links: z.array(z.string()) + // }); + + // const extractResult = await app.extractUrls(['https://firecrawl.dev'], { + // prompt: "Extract the title, description, and links from the website", + // schema: extractSchema + // }); + // console.log(extractResult); + // Crawl a website with WebSockets: const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 5d0a7fc9..8f3682c2 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.8.2", + "version": "1.9.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 45e19197..2b3ca2b3 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -234,6 +234,26 @@ export interface MapResponse { error?: string; } +/** + * Parameters for extracting information from URLs. + * Defines options for extracting information from URLs. + */ +export interface ExtractParams { + prompt: string; + schema?: zt.ZodSchema; + systemPrompt?: string; +} + +/** + * Response interface for extracting information from URLs. + * Defines the structure of the response received after extracting information from URLs. + */ +export interface ExtractResponse { + success: true; + data: zt.infer; + error?: string; +} + /** * Error response interface. * Defines the structure of the response received when an error occurs. @@ -243,7 +263,6 @@ export interface ErrorResponse { error: string; } - /** * Custom error class for Firecrawl. * Extends the built-in Error class to include a status code. @@ -675,6 +694,44 @@ export default class FirecrawlApp { return { success: false, error: "Internal server error." }; } + /** + * Extracts information from a URL using the Firecrawl API. + * @param url - The URL to extract information from. + * @param params - Additional parameters for the extract request. + * @returns The response from the extract operation. + */ + async extractUrls(urls: string[], params?: ExtractParams): Promise { + const headers = this.prepareHeaders(); + + if (!params?.prompt) { + throw new FirecrawlError("Prompt is required", 400); + } + + let jsonData: { urls: string[] } & ExtractParams= { urls, ...params }; + let jsonSchema: any; + try { + jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined; + } catch (error: any) { + throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400); + } + + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/extract`, + { ...jsonData, schema: jsonSchema }, + headers + ); + if (response.status === 200) { + return response.data as ExtractResponse; + } else { + this.handleError(response, "extract"); + } + } catch (error: any) { + throw new FirecrawlError(error.message, 500); + } + return { success: false, error: "Internal server error." }; + } + /** * Prepares the headers for an API request. * @param idempotencyKey - Optional key to ensure idempotency. diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index e7c80b30..eba7cfd2 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -2,6 +2,8 @@ import time import nest_asyncio import uuid from firecrawl.firecrawl import FirecrawlApp +from pydantic import BaseModel, Field +from typing import List app = FirecrawlApp(api_key="fc-") @@ -50,9 +52,6 @@ print(crawl_status) # LLM Extraction: # Define schema to extract contents into using pydantic -from pydantic import BaseModel, Field -from typing import List - class ArticleSchema(BaseModel): title: str points: int @@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', { map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) print(map_result) +# Extract URLs: +class ExtractSchema(BaseModel): + title: str + description: str + links: List[str] + +# Define the schema using Pydantic +extract_schema = ExtractSchema.schema() + +# Perform the extraction +extract_result = app.extract_urls(['https://firecrawl.dev'], { + 'prompt': "Extract the title, description, and links from the website", + 'schema': extract_schema +}) +print(extract_result) + # Crawl a website with WebSockets: # inside an async function... nest_asyncio.apply() diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index cb897b7e..d39b77a8 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.5.0" +__version__ = "1.6.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index c2693c3d..bae2797d 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -12,15 +12,39 @@ Classes: import logging import os import time -from typing import Any, Dict, Optional, List +from typing import Any, Dict, Optional, List, Union import json import requests +import pydantic import websockets logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: + class ExtractParams(pydantic.BaseModel): + """ + Parameters for the extract operation. + """ + prompt: str + schema: Optional[Any] = None + system_prompt: Optional[str] = None + + class ExtractResponse(pydantic.BaseModel): + """ + Response from the extract operation. + """ + success: bool + data: Optional[Any] = None + error: Optional[str] = None + + class ErrorResponse(pydantic.BaseModel): + """ + Error response. + """ + success: bool + error: str + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: """ Initialize the FirecrawlApp instance with API key, API URL. @@ -434,6 +458,44 @@ class FirecrawlApp: else: self._handle_error(response, 'check batch scrape status') + + def extract_urls(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]: + """ + Extracts information from a URL using the Firecrawl API. + + Args: + urls (List[str]): The URLs to extract information from. + params (Optional[ExtractParams]): Additional parameters for the extract request. + + Returns: + Union[ExtractResponse, ErrorResponse]: The response from the extract operation. + """ + headers = self._prepare_headers() + + if not params or not params.get('prompt'): + raise ValueError("Prompt is required") + + if not params.get('schema'): + raise ValueError("Schema is required for extraction") + + jsonData = {'urls': urls, **params} + jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None + + try: + response = self._post_request( + f'{self.api_url}/v1/extract', + {**jsonData, 'schema': jsonSchema}, + headers + ) + if response.status_code == 200: + return response.json() + else: + self._handle_error(response, "extract") + except Exception as e: + raise ValueError(str(e), 500) + + return {'success': False, 'error': "Internal server error."} + def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests.