This commit is contained in:
rafaelmmiller 2024-11-14 15:51:27 -03:00
parent a1c018fdb0
commit 80d6cb16fb
7 changed files with 167 additions and 7 deletions

View File

@ -1,4 +1,5 @@
import FirecrawlApp from 'firecrawl'; import FirecrawlApp from 'firecrawl';
import { z } from 'zod';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
@ -42,6 +43,18 @@ const main = async () => {
const mapResult = await app.mapUrl('https://firecrawl.dev'); const mapResult = await app.mapUrl('https://firecrawl.dev');
console.log(mapResult) console.log(mapResult)
// Extract information from a website using LLM:
const extractSchema = z.object({
title: z.string(),
description: z.string(),
links: z.array(z.string())
});
const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
prompt: "Extract the title, description, and links from the website",
schema: extractSchema
});
console.log(extractResult);
// Crawl a website with WebSockets: // Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});

View File

@ -42,6 +42,19 @@ const main = async () => {
const mapResult = await app.mapUrl('https://firecrawl.dev'); const mapResult = await app.mapUrl('https://firecrawl.dev');
console.log(mapResult) console.log(mapResult)
// // Extract information from a website using LLM:
// const extractSchema = z.object({
// title: z.string(),
// description: z.string(),
// links: z.array(z.string())
// });
// const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
// prompt: "Extract the title, description, and links from the website",
// schema: extractSchema
// });
// console.log(extractResult);
// Crawl a website with WebSockets: // Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.8.2", "version": "1.9.0",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@ -234,6 +234,26 @@ export interface MapResponse {
error?: string; error?: string;
} }
/**
* Parameters for extracting information from URLs.
* Defines options for extracting information from URLs.
*/
export interface ExtractParams {
prompt: string;
schema?: zt.ZodSchema;
systemPrompt?: string;
}
/**
* Response interface for extracting information from URLs.
* Defines the structure of the response received after extracting information from URLs.
*/
export interface ExtractResponse {
success: true;
data: zt.infer<zt.ZodSchema>;
error?: string;
}
/** /**
* Error response interface. * Error response interface.
* Defines the structure of the response received when an error occurs. * Defines the structure of the response received when an error occurs.
@ -243,7 +263,6 @@ export interface ErrorResponse {
error: string; error: string;
} }
/** /**
* Custom error class for Firecrawl. * Custom error class for Firecrawl.
* Extends the built-in Error class to include a status code. * Extends the built-in Error class to include a status code.
@ -675,6 +694,44 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." }; return { success: false, error: "Internal server error." };
} }
/**
* Extracts information from a URL using the Firecrawl API.
* @param url - The URL to extract information from.
* @param params - Additional parameters for the extract request.
* @returns The response from the extract operation.
*/
async extractUrls(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
const headers = this.prepareHeaders();
if (!params?.prompt) {
throw new FirecrawlError("Prompt is required", 400);
}
let jsonData: { urls: string[] } & ExtractParams= { urls, ...params };
let jsonSchema: any;
try {
jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
} catch (error: any) {
throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
}
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/extract`,
{ ...jsonData, schema: jsonSchema },
headers
);
if (response.status === 200) {
return response.data as ExtractResponse;
} else {
this.handleError(response, "extract");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/** /**
* Prepares the headers for an API request. * Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency. * @param idempotencyKey - Optional key to ensure idempotency.

View File

@ -2,6 +2,8 @@ import time
import nest_asyncio import nest_asyncio
import uuid import uuid
from firecrawl.firecrawl import FirecrawlApp from firecrawl.firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import List
app = FirecrawlApp(api_key="fc-") app = FirecrawlApp(api_key="fc-")
@ -50,9 +52,6 @@ print(crawl_status)
# LLM Extraction: # LLM Extraction:
# Define schema to extract contents into using pydantic # Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field
from typing import List
class ArticleSchema(BaseModel): class ArticleSchema(BaseModel):
title: str title: str
points: int points: int
@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
print(map_result) print(map_result)
# Extract URLs:
class ExtractSchema(BaseModel):
title: str
description: str
links: List[str]
# Define the schema using Pydantic
extract_schema = ExtractSchema.schema()
# Perform the extraction
extract_result = app.extract_urls(['https://firecrawl.dev'], {
'prompt': "Extract the title, description, and links from the website",
'schema': extract_schema
})
print(extract_result)
# Crawl a website with WebSockets: # Crawl a website with WebSockets:
# inside an async function... # inside an async function...
nest_asyncio.apply() nest_asyncio.apply()

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.5.0" __version__ = "1.6.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -12,15 +12,39 @@ Classes:
import logging import logging
import os import os
import time import time
from typing import Any, Dict, Optional, List from typing import Any, Dict, Optional, List, Union
import json import json
import requests import requests
import pydantic
import websockets import websockets
logger : logging.Logger = logging.getLogger("firecrawl") logger : logging.Logger = logging.getLogger("firecrawl")
class FirecrawlApp: class FirecrawlApp:
class ExtractParams(pydantic.BaseModel):
"""
Parameters for the extract operation.
"""
prompt: str
schema: Optional[Any] = None
system_prompt: Optional[str] = None
class ExtractResponse(pydantic.BaseModel):
"""
Response from the extract operation.
"""
success: bool
data: Optional[Any] = None
error: Optional[str] = None
class ErrorResponse(pydantic.BaseModel):
"""
Error response.
"""
success: bool
error: str
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
""" """
Initialize the FirecrawlApp instance with API key, API URL. Initialize the FirecrawlApp instance with API key, API URL.
@ -434,6 +458,44 @@ class FirecrawlApp:
else: else:
self._handle_error(response, 'check batch scrape status') self._handle_error(response, 'check batch scrape status')
def extract_urls(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
"""
Extracts information from a URL using the Firecrawl API.
Args:
urls (List[str]): The URLs to extract information from.
params (Optional[ExtractParams]): Additional parameters for the extract request.
Returns:
Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
"""
headers = self._prepare_headers()
if not params or not params.get('prompt'):
raise ValueError("Prompt is required")
if not params.get('schema'):
raise ValueError("Schema is required for extraction")
jsonData = {'urls': urls, **params}
jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
try:
response = self._post_request(
f'{self.api_url}/v1/extract',
{**jsonData, 'schema': jsonSchema},
headers
)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, "extract")
except Exception as e:
raise ValueError(str(e), 500)
return {'success': False, 'error': "Internal server error."}
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
""" """
Prepare the headers for API requests. Prepare the headers for API requests.