mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 15:39:03 +08:00
sdks wip
This commit is contained in:
parent
a1c018fdb0
commit
80d6cb16fb
@ -1,4 +1,5 @@
|
|||||||
import FirecrawlApp from 'firecrawl';
|
import FirecrawlApp from 'firecrawl';
|
||||||
|
import { z } from 'zod';
|
||||||
|
|
||||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||||
|
|
||||||
@ -42,6 +43,18 @@ const main = async () => {
|
|||||||
const mapResult = await app.mapUrl('https://firecrawl.dev');
|
const mapResult = await app.mapUrl('https://firecrawl.dev');
|
||||||
console.log(mapResult)
|
console.log(mapResult)
|
||||||
|
|
||||||
|
// Extract information from a website using LLM:
|
||||||
|
const extractSchema = z.object({
|
||||||
|
title: z.string(),
|
||||||
|
description: z.string(),
|
||||||
|
links: z.array(z.string())
|
||||||
|
});
|
||||||
|
|
||||||
|
const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
|
||||||
|
prompt: "Extract the title, description, and links from the website",
|
||||||
|
schema: extractSchema
|
||||||
|
});
|
||||||
|
console.log(extractResult);
|
||||||
|
|
||||||
// Crawl a website with WebSockets:
|
// Crawl a website with WebSockets:
|
||||||
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
||||||
|
@ -42,6 +42,19 @@ const main = async () => {
|
|||||||
const mapResult = await app.mapUrl('https://firecrawl.dev');
|
const mapResult = await app.mapUrl('https://firecrawl.dev');
|
||||||
console.log(mapResult)
|
console.log(mapResult)
|
||||||
|
|
||||||
|
// // Extract information from a website using LLM:
|
||||||
|
// const extractSchema = z.object({
|
||||||
|
// title: z.string(),
|
||||||
|
// description: z.string(),
|
||||||
|
// links: z.array(z.string())
|
||||||
|
// });
|
||||||
|
|
||||||
|
// const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
|
||||||
|
// prompt: "Extract the title, description, and links from the website",
|
||||||
|
// schema: extractSchema
|
||||||
|
// });
|
||||||
|
// console.log(extractResult);
|
||||||
|
|
||||||
// Crawl a website with WebSockets:
|
// Crawl a website with WebSockets:
|
||||||
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@mendable/firecrawl-js",
|
"name": "@mendable/firecrawl-js",
|
||||||
"version": "1.8.2",
|
"version": "1.9.0",
|
||||||
"description": "JavaScript SDK for Firecrawl API",
|
"description": "JavaScript SDK for Firecrawl API",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
@ -234,6 +234,26 @@ export interface MapResponse {
|
|||||||
error?: string;
|
error?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parameters for extracting information from URLs.
|
||||||
|
* Defines options for extracting information from URLs.
|
||||||
|
*/
|
||||||
|
export interface ExtractParams {
|
||||||
|
prompt: string;
|
||||||
|
schema?: zt.ZodSchema;
|
||||||
|
systemPrompt?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Response interface for extracting information from URLs.
|
||||||
|
* Defines the structure of the response received after extracting information from URLs.
|
||||||
|
*/
|
||||||
|
export interface ExtractResponse {
|
||||||
|
success: true;
|
||||||
|
data: zt.infer<zt.ZodSchema>;
|
||||||
|
error?: string;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Error response interface.
|
* Error response interface.
|
||||||
* Defines the structure of the response received when an error occurs.
|
* Defines the structure of the response received when an error occurs.
|
||||||
@ -243,7 +263,6 @@ export interface ErrorResponse {
|
|||||||
error: string;
|
error: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Custom error class for Firecrawl.
|
* Custom error class for Firecrawl.
|
||||||
* Extends the built-in Error class to include a status code.
|
* Extends the built-in Error class to include a status code.
|
||||||
@ -675,6 +694,44 @@ export default class FirecrawlApp {
|
|||||||
return { success: false, error: "Internal server error." };
|
return { success: false, error: "Internal server error." };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts information from a URL using the Firecrawl API.
|
||||||
|
* @param url - The URL to extract information from.
|
||||||
|
* @param params - Additional parameters for the extract request.
|
||||||
|
* @returns The response from the extract operation.
|
||||||
|
*/
|
||||||
|
async extractUrls(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
|
||||||
|
const headers = this.prepareHeaders();
|
||||||
|
|
||||||
|
if (!params?.prompt) {
|
||||||
|
throw new FirecrawlError("Prompt is required", 400);
|
||||||
|
}
|
||||||
|
|
||||||
|
let jsonData: { urls: string[] } & ExtractParams= { urls, ...params };
|
||||||
|
let jsonSchema: any;
|
||||||
|
try {
|
||||||
|
jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
|
||||||
|
} catch (error: any) {
|
||||||
|
throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response: AxiosResponse = await this.postRequest(
|
||||||
|
this.apiUrl + `/v1/extract`,
|
||||||
|
{ ...jsonData, schema: jsonSchema },
|
||||||
|
headers
|
||||||
|
);
|
||||||
|
if (response.status === 200) {
|
||||||
|
return response.data as ExtractResponse;
|
||||||
|
} else {
|
||||||
|
this.handleError(response, "extract");
|
||||||
|
}
|
||||||
|
} catch (error: any) {
|
||||||
|
throw new FirecrawlError(error.message, 500);
|
||||||
|
}
|
||||||
|
return { success: false, error: "Internal server error." };
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prepares the headers for an API request.
|
* Prepares the headers for an API request.
|
||||||
* @param idempotencyKey - Optional key to ensure idempotency.
|
* @param idempotencyKey - Optional key to ensure idempotency.
|
||||||
|
@ -2,6 +2,8 @@ import time
|
|||||||
import nest_asyncio
|
import nest_asyncio
|
||||||
import uuid
|
import uuid
|
||||||
from firecrawl.firecrawl import FirecrawlApp
|
from firecrawl.firecrawl import FirecrawlApp
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing import List
|
||||||
|
|
||||||
app = FirecrawlApp(api_key="fc-")
|
app = FirecrawlApp(api_key="fc-")
|
||||||
|
|
||||||
@ -50,9 +52,6 @@ print(crawl_status)
|
|||||||
|
|
||||||
# LLM Extraction:
|
# LLM Extraction:
|
||||||
# Define schema to extract contents into using pydantic
|
# Define schema to extract contents into using pydantic
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
class ArticleSchema(BaseModel):
|
class ArticleSchema(BaseModel):
|
||||||
title: str
|
title: str
|
||||||
points: int
|
points: int
|
||||||
@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
|||||||
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
||||||
print(map_result)
|
print(map_result)
|
||||||
|
|
||||||
|
# Extract URLs:
|
||||||
|
class ExtractSchema(BaseModel):
|
||||||
|
title: str
|
||||||
|
description: str
|
||||||
|
links: List[str]
|
||||||
|
|
||||||
|
# Define the schema using Pydantic
|
||||||
|
extract_schema = ExtractSchema.schema()
|
||||||
|
|
||||||
|
# Perform the extraction
|
||||||
|
extract_result = app.extract_urls(['https://firecrawl.dev'], {
|
||||||
|
'prompt': "Extract the title, description, and links from the website",
|
||||||
|
'schema': extract_schema
|
||||||
|
})
|
||||||
|
print(extract_result)
|
||||||
|
|
||||||
# Crawl a website with WebSockets:
|
# Crawl a website with WebSockets:
|
||||||
# inside an async function...
|
# inside an async function...
|
||||||
nest_asyncio.apply()
|
nest_asyncio.apply()
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.5.0"
|
__version__ = "1.6.0"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -12,15 +12,39 @@ Classes:
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, Optional, List
|
from typing import Any, Dict, Optional, List, Union
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
import pydantic
|
||||||
import websockets
|
import websockets
|
||||||
|
|
||||||
logger : logging.Logger = logging.getLogger("firecrawl")
|
logger : logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
|
class ExtractParams(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Parameters for the extract operation.
|
||||||
|
"""
|
||||||
|
prompt: str
|
||||||
|
schema: Optional[Any] = None
|
||||||
|
system_prompt: Optional[str] = None
|
||||||
|
|
||||||
|
class ExtractResponse(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Response from the extract operation.
|
||||||
|
"""
|
||||||
|
success: bool
|
||||||
|
data: Optional[Any] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class ErrorResponse(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Error response.
|
||||||
|
"""
|
||||||
|
success: bool
|
||||||
|
error: str
|
||||||
|
|
||||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Initialize the FirecrawlApp instance with API key, API URL.
|
Initialize the FirecrawlApp instance with API key, API URL.
|
||||||
@ -434,6 +458,44 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check batch scrape status')
|
self._handle_error(response, 'check batch scrape status')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_urls(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
|
||||||
|
"""
|
||||||
|
Extracts information from a URL using the Firecrawl API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls (List[str]): The URLs to extract information from.
|
||||||
|
params (Optional[ExtractParams]): Additional parameters for the extract request.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
|
||||||
|
"""
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
|
||||||
|
if not params or not params.get('prompt'):
|
||||||
|
raise ValueError("Prompt is required")
|
||||||
|
|
||||||
|
if not params.get('schema'):
|
||||||
|
raise ValueError("Schema is required for extraction")
|
||||||
|
|
||||||
|
jsonData = {'urls': urls, **params}
|
||||||
|
jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self._post_request(
|
||||||
|
f'{self.api_url}/v1/extract',
|
||||||
|
{**jsonData, 'schema': jsonSchema},
|
||||||
|
headers
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()
|
||||||
|
else:
|
||||||
|
self._handle_error(response, "extract")
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(str(e), 500)
|
||||||
|
|
||||||
|
return {'success': False, 'error': "Internal server error."}
|
||||||
|
|
||||||
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
Prepare the headers for API requests.
|
Prepare the headers for API requests.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user