[python-SDK] improvs/async (#1337)

* improv/types-and-comments-descs

* async

* removed v0 in example

* tomkosms review

* refator: dry request and error handling

* fixed websocket params

* added origin to requests

* Update firecrawl.py

* Update firecrawl.py

* added agent options types

* Update firecrawl.py

* generic

* Update firecrawl.py

* scrape params commentary

* Update firecrawl.py

* Update firecrawl.py

* Update firecrawl.py

* Update firecrawl.py

* async scrape

* Update firecrawl.py

* Nick: new examples

* Nick: python sdk 2.0

* async functions

* Nick:

* Nick:

---------

Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
Rafael Miller 2025-04-18 01:32:55 -07:00 committed by GitHub
parent ec3d679c5b
commit 29b36c5f9a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 3528 additions and 484 deletions

View File

@ -59,7 +59,9 @@ export async function extractController(
if (
(await getTeamIdSyncB(req.auth.team_id)) &&
req.body.origin !== "api-sdk" &&
req.body.origin !== "website"
req.body.origin !== "website" &&
!req.body.origin.startsWith("python-sdk@") &&
!req.body.origin.startsWith("js-sdk@")
) {
return await oldExtract(req, res, extractId);
}

View File

@ -278,14 +278,14 @@ v1Router.get(
v1Router.post(
"/deep-research",
authMiddleware(RateLimiterMode.Extract),
authMiddleware(RateLimiterMode.Crawl),
checkCreditsMiddleware(1),
wrap(deepResearchController),
);
v1Router.get(
"/deep-research/:jobId",
authMiddleware(RateLimiterMode.ExtractStatus),
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(deepResearchStatusController),
);

View File

@ -550,11 +550,26 @@ export interface GenerateLLMsTextStatusResponse {
export default class FirecrawlApp {
public apiKey: string;
public apiUrl: string;
public version: string = "1.19.1";
private isCloudService(url: string): boolean {
return url.includes('api.firecrawl.dev');
}
private async getVersion(): Promise<string> {
try {
const packageJson = await import('../package.json', { assert: { type: 'json' } });
return packageJson.default.version;
} catch (error) {
console.error("Error getting version:", error);
return "1.19.1";
}
}
private async init() {
this.version = await this.getVersion();
}
/**
* Initializes a new instance of the FirecrawlApp class.
* @param config - Configuration options for the FirecrawlApp instance.
@ -568,6 +583,7 @@ export default class FirecrawlApp {
this.apiKey = apiKey || '';
this.apiUrl = baseUrl;
this.init();
}
/**
@ -584,7 +600,7 @@ export default class FirecrawlApp {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders;
let jsonData: any = { url, ...params };
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
if (jsonData?.extract?.schema) {
let schema = jsonData.extract.schema;
@ -666,7 +682,7 @@ export default class FirecrawlApp {
lang: params?.lang ?? "en",
country: params?.country ?? "us",
location: params?.location,
origin: params?.origin ?? "api",
origin: `js-sdk@${this.version}`,
timeout: params?.timeout ?? 60000,
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
};
@ -738,7 +754,7 @@ export default class FirecrawlApp {
idempotencyKey?: string
): Promise<CrawlStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { url, ...params };
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/crawl`,
@ -767,7 +783,7 @@ export default class FirecrawlApp {
idempotencyKey?: string
): Promise<CrawlResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { url, ...params };
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/crawl`,
@ -943,7 +959,7 @@ export default class FirecrawlApp {
*/
async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> {
const headers = this.prepareHeaders();
let jsonData: { url: string } & MapParams = { url, ...params };
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
try {
const response: AxiosResponse = await this.postRequest(
@ -981,7 +997,7 @@ export default class FirecrawlApp {
ignoreInvalidURLs?: boolean,
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` };
if (jsonData?.extract?.schema) {
let schema = jsonData.extract.schema;
@ -1046,7 +1062,7 @@ export default class FirecrawlApp {
ignoreInvalidURLs?: boolean,
): Promise<BatchScrapeResponse | ErrorResponse> {
const headers = this.prepareHeaders(idempotencyKey);
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` };
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/batch/scrape`,
@ -1220,7 +1236,7 @@ export default class FirecrawlApp {
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/extract`,
{ ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" },
{ ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
headers
);
@ -1288,7 +1304,7 @@ export default class FirecrawlApp {
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/extract`,
{ ...jsonData, schema: jsonSchema },
{ ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
headers
);
@ -1579,7 +1595,7 @@ export default class FirecrawlApp {
*/
async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
const headers = this.prepareHeaders();
let jsonData: any = { query, ...params };
let jsonData: any = { query, ...params, origin: `js-sdk@${this.version}` };
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
@ -1587,7 +1603,7 @@ export default class FirecrawlApp {
try {
schema = zodToJsonSchema(schema);
} catch (error) {
// Ignore error if schema can't be parsed as Zod
}
jsonData = {
...jsonData,
@ -1733,9 +1749,10 @@ export default class FirecrawlApp {
async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
const headers = this.prepareHeaders();
try {
let jsonData: any = { topic, ...params, origin: `js-sdk@${this.version}` };
const response: AxiosResponse = await this.postRequest(
`${this.apiUrl}/v1/deep-research`,
{ topic, ...params },
jsonData,
headers
);
@ -1845,10 +1862,11 @@ export default class FirecrawlApp {
*/
async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextResponse | ErrorResponse> {
const headers = this.prepareHeaders();
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
try {
const response: AxiosResponse = await this.postRequest(
`${this.apiUrl}/v1/llmstxt`,
{ url, ...params },
jsonData,
headers
);

View File

@ -1,53 +1,45 @@
import time
import nest_asyncio
import uuid
from firecrawl.firecrawl import FirecrawlApp
from firecrawl.firecrawl import ExtractConfig, FirecrawlApp
from pydantic import BaseModel, Field
from typing import List
import time
app = FirecrawlApp(api_url="https://api.firecrawl.dev")
app = FirecrawlApp(api_key="fc-")
# Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])
# # Scrape a website:
scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
print(scrape_result.markdown)
# Test batch scrape
# # Test batch scrapeq
urls = ['https://example.com', 'https://docs.firecrawl.dev']
batch_scrape_params = {
'formats': ['markdown', 'html'],
}
# Synchronous batch scrape
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
print("Synchronous Batch Scrape Result:")
print(batch_result['data'][0]['markdown'])
print(batch_result.data[0].markdown)
# Asynchronous batch scrape
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
# # Asynchronous batch scrape
async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
print("\nAsynchronous Batch Scrape Result:")
print(async_batch_result)
# Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
print(crawl_result)
crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
print(crawl_result.data[0].markdown)
# Asynchronous Crawl a website:
async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
# # Asynchronous Crawl a website:
async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
print(async_result)
crawl_status = app.check_crawl_status(async_result['id'])
crawl_status = app.check_crawl_status(async_result.id)
print(crawl_status)
attempts = 15
while attempts > 0 and crawl_status['status'] != 'completed':
while attempts > 0 and crawl_status.status != 'completed':
print(crawl_status)
crawl_status = app.check_crawl_status(async_result['id'])
crawl_status = app.check_crawl_status(async_result.id)
attempts -= 1
time.sleep(1)
crawl_status = app.get_crawl_status(async_result['id'])
crawl_status = app.check_crawl_status(async_result.id)
print(crawl_status)
# LLM Extraction:
@ -61,14 +53,11 @@ class ArticleSchema(BaseModel):
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
'formats': ['extract'],
'extract': {
'schema': TopArticlesSchema.model_json_schema()
}
})
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
print(llm_extraction_result['extract'])
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
print(llm_extraction_result.extract)
# # Define schema to extract contents into using json schema
json_schema = {
@ -94,24 +83,16 @@ json_schema = {
"required": ["top"]
}
app2 = FirecrawlApp(api_key="fc-", version="v0")
extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True})
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': json_schema,
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
print(llm_extraction_result.extract)
# print(llm_extraction_result['llm_extraction'])
# Map a website:
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
map_result = app.map_url('https://firecrawl.dev', search="blog")
print(map_result)
# Extract URLs:
@ -124,14 +105,12 @@ class ExtractSchema(BaseModel):
extract_schema = ExtractSchema.schema()
# Perform the extraction
extract_result = app.extract(['https://firecrawl.dev'], {
'prompt': "Extract the title, description, and links from the website",
'schema': extract_schema
})
extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
print(extract_result)
# Crawl a website with WebSockets:
# inside an async function...
import nest_asyncio
nest_asyncio.apply()
# Define event handlers

View File

@ -0,0 +1,120 @@
import time
import nest_asyncio
import uuid
import asyncio
from firecrawl.firecrawl import AsyncFirecrawlApp
from pydantic import BaseModel, Field
from typing import List
app = AsyncFirecrawlApp(api_url="https://api.firecrawl.dev")
async def example_scrape():
# Scrape a website:
scrape_result = await app.scrape_url('example.com', formats=["markdown", "html"])
print(scrape_result.markdown)
async def example_batch_scrape():
# Batch scrape
urls = ['https://example.com', 'https://docs.firecrawl.dev']
# Synchronous batch scrape
batch_result = await app.batch_scrape_urls(urls, formats=["markdown", "html"])
print("Synchronous Batch Scrape Result:")
print(batch_result.data[0].markdown)
# Asynchronous batch scrape
async_batch_result = await app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
print("\nAsynchronous Batch Scrape Result:")
print(async_batch_result)
async def example_crawl():
# Crawl a website:
crawl_result = await app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
print(crawl_result.data[0].markdown)
# Asynchronous Crawl a website:
async_result = await app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
print(async_result)
crawl_status = await app.check_crawl_status(async_result.id)
print(crawl_status)
attempts = 15
while attempts > 0 and crawl_status.status != 'completed':
print(crawl_status)
crawl_status = await app.check_crawl_status(async_result.id)
attempts -= 1
await asyncio.sleep(1) # Use async sleep instead of time.sleep
crawl_status = await app.check_crawl_status(async_result.id)
print(crawl_status)
async def example_llm_extraction():
# Define schema to extract contents into using pydantic
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
print(llm_extraction_result.extract)
async def example_map_and_extract():
# Map a website:
map_result = await app.map_url('https://firecrawl.dev', search="blog")
print(map_result)
# Extract URLs:
class ExtractSchema(BaseModel):
title: str
description: str
links: List[str]
# Define the schema using Pydantic
extract_schema = ExtractSchema.schema()
# Perform the extraction
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
print(extract_result)
# Define event handlers for websocket
def on_document(detail):
print("DOC", detail)
def on_error(detail):
print("ERR", detail['error'])
def on_done(detail):
print("DONE", detail['status'])
async def example_websocket_crawl():
# Initiate the crawl job and get the watcher
watcher = await app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
# Add event listeners
watcher.add_event_listener("document", on_document)
watcher.add_event_listener("error", on_error)
watcher.add_event_listener("done", on_done)
# Start the watcher
await watcher.connect()
async def main():
nest_asyncio.apply()
await example_scrape()
await example_batch_scrape()
await example_crawl()
await example_llm_extraction()
await example_map_and_extract()
await example_websocket_crawl()
if __name__ == "__main__":
asyncio.run(main())

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.17.0"
__version__ = "2.0.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

File diff suppressed because it is too large Load Diff

View File

@ -13,7 +13,8 @@ dependencies = [
"python-dotenv",
"websockets",
"nest-asyncio",
"pydantic>=2.10.3",
"pydantic",
"aiohttp"
]
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]

View File

@ -4,3 +4,4 @@ python-dotenv
websockets
nest-asyncio
pydantic
aiohttp

View File

@ -32,7 +32,9 @@ setup(
'python-dotenv',
'websockets',
'asyncio',
'nest-asyncio'
'nest-asyncio',
'pydantic',
'aiohttp'
],
python_requires=">=3.8",
classifiers=[