mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 23:45:51 +08:00
[python-SDK] improvs/async (#1337)
* improv/types-and-comments-descs * async * removed v0 in example * tomkosms review * refator: dry request and error handling * fixed websocket params * added origin to requests * Update firecrawl.py * Update firecrawl.py * added agent options types * Update firecrawl.py * generic * Update firecrawl.py * scrape params commentary * Update firecrawl.py * Update firecrawl.py * Update firecrawl.py * Update firecrawl.py * async scrape * Update firecrawl.py * Nick: new examples * Nick: python sdk 2.0 * async functions * Nick: * Nick: --------- Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
This commit is contained in:
parent
ec3d679c5b
commit
29b36c5f9a
@ -59,7 +59,9 @@ export async function extractController(
|
||||
if (
|
||||
(await getTeamIdSyncB(req.auth.team_id)) &&
|
||||
req.body.origin !== "api-sdk" &&
|
||||
req.body.origin !== "website"
|
||||
req.body.origin !== "website" &&
|
||||
!req.body.origin.startsWith("python-sdk@") &&
|
||||
!req.body.origin.startsWith("js-sdk@")
|
||||
) {
|
||||
return await oldExtract(req, res, extractId);
|
||||
}
|
||||
|
@ -278,14 +278,14 @@ v1Router.get(
|
||||
|
||||
v1Router.post(
|
||||
"/deep-research",
|
||||
authMiddleware(RateLimiterMode.Extract),
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(deepResearchController),
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/deep-research/:jobId",
|
||||
authMiddleware(RateLimiterMode.ExtractStatus),
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(deepResearchStatusController),
|
||||
);
|
||||
|
||||
|
@ -550,11 +550,26 @@ export interface GenerateLLMsTextStatusResponse {
|
||||
export default class FirecrawlApp {
|
||||
public apiKey: string;
|
||||
public apiUrl: string;
|
||||
public version: string = "1.19.1";
|
||||
|
||||
private isCloudService(url: string): boolean {
|
||||
return url.includes('api.firecrawl.dev');
|
||||
}
|
||||
|
||||
private async getVersion(): Promise<string> {
|
||||
try {
|
||||
const packageJson = await import('../package.json', { assert: { type: 'json' } });
|
||||
return packageJson.default.version;
|
||||
} catch (error) {
|
||||
console.error("Error getting version:", error);
|
||||
return "1.19.1";
|
||||
}
|
||||
}
|
||||
|
||||
private async init() {
|
||||
this.version = await this.getVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes a new instance of the FirecrawlApp class.
|
||||
* @param config - Configuration options for the FirecrawlApp instance.
|
||||
@ -568,6 +583,7 @@ export default class FirecrawlApp {
|
||||
|
||||
this.apiKey = apiKey || '';
|
||||
this.apiUrl = baseUrl;
|
||||
this.init();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -584,7 +600,7 @@ export default class FirecrawlApp {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
let jsonData: any = { url, ...params };
|
||||
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
||||
if (jsonData?.extract?.schema) {
|
||||
let schema = jsonData.extract.schema;
|
||||
|
||||
@ -666,7 +682,7 @@ export default class FirecrawlApp {
|
||||
lang: params?.lang ?? "en",
|
||||
country: params?.country ?? "us",
|
||||
location: params?.location,
|
||||
origin: params?.origin ?? "api",
|
||||
origin: `js-sdk@${this.version}`,
|
||||
timeout: params?.timeout ?? 60000,
|
||||
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
|
||||
};
|
||||
@ -738,7 +754,7 @@ export default class FirecrawlApp {
|
||||
idempotencyKey?: string
|
||||
): Promise<CrawlStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { url, ...params };
|
||||
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/crawl`,
|
||||
@ -767,7 +783,7 @@ export default class FirecrawlApp {
|
||||
idempotencyKey?: string
|
||||
): Promise<CrawlResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { url, ...params };
|
||||
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/crawl`,
|
||||
@ -943,7 +959,7 @@ export default class FirecrawlApp {
|
||||
*/
|
||||
async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
let jsonData: { url: string } & MapParams = { url, ...params };
|
||||
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
@ -981,7 +997,7 @@ export default class FirecrawlApp {
|
||||
ignoreInvalidURLs?: boolean,
|
||||
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
|
||||
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` };
|
||||
if (jsonData?.extract?.schema) {
|
||||
let schema = jsonData.extract.schema;
|
||||
|
||||
@ -1046,7 +1062,7 @@ export default class FirecrawlApp {
|
||||
ignoreInvalidURLs?: boolean,
|
||||
): Promise<BatchScrapeResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
|
||||
let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/batch/scrape`,
|
||||
@ -1220,7 +1236,7 @@ export default class FirecrawlApp {
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/extract`,
|
||||
{ ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" },
|
||||
{ ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
|
||||
headers
|
||||
);
|
||||
|
||||
@ -1288,7 +1304,7 @@ export default class FirecrawlApp {
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/extract`,
|
||||
{ ...jsonData, schema: jsonSchema },
|
||||
{ ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
|
||||
headers
|
||||
);
|
||||
|
||||
@ -1579,7 +1595,7 @@ export default class FirecrawlApp {
|
||||
*/
|
||||
async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
let jsonData: any = { query, ...params };
|
||||
let jsonData: any = { query, ...params, origin: `js-sdk@${this.version}` };
|
||||
|
||||
if (jsonData?.jsonOptions?.schema) {
|
||||
let schema = jsonData.jsonOptions.schema;
|
||||
@ -1587,7 +1603,7 @@ export default class FirecrawlApp {
|
||||
try {
|
||||
schema = zodToJsonSchema(schema);
|
||||
} catch (error) {
|
||||
|
||||
// Ignore error if schema can't be parsed as Zod
|
||||
}
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
@ -1733,9 +1749,10 @@ export default class FirecrawlApp {
|
||||
async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
try {
|
||||
let jsonData: any = { topic, ...params, origin: `js-sdk@${this.version}` };
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
`${this.apiUrl}/v1/deep-research`,
|
||||
{ topic, ...params },
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
|
||||
@ -1845,10 +1862,11 @@ export default class FirecrawlApp {
|
||||
*/
|
||||
async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
`${this.apiUrl}/v1/llmstxt`,
|
||||
{ url, ...params },
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
|
||||
|
@ -1,53 +1,45 @@
|
||||
import time
|
||||
import nest_asyncio
|
||||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
from firecrawl.firecrawl import ExtractConfig, FirecrawlApp
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
import time
|
||||
app = FirecrawlApp(api_url="https://api.firecrawl.dev")
|
||||
|
||||
app = FirecrawlApp(api_key="fc-")
|
||||
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
# # Scrape a website:
|
||||
scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
|
||||
print(scrape_result.markdown)
|
||||
|
||||
|
||||
# Test batch scrape
|
||||
# # Test batch scrapeq
|
||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||
batch_scrape_params = {
|
||||
'formats': ['markdown', 'html'],
|
||||
}
|
||||
|
||||
# Synchronous batch scrape
|
||||
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
|
||||
batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("Synchronous Batch Scrape Result:")
|
||||
print(batch_result['data'][0]['markdown'])
|
||||
print(batch_result.data[0].markdown)
|
||||
|
||||
# Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
|
||||
# # Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("\nAsynchronous Batch Scrape Result:")
|
||||
print(async_batch_result)
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
||||
print(crawl_result)
|
||||
crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||
print(crawl_result.data[0].markdown)
|
||||
|
||||
# Asynchronous Crawl a website:
|
||||
async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
|
||||
# # Asynchronous Crawl a website:
|
||||
async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||
print(async_result)
|
||||
|
||||
crawl_status = app.check_crawl_status(async_result['id'])
|
||||
crawl_status = app.check_crawl_status(async_result.id)
|
||||
print(crawl_status)
|
||||
|
||||
attempts = 15
|
||||
while attempts > 0 and crawl_status['status'] != 'completed':
|
||||
while attempts > 0 and crawl_status.status != 'completed':
|
||||
print(crawl_status)
|
||||
crawl_status = app.check_crawl_status(async_result['id'])
|
||||
crawl_status = app.check_crawl_status(async_result.id)
|
||||
attempts -= 1
|
||||
time.sleep(1)
|
||||
|
||||
crawl_status = app.get_crawl_status(async_result['id'])
|
||||
crawl_status = app.check_crawl_status(async_result.id)
|
||||
print(crawl_status)
|
||||
|
||||
# LLM Extraction:
|
||||
@ -61,14 +53,11 @@ class ArticleSchema(BaseModel):
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'formats': ['extract'],
|
||||
'extract': {
|
||||
'schema': TopArticlesSchema.model_json_schema()
|
||||
}
|
||||
})
|
||||
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
|
||||
|
||||
print(llm_extraction_result['extract'])
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||
|
||||
print(llm_extraction_result.extract)
|
||||
|
||||
# # Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
@ -94,24 +83,16 @@ json_schema = {
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
app2 = FirecrawlApp(api_key="fc-", version="v0")
|
||||
extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True})
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||
|
||||
|
||||
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': json_schema,
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
print(llm_extraction_result.extract)
|
||||
|
||||
# print(llm_extraction_result['llm_extraction'])
|
||||
|
||||
|
||||
# Map a website:
|
||||
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
||||
map_result = app.map_url('https://firecrawl.dev', search="blog")
|
||||
print(map_result)
|
||||
|
||||
# Extract URLs:
|
||||
@ -124,14 +105,12 @@ class ExtractSchema(BaseModel):
|
||||
extract_schema = ExtractSchema.schema()
|
||||
|
||||
# Perform the extraction
|
||||
extract_result = app.extract(['https://firecrawl.dev'], {
|
||||
'prompt': "Extract the title, description, and links from the website",
|
||||
'schema': extract_schema
|
||||
})
|
||||
extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
||||
print(extract_result)
|
||||
|
||||
# Crawl a website with WebSockets:
|
||||
# inside an async function...
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
|
||||
# Define event handlers
|
||||
|
120
apps/python-sdk/example_async.py
Normal file
120
apps/python-sdk/example_async.py
Normal file
@ -0,0 +1,120 @@
|
||||
import time
|
||||
import nest_asyncio
|
||||
import uuid
|
||||
import asyncio
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
|
||||
app = AsyncFirecrawlApp(api_url="https://api.firecrawl.dev")
|
||||
|
||||
async def example_scrape():
|
||||
# Scrape a website:
|
||||
scrape_result = await app.scrape_url('example.com', formats=["markdown", "html"])
|
||||
print(scrape_result.markdown)
|
||||
|
||||
async def example_batch_scrape():
|
||||
# Batch scrape
|
||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||
|
||||
# Synchronous batch scrape
|
||||
batch_result = await app.batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("Synchronous Batch Scrape Result:")
|
||||
print(batch_result.data[0].markdown)
|
||||
|
||||
# Asynchronous batch scrape
|
||||
async_batch_result = await app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("\nAsynchronous Batch Scrape Result:")
|
||||
print(async_batch_result)
|
||||
|
||||
async def example_crawl():
|
||||
# Crawl a website:
|
||||
crawl_result = await app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||
print(crawl_result.data[0].markdown)
|
||||
|
||||
# Asynchronous Crawl a website:
|
||||
async_result = await app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||
print(async_result)
|
||||
|
||||
crawl_status = await app.check_crawl_status(async_result.id)
|
||||
print(crawl_status)
|
||||
|
||||
attempts = 15
|
||||
while attempts > 0 and crawl_status.status != 'completed':
|
||||
print(crawl_status)
|
||||
crawl_status = await app.check_crawl_status(async_result.id)
|
||||
attempts -= 1
|
||||
await asyncio.sleep(1) # Use async sleep instead of time.sleep
|
||||
|
||||
crawl_status = await app.check_crawl_status(async_result.id)
|
||||
print(crawl_status)
|
||||
|
||||
async def example_llm_extraction():
|
||||
# Define schema to extract contents into using pydantic
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
||||
|
||||
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
|
||||
|
||||
llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||
|
||||
print(llm_extraction_result.extract)
|
||||
|
||||
async def example_map_and_extract():
|
||||
# Map a website:
|
||||
map_result = await app.map_url('https://firecrawl.dev', search="blog")
|
||||
print(map_result)
|
||||
|
||||
# Extract URLs:
|
||||
class ExtractSchema(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
links: List[str]
|
||||
|
||||
# Define the schema using Pydantic
|
||||
extract_schema = ExtractSchema.schema()
|
||||
|
||||
# Perform the extraction
|
||||
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
||||
print(extract_result)
|
||||
|
||||
# Define event handlers for websocket
|
||||
def on_document(detail):
|
||||
print("DOC", detail)
|
||||
|
||||
def on_error(detail):
|
||||
print("ERR", detail['error'])
|
||||
|
||||
def on_done(detail):
|
||||
print("DONE", detail['status'])
|
||||
|
||||
async def example_websocket_crawl():
|
||||
# Initiate the crawl job and get the watcher
|
||||
watcher = await app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
|
||||
|
||||
# Add event listeners
|
||||
watcher.add_event_listener("document", on_document)
|
||||
watcher.add_event_listener("error", on_error)
|
||||
watcher.add_event_listener("done", on_done)
|
||||
|
||||
# Start the watcher
|
||||
await watcher.connect()
|
||||
|
||||
async def main():
|
||||
nest_asyncio.apply()
|
||||
|
||||
await example_scrape()
|
||||
await example_batch_scrape()
|
||||
await example_crawl()
|
||||
await example_llm_extraction()
|
||||
await example_map_and_extract()
|
||||
await example_websocket_crawl()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp # noqa
|
||||
|
||||
__version__ = "1.17.0"
|
||||
__version__ = "2.0.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,8 @@ dependencies = [
|
||||
"python-dotenv",
|
||||
"websockets",
|
||||
"nest-asyncio",
|
||||
"pydantic>=2.10.3",
|
||||
"pydantic",
|
||||
"aiohttp"
|
||||
]
|
||||
authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||
maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
|
||||
|
@ -4,3 +4,4 @@ python-dotenv
|
||||
websockets
|
||||
nest-asyncio
|
||||
pydantic
|
||||
aiohttp
|
@ -32,7 +32,9 @@ setup(
|
||||
'python-dotenv',
|
||||
'websockets',
|
||||
'asyncio',
|
||||
'nest-asyncio'
|
||||
'nest-asyncio',
|
||||
'pydantic',
|
||||
'aiohttp'
|
||||
],
|
||||
python_requires=">=3.8",
|
||||
classifiers=[
|
||||
|
Loading…
x
Reference in New Issue
Block a user