[python-SDK] improvs/async (#1337)

* improv/types-and-comments-descs * async * removed v0 in example * tomkosms review * refator: dry request and error handling * fixed websocket params * added origin to requests * Update firecrawl.py * Update firecrawl.py * added agent options types * Update firecrawl.py * generic * Update firecrawl.py * scrape params commentary * Update firecrawl.py * Update firecrawl.py * Update firecrawl.py * Update firecrawl.py * async scrape * Update firecrawl.py * Nick: new examples * Nick: python sdk 2.0 * async functions * Nick: * Nick: --------- Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com> Co-authored-by: Nicolas <nicolascamara29@gmail.com>
2025-08-14 17:45:58 +08:00 · 2025-04-18 01:32:55 -07:00 · 2025-04-18 01:32:55 -07:00 · 29b36c5f9a
commit 29b36c5f9a
parent ec3d679c5b
10 changed files with 3528 additions and 484 deletions
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -59,7 +59,9 @@ export async function extractController(
  if (
    (await getTeamIdSyncB(req.auth.team_id)) &&
    req.body.origin !== "api-sdk" &&
-    req.body.origin !== "website"
+    req.body.origin !== "website" &&
+    !req.body.origin.startsWith("python-sdk@") &&
+    !req.body.origin.startsWith("js-sdk@")
  ) {
    return await oldExtract(req, res, extractId);
  }
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -278,14 +278,14 @@ v1Router.get(

 v1Router.post(
  "/deep-research",
-  authMiddleware(RateLimiterMode.Extract),
+  authMiddleware(RateLimiterMode.Crawl),
  checkCreditsMiddleware(1),
  wrap(deepResearchController),
 );

 v1Router.get(
  "/deep-research/:jobId",
-  authMiddleware(RateLimiterMode.ExtractStatus),
+  authMiddleware(RateLimiterMode.CrawlStatus),
  wrap(deepResearchStatusController),
 );

--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -550,11 +550,26 @@ export interface GenerateLLMsTextStatusResponse {
 export default class FirecrawlApp {
  public apiKey: string;
  public apiUrl: string;
+  public version: string = "1.19.1";
  
  private isCloudService(url: string): boolean {
    return url.includes('api.firecrawl.dev');
  }

+  private async getVersion(): Promise<string> {
+    try {
+      const packageJson = await import('../package.json', { assert: { type: 'json' } });
+      return packageJson.default.version;
+    } catch (error) {
+      console.error("Error getting version:", error);
+      return "1.19.1";
+    }
+  }
+
+  private async init() {
+    this.version = await this.getVersion();
+  }
+
  /**
   * Initializes a new instance of the FirecrawlApp class.
   * @param config - Configuration options for the FirecrawlApp instance.
@ -568,6 +583,7 @@ export default class FirecrawlApp {

    this.apiKey = apiKey || '';
    this.apiUrl = baseUrl;
+    this.init();
  }

  /**
@ -584,7 +600,7 @@ export default class FirecrawlApp {
      "Content-Type": "application/json",
      Authorization: `Bearer ${this.apiKey}`,
    } as AxiosRequestHeaders;
-    let jsonData: any = { url, ...params };
+    let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
    if (jsonData?.extract?.schema) {
      let schema = jsonData.extract.schema;

@ -666,7 +682,7 @@ export default class FirecrawlApp {
      lang: params?.lang ?? "en",
      country: params?.country ?? "us",
      location: params?.location,
-      origin: params?.origin ?? "api",
+      origin: `js-sdk@${this.version}`,
      timeout: params?.timeout ?? 60000,
      scrapeOptions: params?.scrapeOptions ?? { formats: [] },
    };
@ -738,7 +754,7 @@ export default class FirecrawlApp {
    idempotencyKey?: string
  ): Promise<CrawlStatusResponse | ErrorResponse> {
    const headers = this.prepareHeaders(idempotencyKey);
-    let jsonData: any = { url, ...params };
+    let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/crawl`,
@ -767,7 +783,7 @@ export default class FirecrawlApp {
    idempotencyKey?: string
  ): Promise<CrawlResponse | ErrorResponse> {
    const headers = this.prepareHeaders(idempotencyKey);
-    let jsonData: any = { url, ...params };
+    let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/crawl`,
@ -943,7 +959,7 @@ export default class FirecrawlApp {
   */
  async mapUrl(url: string, params?: MapParams): Promise<MapResponse | ErrorResponse> {
    const headers = this.prepareHeaders();
-    let jsonData: { url: string } & MapParams = { url, ...params };
+    let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };

    try {
      const response: AxiosResponse = await this.postRequest(
@ -981,7 +997,7 @@ export default class FirecrawlApp {
    ignoreInvalidURLs?: boolean,
  ): Promise<BatchScrapeStatusResponse | ErrorResponse> {
    const headers = this.prepareHeaders(idempotencyKey);
-    let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params };
+    let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` };
    if (jsonData?.extract?.schema) {
      let schema = jsonData.extract.schema;

@ -1046,7 +1062,7 @@ export default class FirecrawlApp {
    ignoreInvalidURLs?: boolean,
  ): Promise<BatchScrapeResponse | ErrorResponse> {
    const headers = this.prepareHeaders(idempotencyKey);
-    let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) };
+    let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` };
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/batch/scrape`,
@ -1220,7 +1236,7 @@ export default class FirecrawlApp {
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/extract`,
-        { ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" },
+        { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
        headers
      );

@ -1288,7 +1304,7 @@ export default class FirecrawlApp {
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/extract`,
-        { ...jsonData, schema: jsonSchema },
+        { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` },
        headers
      );

@ -1579,7 +1595,7 @@ export default class FirecrawlApp {
   */
  async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
    const headers = this.prepareHeaders();
-    let jsonData: any = { query, ...params };
+    let jsonData: any = { query, ...params, origin: `js-sdk@${this.version}` };

    if (jsonData?.jsonOptions?.schema) {
      let schema = jsonData.jsonOptions.schema;
@ -1587,7 +1603,7 @@ export default class FirecrawlApp {
      try {
        schema = zodToJsonSchema(schema);
      } catch (error) {
-        
+        // Ignore error if schema can't be parsed as Zod
      }
      jsonData = {
        ...jsonData,
@ -1733,9 +1749,10 @@ export default class FirecrawlApp {
  async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
    const headers = this.prepareHeaders();
    try {
+      let jsonData: any = { topic, ...params, origin: `js-sdk@${this.version}` };
      const response: AxiosResponse = await this.postRequest(
        `${this.apiUrl}/v1/deep-research`,
-        { topic, ...params },
+        jsonData,
        headers
      );

@ -1845,10 +1862,11 @@ export default class FirecrawlApp {
   */
  async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise<GenerateLLMsTextResponse | ErrorResponse> {
    const headers = this.prepareHeaders();
+    let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` };
    try {
      const response: AxiosResponse = await this.postRequest(
        `${this.apiUrl}/v1/llmstxt`,
-        { url, ...params },
+        jsonData,
        headers
      );

--- a/apps/python-sdk/example.py
+++ b/apps/python-sdk/example.py
@ -1,53 +1,45 @@
-import time
-import nest_asyncio
-import uuid
-from firecrawl.firecrawl import FirecrawlApp
+from firecrawl.firecrawl import ExtractConfig, FirecrawlApp
 from pydantic import BaseModel, Field
 from typing import List
+import time
+app = FirecrawlApp(api_url="https://api.firecrawl.dev")

-app = FirecrawlApp(api_key="fc-")
-
-# Scrape a website:
-scrape_result = app.scrape_url('firecrawl.dev')
-print(scrape_result['markdown'])
+# # Scrape a website:
+scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
+print(scrape_result.markdown)


-# Test batch scrape
+# # Test batch scrapeq
 urls = ['https://example.com', 'https://docs.firecrawl.dev']
-batch_scrape_params = {
-    'formats': ['markdown', 'html'],
-}
-
 # Synchronous batch scrape
-batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
+batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
 print("Synchronous Batch Scrape Result:")
-print(batch_result['data'][0]['markdown'])
+print(batch_result.data[0].markdown)

-# Asynchronous batch scrape
-async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
+# # Asynchronous batch scrape
+async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
 print("\nAsynchronous Batch Scrape Result:")
 print(async_batch_result)

 # Crawl a website:
-idempotency_key = str(uuid.uuid4()) # optional idempotency key
-crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
-print(crawl_result)
+crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
+print(crawl_result.data[0].markdown)

-# Asynchronous Crawl a website:
-async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
+# # Asynchronous Crawl a website:
+async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
 print(async_result)

-crawl_status = app.check_crawl_status(async_result['id'])
+crawl_status = app.check_crawl_status(async_result.id)
 print(crawl_status)

 attempts = 15
-while attempts > 0 and crawl_status['status'] != 'completed':
+while attempts > 0 and crawl_status.status != 'completed':
    print(crawl_status)
-    crawl_status = app.check_crawl_status(async_result['id'])
+    crawl_status = app.check_crawl_status(async_result.id)
    attempts -= 1
    time.sleep(1)

-crawl_status = app.get_crawl_status(async_result['id'])
+crawl_status = app.check_crawl_status(async_result.id)
 print(crawl_status)

 # LLM Extraction:
@ -61,14 +53,11 @@ class ArticleSchema(BaseModel):
 class TopArticlesSchema(BaseModel):
    top: List[ArticleSchema] = Field(..., description="Top 5 stories")

-llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
-    'formats': ['extract'],
-    'extract': {
-        'schema': TopArticlesSchema.model_json_schema()
-    }
-})
+extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())

-print(llm_extraction_result['extract'])
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
+
+print(llm_extraction_result.extract)

 # # Define schema to extract contents into using json schema
 json_schema = {
@ -94,24 +83,16 @@ json_schema = {
  "required": ["top"]
 }

-app2 = FirecrawlApp(api_key="fc-", version="v0")
+extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True})
+llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)

-
-llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
-    'extractorOptions': {
-        'extractionSchema': json_schema,
-        'mode': 'llm-extraction'
-    },
-    'pageOptions':{
-        'onlyMainContent': True
-    }
-})
+print(llm_extraction_result.extract)

 # print(llm_extraction_result['llm_extraction'])


 # Map a website:
-map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
+map_result = app.map_url('https://firecrawl.dev', search="blog")
 print(map_result)

 # Extract URLs:
@ -124,14 +105,12 @@ class ExtractSchema(BaseModel):
 extract_schema = ExtractSchema.schema()

 # Perform the extraction
-extract_result = app.extract(['https://firecrawl.dev'], {
-    'prompt': "Extract the title, description, and links from the website",
-    'schema': extract_schema
-})
+extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
 print(extract_result)

 # Crawl a website with WebSockets:
 # inside an async function...
+import nest_asyncio
 nest_asyncio.apply()

 # Define event handlers
--- a/apps/python-sdk/example_async.py
+++ b/apps/python-sdk/example_async.py
@ -0,0 +1,120 @@
+import time
+import nest_asyncio
+import uuid
+import asyncio
+from firecrawl.firecrawl import AsyncFirecrawlApp
+from pydantic import BaseModel, Field
+from typing import List
+
+app = AsyncFirecrawlApp(api_url="https://api.firecrawl.dev")
+
+async def example_scrape():
+    # Scrape a website:
+    scrape_result = await app.scrape_url('example.com', formats=["markdown", "html"])
+    print(scrape_result.markdown)
+
+async def example_batch_scrape():
+    # Batch scrape
+    urls = ['https://example.com', 'https://docs.firecrawl.dev']
+
+    # Synchronous batch scrape
+    batch_result = await app.batch_scrape_urls(urls, formats=["markdown", "html"])
+    print("Synchronous Batch Scrape Result:")
+    print(batch_result.data[0].markdown)
+
+    # Asynchronous batch scrape
+    async_batch_result = await app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
+    print("\nAsynchronous Batch Scrape Result:")
+    print(async_batch_result)
+
+async def example_crawl():
+    # Crawl a website:
+    crawl_result = await app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
+    print(crawl_result.data[0].markdown)
+
+    # Asynchronous Crawl a website:
+    async_result = await app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
+    print(async_result)
+
+    crawl_status = await app.check_crawl_status(async_result.id)
+    print(crawl_status)
+
+    attempts = 15
+    while attempts > 0 and crawl_status.status != 'completed':
+        print(crawl_status)
+        crawl_status = await app.check_crawl_status(async_result.id)
+        attempts -= 1
+        await asyncio.sleep(1)  # Use async sleep instead of time.sleep
+
+    crawl_status = await app.check_crawl_status(async_result.id)
+    print(crawl_status)
+
+async def example_llm_extraction():
+    # Define schema to extract contents into using pydantic
+    class ArticleSchema(BaseModel):
+        title: str
+        points: int 
+        by: str
+        commentsURL: str
+
+    class TopArticlesSchema(BaseModel):
+        top: List[ArticleSchema] = Field(..., description="Top 5 stories")
+
+    extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
+
+    llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
+
+    print(llm_extraction_result.extract)
+
+async def example_map_and_extract():
+    # Map a website:
+    map_result = await app.map_url('https://firecrawl.dev', search="blog")
+    print(map_result)
+
+    # Extract URLs:
+    class ExtractSchema(BaseModel):
+        title: str
+        description: str
+        links: List[str]
+
+    # Define the schema using Pydantic
+    extract_schema = ExtractSchema.schema()
+
+    # Perform the extraction
+    extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
+    print(extract_result)
+
+# Define event handlers for websocket
+def on_document(detail):
+    print("DOC", detail)
+
+def on_error(detail):
+    print("ERR", detail['error'])
+
+def on_done(detail):
+    print("DONE", detail['status'])
+
+async def example_websocket_crawl():
+    # Initiate the crawl job and get the watcher
+    watcher = await app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
+
+    # Add event listeners
+    watcher.add_event_listener("document", on_document)
+    watcher.add_event_listener("error", on_error)
+    watcher.add_event_listener("done", on_done)
+
+    # Start the watcher
+    await watcher.connect()
+
+async def main():
+    nest_asyncio.apply()
+    
+    await example_scrape()
+    await example_batch_scrape()
+    await example_crawl()
+    await example_llm_extraction()
+    await example_map_and_extract()
+    await example_websocket_crawl()
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp # noqa

-__version__ = "1.17.0"
+__version__ = "2.0.0"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
--- a/apps/python-sdk/pyproject.toml
+++ b/apps/python-sdk/pyproject.toml
@ -13,7 +13,8 @@ dependencies = [
    "python-dotenv",
    "websockets",
    "nest-asyncio",
-    "pydantic>=2.10.3",
+    "pydantic",
+    "aiohttp"
 ]
 authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
 maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
--- a/apps/python-sdk/requirements.txt
+++ b/apps/python-sdk/requirements.txt
@ -4,3 +4,4 @@ python-dotenv
 websockets
 nest-asyncio
 pydantic
+aiohttp
--- a/apps/python-sdk/setup.py
+++ b/apps/python-sdk/setup.py
@ -32,7 +32,9 @@ setup(
        'python-dotenv',
        'websockets',
        'asyncio',
-        'nest-asyncio'
+        'nest-asyncio',
+        'pydantic',
+        'aiohttp'
    ],
    python_requires=">=3.8",
    classifiers=[