diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 35061ac3..6807b7b5 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -59,7 +59,9 @@ export async function extractController( if ( (await getTeamIdSyncB(req.auth.team_id)) && req.body.origin !== "api-sdk" && - req.body.origin !== "website" + req.body.origin !== "website" && + !req.body.origin.startsWith("python-sdk@") && + !req.body.origin.startsWith("js-sdk@") ) { return await oldExtract(req, res, extractId); } diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 185a70de..271f2b17 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -278,14 +278,14 @@ v1Router.get( v1Router.post( "/deep-research", - authMiddleware(RateLimiterMode.Extract), + authMiddleware(RateLimiterMode.Crawl), checkCreditsMiddleware(1), wrap(deepResearchController), ); v1Router.get( "/deep-research/:jobId", - authMiddleware(RateLimiterMode.ExtractStatus), + authMiddleware(RateLimiterMode.CrawlStatus), wrap(deepResearchStatusController), ); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 779c5e71..ed090e10 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -550,11 +550,26 @@ export interface GenerateLLMsTextStatusResponse { export default class FirecrawlApp { public apiKey: string; public apiUrl: string; - + public version: string = "1.19.1"; + private isCloudService(url: string): boolean { return url.includes('api.firecrawl.dev'); } + private async getVersion(): Promise { + try { + const packageJson = await import('../package.json', { assert: { type: 'json' } }); + return packageJson.default.version; + } catch (error) { + console.error("Error getting version:", error); + return "1.19.1"; + } + } + + private async init() { + this.version = await this.getVersion(); + } + /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. @@ -568,6 +583,7 @@ export default class FirecrawlApp { this.apiKey = apiKey || ''; this.apiUrl = baseUrl; + this.init(); } /** @@ -584,7 +600,7 @@ export default class FirecrawlApp { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; - let jsonData: any = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; @@ -666,7 +682,7 @@ export default class FirecrawlApp { lang: params?.lang ?? "en", country: params?.country ?? "us", location: params?.location, - origin: params?.origin ?? "api", + origin: `js-sdk@${this.version}`, timeout: params?.timeout ?? 60000, scrapeOptions: params?.scrapeOptions ?? { formats: [] }, }; @@ -738,7 +754,7 @@ export default class FirecrawlApp { idempotencyKey?: string ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/crawl`, @@ -767,7 +783,7 @@ export default class FirecrawlApp { idempotencyKey?: string ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/crawl`, @@ -943,7 +959,7 @@ export default class FirecrawlApp { */ async mapUrl(url: string, params?: MapParams): Promise { const headers = this.prepareHeaders(); - let jsonData: { url: string } & MapParams = { url, ...params }; + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( @@ -981,7 +997,7 @@ export default class FirecrawlApp { ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` }; if (jsonData?.extract?.schema) { let schema = jsonData.extract.schema; @@ -1046,7 +1062,7 @@ export default class FirecrawlApp { ignoreInvalidURLs?: boolean, ): Promise { const headers = this.prepareHeaders(idempotencyKey); - let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...(params ?? {}) }; + let jsonData: any = { urls, webhook, ignoreInvalidURLs, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/batch/scrape`, @@ -1220,7 +1236,7 @@ export default class FirecrawlApp { try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/extract`, - { ...jsonData, schema: jsonSchema, origin: params?.origin || "api-sdk" }, + { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` }, headers ); @@ -1288,7 +1304,7 @@ export default class FirecrawlApp { try { const response: AxiosResponse = await this.postRequest( this.apiUrl + `/v1/extract`, - { ...jsonData, schema: jsonSchema }, + { ...jsonData, schema: jsonSchema, origin: `js-sdk@${this.version}` }, headers ); @@ -1579,7 +1595,7 @@ export default class FirecrawlApp { */ async asyncDeepResearch(query: string, params: DeepResearchParams): Promise { const headers = this.prepareHeaders(); - let jsonData: any = { query, ...params }; + let jsonData: any = { query, ...params, origin: `js-sdk@${this.version}` }; if (jsonData?.jsonOptions?.schema) { let schema = jsonData.jsonOptions.schema; @@ -1587,7 +1603,7 @@ export default class FirecrawlApp { try { schema = zodToJsonSchema(schema); } catch (error) { - + // Ignore error if schema can't be parsed as Zod } jsonData = { ...jsonData, @@ -1733,9 +1749,10 @@ export default class FirecrawlApp { async __asyncDeepResearch(topic: string, params: DeepResearchParams): Promise { const headers = this.prepareHeaders(); try { + let jsonData: any = { topic, ...params, origin: `js-sdk@${this.version}` }; const response: AxiosResponse = await this.postRequest( `${this.apiUrl}/v1/deep-research`, - { topic, ...params }, + jsonData, headers ); @@ -1845,10 +1862,11 @@ export default class FirecrawlApp { */ async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise { const headers = this.prepareHeaders(); + let jsonData: any = { url, ...params, origin: `js-sdk@${this.version}` }; try { const response: AxiosResponse = await this.postRequest( `${this.apiUrl}/v1/llmstxt`, - { url, ...params }, + jsonData, headers ); diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index fb960187..705d2e0c 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -1,53 +1,45 @@ -import time -import nest_asyncio -import uuid -from firecrawl.firecrawl import FirecrawlApp +from firecrawl.firecrawl import ExtractConfig, FirecrawlApp from pydantic import BaseModel, Field from typing import List +import time +app = FirecrawlApp(api_url="https://api.firecrawl.dev") -app = FirecrawlApp(api_key="fc-") - -# Scrape a website: -scrape_result = app.scrape_url('firecrawl.dev') -print(scrape_result['markdown']) +# # Scrape a website: +scrape_result = app.scrape_url('example.com', formats=["markdown", "html"]) +print(scrape_result.markdown) -# Test batch scrape +# # Test batch scrapeq urls = ['https://example.com', 'https://docs.firecrawl.dev'] -batch_scrape_params = { - 'formats': ['markdown', 'html'], -} - # Synchronous batch scrape -batch_result = app.batch_scrape_urls(urls, batch_scrape_params) +batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"]) print("Synchronous Batch Scrape Result:") -print(batch_result['data'][0]['markdown']) +print(batch_result.data[0].markdown) -# Asynchronous batch scrape -async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params) +# # Asynchronous batch scrape +async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"]) print("\nAsynchronous Batch Scrape Result:") print(async_batch_result) # Crawl a website: -idempotency_key = str(uuid.uuid4()) # optional idempotency key -crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) -print(crawl_result) +crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*']) +print(crawl_result.data[0].markdown) -# Asynchronous Crawl a website: -async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") +# # Asynchronous Crawl a website: +async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*']) print(async_result) -crawl_status = app.check_crawl_status(async_result['id']) +crawl_status = app.check_crawl_status(async_result.id) print(crawl_status) attempts = 15 -while attempts > 0 and crawl_status['status'] != 'completed': +while attempts > 0 and crawl_status.status != 'completed': print(crawl_status) - crawl_status = app.check_crawl_status(async_result['id']) + crawl_status = app.check_crawl_status(async_result.id) attempts -= 1 time.sleep(1) -crawl_status = app.get_crawl_status(async_result['id']) +crawl_status = app.check_crawl_status(async_result.id) print(crawl_status) # LLM Extraction: @@ -61,14 +53,11 @@ class ArticleSchema(BaseModel): class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., description="Top 5 stories") -llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { - 'formats': ['extract'], - 'extract': { - 'schema': TopArticlesSchema.model_json_schema() - } -}) +extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema()) -print(llm_extraction_result['extract']) +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) + +print(llm_extraction_result.extract) # # Define schema to extract contents into using json schema json_schema = { @@ -94,24 +83,16 @@ json_schema = { "required": ["top"] } -app2 = FirecrawlApp(api_key="fc-", version="v0") +extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True}) +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) - -llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': json_schema, - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } -}) +print(llm_extraction_result.extract) # print(llm_extraction_result['llm_extraction']) # Map a website: -map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) +map_result = app.map_url('https://firecrawl.dev', search="blog") print(map_result) # Extract URLs: @@ -124,14 +105,12 @@ class ExtractSchema(BaseModel): extract_schema = ExtractSchema.schema() # Perform the extraction -extract_result = app.extract(['https://firecrawl.dev'], { - 'prompt': "Extract the title, description, and links from the website", - 'schema': extract_schema -}) +extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) print(extract_result) # Crawl a website with WebSockets: # inside an async function... +import nest_asyncio nest_asyncio.apply() # Define event handlers @@ -155,4 +134,4 @@ async def start_crawl_and_watch(): watcher.add_event_listener("done", on_done) # Start the watcher - await watcher.connect() + await watcher.connect() \ No newline at end of file diff --git a/apps/python-sdk/example_async.py b/apps/python-sdk/example_async.py new file mode 100644 index 00000000..c554d695 --- /dev/null +++ b/apps/python-sdk/example_async.py @@ -0,0 +1,120 @@ +import time +import nest_asyncio +import uuid +import asyncio +from firecrawl.firecrawl import AsyncFirecrawlApp +from pydantic import BaseModel, Field +from typing import List + +app = AsyncFirecrawlApp(api_url="https://api.firecrawl.dev") + +async def example_scrape(): + # Scrape a website: + scrape_result = await app.scrape_url('example.com', formats=["markdown", "html"]) + print(scrape_result.markdown) + +async def example_batch_scrape(): + # Batch scrape + urls = ['https://example.com', 'https://docs.firecrawl.dev'] + + # Synchronous batch scrape + batch_result = await app.batch_scrape_urls(urls, formats=["markdown", "html"]) + print("Synchronous Batch Scrape Result:") + print(batch_result.data[0].markdown) + + # Asynchronous batch scrape + async_batch_result = await app.async_batch_scrape_urls(urls, formats=["markdown", "html"]) + print("\nAsynchronous Batch Scrape Result:") + print(async_batch_result) + +async def example_crawl(): + # Crawl a website: + crawl_result = await app.crawl_url('firecrawl.dev', exclude_paths=['blog/*']) + print(crawl_result.data[0].markdown) + + # Asynchronous Crawl a website: + async_result = await app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*']) + print(async_result) + + crawl_status = await app.check_crawl_status(async_result.id) + print(crawl_status) + + attempts = 15 + while attempts > 0 and crawl_status.status != 'completed': + print(crawl_status) + crawl_status = await app.check_crawl_status(async_result.id) + attempts -= 1 + await asyncio.sleep(1) # Use async sleep instead of time.sleep + + crawl_status = await app.check_crawl_status(async_result.id) + print(crawl_status) + +async def example_llm_extraction(): + # Define schema to extract contents into using pydantic + class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + + class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., description="Top 5 stories") + + extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema()) + + llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) + + print(llm_extraction_result.extract) + +async def example_map_and_extract(): + # Map a website: + map_result = await app.map_url('https://firecrawl.dev', search="blog") + print(map_result) + + # Extract URLs: + class ExtractSchema(BaseModel): + title: str + description: str + links: List[str] + + # Define the schema using Pydantic + extract_schema = ExtractSchema.schema() + + # Perform the extraction + extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) + print(extract_result) + +# Define event handlers for websocket +def on_document(detail): + print("DOC", detail) + +def on_error(detail): + print("ERR", detail['error']) + +def on_done(detail): + print("DONE", detail['status']) + +async def example_websocket_crawl(): + # Initiate the crawl job and get the watcher + watcher = await app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 }) + + # Add event listeners + watcher.add_event_listener("document", on_document) + watcher.add_event_listener("error", on_error) + watcher.add_event_listener("done", on_done) + + # Start the watcher + await watcher.connect() + +async def main(): + nest_asyncio.apply() + + await example_scrape() + await example_batch_scrape() + await example_crawl() + await example_llm_extraction() + await example_map_and_extract() + await example_websocket_crawl() + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index c30ba0fb..10431768 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.17.0" +__version__ = "2.0.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index b033a9d0..fb35bb78 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -12,15 +12,293 @@ Classes: import logging import os import time -from typing import Any, Dict, Optional, List, Union, Callable +from typing import Any, Dict, Optional, List, Union, Callable, Literal, TypeVar, Generic import json - +from datetime import datetime +import re +import warnings import requests import pydantic import websockets +import aiohttp +import asyncio +from pydantic import Field + +# Suppress Pydantic warnings about attribute shadowing +warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"") +warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"") +warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractConfig\" shadows an attribute in parent \"BaseModel\"") +warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"") + + +def get_version(): + try: + from pathlib import Path + package_path = os.path.dirname(__file__) + version_file = Path(os.path.join(package_path, '__init__.py')).read_text() + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) + if version_match: + return version_match.group(1).strip() + except Exception: + print("Failed to get version from __init__.py") + return None + +version = get_version() logger : logging.Logger = logging.getLogger("firecrawl") +T = TypeVar('T') + +# class FirecrawlDocumentMetadata(pydantic.BaseModel): +# """Metadata for a Firecrawl document.""" +# title: Optional[str] = None +# description: Optional[str] = None +# language: Optional[str] = None +# keywords: Optional[str] = None +# robots: Optional[str] = None +# ogTitle: Optional[str] = None +# ogDescription: Optional[str] = None +# ogUrl: Optional[str] = None +# ogImage: Optional[str] = None +# ogAudio: Optional[str] = None +# ogDeterminer: Optional[str] = None +# ogLocale: Optional[str] = None +# ogLocaleAlternate: Optional[List[str]] = None +# ogSiteName: Optional[str] = None +# ogVideo: Optional[str] = None +# dctermsCreated: Optional[str] = None +# dcDateCreated: Optional[str] = None +# dcDate: Optional[str] = None +# dctermsType: Optional[str] = None +# dcType: Optional[str] = None +# dctermsAudience: Optional[str] = None +# dctermsSubject: Optional[str] = None +# dcSubject: Optional[str] = None +# dcDescription: Optional[str] = None +# dctermsKeywords: Optional[str] = None +# modifiedTime: Optional[str] = None +# publishedTime: Optional[str] = None +# articleTag: Optional[str] = None +# articleSection: Optional[str] = None +# sourceURL: Optional[str] = None +# statusCode: Optional[int] = None +# error: Optional[str] = None + +class AgentOptions(pydantic.BaseModel): + """Configuration for the agent.""" + model: Literal["FIRE-1"] = "FIRE-1" + prompt: Optional[str] = None + +class AgentOptionsExtract(pydantic.BaseModel): + """Configuration for the agent in extract operations.""" + model: Literal["FIRE-1"] = "FIRE-1" + +class ActionsResult(pydantic.BaseModel): + """Result of actions performed during scraping.""" + screenshots: List[str] + +class FirecrawlDocument(pydantic.BaseModel, Generic[T]): + """Document retrieved or processed by Firecrawl.""" + url: Optional[str] = None + markdown: Optional[str] = None + html: Optional[str] = None + rawHtml: Optional[str] = None + links: Optional[List[str]] = None + extract: Optional[T] = None + json: Optional[T] = None + screenshot: Optional[str] = None + metadata: Optional[Any] = None + actions: Optional[ActionsResult] = None + title: Optional[str] = None # v1 search only + description: Optional[str] = None # v1 search only + +class LocationConfig(pydantic.BaseModel): + """Location configuration for scraping.""" + country: Optional[str] = None + languages: Optional[List[str]] = None + +class WebhookConfig(pydantic.BaseModel): + """Configuration for webhooks.""" + url: str + headers: Optional[Dict[str, str]] = None + metadata: Optional[Dict[str, str]] = None + events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None + +class CommonOptions(pydantic.BaseModel): + """Parameters for scraping operations.""" + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None + headers: Optional[Dict[str, str]] = None + includeTags: Optional[List[str]] = None + excludeTags: Optional[List[str]] = None + onlyMainContent: Optional[bool] = None + waitFor: Optional[int] = None + timeout: Optional[int] = None + location: Optional[LocationConfig] = None + mobile: Optional[bool] = None + skipTlsVerification: Optional[bool] = None + removeBase64Images: Optional[bool] = None + blockAds: Optional[bool] = None + proxy: Optional[Literal["basic", "stealth"]] = None + +class WaitAction(pydantic.BaseModel): + """Wait action to perform during scraping.""" + type: Literal["wait"] + milliseconds: int + selector: Optional[str] = None + +class ScreenshotAction(pydantic.BaseModel): + """Screenshot action to perform during scraping.""" + type: Literal["screenshot"] + fullPage: Optional[bool] = None + +class ClickAction(pydantic.BaseModel): + """Click action to perform during scraping.""" + type: Literal["click"] + selector: str + +class WriteAction(pydantic.BaseModel): + """Write action to perform during scraping.""" + type: Literal["write"] + text: str + +class PressAction(pydantic.BaseModel): + """Press action to perform during scraping.""" + type: Literal["press"] + key: str + +class ScrollAction(pydantic.BaseModel): + """Scroll action to perform during scraping.""" + type: Literal["scroll"] + direction: Literal["up", "down"] + selector: Optional[str] = None + +class ScrapeAction(pydantic.BaseModel): + """Scrape action to perform during scraping.""" + type: Literal["scrape"] + +class ExecuteJavascriptAction(pydantic.BaseModel): + """Execute javascript action to perform during scraping.""" + type: Literal["executeJavascript"] + script: str + + +class ExtractAgent(pydantic.BaseModel): + """Configuration for the agent in extract operations.""" + model: Literal["FIRE-1"] = "FIRE-1" + +class ExtractConfig(pydantic.BaseModel): + """Configuration for extraction.""" + prompt: Optional[str] = None + schema: Optional[Any] = None + systemPrompt: Optional[str] = None + agent: Optional[ExtractAgent] = None + +class ScrapeParams(CommonOptions): + """Parameters for scraping operations.""" + extract: Optional[ExtractConfig] = None + jsonOptions: Optional[ExtractConfig] = None + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None + agent: Optional[AgentOptions] = None + +class ScrapeResponse(FirecrawlDocument[T], Generic[T]): + """Response from scraping operations.""" + success: bool = True + warning: Optional[str] = None + error: Optional[str] = None + +class BatchScrapeResponse(pydantic.BaseModel): + """Response from batch scrape operations.""" + id: Optional[str] = None + url: Optional[str] = None + success: bool = True + error: Optional[str] = None + invalidURLs: Optional[List[str]] = None + +class BatchScrapeStatusResponse(pydantic.BaseModel): + """Response from batch scrape status checks.""" + success: bool = True + status: Literal["scraping", "completed", "failed", "cancelled"] + completed: int + total: int + creditsUsed: int + expiresAt: datetime + next: Optional[str] = None + data: List[FirecrawlDocument] + +class CrawlParams(pydantic.BaseModel): + """Parameters for crawling operations.""" + includePaths: Optional[List[str]] = None + excludePaths: Optional[List[str]] = None + maxDepth: Optional[int] = None + maxDiscoveryDepth: Optional[int] = None + limit: Optional[int] = None + allowBackwardLinks: Optional[bool] = None + allowExternalLinks: Optional[bool] = None + ignoreSitemap: Optional[bool] = None + scrapeOptions: Optional[CommonOptions] = None + webhook: Optional[Union[str, WebhookConfig]] = None + deduplicateSimilarURLs: Optional[bool] = None + ignoreQueryParameters: Optional[bool] = None + regexOnFullURL: Optional[bool] = None + +class CrawlResponse(pydantic.BaseModel): + """Response from crawling operations.""" + id: Optional[str] = None + url: Optional[str] = None + success: bool = True + error: Optional[str] = None + +class CrawlStatusResponse(pydantic.BaseModel): + """Response from crawl status checks.""" + success: bool = True + status: Literal["scraping", "completed", "failed", "cancelled"] + completed: int + total: int + creditsUsed: int + expiresAt: datetime + next: Optional[str] = None + data: List[FirecrawlDocument] + +class CrawlErrorsResponse(pydantic.BaseModel): + """Response from crawl/batch scrape error monitoring.""" + errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str} + robotsBlocked: List[str] + +class MapParams(pydantic.BaseModel): + """Parameters for mapping operations.""" + search: Optional[str] = None + ignoreSitemap: Optional[bool] = None + includeSubdomains: Optional[bool] = None + sitemapOnly: Optional[bool] = None + limit: Optional[int] = None + timeout: Optional[int] = None + +class MapResponse(pydantic.BaseModel): + """Response from mapping operations.""" + success: bool = True + links: Optional[List[str]] = None + error: Optional[str] = None + +class ExtractParams(pydantic.BaseModel): + """Parameters for extracting information from URLs.""" + prompt: Optional[str] = None + schema: Optional[Any] = None + systemPrompt: Optional[str] = None + allowExternalLinks: Optional[bool] = None + enableWebSearch: Optional[bool] = None + includeSubdomains: Optional[bool] = None + origin: Optional[str] = None + showSources: Optional[bool] = None + scrapeOptions: Optional[CommonOptions] = None + +class ExtractResponse(pydantic.BaseModel, Generic[T]): + """Response from extract operations.""" + success: bool = True + data: Optional[T] = None + error: Optional[str] = None + warning: Optional[str] = None + sources: Optional[List[str]] = None + class SearchParams(pydantic.BaseModel): query: str limit: Optional[int] = 5 @@ -31,7 +309,14 @@ class SearchParams(pydantic.BaseModel): location: Optional[str] = None origin: Optional[str] = "api" timeout: Optional[int] = 60000 - scrapeOptions: Optional[Dict[str, Any]] = None + scrapeOptions: Optional[CommonOptions] = None + +class SearchResponse(pydantic.BaseModel): + """Response from search operations.""" + success: bool = True + data: List[FirecrawlDocument] + warning: Optional[str] = None + error: Optional[str] = None class GenerateLLMsTextParams(pydantic.BaseModel): """ @@ -75,6 +360,24 @@ class DeepResearchStatusResponse(pydantic.BaseModel): sources: List[Dict[str, Any]] summaries: List[str] +class GenerateLLMsTextResponse(pydantic.BaseModel): + """Response from LLMs.txt generation operations.""" + success: bool = True + id: str + error: Optional[str] = None + +class GenerateLLMsTextStatusResponseData(pydantic.BaseModel): + llmstxt: str + llmsfulltxt: Optional[str] = None + +class GenerateLLMsTextStatusResponse(pydantic.BaseModel): + """Status response from LLMs.txt generation operations.""" + success: bool = True + data: Optional[GenerateLLMsTextStatusResponseData] = None + status: Literal["processing", "completed", "failed"] + error: Optional[str] = None + expiresAt: str + class ChangeTrackingData(pydantic.BaseModel): """ Data for the change tracking format. @@ -84,42 +387,39 @@ class ChangeTrackingData(pydantic.BaseModel): visibility: str # "visible" | "hidden" diff: Optional[Dict[str, Any]] = None json: Optional[Any] = None + +class SearchResponse(pydantic.BaseModel): + """ + Response from the search operation. + """ + success: bool + data: List[Dict[str, Any]] + warning: Optional[str] = None + error: Optional[str] = None + +class ExtractParams(pydantic.BaseModel): + """ + Parameters for the extract operation. + """ + prompt: Optional[str] = None + schema: Optional[Any] = pydantic.Field(None, alias='schema') + system_prompt: Optional[str] = None + allow_external_links: Optional[bool] = False + enable_web_search: Optional[bool] = False + # Just for backwards compatibility + enableWebSearch: Optional[bool] = False + show_sources: Optional[bool] = False + agent: Optional[Dict[str, Any]] = None + +class ExtractResponse(pydantic.BaseModel, Generic[T]): + """ + Response from the extract operation. + """ + success: bool + data: Optional[T] = None + error: Optional[str] = None class FirecrawlApp: - class SearchResponse(pydantic.BaseModel): - """ - Response from the search operation. - """ - success: bool - data: List[Dict[str, Any]] - warning: Optional[str] = None - error: Optional[str] = None - - class ExtractParams(pydantic.BaseModel): - """ - Parameters for the extract operation. - """ - prompt: Optional[str] = None - schema_: Optional[Any] = pydantic.Field(None, alias='schema') - system_prompt: Optional[str] = None - allow_external_links: Optional[bool] = False - enable_web_search: Optional[bool] = False - # Just for backwards compatibility - enableWebSearch: Optional[bool] = False - show_sources: Optional[bool] = False - agent: Optional[Dict[str, Any]] = None - - - - - class ExtractResponse(pydantic.BaseModel): - """ - Response from the extract operation. - """ - success: bool - data: Optional[Any] = None - error: Optional[str] = None - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: """ Initialize the FirecrawlApp instance with API key, API URL. @@ -138,200 +438,451 @@ class FirecrawlApp: logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}") - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def scrape_url( + self, + url: str, + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + **kwargs) -> ScrapeResponse[Any]: """ - Scrape the specified URL using the Firecrawl API. + Scrape and extract content from a URL. Args: - url (str): The URL to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + url (str): Target URL to scrape + formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait for a specific element to appear + timeout (Optional[int]): Request timeout (ms) + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 images + block_ads (Optional[bool]): Block ads + proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth) + extract (Optional[ExtractConfig]): Content extraction settings + json_options (Optional[ExtractConfig]): JSON extraction settings + actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform + Returns: - Any: The scraped data if the request is successful. + ScrapeResponse with: + * Requested content formats + * Page metadata + * Extraction results + * Success/error status Raises: - Exception: If the scrape request fails. + Exception: If scraping fails """ - headers = self._prepare_headers() - # Prepare the base scrape parameters with the URL - scrape_params = {'url': url} + # Build scrape parameters + scrape_params = { + 'url': url, + 'origin': f"python-sdk@{version}" + } - # If there are additional params, process them - if params: - # Handle extract (for v1) - extract = params.get('extract', {}) - if extract: - if 'schema' in extract and hasattr(extract['schema'], 'schema'): - extract['schema'] = extract['schema'].schema() - scrape_params['extract'] = extract + # Add optional parameters if provided + if formats: + scrape_params['formats'] = formats + if include_tags: + scrape_params['includeTags'] = include_tags + if exclude_tags: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for: + scrape_params['waitFor'] = wait_for + if timeout: + scrape_params['timeout'] = timeout + if location: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy: + scrape_params['proxy'] = proxy + if extract: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + scrape_params.update(kwargs) - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key not in ['extract']: - scrape_params[key] = value - - json = params.get("jsonOptions", {}) - if json: - if 'schema' in json and hasattr(json['schema'], 'schema'): - json['schema'] = json['schema'].schema() - scrape_params['jsonOptions'] = json - - change_tracking = params.get("changeTrackingOptions", {}) - if change_tracking: - scrape_params['changeTrackingOptions'] = change_tracking - - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key not in ['jsonOptions', 'changeTrackingOptions', 'agent']: - scrape_params[key] = value - - agent = params.get('agent') - if agent: - scrape_params['agent'] = agent - - - endpoint = f'/v1/scrape' - # Make the POST request with the prepared headers and JSON data + # Make request response = requests.post( - f'{self.api_url}{endpoint}', + f'{self.api_url}/v1/scrape', headers=headers, json=scrape_params, - timeout=(scrape_params["timeout"] + 5000 if "timeout" in scrape_params else None), + timeout=(timeout + 5000 if timeout else None) ) + if response.status_code == 200: try: - response = response.json() - except: - raise Exception(f'Failed to parse Firecrawl response as JSON.') - if response['success'] and 'data' in response: - return response['data'] - elif "error" in response: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - else: - raise Exception(f'Failed to scrape URL. Error: {response}') + response_json = response.json() + if response_json.get('success') and 'data' in response_json: + return ScrapeResponse(**response_json['data']) + elif "error" in response_json: + raise Exception(f'Failed to scrape URL. Error: {response_json["error"]}') + else: + raise Exception(f'Failed to scrape URL. Error: {response_json}') + except ValueError: + raise Exception('Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'scrape URL') - def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]: + def search( + self, + query: str, + *, + limit: Optional[int] = None, + tbs: Optional[str] = None, + filter: Optional[str] = None, + lang: Optional[str] = None, + country: Optional[str] = None, + location: Optional[str] = None, + timeout: Optional[int] = None, + scrape_options: Optional[CommonOptions] = None, + params: Optional[Union[Dict[str, Any], SearchParams]] = None, + **kwargs) -> SearchResponse: """ - Search for content using the Firecrawl API. + Search for content using Firecrawl. Args: - query (str): The search query string. - params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters. + query (str): Search query string + limit (Optional[int]): Max results (default: 5) + tbs (Optional[str]): Time filter (e.g. "qdr:d") + filter (Optional[str]): Custom result filter + lang (Optional[str]): Language code (default: "en") + country (Optional[str]): Country code (default: "us") + location (Optional[str]): Geo-targeting + timeout (Optional[int]): Request timeout in milliseconds + scrape_options (Optional[CommonOptions]): Result scraping configuration + params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters + **kwargs: Additional keyword arguments for future compatibility Returns: - Dict[str, Any]: The search response containing success status and search results. + SearchResponse: Response containing: + * success (bool): Whether request succeeded + * data (List[FirecrawlDocument]): Search results + * warning (Optional[str]): Warning message if any + * error (Optional[str]): Error message if any + + Raises: + Exception: If search fails or response cannot be parsed """ - if params is None: - params = {} + # Build search parameters + search_params = {} + if params: + if isinstance(params, dict): + search_params.update(params) + else: + search_params.update(params.dict(exclude_none=True)) - if isinstance(params, dict): - search_params = SearchParams(query=query, **params) - else: - search_params = params - search_params.query = query + # Add individual parameters + if limit is not None: + search_params['limit'] = limit + if tbs is not None: + search_params['tbs'] = tbs + if filter is not None: + search_params['filter'] = filter + if lang is not None: + search_params['lang'] = lang + if country is not None: + search_params['country'] = country + if location is not None: + search_params['location'] = location + if timeout is not None: + search_params['timeout'] = timeout + if scrape_options is not None: + search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + + # Add any additional kwargs + search_params.update(kwargs) + # Create final params object + final_params = SearchParams(query=query, **search_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['origin'] = f"python-sdk@{version}" + + # Make request response = requests.post( f"{self.api_url}/v1/search", headers={"Authorization": f"Bearer {self.api_key}"}, - json=search_params.dict(exclude_none=True) + json=params_dict ) - if response.status_code != 200: - raise Exception(f"Request failed with status code {response.status_code}") + if response.status_code == 200: + try: + response_json = response.json() + if response_json.get('success') and 'data' in response_json: + return SearchResponse(**response_json) + elif "error" in response_json: + raise Exception(f'Search failed. Error: {response_json["error"]}') + else: + raise Exception(f'Search failed. Error: {response_json}') + except ValueError: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'search') - try: - return response.json() - except: - raise Exception(f'Failed to parse Firecrawl response as JSON.') - - def crawl_url(self, url: str, - params: Optional[Dict[str, Any]] = None, - poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + def crawl_url( + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlStatusResponse: """ - Initiate a crawl job for the specified URL using the Firecrawl API. + Crawl a website starting from a URL. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + poll_interval (Optional[int]): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - Dict[str, Any]: A dictionary containing the crawl results. The structure includes: - - 'success' (bool): Indicates if the crawl was successful. - - 'status' (str): The final status of the crawl job (e.g., 'completed'). - - 'completed' (int): Number of scraped pages that completed. - - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this crawl. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the crawl data expires. - - 'data' (List[Dict]): List of all the scraped pages. + CrawlStatusResponse with: + * Crawling status and progress + * Crawled page contents + * Success/error information Raises: - Exception: If the crawl job initiation or monitoring fails. + Exception: If crawl fails """ - endpoint = f'/v1/crawl' + crawl_params = {} + + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers) + if response.status_code == 200: try: id = response.json().get('id') except: raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) - else: self._handle_error(response, 'start crawl job') - - def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_crawl_url( + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlResponse: """ - Initiate a crawl job asynchronously. + Start an asynchronous crawl job. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes: - - 'success' (bool): Indicates if the crawl initiation was successful. - - 'id' (str): The unique identifier for the crawl job. - - 'url' (str): The URL to check the status of the crawl job. + CrawlResponse with: + * success - Whether crawl started successfully + * id - Unique identifier for the crawl job + * url - Status check URL for the crawl + * error - Error message if start failed + + Raises: + Exception: If crawl initiation fails """ - endpoint = f'/v1/crawl' + crawl_params = {} + + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers) + if response.status_code == 200: try: - return response.json() + return CrawlResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'start crawl job') - def check_crawl_status(self, id: str) -> Any: + def check_crawl_status(self, id: str) -> CrawlStatusResponse: """ - Check the status of a crawl job using the Firecrawl API. + Check the status and results of a crawl job. Args: - id (str): The ID of the crawl job. + id: Unique identifier for the crawl job Returns: - Any: The status of the crawl job. + CrawlStatusResponse containing: + + Status Information: + * status - Current state (scraping/completed/failed/cancelled) + * completed - Number of pages crawled + * total - Total pages to crawl + * creditsUsed - API credits consumed + * expiresAt - Data expiration timestamp + + Results: + * data - List of crawled documents + * next - URL for next page of results (if paginated) + * success - Whether status check succeeded + * error - Error message if failed Raises: - Exception: If the status check request fails. + Exception: If status check fails """ endpoint = f'/v1/crawl/{id}' @@ -383,28 +934,37 @@ class FirecrawlApp: if 'next' in status_data: response['next'] = status_data['next'] - return { - 'success': False if 'error' in status_data else True, + return CrawlStatusResponse( + success=False if 'error' in status_data else True, **response - } + ) else: self._handle_error(response, 'check crawl status') - def check_crawl_errors(self, id: str) -> Dict[str, Any]: + def check_crawl_errors(self, id: str) -> CrawlErrorsResponse: """ Returns information about crawl errors. Args: - id (str): The ID of the crawl job. + id (str): The ID of the crawl job Returns: - Dict[str, Any]: Information about crawl errors. + CrawlErrorsResponse containing: + * errors (List[Dict[str, str]]): List of errors with fields: + - id (str): Error ID + - timestamp (str): When the error occurred + - url (str): URL that caused the error + - error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If error check fails """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers) if response.status_code == 200: try: - return response.json() + return CrawlErrorsResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -412,13 +972,18 @@ class FirecrawlApp: def cancel_crawl(self, id: str) -> Dict[str, Any]: """ - Cancel an asynchronous crawl job using the Firecrawl API. + Cancel an asynchronous crawl job. Args: - id (str): The ID of the crawl job to cancel. + id (str): The ID of the crawl job to cancel Returns: - Dict[str, Any]: The response from the cancel crawl request. + Dict[str, Any] containing: + * success (bool): Whether cancellation was successful + * error (str, optional): Error message if cancellation failed + + Raises: + Exception: If cancellation fails """ headers = self._prepare_headers() response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers) @@ -430,154 +995,524 @@ class FirecrawlApp: else: self._handle_error(response, "cancel crawl job") - def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def crawl_url_and_watch( + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> 'CrawlWatcher': """ Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket. Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job. + CrawlWatcher: An instance to monitor the crawl job via WebSocket + + Raises: + Exception: If crawl job fails to start """ - crawl_response = self.async_crawl_url(url, params, idempotency_key) - if crawl_response['success'] and 'id' in crawl_response: - return CrawlWatcher(crawl_response['id'], self) + crawl_response = self.async_crawl_url( + url, + include_paths=include_paths, + exclude_paths=exclude_paths, + max_depth=max_depth, + max_discovery_depth=max_discovery_depth, + limit=limit, + allow_backward_links=allow_backward_links, + allow_external_links=allow_external_links, + ignore_sitemap=ignore_sitemap, + scrape_options=scrape_options, + webhook=webhook, + deduplicate_similar_urls=deduplicate_similar_urls, + ignore_query_parameters=ignore_query_parameters, + regex_on_full_url=regex_on_full_url, + idempotency_key=idempotency_key, + **kwargs + ) + if crawl_response.success and crawl_response.id: + return CrawlWatcher(crawl_response.id, self) else: raise Exception("Crawl job failed to start") - def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + def map_url( + self, + url: str, + *, + search: Optional[str] = None, + ignore_sitemap: Optional[bool] = None, + include_subdomains: Optional[bool] = None, + sitemap_only: Optional[bool] = None, + limit: Optional[int] = None, + timeout: Optional[int] = None, + params: Optional[MapParams] = None) -> MapResponse: """ - Perform a map search using the Firecrawl API. + Map and discover links from a URL. Args: - url (str): The URL to perform the map search on. - params (Optional[Dict[str, Any]]): Additional parameters for the map search. + url (str): Target URL to map + search (Optional[str]): Filter pattern for URLs + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + include_subdomains (Optional[bool]): Include subdomain links + sitemap_only (Optional[bool]): Only use sitemap.xml + limit (Optional[int]): Maximum URLs to return + timeout (Optional[int]): Request timeout in milliseconds + params (Optional[MapParams]): Additional mapping parameters Returns: - List[str]: A list of URLs discovered during the map search. + MapResponse: Response containing: + * success (bool): Whether request succeeded + * links (List[str]): Discovered URLs + * error (Optional[str]): Error message if any + + Raises: + Exception: If mapping fails or response cannot be parsed """ - endpoint = f'/v1/map' - headers = self._prepare_headers() - - # Prepare the base scrape parameters with the URL - json_data = {'url': url} + # Build map parameters + map_params = {} if params: - json_data.update(params) + map_params.update(params.dict(exclude_none=True)) - # Make the POST request with the prepared headers and JSON data + # Add individual parameters + if search is not None: + map_params['search'] = search + if ignore_sitemap is not None: + map_params['ignoreSitemap'] = ignore_sitemap + if include_subdomains is not None: + map_params['includeSubdomains'] = include_subdomains + if sitemap_only is not None: + map_params['sitemapOnly'] = sitemap_only + if limit is not None: + map_params['limit'] = limit + if timeout is not None: + map_params['timeout'] = timeout + + # Create final params object + final_params = MapParams(**map_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request response = requests.post( - f'{self.api_url}{endpoint}', - headers=headers, - json=json_data, + f"{self.api_url}/v1/map", + headers={"Authorization": f"Bearer {self.api_key}"}, + json=params_dict ) + if response.status_code == 200: try: - response = response.json() - except: - raise Exception(f'Failed to parse Firecrawl response as JSON.') - if response['success'] and 'links' in response: - return response - elif 'error' in response: - raise Exception(f'Failed to map URL. Error: {response["error"]}') - else: - raise Exception(f'Failed to map URL. Error: {response}') + response_json = response.json() + if response_json.get('success') and 'links' in response_json: + return MapResponse(**response_json) + elif "error" in response_json: + raise Exception(f'Map failed. Error: {response_json["error"]}') + else: + raise Exception(f'Map failed. Error: {response_json}') + except ValueError: + raise Exception('Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'map') - def batch_scrape_urls(self, urls: List[str], - params: Optional[Dict[str, Any]] = None, - poll_interval: Optional[int] = 2, - idempotency_key: Optional[str] = None) -> Any: + def batch_scrape_urls( + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeStatusResponse: """ - Initiate a batch scrape job for the specified URLs using the Firecrawl API. + Batch scrape multiple URLs and monitor until completion. Args: - urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. - poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + poll_interval (Optional[int]): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - Dict[str, Any]: A dictionary containing the scrape results. The structure includes: - - 'success' (bool): Indicates if the batch scrape was successful. - - 'status' (str): The final status of the batch scrape job (e.g., 'completed'). - - 'completed' (int): Number of scraped pages that completed. - - 'total' (int): Total number of scraped pages. - - 'creditsUsed' (int): Estimated number of API credits used for this batch scrape. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the batch scrape data expires. - - 'data' (List[Dict]): List of all the scraped pages. + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information Raises: - Exception: If the batch scrape job initiation or monitoring fails. + Exception: If batch scrape fails """ - endpoint = f'/v1/batch/scrape' + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'urls': urls} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) + if response.status_code == 200: try: id = response.json().get('id') except: raise Exception(f'Failed to parse Firecrawl response as JSON.') return self._monitor_job_status(id, headers, poll_interval) - else: self._handle_error(response, 'start batch scrape job') - - def async_batch_scrape_urls(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_batch_scrape_urls( + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeResponse: """ - Initiate a crawl job asynchronously. + Initiate a batch scrape job asynchronously. Args: - urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes: - - 'success' (bool): Indicates if the batch scrape initiation was successful. - - 'id' (str): The unique identifier for the batch scrape job. - - 'url' (str): The URL to check the status of the batch scrape job. + BatchScrapeResponse with: + * success - Whether job started successfully + * id - Unique identifier for the job + * url - Status check URL + * error - Error message if start failed + + Raises: + Exception: If job initiation fails """ - endpoint = f'/v1/batch/scrape' + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request headers = self._prepare_headers(idempotency_key) - json_data = {'urls': urls} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers) + response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) + if response.status_code == 200: try: - return response.json() + return BatchScrapeResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, 'start batch scrape job') - def batch_scrape_urls_and_watch(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher': + def batch_scrape_urls_and_watch( + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> 'CrawlWatcher': """ Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket. Args: - urls (List[str]): The URLs to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scraper. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API Returns: - CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job. + CrawlWatcher: An instance to monitor the batch scrape job via WebSocket + + Raises: + Exception: If batch scrape job fails to start """ - crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key) - if crawl_response['success'] and 'id' in crawl_response: - return CrawlWatcher(crawl_response['id'], self) + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) + + if response.status_code == 200: + try: + crawl_response = BatchScrapeResponse(**response.json()) + if crawl_response.success and crawl_response.id: + return CrawlWatcher(crawl_response.id, self) + else: + raise Exception("Batch scrape job failed to start") + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') else: - raise Exception("Batch scrape job failed to start") + self._handle_error(response, 'start batch scrape job') - def check_batch_scrape_status(self, id: str) -> Any: + def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse: """ Check the status of a batch scrape job using the Firecrawl API. @@ -585,7 +1520,7 @@ class FirecrawlApp: id (str): The ID of the batch scrape job. Returns: - Any: The status of the batch scrape job. + BatchScrapeStatusResponse: The status of the batch scrape job. Raises: Exception: If the status check request fails. @@ -625,29 +1560,21 @@ class FirecrawlApp: break status_data['data'] = data - response = { + return BatchScrapeStatusResponse(**{ + 'success': False if 'error' in status_data else True, 'status': status_data.get('status'), 'total': status_data.get('total'), 'completed': status_data.get('completed'), 'creditsUsed': status_data.get('creditsUsed'), 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data') - } - - if 'error' in status_data: - response['error'] = status_data['error'] - - if 'next' in status_data: - response['next'] = status_data['next'] - - return { - 'success': False if 'error' in status_data else True, - **response - } + 'data': status_data.get('data'), + 'next': status_data.get('next'), + 'error': status_data.get('error') + }) else: self._handle_error(response, 'check batch scrape status') - def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]: + def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse: """ Returns information about batch scrape errors. @@ -655,38 +1582,68 @@ class FirecrawlApp: id (str): The ID of the crawl job. Returns: - Dict[str, Any]: Information about crawl errors. + CrawlErrorsResponse: A response containing: + * errors (List[Dict[str, str]]): List of errors with fields: + * id (str): Error ID + * timestamp (str): When the error occurred + * url (str): URL that caused the error + * error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If the error check request fails """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers) if response.status_code == 200: try: - return response.json() + return CrawlErrorsResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: self._handle_error(response, "check batch scrape errors") - def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any: + def extract( + self, + urls: Optional[List[str]] = None, + *, + prompt: Optional[str] = None, + schema: Optional[Any] = None, + system_prompt: Optional[str] = None, + allow_external_links: Optional[bool] = False, + enable_web_search: Optional[bool] = False, + show_sources: Optional[bool] = False, + agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]: """ - Extracts information from a URL using the Firecrawl API. + Extract structured information from URLs. Args: - urls (Optional[List[str]]): The URLs to extract information from. - params (Optional[ExtractParams]): Additional parameters for the extract request. + urls (Optional[List[str]]): URLs to extract from + prompt (Optional[str]): Custom extraction prompt + schema (Optional[Any]): JSON schema/Pydantic model + system_prompt (Optional[str]): System context + allow_external_links (Optional[bool]): Follow external links + enable_web_search (Optional[bool]): Enable web search + show_sources (Optional[bool]): Include source URLs + agent (Optional[Dict[str, Any]]): Agent configuration Returns: - Union[ExtractResponse, ErrorResponse]: The response from the extract operation. + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any + + Raises: + ValueError: If prompt/schema missing or extraction fails """ headers = self._prepare_headers() - if not params or (not params.get('prompt') and not params.get('schema')): + if not prompt and not schema: raise ValueError("Either prompt or schema is required") - if not urls and not params.get('prompt'): + if not urls and not prompt: raise ValueError("Either urls or prompt is required") - schema = params.get('schema') if schema: if hasattr(schema, 'model_json_schema'): # Convert Pydantic model to JSON schema @@ -694,26 +1651,22 @@ class FirecrawlApp: # Otherwise assume it's already a JSON schema dict request_data = { - 'urls': urls, - 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)), - 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), - 'showSources': params.get('show_sources', params.get('showSources', False)), + 'urls': urls or [], + 'allowExternalLinks': allow_external_links, + 'enableWebSearch': enable_web_search, + 'showSources': show_sources, 'schema': schema, - 'origin': 'api-sdk' + 'origin': f'python-sdk@{get_version()}' } - if not request_data['urls']: - request_data['urls'] = [] # Only add prompt and systemPrompt if they exist - if params.get('prompt'): - request_data['prompt'] = params['prompt'] - if params.get('system_prompt'): - request_data['systemPrompt'] = params['system_prompt'] - elif params.get('systemPrompt'): # Check legacy field name - request_data['systemPrompt'] = params['systemPrompt'] + if prompt: + request_data['prompt'] = prompt + if system_prompt: + request_data['systemPrompt'] = system_prompt - if params.get('agent'): - request_data['agent'] = params['agent'] + if agent: + request_data['agent'] = agent try: # Send the initial extract request @@ -744,10 +1697,7 @@ class FirecrawlApp: except: raise Exception(f'Failed to parse Firecrawl response as JSON.') if status_data['status'] == 'completed': - if status_data['success']: - return status_data - else: - raise Exception(f'Failed to extract. Error: {status_data["error"]}') + return ExtractResponse(**status_data) elif status_data['status'] in ['failed', 'cancelled']: raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') else: @@ -761,9 +1711,9 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - return {'success': False, 'error': "Internal server error."} + return ExtractResponse(success=False, error="Internal server error.") - def get_extract_status(self, job_id: str) -> Dict[str, Any]: + def get_extract_status(self, job_id: str) -> ExtractResponse[Any]: """ Retrieve the status of an extract job. @@ -771,7 +1721,7 @@ class FirecrawlApp: job_id (str): The ID of the extract job. Returns: - Dict[str, Any]: The status of the extract job. + ExtractResponse[Any]: The status of the extract job. Raises: ValueError: If there is an error retrieving the status. @@ -781,7 +1731,7 @@ class FirecrawlApp: response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers) if response.status_code == 200: try: - return response.json() + return ExtractResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -789,43 +1739,71 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - def async_extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]: + def async_extract( + self, + urls: List[str], + *, + prompt: Optional[str] = None, + schema: Optional[Any] = None, + system_prompt: Optional[str] = None, + allow_external_links: Optional[bool] = False, + enable_web_search: Optional[bool] = False, + show_sources: Optional[bool] = False, + agent: Optional[Dict[str, Any]] = None, + idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: """ Initiate an asynchronous extract job. Args: - urls (List[str]): The URLs to extract data from. - params (Optional[Dict[str, Any]]): Additional parameters for the extract request. - idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. + urls (List[str]): URLs to extract information from + prompt (Optional[str]): Custom extraction prompt + schema (Optional[Any]): JSON schema/Pydantic model + system_prompt (Optional[str]): System context + allow_external_links (Optional[bool]): Follow external links + enable_web_search (Optional[bool]): Enable web search + show_sources (Optional[bool]): Include source URLs + agent (Optional[Dict[str, Any]]): Agent configuration + idempotency_key (Optional[str]): Unique key to prevent duplicate requests Returns: - Dict[str, Any]: The response from the extract operation. + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any Raises: - ValueError: If there is an error initiating the extract job. + ValueError: If job initiation fails """ headers = self._prepare_headers(idempotency_key) - schema = params.get('schema') if params else None + schema = schema if schema: if hasattr(schema, 'model_json_schema'): # Convert Pydantic model to JSON schema schema = schema.model_json_schema() # Otherwise assume it's already a JSON schema dict - jsonData = {'urls': urls, **(params or {})} request_data = { - **jsonData, - 'allowExternalLinks': params.get('allow_external_links', False) if params else False, + 'urls': urls, + 'allowExternalLinks': allow_external_links, + 'enableWebSearch': enable_web_search, + 'showSources': show_sources, 'schema': schema, - 'origin': 'api-sdk' + 'origin': f'python-sdk@{version}' } + if prompt: + request_data['prompt'] = prompt + if system_prompt: + request_data['systemPrompt'] = system_prompt + if agent: + request_data['agent'] = agent + try: response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers) if response.status_code == 200: try: - return response.json() + return ExtractResponse(**response.json()) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: @@ -833,34 +1811,44 @@ class FirecrawlApp: except Exception as e: raise ValueError(str(e), 500) - def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + def generate_llms_text( + self, + url: str, + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse: """ Generate LLMs.txt for a given URL and poll until completion. Args: - url (str): The URL to generate LLMs.txt from. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + url (str): Target URL to generate LLMs.txt from + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming Returns: - Dict[str, Any]: A dictionary containing the generation results. The structure includes: - - 'success' (bool): Indicates if the generation was successful. - - 'status' (str): The final status of the generation job. - - 'data' (Dict): The generated LLMs.txt data. - - 'error' (Optional[str]): Error message if the generation failed. - - 'expiresAt' (str): ISO 8601 formatted date-time string indicating when the data expires. + GenerateLLMsTextStatusResponse with: + * Generated LLMs.txt content + * Full version if requested + * Generation status + * Success/error information Raises: - Exception: If the generation job fails or an error occurs during status checks. + Exception: If generation fails """ - if params is None: - params = {} + params = GenerateLLMsTextParams( + maxUrls=max_urls, + showFullText=show_full_text, + __experimental_stream=experimental_stream + ) - if isinstance(params, dict): - generation_params = GenerateLLMsTextParams(**params) - else: - generation_params = params - - response = self.async_generate_llms_text(url, generation_params) + response = self.async_generate_llms_text( + url, + max_urls=max_urls, + show_full_text=show_full_text, + experimental_stream=experimental_stream + ) if not response.get('success') or 'id' not in response: return response @@ -879,32 +1867,40 @@ class FirecrawlApp: return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} - def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> Dict[str, Any]: + def async_generate_llms_text( + self, + url: str, + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse: """ Initiate an asynchronous LLMs.txt generation operation. Args: - url (str): The URL to generate LLMs.txt from. - params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): Parameters for the LLMs.txt generation. + url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming Returns: - Dict[str, Any]: A dictionary containing the generation initiation response. The structure includes: - - 'success' (bool): Indicates if the generation initiation was successful. - - 'id' (str): The unique identifier for the generation job. + GenerateLLMsTextResponse: A response containing: + * success (bool): Whether the generation initiation was successful + * id (str): The unique identifier for the generation job + * error (str, optional): Error message if initiation failed Raises: Exception: If the generation job initiation fails. """ - if params is None: - params = {} - - if isinstance(params, dict): - generation_params = GenerateLLMsTextParams(**params) - else: - generation_params = params + params = GenerateLLMsTextParams( + maxUrls=max_urls, + showFullText=show_full_text, + __experimental_stream=experimental_stream + ) headers = self._prepare_headers() - json_data = {'url': url, **generation_params.dict(exclude_none=True)} + json_data = {'url': url, **params.dict(exclude_none=True)} + json_data['origin'] = f"python-sdk@{version}" try: response = self._post_request(f'{self.api_url}/v1/llmstxt', json_data, headers) @@ -920,15 +1916,22 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def check_generate_llms_text_status(self, id: str) -> Dict[str, Any]: + def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse: """ Check the status of a LLMs.txt generation operation. Args: - id (str): The ID of the LLMs.txt generation operation. + id (str): The unique identifier of the LLMs.txt generation job to check status for. Returns: - Dict[str, Any]: The current status and results of the generation operation. + GenerateLLMsTextStatusResponse: A response containing: + * success (bool): Whether the generation was successful + * status (str): Status of generation ("processing", "completed", "failed") + * data (Dict[str, str], optional): Generated text with fields: + * llmstxt (str): Generated LLMs.txt content + * llmsfulltxt (str, optional): Full version if requested + * error (str, optional): Error message if generation failed + * expiresAt (str): When the generated data expires Raises: Exception: If the status check fails. @@ -950,7 +1953,9 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: + def _prepare_headers( + self, + idempotency_key: Optional[str] = None) -> Dict[str, str]: """ Prepare the headers for API requests. @@ -972,11 +1977,13 @@ class FirecrawlApp: 'Authorization': f'Bearer {self.api_key}', } - def _post_request(self, url: str, - data: Dict[str, Any], - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _post_request( + self, + url: str, + data: Dict[str, Any], + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a POST request with retries. @@ -1001,10 +2008,12 @@ class FirecrawlApp: return response return response - def _get_request(self, url: str, - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _get_request( + self, + url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a GET request with retries. @@ -1028,10 +2037,12 @@ class FirecrawlApp: return response return response - def _delete_request(self, url: str, - headers: Dict[str, str], - retries: int = 3, - backoff_factor: float = 0.5) -> requests.Response: + def _delete_request( + self, + url: str, + headers: Dict[str, str], + retries: int = 3, + backoff_factor: float = 0.5) -> requests.Response: """ Make a DELETE request with retries. @@ -1055,16 +2066,21 @@ class FirecrawlApp: return response return response - def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any: + def _monitor_job_status( + self, + id: str, + headers: Dict[str, str], + poll_interval: int) -> CrawlStatusResponse: """ Monitor the status of a crawl job until completion. Args: id (str): The ID of the crawl job. headers (Dict[str, str]): The headers to include in the status check requests. - poll_interval (int): Secounds between status checks. + poll_interval (int): Seconds between status checks. + Returns: - Any: The crawl results if the job is completed successfully. + CrawlStatusResponse: The crawl results if the job is completed successfully. Raises: Exception: If the job fails or an error occurs during status checks. @@ -1091,7 +2107,7 @@ class FirecrawlApp: raise Exception(f'Failed to parse Firecrawl response as JSON.') data.extend(status_data.get('data', [])) status_data['data'] = data - return status_data + return CrawlStatusResponse(**status_data) else: raise Exception('Crawl job completed but no data was returned') elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: @@ -1102,7 +2118,10 @@ class FirecrawlApp: else: self._handle_error(status_response, 'check crawl status') - def _handle_error(self, response: requests.Response, action: str) -> None: + def _handle_error( + self, + response: requests.Response, + action: str) -> None: """ Handle errors from API responses. @@ -1119,49 +2138,100 @@ class FirecrawlApp: except: raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response) - - if response.status_code == 402: - message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" - elif response.status_code == 403: - message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}" - elif response.status_code == 408: - message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" - elif response.status_code == 409: - message = f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" - elif response.status_code == 500: - message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" - else: - message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}" + message = self._get_error_message(response.status_code, action, error_message, error_details) # Raise an HTTPError with the custom message and attach the response raise requests.exceptions.HTTPError(message, response=response) - def deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None, - on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, - on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> Dict[str, Any]: + def _get_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str: + """ + Generate a standardized error message based on HTTP status code. + + Args: + status_code (int): The HTTP status code from the response + action (str): Description of the action that was being performed + error_message (str): The error message from the API response + error_details (str): Additional error details from the API response + + Returns: + str: A formatted error message + """ + if status_code == 402: + return f"Payment Required: Failed to {action}. {error_message} - {error_details}" + elif status_code == 403: + message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}" + elif status_code == 408: + return f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" + elif status_code == 409: + return f"Conflict: Failed to {action} due to a conflict. {error_message} - {error_details}" + elif status_code == 500: + return f"Internal Server Error: Failed to {action}. {error_message} - {error_details}" + else: + return f"Unexpected error during {action}: Status code {status_code}. {error_message} - {error_details}" + + def deep_research( + self, + query: str, + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None, + on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, + on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: """ Initiates a deep research operation on a given query and polls until completion. Args: - query (str): The query to research. - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation. - on_activity (Optional[Callable[[Dict[str, Any]], None]]): Optional callback to receive activity updates in real-time. + query (str): Research query or topic to investigate + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming + on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth} + on_source (Optional[Callable]): Source discovery callback receiving {url, title, description} Returns: - Dict[str, Any]: The final research results. + DeepResearchStatusResponse containing: + * success (bool): Whether research completed successfully + * status (str): Current state (processing/completed/failed) + * error (Optional[str]): Error message if failed + * id (str): Unique identifier for the research job + * data (Any): Research findings and analysis + * sources (List[Dict]): List of discovered sources + * activities (List[Dict]): Research progress log + * summaries (List[str]): Generated research summaries Raises: - Exception: If the research operation fails. + Exception: If research fails """ - if params is None: - params = {} + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) - if isinstance(params, dict): - research_params = DeepResearchParams(**params) - else: - research_params = params - - response = self.async_deep_research(query, research_params) + response = self.async_deep_research( + query, + max_depth=max_depth, + time_limit=time_limit, + max_urls=max_urls, + analysis_prompt=analysis_prompt, + system_prompt=system_prompt + ) if not response.get('success') or 'id' not in response: return response @@ -1194,31 +2264,57 @@ class FirecrawlApp: time.sleep(2) # Polling interval return {'success': False, 'error': 'Deep research job terminated unexpectedly'} - def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: + + def async_deep_research( + self, + query: str, + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]: """ Initiates an asynchronous deep research operation. Args: - query (str): The query to research. - params (Optional[Union[Dict[str, Any], DeepResearchParams]]): Parameters for the deep research operation. + query (str): Research query or topic to investigate + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming Returns: - Dict[str, Any]: The response from the deep research initiation. + Dict[str, Any]: A response containing: + * success (bool): Whether the research initiation was successful + * id (str): The unique identifier for the research job + * error (str, optional): Error message if initiation failed Raises: Exception: If the research initiation fails. """ - if params is None: - params = {} - - if isinstance(params, dict): - research_params = DeepResearchParams(**params) - else: - research_params = params + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) headers = self._prepare_headers() json_data = {'query': query, **research_params.dict(exclude_none=True)} + json_data['origin'] = f"python-sdk@{version}" # Handle json options schema if present if 'jsonOptions' in json_data: @@ -1240,7 +2336,7 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} - def check_deep_research_status(self, id: str) -> Dict[str, Any]: + def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse: """ Check the status of a deep research operation. @@ -1248,7 +2344,19 @@ class FirecrawlApp: id (str): The ID of the deep research operation. Returns: - Dict[str, Any]: The current status and results of the research operation. + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries Raises: Exception: If the status check fails. @@ -1271,6 +2379,17 @@ class FirecrawlApp: return {'success': False, 'error': 'Internal server error'} class CrawlWatcher: + """ + A class to watch and handle crawl job events via WebSocket connection. + + Attributes: + id (str): The ID of the crawl job to watch + app (FirecrawlApp): The FirecrawlApp instance + data (List[Dict[str, Any]]): List of crawled documents/data + status (str): Current status of the crawl job + ws_url (str): WebSocket URL for the crawl job + event_handlers (dict): Dictionary of event type to list of handler functions + """ def __init__(self, id: str, app: FirecrawlApp): self.id = id self.app = app @@ -1283,25 +2402,57 @@ class CrawlWatcher: 'document': [] } - async def connect(self): - async with websockets.connect(self.ws_url, extra_headers={"Authorization": f"Bearer {self.app.api_key}"}) as websocket: + async def connect(self) -> None: + """ + Establishes WebSocket connection and starts listening for messages. + """ + async with websockets.connect( + self.ws_url, + additional_headers=[("Authorization", f"Bearer {self.app.api_key}")] + ) as websocket: await self._listen(websocket) - async def _listen(self, websocket): + async def _listen(self, websocket) -> None: + """ + Listens for incoming WebSocket messages and handles them. + + Args: + websocket: The WebSocket connection object + """ async for message in websocket: msg = json.loads(message) await self._handle_message(msg) - def add_event_listener(self, event_type: str, handler): + def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None: + """ + Adds an event handler function for a specific event type. + + Args: + event_type (str): Type of event to listen for ('done', 'error', or 'document') + handler (Callable): Function to handle the event + """ if event_type in self.event_handlers: self.event_handlers[event_type].append(handler) - def dispatch_event(self, event_type: str, detail: Dict[str, Any]): + def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None: + """ + Dispatches an event to all registered handlers for that event type. + + Args: + event_type (str): Type of event to dispatch + detail (Dict[str, Any]): Event details/data to pass to handlers + """ if event_type in self.event_handlers: for handler in self.event_handlers[event_type]: handler(detail) - async def _handle_message(self, msg: Dict[str, Any]): + async def _handle_message(self, msg: Dict[str, Any]) -> None: + """ + Handles incoming WebSocket messages based on their type. + + Args: + msg (Dict[str, Any]): The message to handle + """ if msg['type'] == 'done': self.status = 'completed' self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) @@ -1316,3 +2467,1773 @@ class CrawlWatcher: elif msg['type'] == 'document': self.data.append(msg['data']) self.dispatch_event('document', {'data': msg['data'], 'id': self.id}) + +class AsyncFirecrawlApp(FirecrawlApp): + """ + Asynchronous version of FirecrawlApp that implements async methods using aiohttp. + Provides non-blocking alternatives to all FirecrawlApp operations. + """ + + async def _async_request( + self, + method: str, + url: str, + headers: Dict[str, str], + data: Optional[Dict[str, Any]] = None, + retries: int = 3, + backoff_factor: float = 0.5) -> Dict[str, Any]: + """ + Generic async request method with exponential backoff retry logic. + + Args: + method (str): The HTTP method to use (e.g., "GET" or "POST"). + url (str): The URL to send the request to. + headers (Dict[str, str]): Headers to include in the request. + data (Optional[Dict[str, Any]]): The JSON data to include in the request body (only for POST requests). + retries (int): Maximum number of retry attempts (default: 3). + backoff_factor (float): Factor to calculate delay between retries (default: 0.5). + Delay will be backoff_factor * (2 ** retry_count). + + Returns: + Dict[str, Any]: The parsed JSON response from the server. + + Raises: + aiohttp.ClientError: If the request fails after all retries. + Exception: If max retries are exceeded or other errors occur. + """ + async with aiohttp.ClientSession() as session: + for attempt in range(retries): + try: + async with session.request( + method=method, url=url, headers=headers, json=data + ) as response: + if response.status == 502: + await asyncio.sleep(backoff_factor * (2 ** attempt)) + continue + if response.status >= 300: + await self._handle_error(response, f"make {method} request") + return await response.json() + except aiohttp.ClientError as e: + if attempt == retries - 1: + raise e + await asyncio.sleep(backoff_factor * (2 ** attempt)) + raise Exception("Max retries exceeded") + + async def _async_post_request( + self, url: str, data: Dict[str, Any], headers: Dict[str, str], + retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]: + """ + Make an async POST request with exponential backoff retry logic. + + Args: + url (str): The URL to send the POST request to. + data (Dict[str, Any]): The JSON data to include in the request body. + headers (Dict[str, str]): Headers to include in the request. + retries (int): Maximum number of retry attempts (default: 3). + backoff_factor (float): Factor to calculate delay between retries (default: 0.5). + Delay will be backoff_factor * (2 ** retry_count). + + Returns: + Dict[str, Any]: The parsed JSON response from the server. + + Raises: + aiohttp.ClientError: If the request fails after all retries. + Exception: If max retries are exceeded or other errors occur. + """ + return await self._async_request("POST", url, headers, data, retries, backoff_factor) + + async def _async_get_request( + self, url: str, headers: Dict[str, str], + retries: int = 3, backoff_factor: float = 0.5) -> Dict[str, Any]: + """ + Make an async GET request with exponential backoff retry logic. + + Args: + url (str): The URL to send the GET request to. + headers (Dict[str, str]): Headers to include in the request. + retries (int): Maximum number of retry attempts (default: 3). + backoff_factor (float): Factor to calculate delay between retries (default: 0.5). + Delay will be backoff_factor * (2 ** retry_count). + + Returns: + Dict[str, Any]: The parsed JSON response from the server. + + Raises: + aiohttp.ClientError: If the request fails after all retries. + Exception: If max retries are exceeded or other errors occur. + """ + return await self._async_request("GET", url, headers, None, retries, backoff_factor) + + async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None: + """ + Handle errors from async API responses with detailed error messages. + + Args: + response (aiohttp.ClientResponse): The response object from the failed request + action (str): Description of the action that was being attempted + + Raises: + aiohttp.ClientError: With a detailed error message based on the response status: + - 402: Payment Required + - 408: Request Timeout + - 409: Conflict + - 500: Internal Server Error + - Other: Unexpected error with status code + """ + try: + error_data = await response.json() + error_message = error_data.get('error', 'No error message provided.') + error_details = error_data.get('details', 'No additional error details provided.') + except: + raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}') + + message = await self._get_async_error_message(response.status, action, error_message, error_details) + + raise aiohttp.ClientError(message) + + async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str: + """ + Generate a standardized error message based on HTTP status code for async operations. + + Args: + status_code (int): The HTTP status code from the response + action (str): Description of the action that was being performed + error_message (str): The error message from the API response + error_details (str): Additional error details from the API response + + Returns: + str: A formatted error message + """ + return self._get_error_message(status_code, action, error_message, error_details) + + async def crawl_url_and_watch( + self, + url: str, + params: Optional[CrawlParams] = None, + idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher': + """ + Initiate an async crawl job and return an AsyncCrawlWatcher to monitor progress via WebSocket. + + Args: + url (str): Target URL to start crawling from + params (Optional[CrawlParams]): See CrawlParams model for configuration: + URL Discovery: + * includePaths - Patterns of URLs to include + * excludePaths - Patterns of URLs to exclude + * maxDepth - Maximum crawl depth + * maxDiscoveryDepth - Maximum depth for finding new URLs + * limit - Maximum pages to crawl + + Link Following: + * allowBackwardLinks - Follow parent directory links + * allowExternalLinks - Follow external domain links + * ignoreSitemap - Skip sitemap.xml processing + + Advanced: + * scrapeOptions - Page scraping configuration + * webhook - Notification webhook settings + * deduplicateSimilarURLs - Remove similar URLs + * ignoreQueryParameters - Ignore URL parameters + * regexOnFullURL - Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + AsyncCrawlWatcher: An instance to monitor the crawl job via WebSocket + + Raises: + Exception: If crawl job fails to start + """ + crawl_response = await self.async_crawl_url(url, params, idempotency_key) + if crawl_response.get('success') and 'id' in crawl_response: + return AsyncCrawlWatcher(crawl_response['id'], self) + else: + raise Exception("Crawl job failed to start") + + async def batch_scrape_urls_and_watch( + self, + urls: List[str], + params: Optional[ScrapeParams] = None, + idempotency_key: Optional[str] = None) -> 'AsyncCrawlWatcher': + """ + Initiate an async batch scrape job and return an AsyncCrawlWatcher to monitor progress. + + Args: + urls (List[str]): List of URLs to scrape + params (Optional[ScrapeParams]): See ScrapeParams model for configuration: + + Content Options: + * formats - Content formats to retrieve + * includeTags - HTML tags to include + * excludeTags - HTML tags to exclude + * onlyMainContent - Extract main content only + + Request Options: + * headers - Custom HTTP headers + * timeout - Request timeout (ms) + * mobile - Use mobile user agent + * proxy - Proxy type + + Extraction Options: + * extract - Content extraction config + * jsonOptions - JSON extraction config + * actions - Actions to perform + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + AsyncCrawlWatcher: An instance to monitor the batch scrape job via WebSocket + + Raises: + Exception: If batch scrape job fails to start + """ + batch_response = await self.async_batch_scrape_urls(urls, params, idempotency_key) + if batch_response.get('success') and 'id' in batch_response: + return AsyncCrawlWatcher(batch_response['id'], self) + else: + raise Exception("Batch scrape job failed to start") + + async def scrape_url( + self, + url: str, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]: + """ + Scrape and extract content from a URL asynchronously. + + Args: + url (str): Target URL to scrape + formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait for a specific element to appear + timeout (Optional[int]): Request timeout (ms) + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 images + block_ads (Optional[bool]): Block ads + proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth) + extract (Optional[ExtractConfig]): Content extraction settings + json_options (Optional[ExtractConfig]): JSON extraction settings + actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform + + Returns: + ScrapeResponse with: + * Requested content formats + * Page metadata + * Extraction results + * Success/error status + + Raises: + Exception: If scraping fails + """ + headers = self._prepare_headers() + + # Build scrape parameters + scrape_params = { + 'url': url, + 'origin': f"python-sdk@{version}" + } + + # Add optional parameters if provided and not None + if formats: + scrape_params['formats'] = formats + if include_tags: + scrape_params['includeTags'] = include_tags + if exclude_tags: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for: + scrape_params['waitFor'] = wait_for + if timeout: + scrape_params['timeout'] = timeout + if location: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy: + scrape_params['proxy'] = proxy + if extract: + extract_dict = extract.dict(exclude_none=True) + if 'schema' in extract_dict and hasattr(extract.schema, 'schema'): + extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted + scrape_params['extract'] = extract_dict + if json_options: + json_options_dict = json_options.dict(exclude_none=True) + if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'): + json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted + scrape_params['jsonOptions'] = json_options_dict + if actions: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + + # Make async request + endpoint = f'/v1/scrape' + response = await self._async_post_request( + f'{self.api_url}{endpoint}', + scrape_params, + headers + ) + + if response.get('success') and 'data' in response: + return ScrapeResponse(**response['data']) + elif "error" in response: + raise Exception(f'Failed to scrape URL. Error: {response["error"]}') + else: + # Use the response content directly if possible, otherwise a generic message + error_content = response.get('error', str(response)) + raise Exception(f'Failed to scrape URL. Error: {error_content}') + + async def batch_scrape_urls( + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeStatusResponse: + """ + Asynchronously scrape multiple URLs and monitor until completion. + + Args: + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + poll_interval (Optional[int]): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API + + Returns: + BatchScrapeStatusResponse with: + * Scraping status and progress + * Scraped content for each URL + * Success/error information + + Raises: + Exception: If batch scrape fails + """ + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = await self._async_post_request( + f'{self.api_url}/v1/batch/scrape', + params_dict, + headers + ) + + if response.status_code == 200: + try: + id = response.json().get('id') + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + return self._monitor_job_status(id, headers, poll_interval) + else: + self._handle_error(response, 'start batch scrape job') + + + async def async_batch_scrape_urls( + self, + urls: List[str], + *, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + headers: Optional[Dict[str, str]] = None, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + only_main_content: Optional[bool] = None, + wait_for: Optional[int] = None, + timeout: Optional[int] = None, + location: Optional[LocationConfig] = None, + mobile: Optional[bool] = None, + skip_tls_verification: Optional[bool] = None, + remove_base64_images: Optional[bool] = None, + block_ads: Optional[bool] = None, + proxy: Optional[Literal["basic", "stealth"]] = None, + extract: Optional[ExtractConfig] = None, + json_options: Optional[ExtractConfig] = None, + actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None, + agent: Optional[AgentOptions] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> BatchScrapeResponse: + """ + Initiate a batch scrape job asynchronously. + + Args: + urls (List[str]): URLs to scrape + formats (Optional[List[Literal]]): Content formats to retrieve + headers (Optional[Dict[str, str]]): Custom HTTP headers + include_tags (Optional[List[str]]): HTML tags to include + exclude_tags (Optional[List[str]]): HTML tags to exclude + only_main_content (Optional[bool]): Extract main content only + wait_for (Optional[int]): Wait time in milliseconds + timeout (Optional[int]): Request timeout in milliseconds + location (Optional[LocationConfig]): Location configuration + mobile (Optional[bool]): Use mobile user agent + skip_tls_verification (Optional[bool]): Skip TLS verification + remove_base64_images (Optional[bool]): Remove base64 encoded images + block_ads (Optional[bool]): Block advertisements + proxy (Optional[Literal]): Proxy type to use + extract (Optional[ExtractConfig]): Content extraction config + json_options (Optional[ExtractConfig]): JSON extraction config + actions (Optional[List[Union]]): Actions to perform + agent (Optional[AgentOptions]): Agent configuration + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API + + Returns: + BatchScrapeResponse with: + * success - Whether job started successfully + * id - Unique identifier for the job + * url - Status check URL + * error - Error message if start failed + + Raises: + Exception: If job initiation fails + """ + scrape_params = {} + + # Add individual parameters + if formats is not None: + scrape_params['formats'] = formats + if headers is not None: + scrape_params['headers'] = headers + if include_tags is not None: + scrape_params['includeTags'] = include_tags + if exclude_tags is not None: + scrape_params['excludeTags'] = exclude_tags + if only_main_content is not None: + scrape_params['onlyMainContent'] = only_main_content + if wait_for is not None: + scrape_params['waitFor'] = wait_for + if timeout is not None: + scrape_params['timeout'] = timeout + if location is not None: + scrape_params['location'] = location.dict(exclude_none=True) + if mobile is not None: + scrape_params['mobile'] = mobile + if skip_tls_verification is not None: + scrape_params['skipTlsVerification'] = skip_tls_verification + if remove_base64_images is not None: + scrape_params['removeBase64Images'] = remove_base64_images + if block_ads is not None: + scrape_params['blockAds'] = block_ads + if proxy is not None: + scrape_params['proxy'] = proxy + if extract is not None: + if hasattr(extract.schema, 'schema'): + extract.schema = extract.schema.schema() + scrape_params['extract'] = extract.dict(exclude_none=True) + if json_options is not None: + if hasattr(json_options.schema, 'schema'): + json_options.schema = json_options.schema.schema() + scrape_params['jsonOptions'] = json_options.dict(exclude_none=True) + if actions is not None: + scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions] + if agent is not None: + scrape_params['agent'] = agent.dict(exclude_none=True) + + # Add any additional kwargs + scrape_params.update(kwargs) + + # Create final params object + final_params = ScrapeParams(**scrape_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['urls'] = urls + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = await self._async_post_request( + f'{self.api_url}/v1/batch/scrape', + params_dict, + headers + ) + + if response.status_code == 200: + try: + return BatchScrapeResponse(**response.json()) + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'start batch scrape job') + + async def crawl_url( + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + poll_interval: Optional[int] = 2, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlStatusResponse: + """ + Crawl a website starting from a URL. + + Args: + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + poll_interval (Optional[int]): Seconds between status checks (default: 2) + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API + + Returns: + CrawlStatusResponse with: + * Crawling status and progress + * Crawled page contents + * Success/error information + + Raises: + Exception: If crawl fails + """ + crawl_params = {} + + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = await self._async_post_request( + f'{self.api_url}/v1/crawl', params_dict, headers) + + if response.status_code == 200: + try: + id = response.json().get('id') + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + return self._monitor_job_status(id, headers, poll_interval) + else: + self._handle_error(response, 'start crawl job') + + + async def async_crawl_url( + self, + url: str, + *, + include_paths: Optional[List[str]] = None, + exclude_paths: Optional[List[str]] = None, + max_depth: Optional[int] = None, + max_discovery_depth: Optional[int] = None, + limit: Optional[int] = None, + allow_backward_links: Optional[bool] = None, + allow_external_links: Optional[bool] = None, + ignore_sitemap: Optional[bool] = None, + scrape_options: Optional[CommonOptions] = None, + webhook: Optional[Union[str, WebhookConfig]] = None, + deduplicate_similar_urls: Optional[bool] = None, + ignore_query_parameters: Optional[bool] = None, + regex_on_full_url: Optional[bool] = None, + idempotency_key: Optional[str] = None, + **kwargs + ) -> CrawlResponse: + """ + Start an asynchronous crawl job. + + Args: + url (str): Target URL to start crawling from + include_paths (Optional[List[str]]): Patterns of URLs to include + exclude_paths (Optional[List[str]]): Patterns of URLs to exclude + max_depth (Optional[int]): Maximum crawl depth + max_discovery_depth (Optional[int]): Maximum depth for finding new URLs + limit (Optional[int]): Maximum pages to crawl + allow_backward_links (Optional[bool]): Follow parent directory links + allow_external_links (Optional[bool]): Follow external domain links + ignore_sitemap (Optional[bool]): Skip sitemap.xml processing + scrape_options (Optional[CommonOptions]): Page scraping configuration + webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings + deduplicate_similar_urls (Optional[bool]): Remove similar URLs + ignore_query_parameters (Optional[bool]): Ignore URL parameters + regex_on_full_url (Optional[bool]): Apply regex to full URLs + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + **kwargs: Additional parameters to pass to the API + + Returns: + CrawlResponse with: + * success - Whether crawl started successfully + * id - Unique identifier for the crawl job + * url - Status check URL for the crawl + * error - Error message if start failed + + Raises: + Exception: If crawl initiation fails + """ + crawl_params = {} + + # Add individual parameters + if include_paths is not None: + crawl_params['includePaths'] = include_paths + if exclude_paths is not None: + crawl_params['excludePaths'] = exclude_paths + if max_depth is not None: + crawl_params['maxDepth'] = max_depth + if max_discovery_depth is not None: + crawl_params['maxDiscoveryDepth'] = max_discovery_depth + if limit is not None: + crawl_params['limit'] = limit + if allow_backward_links is not None: + crawl_params['allowBackwardLinks'] = allow_backward_links + if allow_external_links is not None: + crawl_params['allowExternalLinks'] = allow_external_links + if ignore_sitemap is not None: + crawl_params['ignoreSitemap'] = ignore_sitemap + if scrape_options is not None: + crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + if webhook is not None: + crawl_params['webhook'] = webhook + if deduplicate_similar_urls is not None: + crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls + if ignore_query_parameters is not None: + crawl_params['ignoreQueryParameters'] = ignore_query_parameters + if regex_on_full_url is not None: + crawl_params['regexOnFullURL'] = regex_on_full_url + + # Add any additional kwargs + crawl_params.update(kwargs) + + # Create final params object + final_params = CrawlParams(**crawl_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['url'] = url + params_dict['origin'] = f"python-sdk@{version}" + + # Make request + headers = self._prepare_headers(idempotency_key) + response = await self._async_post_request( + f'{self.api_url}/v1/crawl', + params_dict, + headers + ) + + if response.status_code == 200: + try: + return CrawlResponse(**response.json()) + except: + raise Exception(f'Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'start crawl job') + + async def check_crawl_status(self, id: str) -> CrawlStatusResponse: + """ + Check the status and results of an asynchronous crawl job. + + Args: + id (str): Unique identifier for the crawl job + + Returns: + CrawlStatusResponse containing: + Status Information: + * status - Current state (scraping/completed/failed/cancelled) + * completed - Number of pages crawled + * total - Total pages to crawl + * creditsUsed - API credits consumed + * expiresAt - Data expiration timestamp + + Results: + * data - List of crawled documents + * next - URL for next page of results (if paginated) + * success - Whether status check succeeded + * error - Error message if failed + + Raises: + Exception: If status check fails + """ + headers = self._prepare_headers() + endpoint = f'/v1/crawl/{id}' + + status_data = await self._async_get_request( + f'{self.api_url}{endpoint}', + headers + ) + + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + if len(status_data['data']) == 0: + break + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + next_data = await self._async_get_request(next_url, headers) + data.extend(next_data.get('data', [])) + status_data = next_data + status_data['data'] = data + + response = { + 'status': status_data.get('status'), + 'total': status_data.get('total'), + 'completed': status_data.get('completed'), + 'creditsUsed': status_data.get('creditsUsed'), + 'expiresAt': status_data.get('expiresAt'), + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response + } + + async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse: + """ + Monitor the status of an asynchronous job until completion. + + Args: + id (str): The ID of the job to monitor + headers (Dict[str, str]): Headers to include in status check requests + poll_interval (int): Seconds between status checks (default: 2) + + Returns: + CrawlStatusResponse: The job results if completed successfully + + Raises: + Exception: If the job fails or an error occurs during status checks + """ + while True: + status_data = await self._async_get_request( + f'{self.api_url}/v1/crawl/{id}', + headers + ) + + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + if len(status_data['data']) == 0: + break + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + next_data = await self._async_get_request(next_url, headers) + data.extend(next_data.get('data', [])) + status_data = next_data + status_data['data'] = data + return status_data + else: + raise Exception('Job completed but no data was returned') + elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']: + await asyncio.sleep(max(poll_interval, 2)) + else: + raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}') + + async def map_url( + self, + url: str, + params: Optional[MapParams] = None) -> MapResponse: + """ + Asynchronously map and discover links from a URL. + + Args: + url (str): Target URL to map + params (Optional[MapParams]): See MapParams model: + Discovery Options: + * search - Filter pattern for URLs + * ignoreSitemap - Skip sitemap.xml + * includeSubdomains - Include subdomain links + * sitemapOnly - Only use sitemap.xml + + Limits: + * limit - Max URLs to return + * timeout - Request timeout (ms) + + Returns: + MapResponse with: + * Discovered URLs + * Success/error status + + Raises: + Exception: If mapping fails + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + json_data['origin'] = f"python-sdk@{version}" + + endpoint = f'/v1/map' + response = await self._async_post_request( + f'{self.api_url}{endpoint}', + json_data, + headers + ) + + if response.get('success') and 'links' in response: + return response + elif 'error' in response: + raise Exception(f'Failed to map URL. Error: {response["error"]}') + else: + raise Exception(f'Failed to map URL. Error: {response}') + + async def extract( + self, + urls: List[str], + params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: + """ + Asynchronously extract structured information from URLs. + + Args: + urls (List[str]): URLs to extract from + params (Optional[ExtractParams]): See ExtractParams model: + Extraction Config: + * prompt - Custom extraction prompt + * schema - JSON schema/Pydantic model + * systemPrompt - System context + + Behavior Options: + * allowExternalLinks - Follow external links + * enableWebSearch - Enable web search + * includeSubdomains - Include subdomains + * showSources - Include source URLs + + Scraping Options: + * scrapeOptions - Page scraping config + + Returns: + ExtractResponse with: + * Structured data matching schema + * Source information if requested + * Success/error status + + Raises: + ValueError: If prompt/schema missing or extraction fails + """ + headers = self._prepare_headers() + + if not params or (not params.get('prompt') and not params.get('schema')): + raise ValueError("Either prompt or schema is required") + + schema = params.get('schema') + if schema: + if hasattr(schema, 'model_json_schema'): + schema = schema.model_json_schema() + + request_data = { + 'urls': urls, + 'allowExternalLinks': params.get('allow_external_links', params.get('allowExternalLinks', False)), + 'enableWebSearch': params.get('enable_web_search', params.get('enableWebSearch', False)), + 'showSources': params.get('show_sources', params.get('showSources', False)), + 'schema': schema, + 'origin': f'python-sdk@{version}' + } + + if params.get('prompt'): + request_data['prompt'] = params['prompt'] + if params.get('system_prompt'): + request_data['systemPrompt'] = params['system_prompt'] + elif params.get('systemPrompt'): + request_data['systemPrompt'] = params['systemPrompt'] + + response = await self._async_post_request( + f'{self.api_url}/v1/extract', + request_data, + headers + ) + + if response.get('success'): + job_id = response.get('id') + if not job_id: + raise Exception('Job ID not returned from extract request.') + + while True: + status_data = await self._async_get_request( + f'{self.api_url}/v1/extract/{job_id}', + headers + ) + + if status_data['status'] == 'completed': + return status_data + elif status_data['status'] in ['failed', 'cancelled']: + raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') + + await asyncio.sleep(2) + else: + raise Exception(f'Failed to extract. Error: {response.get("error")}') + + async def check_batch_scrape_status(self, id: str) -> BatchScrapeStatusResponse: + """ + Check the status of an asynchronous batch scrape job. + + Args: + id (str): The ID of the batch scrape job + + Returns: + BatchScrapeStatusResponse containing: + Status Information: + * status - Current state (scraping/completed/failed/cancelled) + * completed - Number of URLs scraped + * total - Total URLs to scrape + * creditsUsed - API credits consumed + * expiresAt - Data expiration timestamp + + Results: + * data - List of scraped documents + * next - URL for next page of results (if paginated) + * success - Whether status check succeeded + * error - Error message if failed + + Raises: + Exception: If status check fails + """ + headers = self._prepare_headers() + endpoint = f'/v1/batch/scrape/{id}' + + status_data = await self._async_get_request( + f'{self.api_url}{endpoint}', + headers + ) + + if status_data['status'] == 'completed': + if 'data' in status_data: + data = status_data['data'] + while 'next' in status_data: + if len(status_data['data']) == 0: + break + next_url = status_data.get('next') + if not next_url: + logger.warning("Expected 'next' URL is missing.") + break + next_data = await self._async_get_request(next_url, headers) + data.extend(next_data.get('data', [])) + status_data = next_data + status_data['data'] = data + + response = { + 'status': status_data.get('status'), + 'total': status_data.get('total'), + 'completed': status_data.get('completed'), + 'creditsUsed': status_data.get('creditsUsed'), + 'expiresAt': status_data.get('expiresAt'), + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response + } + + async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse: + """ + Get information about errors from an asynchronous batch scrape job. + + Args: + id (str): The ID of the batch scrape job + + Returns: + CrawlErrorsResponse containing: + errors (List[Dict[str, str]]): List of errors with fields: + * id (str): Error ID + * timestamp (str): When the error occurred + * url (str): URL that caused the error + * error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If error check fails + """ + headers = self._prepare_headers() + return await self._async_get_request( + f'{self.api_url}/v1/batch/scrape/{id}/errors', + headers + ) + + async def check_crawl_errors(self, id: str) -> CrawlErrorsResponse: + """ + Get information about errors from an asynchronous crawl job. + + Args: + id (str): The ID of the crawl job + + Returns: + CrawlErrorsResponse containing: + * errors (List[Dict[str, str]]): List of errors with fields: + - id (str): Error ID + - timestamp (str): When the error occurred + - url (str): URL that caused the error + - error (str): Error message + * robotsBlocked (List[str]): List of URLs blocked by robots.txt + + Raises: + Exception: If error check fails + """ + headers = self._prepare_headers() + return await self._async_get_request( + f'{self.api_url}/v1/crawl/{id}/errors', + headers + ) + + async def cancel_crawl(self, id: str) -> Dict[str, Any]: + """ + Cancel an asynchronous crawl job. + + Args: + id (str): The ID of the crawl job to cancel + + Returns: + Dict[str, Any] containing: + * success (bool): Whether cancellation was successful + * error (str, optional): Error message if cancellation failed + + Raises: + Exception: If cancellation fails + """ + headers = self._prepare_headers() + async with aiohttp.ClientSession() as session: + async with session.delete(f'{self.api_url}/v1/crawl/{id}', headers=headers) as response: + return await response.json() + + async def get_extract_status(self, job_id: str) -> ExtractResponse[Any]: + """ + Check the status of an asynchronous extraction job. + + Args: + job_id (str): The ID of the extraction job + + Returns: + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any + * warning (Optional[str]): Warning message if any + * sources (Optional[List[str]]): Source URLs if requested + + Raises: + ValueError: If status check fails + """ + headers = self._prepare_headers() + try: + return await self._async_get_request( + f'{self.api_url}/v1/extract/{job_id}', + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def async_extract( + self, + urls: Optional[List[str]] = None, + *, + prompt: Optional[str] = None, + schema: Optional[Any] = None, + system_prompt: Optional[str] = None, + allow_external_links: Optional[bool] = False, + enable_web_search: Optional[bool] = False, + show_sources: Optional[bool] = False, + agent: Optional[Dict[str, Any]] = None, + idempotency_key: Optional[str] = None) -> ExtractResponse[Any]: + """ + Initiate an asynchronous extraction job without waiting for completion. + + Args: + urls (Optional[List[str]]): URLs to extract from + prompt (Optional[str]): Custom extraction prompt + schema (Optional[Any]): JSON schema/Pydantic model + system_prompt (Optional[str]): System context + allow_external_links (Optional[bool]): Follow external links + enable_web_search (Optional[bool]): Enable web search + show_sources (Optional[bool]): Include source URLs + agent (Optional[Dict[str, Any]]): Agent configuration + idempotency_key (Optional[str]): Unique key to prevent duplicate requests + + Returns: + ExtractResponse[Any] with: + * success (bool): Whether request succeeded + * data (Optional[Any]): Extracted data matching schema + * error (Optional[str]): Error message if any + + Raises: + ValueError: If job initiation fails + """ + headers = self._prepare_headers(idempotency_key) + + if not prompt and not schema: + raise ValueError("Either prompt or schema is required") + + if not urls and not prompt: + raise ValueError("Either urls or prompt is required") + + if schema: + if hasattr(schema, 'model_json_schema'): + schema = schema.model_json_schema() + + request_data = { + 'urls': urls or [], + 'allowExternalLinks': allow_external_links, + 'enableWebSearch': enable_web_search, + 'showSources': show_sources, + 'schema': schema, + 'origin': f'python-sdk@{version}' + } + + if prompt: + request_data['prompt'] = prompt + if system_prompt: + request_data['systemPrompt'] = system_prompt + if agent: + request_data['agent'] = agent + + try: + return await self._async_post_request( + f'{self.api_url}/v1/extract', + request_data, + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def generate_llms_text( + self, + url: str, + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse: + """ + Generate LLMs.txt for a given URL and monitor until completion. + + Args: + url (str): Target URL to generate LLMs.txt from + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming + + Returns: + GenerateLLMsTextStatusResponse containing: + * success (bool): Whether generation completed successfully + * status (str): Status of generation (processing/completed/failed) + * data (Dict[str, str], optional): Generated text with fields: + - llmstxt (str): Generated LLMs.txt content + - llmsfulltxt (str, optional): Full version if requested + * error (str, optional): Error message if generation failed + * expiresAt (str): When the generated data expires + + Raises: + Exception: If generation fails + """ + params = {} + if max_urls is not None: + params['maxUrls'] = max_urls + if show_full_text is not None: + params['showFullText'] = show_full_text + if experimental_stream is not None: + params['__experimental_stream'] = experimental_stream + + response = await self.async_generate_llms_text( + url, + max_urls=max_urls, + show_full_text=show_full_text, + experimental_stream=experimental_stream + ) + if not response.get('success') or 'id' not in response: + return response + + job_id = response['id'] + while True: + status = await self.check_generate_llms_text_status(job_id) + + if status['status'] == 'completed': + return status + elif status['status'] == 'failed': + raise Exception(f'LLMs.txt generation failed. Error: {status.get("error")}') + elif status['status'] != 'processing': + break + + await asyncio.sleep(2) + + return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'} + + async def async_generate_llms_text( + self, + url: str, + *, + max_urls: Optional[int] = None, + show_full_text: Optional[bool] = None, + experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse: + """ + Initiate an asynchronous LLMs.txt generation job without waiting for completion. + + Args: + url (str): Target URL to generate LLMs.txt from + max_urls (Optional[int]): Maximum URLs to process (default: 10) + show_full_text (Optional[bool]): Include full text in output (default: False) + experimental_stream (Optional[bool]): Enable experimental streaming + + Returns: + GenerateLLMsTextResponse containing: + * success (bool): Whether job started successfully + * id (str): Unique identifier for the job + * error (str, optional): Error message if start failed + + Raises: + ValueError: If job initiation fails + """ + params = {} + if max_urls is not None: + params['maxUrls'] = max_urls + if show_full_text is not None: + params['showFullText'] = show_full_text + if experimental_stream is not None: + params['__experimental_stream'] = experimental_stream + + headers = self._prepare_headers() + json_data = {'url': url, **params.dict(exclude_none=True)} + json_data['origin'] = f"python-sdk@{version}" + + try: + return await self._async_post_request( + f'{self.api_url}/v1/llmstxt', + json_data, + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def check_generate_llms_text_status(self, id: str) -> GenerateLLMsTextStatusResponse: + """ + Check the status of an asynchronous LLMs.txt generation job. + + Args: + id (str): The ID of the generation job + + Returns: + GenerateLLMsTextStatusResponse containing: + * success (bool): Whether generation completed successfully + * status (str): Status of generation (processing/completed/failed) + * data (Dict[str, str], optional): Generated text with fields: + - llmstxt (str): Generated LLMs.txt content + - llmsfulltxt (str, optional): Full version if requested + * error (str, optional): Error message if generation failed + * expiresAt (str): When the generated data expires + + Raises: + ValueError: If status check fails + """ + headers = self._prepare_headers() + try: + return await self._async_get_request( + f'{self.api_url}/v1/llmstxt/{id}', + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def deep_research( + self, + query: str, + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None, + on_activity: Optional[Callable[[Dict[str, Any]], None]] = None, + on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse: + """ + Initiates a deep research operation on a given query and polls until completion. + + Args: + query (str): Research query or topic to investigate + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming + on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth} + on_source (Optional[Callable]): Source discovery callback receiving {url, title, description} + + Returns: + DeepResearchStatusResponse containing: + * success (bool): Whether research completed successfully + * status (str): Current state (processing/completed/failed) + * error (Optional[str]): Error message if failed + * id (str): Unique identifier for the research job + * data (Any): Research findings and analysis + * sources (List[Dict]): List of discovered sources + * activities (List[Dict]): Research progress log + * summaries (List[str]): Generated research summaries + + Raises: + Exception: If research fails + """ + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) + + response = await self.async_deep_research( + query, + max_depth=max_depth, + time_limit=time_limit, + max_urls=max_urls, + analysis_prompt=analysis_prompt, + system_prompt=system_prompt + ) + if not response.get('success') or 'id' not in response: + return response + + job_id = response['id'] + last_activity_count = 0 + last_source_count = 0 + + while True: + status = await self.check_deep_research_status(job_id) + + if on_activity and 'activities' in status: + new_activities = status['activities'][last_activity_count:] + for activity in new_activities: + on_activity(activity) + last_activity_count = len(status['activities']) + + if on_source and 'sources' in status: + new_sources = status['sources'][last_source_count:] + for source in new_sources: + on_source(source) + last_source_count = len(status['sources']) + + if status['status'] == 'completed': + return status + elif status['status'] == 'failed': + raise Exception(f'Deep research failed. Error: {status.get("error")}') + elif status['status'] != 'processing': + break + + await asyncio.sleep(2) + + return {'success': False, 'error': 'Deep research job terminated unexpectedly'} + + async def async_deep_research( + self, + query: str, + *, + max_depth: Optional[int] = None, + time_limit: Optional[int] = None, + max_urls: Optional[int] = None, + analysis_prompt: Optional[str] = None, + system_prompt: Optional[str] = None, + __experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]: + """ + Initiates an asynchronous deep research operation. + + Args: + query (str): Research query or topic to investigate + max_depth (Optional[int]): Maximum depth of research exploration + time_limit (Optional[int]): Time limit in seconds for research + max_urls (Optional[int]): Maximum number of URLs to process + analysis_prompt (Optional[str]): Custom prompt for analysis + system_prompt (Optional[str]): Custom system prompt + __experimental_stream_steps (Optional[bool]): Enable experimental streaming + + Returns: + Dict[str, Any]: A response containing: + * success (bool): Whether the research initiation was successful + * id (str): The unique identifier for the research job + * error (str, optional): Error message if initiation failed + + Raises: + Exception: If the research initiation fails. + """ + research_params = {} + if max_depth is not None: + research_params['maxDepth'] = max_depth + if time_limit is not None: + research_params['timeLimit'] = time_limit + if max_urls is not None: + research_params['maxUrls'] = max_urls + if analysis_prompt is not None: + research_params['analysisPrompt'] = analysis_prompt + if system_prompt is not None: + research_params['systemPrompt'] = system_prompt + if __experimental_stream_steps is not None: + research_params['__experimental_streamSteps'] = __experimental_stream_steps + research_params = DeepResearchParams(**research_params) + + headers = self._prepare_headers() + + json_data = {'query': query, **research_params.dict(exclude_none=True)} + json_data['origin'] = f"python-sdk@{version}" + + try: + return await self._async_post_request( + f'{self.api_url}/v1/deep-research', + json_data, + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse: + """ + Check the status of a deep research operation. + + Args: + id (str): The ID of the deep research operation. + + Returns: + DeepResearchResponse containing: + + Status: + * success - Whether research completed successfully + * status - Current state (processing/completed/failed) + * error - Error message if failed + + Results: + * id - Unique identifier for the research job + * data - Research findings and analysis + * sources - List of discovered sources + * activities - Research progress log + * summaries - Generated research summaries + + Raises: + Exception: If the status check fails. + """ + headers = self._prepare_headers() + try: + return await self._async_get_request( + f'{self.api_url}/v1/deep-research/{id}', + headers + ) + except Exception as e: + raise ValueError(str(e)) + + async def search( + self, + query: str, + *, + limit: Optional[int] = None, + tbs: Optional[str] = None, + filter: Optional[str] = None, + lang: Optional[str] = None, + country: Optional[str] = None, + location: Optional[str] = None, + timeout: Optional[int] = None, + scrape_options: Optional[CommonOptions] = None, + params: Optional[Union[Dict[str, Any], SearchParams]] = None, + **kwargs) -> SearchResponse: + """ + Asynchronously search for content using Firecrawl. + + Args: + query (str): Search query string + limit (Optional[int]): Max results (default: 5) + tbs (Optional[str]): Time filter (e.g. "qdr:d") + filter (Optional[str]): Custom result filter + lang (Optional[str]): Language code (default: "en") + country (Optional[str]): Country code (default: "us") + location (Optional[str]): Geo-targeting + timeout (Optional[int]): Request timeout in milliseconds + scrape_options (Optional[CommonOptions]): Result scraping configuration + params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters + **kwargs: Additional keyword arguments for future compatibility + + Returns: + SearchResponse: Response containing: + * success (bool): Whether request succeeded + * data (List[FirecrawlDocument]): Search results + * warning (Optional[str]): Warning message if any + * error (Optional[str]): Error message if any + + Raises: + Exception: If search fails or response cannot be parsed + """ + # Build search parameters + search_params = {} + if params: + if isinstance(params, dict): + search_params.update(params) + else: + search_params.update(params.dict(exclude_none=True)) + + # Add individual parameters + if limit is not None: + search_params['limit'] = limit + if tbs is not None: + search_params['tbs'] = tbs + if filter is not None: + search_params['filter'] = filter + if lang is not None: + search_params['lang'] = lang + if country is not None: + search_params['country'] = country + if location is not None: + search_params['location'] = location + if timeout is not None: + search_params['timeout'] = timeout + if scrape_options is not None: + search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True) + + # Add any additional kwargs + search_params.update(kwargs) + + # Create final params object + final_params = SearchParams(query=query, **search_params) + params_dict = final_params.dict(exclude_none=True) + params_dict['origin'] = f"python-sdk@{version}" + + return await self._async_post_request( + f"{self.api_url}/v1/search", + params_dict, + {"Authorization": f"Bearer {self.api_key}"} + ) + +class AsyncCrawlWatcher(CrawlWatcher): + """ + Async version of CrawlWatcher that properly handles async operations. + """ + def __init__(self, id: str, app: AsyncFirecrawlApp): + super().__init__(id, app) + + async def connect(self) -> None: + """ + Establishes async WebSocket connection and starts listening for messages. + """ + async with websockets.connect( + self.ws_url, + additional_headers=[("Authorization", f"Bearer {self.app.api_key}")] + ) as websocket: + await self._listen(websocket) + + async def _listen(self, websocket) -> None: + """ + Listens for incoming WebSocket messages and handles them asynchronously. + + Args: + websocket: The WebSocket connection object + """ + async for message in websocket: + msg = json.loads(message) + await self._handle_message(msg) + + async def _handle_message(self, msg: Dict[str, Any]) -> None: + """ + Handles incoming WebSocket messages based on their type asynchronously. + + Args: + msg (Dict[str, Any]): The message to handle + """ + if msg['type'] == 'done': + self.status = 'completed' + self.dispatch_event('done', {'status': self.status, 'data': self.data, 'id': self.id}) + elif msg['type'] == 'error': + self.status = 'failed' + self.dispatch_event('error', {'status': self.status, 'data': self.data, 'error': msg['error'], 'id': self.id}) + elif msg['type'] == 'catchup': + self.status = msg['data']['status'] + self.data.extend(msg['data'].get('data', [])) + for doc in self.data: + self.dispatch_event('document', {'data': doc, 'id': self.id}) + elif msg['type'] == 'document': + self.data.append(msg['data']) + self.dispatch_event('document', {'data': msg['data'], 'id': self.id}) + + async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None: + """ + Handle errors from async API responses. + """ + try: + error_data = await response.json() + error_message = error_data.get('error', 'No error message provided.') + error_details = error_data.get('details', 'No additional error details provided.') + except: + raise aiohttp.ClientError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}') + + # Use the app's method to get the error message + message = await self.app._get_async_error_message(response.status, action, error_message, error_details) + + raise aiohttp.ClientError(message) + + async def _get_async_error_message(self, status_code: int, action: str, error_message: str, error_details: str) -> str: + """ + Generate a standardized error message based on HTTP status code for async operations. + + Args: + status_code (int): The HTTP status code from the response + action (str): Description of the action that was being performed + error_message (str): The error message from the API response + error_details (str): Additional error details from the API response + + Returns: + str: A formatted error message + """ + return self._get_error_message(status_code, action, error_message, error_details) diff --git a/apps/python-sdk/pyproject.toml b/apps/python-sdk/pyproject.toml index 5a87d8c5..0483c31c 100644 --- a/apps/python-sdk/pyproject.toml +++ b/apps/python-sdk/pyproject.toml @@ -13,7 +13,8 @@ dependencies = [ "python-dotenv", "websockets", "nest-asyncio", - "pydantic>=2.10.3", + "pydantic", + "aiohttp" ] authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}] maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}] diff --git a/apps/python-sdk/requirements.txt b/apps/python-sdk/requirements.txt index 5dcd8f6c..360d9e76 100644 --- a/apps/python-sdk/requirements.txt +++ b/apps/python-sdk/requirements.txt @@ -3,4 +3,5 @@ pytest python-dotenv websockets nest-asyncio -pydantic \ No newline at end of file +pydantic +aiohttp \ No newline at end of file diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index 8a67d1fd..1fb31664 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -32,7 +32,9 @@ setup( 'python-dotenv', 'websockets', 'asyncio', - 'nest-asyncio' + 'nest-asyncio', + 'pydantic', + 'aiohttp' ], python_requires=">=3.8", classifiers=[