mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 20:39:00 +08:00
Merge branch 'sdk-improv/async' of https://github.com/mendableai/firecrawl into sdk-improv/async
This commit is contained in:
commit
55c04d615e
@ -1,53 +1,45 @@
|
|||||||
import time
|
from firecrawl.firecrawl import ExtractConfig, FirecrawlApp
|
||||||
import nest_asyncio
|
|
||||||
import uuid
|
|
||||||
from firecrawl.firecrawl import FirecrawlApp
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
import time
|
||||||
|
app = FirecrawlApp(api_url="https://api.firecrawl.dev")
|
||||||
|
|
||||||
app = FirecrawlApp(api_key="fc-")
|
# # Scrape a website:
|
||||||
|
scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
|
||||||
# Scrape a website:
|
print(scrape_result.markdown)
|
||||||
scrape_result = app.scrape_url('firecrawl.dev')
|
|
||||||
print(scrape_result['markdown'])
|
|
||||||
|
|
||||||
|
|
||||||
# Test batch scrape
|
# # Test batch scrapeq
|
||||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||||
batch_scrape_params = {
|
|
||||||
'formats': ['markdown', 'html'],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Synchronous batch scrape
|
# Synchronous batch scrape
|
||||||
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
|
batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||||
print("Synchronous Batch Scrape Result:")
|
print("Synchronous Batch Scrape Result:")
|
||||||
print(batch_result['data'][0]['markdown'])
|
print(batch_result.data[0].markdown)
|
||||||
|
|
||||||
# Asynchronous batch scrape
|
# # Asynchronous batch scrape
|
||||||
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
|
async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||||
print("\nAsynchronous Batch Scrape Result:")
|
print("\nAsynchronous Batch Scrape Result:")
|
||||||
print(async_batch_result)
|
print(async_batch_result)
|
||||||
|
|
||||||
# Crawl a website:
|
# Crawl a website:
|
||||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||||
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
print(crawl_result.data[0].markdown)
|
||||||
print(crawl_result)
|
|
||||||
|
|
||||||
# Asynchronous Crawl a website:
|
# # Asynchronous Crawl a website:
|
||||||
async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
|
async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||||
print(async_result)
|
print(async_result)
|
||||||
|
|
||||||
crawl_status = app.check_crawl_status(async_result['id'])
|
crawl_status = app.check_crawl_status(async_result.id)
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
|
|
||||||
attempts = 15
|
attempts = 15
|
||||||
while attempts > 0 and crawl_status['status'] != 'completed':
|
while attempts > 0 and crawl_status.status != 'completed':
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
crawl_status = app.check_crawl_status(async_result['id'])
|
crawl_status = app.check_crawl_status(async_result.id)
|
||||||
attempts -= 1
|
attempts -= 1
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
crawl_status = app.check_crawl_status(async_result['id'])
|
crawl_status = app.check_crawl_status(async_result.id)
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
|
|
||||||
# LLM Extraction:
|
# LLM Extraction:
|
||||||
@ -61,14 +53,11 @@ class ArticleSchema(BaseModel):
|
|||||||
class TopArticlesSchema(BaseModel):
|
class TopArticlesSchema(BaseModel):
|
||||||
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
||||||
|
|
||||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
|
||||||
'formats': ['extract'],
|
|
||||||
'extract': {
|
|
||||||
'schema': TopArticlesSchema.model_json_schema()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
print(llm_extraction_result['extract'])
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||||
|
|
||||||
|
print(llm_extraction_result.extract)
|
||||||
|
|
||||||
# # Define schema to extract contents into using json schema
|
# # Define schema to extract contents into using json schema
|
||||||
json_schema = {
|
json_schema = {
|
||||||
@ -94,24 +83,16 @@ json_schema = {
|
|||||||
"required": ["top"]
|
"required": ["top"]
|
||||||
}
|
}
|
||||||
|
|
||||||
app2 = FirecrawlApp(api_key="fc-", version="v0")
|
extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True})
|
||||||
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||||
|
|
||||||
|
print(llm_extraction_result.extract)
|
||||||
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
|
||||||
'extractorOptions': {
|
|
||||||
'extractionSchema': json_schema,
|
|
||||||
'mode': 'llm-extraction'
|
|
||||||
},
|
|
||||||
'pageOptions':{
|
|
||||||
'onlyMainContent': True
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
# print(llm_extraction_result['llm_extraction'])
|
# print(llm_extraction_result['llm_extraction'])
|
||||||
|
|
||||||
|
|
||||||
# Map a website:
|
# Map a website:
|
||||||
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
map_result = app.map_url('https://firecrawl.dev', search="blog")
|
||||||
print(map_result)
|
print(map_result)
|
||||||
|
|
||||||
# Extract URLs:
|
# Extract URLs:
|
||||||
@ -124,14 +105,12 @@ class ExtractSchema(BaseModel):
|
|||||||
extract_schema = ExtractSchema.schema()
|
extract_schema = ExtractSchema.schema()
|
||||||
|
|
||||||
# Perform the extraction
|
# Perform the extraction
|
||||||
extract_result = app.extract(['https://firecrawl.dev'], {
|
extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
||||||
'prompt': "Extract the title, description, and links from the website",
|
|
||||||
'schema': extract_schema
|
|
||||||
})
|
|
||||||
print(extract_result)
|
print(extract_result)
|
||||||
|
|
||||||
# Crawl a website with WebSockets:
|
# Crawl a website with WebSockets:
|
||||||
# inside an async function...
|
# inside an async function...
|
||||||
|
import nest_asyncio
|
||||||
nest_asyncio.apply()
|
nest_asyncio.apply()
|
||||||
|
|
||||||
# Define event handlers
|
# Define event handlers
|
||||||
|
@ -6,51 +6,47 @@ from firecrawl.firecrawl import AsyncFirecrawlApp
|
|||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
app = AsyncFirecrawlApp(api_key="fc-")
|
app = AsyncFirecrawlApp(api_url="https://api.firecrawl.dev")
|
||||||
|
|
||||||
async def example_scrape():
|
async def example_scrape():
|
||||||
# Scrape a website:
|
# Scrape a website:
|
||||||
scrape_result = await app.scrape_url('firecrawl.dev')
|
scrape_result = await app.scrape_url('example.com', formats=["markdown", "html"])
|
||||||
print(scrape_result['markdown'])
|
print(scrape_result.markdown)
|
||||||
|
|
||||||
async def example_batch_scrape():
|
async def example_batch_scrape():
|
||||||
# Batch scrape
|
# Batch scrape
|
||||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||||
batch_scrape_params = {
|
|
||||||
'formats': ['markdown', 'html'],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Synchronous batch scrape
|
# Synchronous batch scrape
|
||||||
batch_result = await app.batch_scrape_urls(urls, batch_scrape_params)
|
batch_result = await app.batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||||
print("Synchronous Batch Scrape Result:")
|
print("Synchronous Batch Scrape Result:")
|
||||||
print(batch_result['data'][0]['markdown'])
|
print(batch_result.data[0].markdown)
|
||||||
|
|
||||||
# Asynchronous batch scrape
|
# Asynchronous batch scrape
|
||||||
async_batch_result = await app.async_batch_scrape_urls(urls, batch_scrape_params)
|
async_batch_result = await app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||||
print("\nAsynchronous Batch Scrape Result:")
|
print("\nAsynchronous Batch Scrape Result:")
|
||||||
print(async_batch_result)
|
print(async_batch_result)
|
||||||
|
|
||||||
async def example_crawl():
|
async def example_crawl():
|
||||||
# Crawl a website:
|
# Crawl a website:
|
||||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
crawl_result = await app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||||
crawl_result = await app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
print(crawl_result.data[0].markdown)
|
||||||
print(crawl_result)
|
|
||||||
|
|
||||||
# Asynchronous Crawl a website:
|
# Asynchronous Crawl a website:
|
||||||
async_result = await app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
|
async_result = await app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||||
print(async_result)
|
print(async_result)
|
||||||
|
|
||||||
crawl_status = await app.check_crawl_status(async_result['id'])
|
crawl_status = await app.check_crawl_status(async_result.id)
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
|
|
||||||
attempts = 15
|
attempts = 15
|
||||||
while attempts > 0 and crawl_status['status'] != 'completed':
|
while attempts > 0 and crawl_status.status != 'completed':
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
crawl_status = await app.check_crawl_status(async_result['id'])
|
crawl_status = await app.check_crawl_status(async_result.id)
|
||||||
attempts -= 1
|
attempts -= 1
|
||||||
await asyncio.sleep(1) # Use async sleep instead of time.sleep
|
await asyncio.sleep(1) # Use async sleep instead of time.sleep
|
||||||
|
|
||||||
crawl_status = await app.check_crawl_status(async_result['id'])
|
crawl_status = await app.check_crawl_status(async_result.id)
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
|
|
||||||
async def example_llm_extraction():
|
async def example_llm_extraction():
|
||||||
@ -64,18 +60,15 @@ async def example_llm_extraction():
|
|||||||
class TopArticlesSchema(BaseModel):
|
class TopArticlesSchema(BaseModel):
|
||||||
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
||||||
|
|
||||||
llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', {
|
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
|
||||||
'formats': ['extract'],
|
|
||||||
'extract': {
|
|
||||||
'schema': TopArticlesSchema.model_json_schema()
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
print(llm_extraction_result['extract'])
|
llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||||
|
|
||||||
|
print(llm_extraction_result.extract)
|
||||||
|
|
||||||
async def example_map_and_extract():
|
async def example_map_and_extract():
|
||||||
# Map a website:
|
# Map a website:
|
||||||
map_result = await app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
map_result = await app.map_url('https://firecrawl.dev', search="blog")
|
||||||
print(map_result)
|
print(map_result)
|
||||||
|
|
||||||
# Extract URLs:
|
# Extract URLs:
|
||||||
@ -88,10 +81,7 @@ async def example_map_and_extract():
|
|||||||
extract_schema = ExtractSchema.schema()
|
extract_schema = ExtractSchema.schema()
|
||||||
|
|
||||||
# Perform the extraction
|
# Perform the extraction
|
||||||
extract_result = await app.extract(['https://firecrawl.dev'], {
|
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
||||||
'prompt': "Extract the title, description, and links from the website",
|
|
||||||
'schema': extract_schema
|
|
||||||
})
|
|
||||||
print(extract_result)
|
print(extract_result)
|
||||||
|
|
||||||
# Define event handlers for websocket
|
# Define event handlers for websocket
|
||||||
|
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp # noqa
|
from .firecrawl import FirecrawlApp # noqa
|
||||||
|
|
||||||
__version__ = "1.17.0"
|
__version__ = "2.0.0"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
@ -3648,12 +3648,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
job_id (str): The ID of the extraction job
|
job_id (str): The ID of the extraction job
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
ExtractResponse containing:
|
ExtractResponse[Any] with:
|
||||||
* success (bool): Whether extraction completed successfully
|
* success (bool): Whether request succeeded
|
||||||
* data (Any): Extracted structured data
|
* data (Optional[Any]): Extracted data matching schema
|
||||||
* error (str, optional): Error message if extraction failed
|
* error (Optional[str]): Error message if any
|
||||||
* warning (str, optional): Warning message if any
|
* warning (Optional[str]): Warning message if any
|
||||||
* sources (List[str], optional): Source URLs if requested
|
* sources (Optional[List[str]]): Source URLs if requested
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If status check fails
|
ValueError: If status check fails
|
||||||
@ -3669,54 +3669,67 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
|
|
||||||
async def async_extract(
|
async def async_extract(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: Optional[List[str]] = None,
|
||||||
params: Optional[ExtractParams] = None,
|
*,
|
||||||
|
prompt: Optional[str] = None,
|
||||||
|
schema: Optional[Any] = None,
|
||||||
|
system_prompt: Optional[str] = None,
|
||||||
|
allow_external_links: Optional[bool] = False,
|
||||||
|
enable_web_search: Optional[bool] = False,
|
||||||
|
show_sources: Optional[bool] = False,
|
||||||
|
agent: Optional[Dict[str, Any]] = None,
|
||||||
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous extraction job without waiting for completion.
|
Initiate an asynchronous extraction job without waiting for completion.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls (List[str]): URLs to extract information from
|
urls (Optional[List[str]]): URLs to extract from
|
||||||
params (Optional[ExtractParams]): See ExtractParams model:
|
prompt (Optional[str]): Custom extraction prompt
|
||||||
Extraction Config:
|
schema (Optional[Any]): JSON schema/Pydantic model
|
||||||
* prompt - Custom extraction prompt
|
system_prompt (Optional[str]): System context
|
||||||
* schema - JSON schema/Pydantic model
|
allow_external_links (Optional[bool]): Follow external links
|
||||||
* systemPrompt - System context
|
enable_web_search (Optional[bool]): Enable web search
|
||||||
|
show_sources (Optional[bool]): Include source URLs
|
||||||
Behavior Options:
|
agent (Optional[Dict[str, Any]]): Agent configuration
|
||||||
* allowExternalLinks - Follow external links
|
|
||||||
* enableWebSearch - Enable web search
|
|
||||||
* includeSubdomains - Include subdomains
|
|
||||||
* showSources - Include source URLs
|
|
||||||
|
|
||||||
Scraping Options:
|
|
||||||
* scrapeOptions - Page scraping config
|
|
||||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
ExtractResponse containing:
|
ExtractResponse[Any] with:
|
||||||
* success (bool): Whether job started successfully
|
* success (bool): Whether request succeeded
|
||||||
* id (str): Unique identifier for the job
|
* data (Optional[Any]): Extracted data matching schema
|
||||||
* error (str, optional): Error message if start failed
|
* error (Optional[str]): Error message if any
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If job initiation fails
|
ValueError: If job initiation fails
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers(idempotency_key)
|
headers = self._prepare_headers(idempotency_key)
|
||||||
|
|
||||||
schema = params.get('schema') if params else None
|
if not prompt and not schema:
|
||||||
|
raise ValueError("Either prompt or schema is required")
|
||||||
|
|
||||||
|
if not urls and not prompt:
|
||||||
|
raise ValueError("Either urls or prompt is required")
|
||||||
|
|
||||||
if schema:
|
if schema:
|
||||||
if hasattr(schema, 'model_json_schema'):
|
if hasattr(schema, 'model_json_schema'):
|
||||||
schema = schema.model_json_schema()
|
schema = schema.model_json_schema()
|
||||||
|
|
||||||
jsonData = {'urls': urls, **(params or {})}
|
|
||||||
request_data = {
|
request_data = {
|
||||||
**jsonData,
|
'urls': urls or [],
|
||||||
'allowExternalLinks': params.get('allow_external_links', False) if params else False,
|
'allowExternalLinks': allow_external_links,
|
||||||
|
'enableWebSearch': enable_web_search,
|
||||||
|
'showSources': show_sources,
|
||||||
'schema': schema,
|
'schema': schema,
|
||||||
'origin': f'python-sdk@{version}'
|
'origin': f'python-sdk@{version}'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if prompt:
|
||||||
|
request_data['prompt'] = prompt
|
||||||
|
if system_prompt:
|
||||||
|
request_data['systemPrompt'] = system_prompt
|
||||||
|
if agent:
|
||||||
|
request_data['agent'] = agent
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return await self._async_post_request(
|
return await self._async_post_request(
|
||||||
f'{self.api_url}/v1/extract',
|
f'{self.api_url}/v1/extract',
|
||||||
@ -3729,16 +3742,18 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
async def generate_llms_text(
|
async def generate_llms_text(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse:
|
*,
|
||||||
|
max_urls: Optional[int] = None,
|
||||||
|
show_full_text: Optional[bool] = None,
|
||||||
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextStatusResponse:
|
||||||
"""
|
"""
|
||||||
Generate LLMs.txt for a given URL and monitor until completion.
|
Generate LLMs.txt for a given URL and monitor until completion.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to generate LLMs.txt from
|
url (str): Target URL to generate LLMs.txt from
|
||||||
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): See GenerateLLMsTextParams model:
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||||
Generation Options:
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||||
* maxUrls - Maximum URLs to process (default: 10)
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||||
* showFullText - Include full text in output (default: False)
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
GenerateLLMsTextStatusResponse containing:
|
GenerateLLMsTextStatusResponse containing:
|
||||||
@ -3753,15 +3768,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
Raises:
|
Raises:
|
||||||
Exception: If generation fails
|
Exception: If generation fails
|
||||||
"""
|
"""
|
||||||
if params is None:
|
params = {}
|
||||||
params = {}
|
if max_urls is not None:
|
||||||
|
params['maxUrls'] = max_urls
|
||||||
|
if show_full_text is not None:
|
||||||
|
params['showFullText'] = show_full_text
|
||||||
|
if experimental_stream is not None:
|
||||||
|
params['__experimental_stream'] = experimental_stream
|
||||||
|
|
||||||
if isinstance(params, dict):
|
response = await self.async_generate_llms_text(url, params)
|
||||||
generation_params = GenerateLLMsTextParams(**params)
|
|
||||||
else:
|
|
||||||
generation_params = params
|
|
||||||
|
|
||||||
response = await self.async_generate_llms_text(url, generation_params)
|
|
||||||
if not response.get('success') or 'id' not in response:
|
if not response.get('success') or 'id' not in response:
|
||||||
return response
|
return response
|
||||||
|
|
||||||
@ -3783,36 +3798,38 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
async def async_generate_llms_text(
|
async def async_generate_llms_text(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse:
|
*,
|
||||||
|
max_urls: Optional[int] = None,
|
||||||
|
show_full_text: Optional[bool] = None,
|
||||||
|
experimental_stream: Optional[bool] = None) -> GenerateLLMsTextResponse:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to generate LLMs.txt from
|
url (str): Target URL to generate LLMs.txt from
|
||||||
params (Optional[Union[Dict[str, Any], GenerateLLMsTextParams]]): See GenerateLLMsTextParams model:
|
max_urls (Optional[int]): Maximum URLs to process (default: 10)
|
||||||
Generation Options:
|
show_full_text (Optional[bool]): Include full text in output (default: False)
|
||||||
* maxUrls - Maximum URLs to process (default: 10)
|
experimental_stream (Optional[bool]): Enable experimental streaming
|
||||||
* showFullText - Include full text in output (default: False)
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
GenerateLLMsTextResponse containing:
|
GenerateLLMsTextResponse containing:
|
||||||
* success (bool): Whether job started successfully
|
* success (bool): Whether job started successfully
|
||||||
* id (str): Unique identifier for the job
|
* id (str): Unique identifier for the job
|
||||||
* error (str, optional): Error message if start failed
|
* error (str, optional): Error message if start failed
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If job initiation fails
|
ValueError: If job initiation fails
|
||||||
"""
|
"""
|
||||||
if params is None:
|
params = {}
|
||||||
params = {}
|
if max_urls is not None:
|
||||||
|
params['maxUrls'] = max_urls
|
||||||
if isinstance(params, dict):
|
if show_full_text is not None:
|
||||||
generation_params = GenerateLLMsTextParams(**params)
|
params['showFullText'] = show_full_text
|
||||||
else:
|
if experimental_stream is not None:
|
||||||
generation_params = params
|
params['__experimental_stream'] = experimental_stream
|
||||||
|
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
json_data = {'url': url, **generation_params.dict(exclude_none=True)}
|
json_data = {'url': url, **params.dict(exclude_none=True)}
|
||||||
json_data['origin'] = f"python-sdk@{version}"
|
json_data['origin'] = f"python-sdk@{version}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -3856,52 +3873,57 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
async def deep_research(
|
async def deep_research(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None,
|
*,
|
||||||
|
max_depth: Optional[int] = None,
|
||||||
|
time_limit: Optional[int] = None,
|
||||||
|
max_urls: Optional[int] = None,
|
||||||
|
analysis_prompt: Optional[str] = None,
|
||||||
|
system_prompt: Optional[str] = None,
|
||||||
|
__experimental_stream_steps: Optional[bool] = None,
|
||||||
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
on_activity: Optional[Callable[[Dict[str, Any]], None]] = None,
|
||||||
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
on_source: Optional[Callable[[Dict[str, Any]], None]] = None) -> DeepResearchStatusResponse:
|
||||||
"""
|
"""
|
||||||
Initiates a deep research operation on a given query and polls until completion, providing real-time updates via callbacks.
|
Initiates a deep research operation on a given query and polls until completion.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query: Research query or topic to investigate
|
query (str): Research query or topic to investigate
|
||||||
|
max_depth (Optional[int]): Maximum depth of research exploration
|
||||||
params: See DeepResearchParams model:
|
time_limit (Optional[int]): Time limit in seconds for research
|
||||||
Research Settings:
|
max_urls (Optional[int]): Maximum number of URLs to process
|
||||||
* maxDepth - Maximum research depth (default: 7)
|
analysis_prompt (Optional[str]): Custom prompt for analysis
|
||||||
* timeLimit - Time limit in seconds (default: 270)
|
system_prompt (Optional[str]): Custom system prompt
|
||||||
* maxUrls - Maximum URLs to process (default: 20)
|
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
||||||
|
on_activity (Optional[Callable]): Progress callback receiving {type, status, message, timestamp, depth}
|
||||||
Callbacks:
|
on_source (Optional[Callable]): Source discovery callback receiving {url, title, description}
|
||||||
* on_activity - Progress callback receiving:
|
|
||||||
{type, status, message, timestamp, depth}
|
|
||||||
* on_source - Source discovery callback receiving:
|
|
||||||
{url, title, description}
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DeepResearchResponse containing:
|
DeepResearchStatusResponse containing:
|
||||||
|
* success (bool): Whether research completed successfully
|
||||||
Status:
|
* status (str): Current state (processing/completed/failed)
|
||||||
* success - Whether research completed successfully
|
* error (Optional[str]): Error message if failed
|
||||||
* status - Current state (processing/completed/failed)
|
* id (str): Unique identifier for the research job
|
||||||
* error - Error message if failed
|
* data (Any): Research findings and analysis
|
||||||
|
* sources (List[Dict]): List of discovered sources
|
||||||
Results:
|
* activities (List[Dict]): Research progress log
|
||||||
* id - Unique identifier for the research job
|
* summaries (List[str]): Generated research summaries
|
||||||
* data - Research findings and analysis
|
|
||||||
* sources - List of discovered sources
|
|
||||||
* activities - Research progress log
|
|
||||||
* summaries - Generated research summaries
|
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If research fails
|
Exception: If research fails
|
||||||
"""
|
"""
|
||||||
if params is None:
|
research_params = {}
|
||||||
params = {}
|
if max_depth is not None:
|
||||||
|
research_params['maxDepth'] = max_depth
|
||||||
if isinstance(params, dict):
|
if time_limit is not None:
|
||||||
research_params = DeepResearchParams(**params)
|
research_params['timeLimit'] = time_limit
|
||||||
else:
|
if max_urls is not None:
|
||||||
research_params = params
|
research_params['maxUrls'] = max_urls
|
||||||
|
if analysis_prompt is not None:
|
||||||
|
research_params['analysisPrompt'] = analysis_prompt
|
||||||
|
if system_prompt is not None:
|
||||||
|
research_params['systemPrompt'] = system_prompt
|
||||||
|
if __experimental_stream_steps is not None:
|
||||||
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
||||||
|
research_params = DeepResearchParams(**research_params)
|
||||||
|
|
||||||
response = await self.async_deep_research(query, research_params)
|
response = await self.async_deep_research(query, research_params)
|
||||||
if not response.get('success') or 'id' not in response:
|
if not response.get('success') or 'id' not in response:
|
||||||
@ -3940,38 +3962,54 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
async def async_deep_research(
|
async def async_deep_research(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse:
|
*,
|
||||||
|
max_depth: Optional[int] = None,
|
||||||
|
time_limit: Optional[int] = None,
|
||||||
|
max_urls: Optional[int] = None,
|
||||||
|
analysis_prompt: Optional[str] = None,
|
||||||
|
system_prompt: Optional[str] = None,
|
||||||
|
__experimental_stream_steps: Optional[bool] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous deep research job without waiting for completion.
|
Initiates an asynchronous deep research operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query (str): Research query or topic to investigate
|
query (str): Research query or topic to investigate
|
||||||
params (Optional[Union[Dict[str, Any], DeepResearchParams]]): See DeepResearchParams model:
|
max_depth (Optional[int]): Maximum depth of research exploration
|
||||||
Research Settings:
|
time_limit (Optional[int]): Time limit in seconds for research
|
||||||
* maxDepth - Maximum research depth (default: 7)
|
max_urls (Optional[int]): Maximum number of URLs to process
|
||||||
* timeLimit - Time limit in seconds (default: 270)
|
analysis_prompt (Optional[str]): Custom prompt for analysis
|
||||||
* maxUrls - Maximum URLs to process (default: 20)
|
system_prompt (Optional[str]): Custom system prompt
|
||||||
|
__experimental_stream_steps (Optional[bool]): Enable experimental streaming
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DeepResearchResponse containing:
|
Dict[str, Any]: A response containing:
|
||||||
* success (bool): Whether job started successfully
|
* success (bool): Whether the research initiation was successful
|
||||||
* id (str): Unique identifier for the job
|
* id (str): The unique identifier for the research job
|
||||||
* error (str, optional): Error message if start failed
|
* error (str, optional): Error message if initiation failed
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If job initiation fails
|
Exception: If the research initiation fails.
|
||||||
"""
|
"""
|
||||||
if params is None:
|
research_params = {}
|
||||||
params = {}
|
if max_depth is not None:
|
||||||
|
research_params['maxDepth'] = max_depth
|
||||||
if isinstance(params, dict):
|
if time_limit is not None:
|
||||||
research_params = DeepResearchParams(**params)
|
research_params['timeLimit'] = time_limit
|
||||||
else:
|
if max_urls is not None:
|
||||||
research_params = params
|
research_params['maxUrls'] = max_urls
|
||||||
|
if analysis_prompt is not None:
|
||||||
|
research_params['analysisPrompt'] = analysis_prompt
|
||||||
|
if system_prompt is not None:
|
||||||
|
research_params['systemPrompt'] = system_prompt
|
||||||
|
if __experimental_stream_steps is not None:
|
||||||
|
research_params['__experimental_streamSteps'] = __experimental_stream_steps
|
||||||
|
research_params = DeepResearchParams(**research_params)
|
||||||
|
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
|
|
||||||
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
||||||
json_data['origin'] = f"python-sdk@{version}"
|
json_data['origin'] = f"python-sdk@{version}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return await self._async_post_request(
|
return await self._async_post_request(
|
||||||
f'{self.api_url}/v1/deep-research',
|
f'{self.api_url}/v1/deep-research',
|
||||||
@ -3983,26 +4021,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
|
|
||||||
async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
async def check_deep_research_status(self, id: str) -> DeepResearchStatusResponse:
|
||||||
"""
|
"""
|
||||||
Check the status of an asynchronous deep research job.
|
Check the status of a deep research operation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
id (str): The ID of the research job
|
id (str): The ID of the deep research operation.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
DeepResearchStatusResponse containing:
|
DeepResearchResponse containing:
|
||||||
* success (bool): Whether research completed successfully
|
|
||||||
* status (str): Current state (processing/completed/failed)
|
Status:
|
||||||
* data (Dict[str, Any], optional): Research findings and analysis
|
* success - Whether research completed successfully
|
||||||
* error (str, optional): Error message if failed
|
* status - Current state (processing/completed/failed)
|
||||||
* expiresAt (str): When the research data expires
|
* error - Error message if failed
|
||||||
* currentDepth (int): Current research depth
|
|
||||||
* maxDepth (int): Maximum research depth
|
Results:
|
||||||
* activities (List[Dict[str, Any]]): Research progress log
|
* id - Unique identifier for the research job
|
||||||
* sources (List[Dict[str, Any]]): Discovered sources
|
* data - Research findings and analysis
|
||||||
* summaries (List[str]): Generated research summaries
|
* sources - List of discovered sources
|
||||||
|
* activities - Research progress log
|
||||||
|
* summaries - Generated research summaries
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If status check fails
|
Exception: If the status check fails.
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
try:
|
try:
|
||||||
@ -4016,52 +4056,80 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
async def search(
|
async def search(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse:
|
*,
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
tbs: Optional[str] = None,
|
||||||
|
filter: Optional[str] = None,
|
||||||
|
lang: Optional[str] = None,
|
||||||
|
country: Optional[str] = None,
|
||||||
|
location: Optional[str] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
scrape_options: Optional[CommonOptions] = None,
|
||||||
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
||||||
|
**kwargs) -> SearchResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously search for content using Firecrawl.
|
Asynchronously search for content using Firecrawl.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query (str): Search query string
|
query (str): Search query string
|
||||||
params (Optional[Union[Dict[str, Any], SearchParams]]): See SearchParams model:
|
limit (Optional[int]): Max results (default: 5)
|
||||||
Search Options:
|
tbs (Optional[str]): Time filter (e.g. "qdr:d")
|
||||||
* limit - Max results (default: 5)
|
filter (Optional[str]): Custom result filter
|
||||||
* tbs - Time filter (e.g. "qdr:d")
|
lang (Optional[str]): Language code (default: "en")
|
||||||
* filter - Custom result filter
|
country (Optional[str]): Country code (default: "us")
|
||||||
|
location (Optional[str]): Geo-targeting
|
||||||
Localization:
|
timeout (Optional[int]): Request timeout in milliseconds
|
||||||
* lang - Language code (default: "en")
|
scrape_options (Optional[CommonOptions]): Result scraping configuration
|
||||||
* country - Country code (default: "us")
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
||||||
* location - Geo-targeting
|
**kwargs: Additional keyword arguments for future compatibility
|
||||||
|
|
||||||
Request Options:
|
|
||||||
* timeout - Request timeout (ms)
|
|
||||||
* scrapeOptions - Result scraping config
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
SearchResponse containing:
|
SearchResponse: Response containing:
|
||||||
* success (bool): Whether search completed successfully
|
* success (bool): Whether request succeeded
|
||||||
* data (List[FirecrawlDocument]): Search results
|
* data (List[FirecrawlDocument]): Search results
|
||||||
* warning (str, optional): Warning message if any
|
* warning (Optional[str]): Warning message if any
|
||||||
* error (str, optional): Error message if search failed
|
* error (Optional[str]): Error message if any
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If search fails
|
Exception: If search fails or response cannot be parsed
|
||||||
"""
|
"""
|
||||||
if params is None:
|
# Build search parameters
|
||||||
params = {}
|
search_params = {}
|
||||||
|
if params:
|
||||||
|
if isinstance(params, dict):
|
||||||
|
search_params.update(params)
|
||||||
|
else:
|
||||||
|
search_params.update(params.dict(exclude_none=True))
|
||||||
|
|
||||||
if isinstance(params, dict):
|
# Add individual parameters
|
||||||
search_params = SearchParams(query=query, **params)
|
if limit is not None:
|
||||||
else:
|
search_params['limit'] = limit
|
||||||
search_params = params
|
if tbs is not None:
|
||||||
search_params.query = query
|
search_params['tbs'] = tbs
|
||||||
|
if filter is not None:
|
||||||
|
search_params['filter'] = filter
|
||||||
|
if lang is not None:
|
||||||
|
search_params['lang'] = lang
|
||||||
|
if country is not None:
|
||||||
|
search_params['country'] = country
|
||||||
|
if location is not None:
|
||||||
|
search_params['location'] = location
|
||||||
|
if timeout is not None:
|
||||||
|
search_params['timeout'] = timeout
|
||||||
|
if scrape_options is not None:
|
||||||
|
search_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
||||||
|
|
||||||
|
# Add any additional kwargs
|
||||||
|
search_params.update(kwargs)
|
||||||
|
|
||||||
search_params_dict = search_params.dict(exclude_none=True)
|
# Create final params object
|
||||||
search_params_dict['origin'] = f"python-sdk@{version}"
|
final_params = SearchParams(query=query, **search_params)
|
||||||
|
params_dict = final_params.dict(exclude_none=True)
|
||||||
|
params_dict['origin'] = f"python-sdk@{version}"
|
||||||
|
|
||||||
return await self._async_post_request(
|
return await self._async_post_request(
|
||||||
f"{self.api_url}/v1/search",
|
f"{self.api_url}/v1/search",
|
||||||
search_params_dict,
|
params_dict,
|
||||||
{"Authorization": f"Bearer {self.api_key}"}
|
{"Authorization": f"Bearer {self.api_key}"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user