This commit is contained in:
rafaelmmiller 2025-04-18 13:37:09 -07:00
parent 79bc54c11e
commit 0aedef7210
3 changed files with 70 additions and 62 deletions

View File

@ -42,23 +42,7 @@ while attempts > 0 and crawl_status.status != 'completed':
crawl_status = app.check_crawl_status(async_result.id) crawl_status = app.check_crawl_status(async_result.id)
print(crawl_status) print(crawl_status)
# LLM Extraction: # JSON format:
# Define schema to extract contents into using pydantic
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
extract_config = JsonConfig(schema=TopArticlesSchema.model_json_schema())
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
print(llm_extraction_result.extract)
# Define schema to extract contents into using json schema # Define schema to extract contents into using json schema
json_schema = { json_schema = {
"type": "object", "type": "object",
@ -86,9 +70,6 @@ llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=[
print(llm_extraction_result.json) print(llm_extraction_result.json)
print(llm_extraction_result['llm_extraction'])
# Map a website: # Map a website:
map_result = app.map_url('https://firecrawl.dev', search="blog") map_result = app.map_url('https://firecrawl.dev', search="blog")
print(map_result) print(map_result)

View File

@ -2,7 +2,7 @@ import time
import nest_asyncio import nest_asyncio
import uuid import uuid
import asyncio import asyncio
from firecrawl.firecrawl import AsyncFirecrawlApp from firecrawl.firecrawl import AsyncFirecrawlApp, ScrapeOptions, JsonConfig
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
@ -84,6 +84,20 @@ async def example_map_and_extract():
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
print(extract_result) print(extract_result)
async def example_deep_research():
# Deep research example
research_result = await app.deep_research(
"What are the latest developments in large language models?",
max_urls=4
)
print("Research Results:", research_result)
async def example_generate_llms_text():
# Generate LLMs.txt example
llms_result = await app.generate_llms_text(
"https://firecrawl.dev")
print("LLMs.txt Results:", llms_result)
# Define event handlers for websocket # Define event handlers for websocket
def on_document(detail): def on_document(detail):
print("DOC", detail) print("DOC", detail)
@ -115,6 +129,8 @@ async def main():
await example_llm_extraction() await example_llm_extraction()
await example_map_and_extract() await example_map_and_extract()
await example_websocket_crawl() await example_websocket_crawl()
await example_deep_research()
await example_generate_llms_text()
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

View File

@ -1742,7 +1742,7 @@ class FirecrawlApp:
def async_extract( def async_extract(
self, self,
urls: List[str], urls: Optional[List[str]] = None,
*, *,
prompt: Optional[str] = None, prompt: Optional[str] = None,
schema: Optional[Any] = None, schema: Optional[Any] = None,
@ -1750,8 +1750,7 @@ class FirecrawlApp:
allow_external_links: Optional[bool] = False, allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False, enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False, show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None, agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
""" """
Initiate an asynchronous extract job. Initiate an asynchronous extract job.
@ -1775,7 +1774,7 @@ class FirecrawlApp:
Raises: Raises:
ValueError: If job initiation fails ValueError: If job initiation fails
""" """
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers()
schema = schema schema = schema
if schema: if schema:
@ -3457,27 +3456,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
async def extract( async def extract(
self, self,
urls: List[str], urls: Optional[List[str]] = None,
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]: *,
prompt: Optional[str] = None,
schema: Optional[Any] = None,
system_prompt: Optional[str] = None,
allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
""" """
Asynchronously extract structured information from URLs. Asynchronously extract structured information from URLs.
Args: Args:
urls (List[str]): URLs to extract from urls (Optional[List[str]]): URLs to extract from
params (Optional[ExtractParams]): See ExtractParams model: prompt (Optional[str]): Custom extraction prompt
Extraction Config: schema (Optional[Any]): JSON schema/Pydantic model
* prompt - Custom extraction prompt system_prompt (Optional[str]): System context
* schema - JSON schema/Pydantic model allow_external_links (Optional[bool]): Follow external links
* systemPrompt - System context enable_web_search (Optional[bool]): Enable web search
show_sources (Optional[bool]): Include source URLs
Behavior Options: agent (Optional[Dict[str, Any]]): Agent configuration
* allowExternalLinks - Follow external links
* enableWebSearch - Enable web search
* includeSubdomains - Include subdomains
* showSources - Include source URLs
Scraping Options:
* scrapeOptions - Page scraping config
Returns: Returns:
ExtractResponse with: ExtractResponse with:
@ -3490,29 +3490,35 @@ class AsyncFirecrawlApp(FirecrawlApp):
""" """
headers = self._prepare_headers() headers = self._prepare_headers()
if not params or (not params.get('prompt') and not params.get('schema')): if not prompt and not schema:
raise ValueError("Either prompt or schema is required") raise ValueError("Either prompt or schema is required")
schema = params.get('schema') if not urls and not prompt:
raise ValueError("Either urls or prompt is required")
if schema: if schema:
if hasattr(schema, 'model_json_schema'): if hasattr(schema, 'model_json_schema'):
# Convert Pydantic model to JSON schema
schema = schema.model_json_schema() schema = schema.model_json_schema()
# Otherwise assume it's already a JSON schema dict
request_data = ExtractResponse( request_data = {
urls=urls, 'urls': urls or [],
allowExternalLinks=params.get('allow_external_links', params.get('allowExternalLinks', False)), 'allowExternalLinks': allow_external_links,
enableWebSearch=params.get('enable_web_search', params.get('enableWebSearch', False)), 'enableWebSearch': enable_web_search,
showSources=params.get('show_sources', params.get('showSources', False)), 'showSources': show_sources,
schema=schema, 'schema': schema,
origin=f'python-sdk@{version}' 'origin': f'python-sdk@{get_version()}'
) }
if params.get('prompt'): # Only add prompt and systemPrompt if they exist
request_data['prompt'] = params['prompt'] if prompt:
if params.get('system_prompt'): request_data['prompt'] = prompt
request_data['systemPrompt'] = params['system_prompt'] if system_prompt:
elif params.get('systemPrompt'): request_data['systemPrompt'] = system_prompt
request_data['systemPrompt'] = params['systemPrompt']
if agent:
request_data['agent'] = agent
response = await self._async_post_request( response = await self._async_post_request(
f'{self.api_url}/v1/extract', f'{self.api_url}/v1/extract',
@ -3532,7 +3538,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
) )
if status_data['status'] == 'completed': if status_data['status'] == 'completed':
return status_data return ExtractResponse(**status_data)
elif status_data['status'] in ['failed', 'cancelled']: elif status_data['status'] in ['failed', 'cancelled']:
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}') raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
@ -3715,8 +3721,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
allow_external_links: Optional[bool] = False, allow_external_links: Optional[bool] = False,
enable_web_search: Optional[bool] = False, enable_web_search: Optional[bool] = False,
show_sources: Optional[bool] = False, show_sources: Optional[bool] = False,
agent: Optional[Dict[str, Any]] = None, agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
""" """
Initiate an asynchronous extraction job without waiting for completion. Initiate an asynchronous extraction job without waiting for completion.
@ -3740,7 +3745,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
Raises: Raises:
ValueError: If job initiation fails ValueError: If job initiation fails
""" """
headers = self._prepare_headers(idempotency_key) headers = self._prepare_headers()
if not prompt and not schema: if not prompt and not schema:
raise ValueError("Either prompt or schema is required") raise ValueError("Either prompt or schema is required")
@ -3871,6 +3876,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
if experimental_stream is not None: if experimental_stream is not None:
params['__experimental_stream'] = experimental_stream params['__experimental_stream'] = experimental_stream
params = GenerateLLMsTextParams(
maxUrls=max_urls,
showFullText=show_full_text,
__experimental_stream=experimental_stream
)
headers = self._prepare_headers() headers = self._prepare_headers()
json_data = {'url': url, **params.dict(exclude_none=True)} json_data = {'url': url, **params.dict(exclude_none=True)}
json_data['origin'] = f"python-sdk@{version}" json_data['origin'] = f"python-sdk@{version}"