mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 19:46:05 +08:00
fix
This commit is contained in:
parent
79bc54c11e
commit
0aedef7210
@ -42,23 +42,7 @@ while attempts > 0 and crawl_status.status != 'completed':
|
|||||||
crawl_status = app.check_crawl_status(async_result.id)
|
crawl_status = app.check_crawl_status(async_result.id)
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
|
|
||||||
# LLM Extraction:
|
# JSON format:
|
||||||
# Define schema to extract contents into using pydantic
|
|
||||||
class ArticleSchema(BaseModel):
|
|
||||||
title: str
|
|
||||||
points: int
|
|
||||||
by: str
|
|
||||||
commentsURL: str
|
|
||||||
|
|
||||||
class TopArticlesSchema(BaseModel):
|
|
||||||
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
|
||||||
|
|
||||||
extract_config = JsonConfig(schema=TopArticlesSchema.model_json_schema())
|
|
||||||
|
|
||||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
|
||||||
|
|
||||||
print(llm_extraction_result.extract)
|
|
||||||
|
|
||||||
# Define schema to extract contents into using json schema
|
# Define schema to extract contents into using json schema
|
||||||
json_schema = {
|
json_schema = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
@ -86,9 +70,6 @@ llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=[
|
|||||||
|
|
||||||
print(llm_extraction_result.json)
|
print(llm_extraction_result.json)
|
||||||
|
|
||||||
print(llm_extraction_result['llm_extraction'])
|
|
||||||
|
|
||||||
|
|
||||||
# Map a website:
|
# Map a website:
|
||||||
map_result = app.map_url('https://firecrawl.dev', search="blog")
|
map_result = app.map_url('https://firecrawl.dev', search="blog")
|
||||||
print(map_result)
|
print(map_result)
|
||||||
|
@ -2,7 +2,7 @@ import time
|
|||||||
import nest_asyncio
|
import nest_asyncio
|
||||||
import uuid
|
import uuid
|
||||||
import asyncio
|
import asyncio
|
||||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
from firecrawl.firecrawl import AsyncFirecrawlApp, ScrapeOptions, JsonConfig
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -84,6 +84,20 @@ async def example_map_and_extract():
|
|||||||
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
||||||
print(extract_result)
|
print(extract_result)
|
||||||
|
|
||||||
|
async def example_deep_research():
|
||||||
|
# Deep research example
|
||||||
|
research_result = await app.deep_research(
|
||||||
|
"What are the latest developments in large language models?",
|
||||||
|
max_urls=4
|
||||||
|
)
|
||||||
|
print("Research Results:", research_result)
|
||||||
|
|
||||||
|
async def example_generate_llms_text():
|
||||||
|
# Generate LLMs.txt example
|
||||||
|
llms_result = await app.generate_llms_text(
|
||||||
|
"https://firecrawl.dev")
|
||||||
|
print("LLMs.txt Results:", llms_result)
|
||||||
|
|
||||||
# Define event handlers for websocket
|
# Define event handlers for websocket
|
||||||
def on_document(detail):
|
def on_document(detail):
|
||||||
print("DOC", detail)
|
print("DOC", detail)
|
||||||
@ -115,6 +129,8 @@ async def main():
|
|||||||
await example_llm_extraction()
|
await example_llm_extraction()
|
||||||
await example_map_and_extract()
|
await example_map_and_extract()
|
||||||
await example_websocket_crawl()
|
await example_websocket_crawl()
|
||||||
|
await example_deep_research()
|
||||||
|
await example_generate_llms_text()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
@ -1742,7 +1742,7 @@ class FirecrawlApp:
|
|||||||
|
|
||||||
def async_extract(
|
def async_extract(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: Optional[List[str]] = None,
|
||||||
*,
|
*,
|
||||||
prompt: Optional[str] = None,
|
prompt: Optional[str] = None,
|
||||||
schema: Optional[Any] = None,
|
schema: Optional[Any] = None,
|
||||||
@ -1750,8 +1750,7 @@ class FirecrawlApp:
|
|||||||
allow_external_links: Optional[bool] = False,
|
allow_external_links: Optional[bool] = False,
|
||||||
enable_web_search: Optional[bool] = False,
|
enable_web_search: Optional[bool] = False,
|
||||||
show_sources: Optional[bool] = False,
|
show_sources: Optional[bool] = False,
|
||||||
agent: Optional[Dict[str, Any]] = None,
|
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
||||||
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous extract job.
|
Initiate an asynchronous extract job.
|
||||||
|
|
||||||
@ -1775,7 +1774,7 @@ class FirecrawlApp:
|
|||||||
Raises:
|
Raises:
|
||||||
ValueError: If job initiation fails
|
ValueError: If job initiation fails
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers(idempotency_key)
|
headers = self._prepare_headers()
|
||||||
|
|
||||||
schema = schema
|
schema = schema
|
||||||
if schema:
|
if schema:
|
||||||
@ -3457,27 +3456,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
|
|
||||||
async def extract(
|
async def extract(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: Optional[List[str]] = None,
|
||||||
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
*,
|
||||||
|
prompt: Optional[str] = None,
|
||||||
|
schema: Optional[Any] = None,
|
||||||
|
system_prompt: Optional[str] = None,
|
||||||
|
allow_external_links: Optional[bool] = False,
|
||||||
|
enable_web_search: Optional[bool] = False,
|
||||||
|
show_sources: Optional[bool] = False,
|
||||||
|
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Asynchronously extract structured information from URLs.
|
Asynchronously extract structured information from URLs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls (List[str]): URLs to extract from
|
urls (Optional[List[str]]): URLs to extract from
|
||||||
params (Optional[ExtractParams]): See ExtractParams model:
|
prompt (Optional[str]): Custom extraction prompt
|
||||||
Extraction Config:
|
schema (Optional[Any]): JSON schema/Pydantic model
|
||||||
* prompt - Custom extraction prompt
|
system_prompt (Optional[str]): System context
|
||||||
* schema - JSON schema/Pydantic model
|
allow_external_links (Optional[bool]): Follow external links
|
||||||
* systemPrompt - System context
|
enable_web_search (Optional[bool]): Enable web search
|
||||||
|
show_sources (Optional[bool]): Include source URLs
|
||||||
Behavior Options:
|
agent (Optional[Dict[str, Any]]): Agent configuration
|
||||||
* allowExternalLinks - Follow external links
|
|
||||||
* enableWebSearch - Enable web search
|
|
||||||
* includeSubdomains - Include subdomains
|
|
||||||
* showSources - Include source URLs
|
|
||||||
|
|
||||||
Scraping Options:
|
|
||||||
* scrapeOptions - Page scraping config
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
ExtractResponse with:
|
ExtractResponse with:
|
||||||
@ -3490,29 +3490,35 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
"""
|
"""
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
|
|
||||||
if not params or (not params.get('prompt') and not params.get('schema')):
|
if not prompt and not schema:
|
||||||
raise ValueError("Either prompt or schema is required")
|
raise ValueError("Either prompt or schema is required")
|
||||||
|
|
||||||
schema = params.get('schema')
|
if not urls and not prompt:
|
||||||
|
raise ValueError("Either urls or prompt is required")
|
||||||
|
|
||||||
if schema:
|
if schema:
|
||||||
if hasattr(schema, 'model_json_schema'):
|
if hasattr(schema, 'model_json_schema'):
|
||||||
|
# Convert Pydantic model to JSON schema
|
||||||
schema = schema.model_json_schema()
|
schema = schema.model_json_schema()
|
||||||
|
# Otherwise assume it's already a JSON schema dict
|
||||||
|
|
||||||
request_data = ExtractResponse(
|
request_data = {
|
||||||
urls=urls,
|
'urls': urls or [],
|
||||||
allowExternalLinks=params.get('allow_external_links', params.get('allowExternalLinks', False)),
|
'allowExternalLinks': allow_external_links,
|
||||||
enableWebSearch=params.get('enable_web_search', params.get('enableWebSearch', False)),
|
'enableWebSearch': enable_web_search,
|
||||||
showSources=params.get('show_sources', params.get('showSources', False)),
|
'showSources': show_sources,
|
||||||
schema=schema,
|
'schema': schema,
|
||||||
origin=f'python-sdk@{version}'
|
'origin': f'python-sdk@{get_version()}'
|
||||||
)
|
}
|
||||||
|
|
||||||
if params.get('prompt'):
|
# Only add prompt and systemPrompt if they exist
|
||||||
request_data['prompt'] = params['prompt']
|
if prompt:
|
||||||
if params.get('system_prompt'):
|
request_data['prompt'] = prompt
|
||||||
request_data['systemPrompt'] = params['system_prompt']
|
if system_prompt:
|
||||||
elif params.get('systemPrompt'):
|
request_data['systemPrompt'] = system_prompt
|
||||||
request_data['systemPrompt'] = params['systemPrompt']
|
|
||||||
|
if agent:
|
||||||
|
request_data['agent'] = agent
|
||||||
|
|
||||||
response = await self._async_post_request(
|
response = await self._async_post_request(
|
||||||
f'{self.api_url}/v1/extract',
|
f'{self.api_url}/v1/extract',
|
||||||
@ -3532,7 +3538,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if status_data['status'] == 'completed':
|
if status_data['status'] == 'completed':
|
||||||
return status_data
|
return ExtractResponse(**status_data)
|
||||||
elif status_data['status'] in ['failed', 'cancelled']:
|
elif status_data['status'] in ['failed', 'cancelled']:
|
||||||
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
||||||
|
|
||||||
@ -3715,8 +3721,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
allow_external_links: Optional[bool] = False,
|
allow_external_links: Optional[bool] = False,
|
||||||
enable_web_search: Optional[bool] = False,
|
enable_web_search: Optional[bool] = False,
|
||||||
show_sources: Optional[bool] = False,
|
show_sources: Optional[bool] = False,
|
||||||
agent: Optional[Dict[str, Any]] = None,
|
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
||||||
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous extraction job without waiting for completion.
|
Initiate an asynchronous extraction job without waiting for completion.
|
||||||
|
|
||||||
@ -3740,7 +3745,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
Raises:
|
Raises:
|
||||||
ValueError: If job initiation fails
|
ValueError: If job initiation fails
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers(idempotency_key)
|
headers = self._prepare_headers()
|
||||||
|
|
||||||
if not prompt and not schema:
|
if not prompt and not schema:
|
||||||
raise ValueError("Either prompt or schema is required")
|
raise ValueError("Either prompt or schema is required")
|
||||||
@ -3871,6 +3876,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
if experimental_stream is not None:
|
if experimental_stream is not None:
|
||||||
params['__experimental_stream'] = experimental_stream
|
params['__experimental_stream'] = experimental_stream
|
||||||
|
|
||||||
|
params = GenerateLLMsTextParams(
|
||||||
|
maxUrls=max_urls,
|
||||||
|
showFullText=show_full_text,
|
||||||
|
__experimental_stream=experimental_stream
|
||||||
|
)
|
||||||
|
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
json_data = {'url': url, **params.dict(exclude_none=True)}
|
json_data = {'url': url, **params.dict(exclude_none=True)}
|
||||||
json_data['origin'] = f"python-sdk@{version}"
|
json_data['origin'] = f"python-sdk@{version}"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user