mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 01:45:59 +08:00
async
This commit is contained in:
parent
6a5a4e5b6f
commit
3641070ece
@ -47,7 +47,7 @@ while attempts > 0 and crawl_status['status'] != 'completed':
|
|||||||
attempts -= 1
|
attempts -= 1
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
crawl_status = app.get_crawl_status(async_result['id'])
|
crawl_status = app.check_crawl_status(async_result['id'])
|
||||||
print(crawl_status)
|
print(crawl_status)
|
||||||
|
|
||||||
# LLM Extraction:
|
# LLM Extraction:
|
||||||
|
168
apps/python-sdk/example_async.py
Normal file
168
apps/python-sdk/example_async.py
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
import time
|
||||||
|
import nest_asyncio
|
||||||
|
import uuid
|
||||||
|
import asyncio
|
||||||
|
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
app = AsyncFirecrawlApp(api_key="fc-")
|
||||||
|
|
||||||
|
async def example_scrape():
|
||||||
|
# Scrape a website:
|
||||||
|
scrape_result = await app.scrape_url('firecrawl.dev')
|
||||||
|
print(scrape_result['markdown'])
|
||||||
|
|
||||||
|
async def example_batch_scrape():
|
||||||
|
# Batch scrape
|
||||||
|
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||||
|
batch_scrape_params = {
|
||||||
|
'formats': ['markdown', 'html'],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Synchronous batch scrape
|
||||||
|
batch_result = await app.batch_scrape_urls(urls, batch_scrape_params)
|
||||||
|
print("Synchronous Batch Scrape Result:")
|
||||||
|
print(batch_result['data'][0]['markdown'])
|
||||||
|
|
||||||
|
# Asynchronous batch scrape
|
||||||
|
async_batch_result = await app.async_batch_scrape_urls(urls, batch_scrape_params)
|
||||||
|
print("\nAsynchronous Batch Scrape Result:")
|
||||||
|
print(async_batch_result)
|
||||||
|
|
||||||
|
async def example_crawl():
|
||||||
|
# Crawl a website:
|
||||||
|
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||||
|
crawl_result = await app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
||||||
|
print(crawl_result)
|
||||||
|
|
||||||
|
# Asynchronous Crawl a website:
|
||||||
|
async_result = await app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
|
||||||
|
print(async_result)
|
||||||
|
|
||||||
|
crawl_status = await app.check_crawl_status(async_result['id'])
|
||||||
|
print(crawl_status)
|
||||||
|
|
||||||
|
attempts = 15
|
||||||
|
while attempts > 0 and crawl_status['status'] != 'completed':
|
||||||
|
print(crawl_status)
|
||||||
|
crawl_status = await app.check_crawl_status(async_result['id'])
|
||||||
|
attempts -= 1
|
||||||
|
await asyncio.sleep(1) # Use async sleep instead of time.sleep
|
||||||
|
|
||||||
|
crawl_status = await app.check_crawl_status(async_result['id'])
|
||||||
|
print(crawl_status)
|
||||||
|
|
||||||
|
async def example_llm_extraction():
|
||||||
|
# Define schema to extract contents into using pydantic
|
||||||
|
class ArticleSchema(BaseModel):
|
||||||
|
title: str
|
||||||
|
points: int
|
||||||
|
by: str
|
||||||
|
commentsURL: str
|
||||||
|
|
||||||
|
class TopArticlesSchema(BaseModel):
|
||||||
|
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
||||||
|
|
||||||
|
llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', {
|
||||||
|
'formats': ['extract'],
|
||||||
|
'extract': {
|
||||||
|
'schema': TopArticlesSchema.model_json_schema()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
print(llm_extraction_result['extract'])
|
||||||
|
|
||||||
|
# Define schema to extract contents into using json schema
|
||||||
|
json_schema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"top": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string"},
|
||||||
|
"points": {"type": "number"},
|
||||||
|
"by": {"type": "string"},
|
||||||
|
"commentsURL": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["title", "points", "by", "commentsURL"]
|
||||||
|
},
|
||||||
|
"minItems": 5,
|
||||||
|
"maxItems": 5,
|
||||||
|
"description": "Top 5 stories on Hacker News"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["top"]
|
||||||
|
}
|
||||||
|
|
||||||
|
app2 = AsyncFirecrawlApp(api_key="fc-", version="v0")
|
||||||
|
|
||||||
|
llm_extraction_result = await app2.scrape_url('https://news.ycombinator.com', {
|
||||||
|
'extractorOptions': {
|
||||||
|
'extractionSchema': json_schema,
|
||||||
|
'mode': 'llm-extraction'
|
||||||
|
},
|
||||||
|
'pageOptions':{
|
||||||
|
'onlyMainContent': True
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
async def example_map_and_extract():
|
||||||
|
# Map a website:
|
||||||
|
map_result = await app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
||||||
|
print(map_result)
|
||||||
|
|
||||||
|
# Extract URLs:
|
||||||
|
class ExtractSchema(BaseModel):
|
||||||
|
title: str
|
||||||
|
description: str
|
||||||
|
links: List[str]
|
||||||
|
|
||||||
|
# Define the schema using Pydantic
|
||||||
|
extract_schema = ExtractSchema.schema()
|
||||||
|
|
||||||
|
# Perform the extraction
|
||||||
|
extract_result = await app.extract(['https://firecrawl.dev'], {
|
||||||
|
'prompt': "Extract the title, description, and links from the website",
|
||||||
|
'schema': extract_schema
|
||||||
|
})
|
||||||
|
print(extract_result)
|
||||||
|
|
||||||
|
# Define event handlers for websocket
|
||||||
|
def on_document(detail):
|
||||||
|
print("DOC", detail)
|
||||||
|
|
||||||
|
def on_error(detail):
|
||||||
|
print("ERR", detail['error'])
|
||||||
|
|
||||||
|
def on_done(detail):
|
||||||
|
print("DONE", detail['status'])
|
||||||
|
|
||||||
|
async def example_websocket_crawl():
|
||||||
|
# Initiate the crawl job and get the watcher
|
||||||
|
watcher = await app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
|
||||||
|
|
||||||
|
# Add event listeners
|
||||||
|
watcher.add_event_listener("document", on_document)
|
||||||
|
watcher.add_event_listener("error", on_error)
|
||||||
|
watcher.add_event_listener("done", on_done)
|
||||||
|
|
||||||
|
# Start the watcher
|
||||||
|
await watcher.connect()
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Apply nest_asyncio to allow nested event loops
|
||||||
|
nest_asyncio.apply()
|
||||||
|
|
||||||
|
# Run all the examples
|
||||||
|
await example_scrape()
|
||||||
|
await example_batch_scrape()
|
||||||
|
await example_crawl()
|
||||||
|
await example_llm_extraction()
|
||||||
|
await example_map_and_extract()
|
||||||
|
await example_websocket_crawl()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
File diff suppressed because it is too large
Load Diff
@ -4,3 +4,4 @@ python-dotenv
|
|||||||
websockets
|
websockets
|
||||||
nest-asyncio
|
nest-asyncio
|
||||||
pydantic
|
pydantic
|
||||||
|
aiohttp
|
Loading…
x
Reference in New Issue
Block a user