firecrawl/apps/python-sdk/example_async.py

121 lines
3.7 KiB
Python

import time
import nest_asyncio
import uuid
import asyncio
from firecrawl.firecrawl import AsyncFirecrawlApp
from pydantic import BaseModel, Field
from typing import List
app = AsyncFirecrawlApp(api_url="https://api.firecrawl.dev")
async def example_scrape():
# Scrape a website:
scrape_result = await app.scrape_url('example.com', formats=["markdown", "html"])
print(scrape_result.markdown)
async def example_batch_scrape():
# Batch scrape
urls = ['https://example.com', 'https://docs.firecrawl.dev']
# Synchronous batch scrape
batch_result = await app.batch_scrape_urls(urls, formats=["markdown", "html"])
print("Synchronous Batch Scrape Result:")
print(batch_result.data[0].markdown)
# Asynchronous batch scrape
async_batch_result = await app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
print("\nAsynchronous Batch Scrape Result:")
print(async_batch_result)
async def example_crawl():
# Crawl a website:
crawl_result = await app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
print(crawl_result.data[0].markdown)
# Asynchronous Crawl a website:
async_result = await app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
print(async_result)
crawl_status = await app.check_crawl_status(async_result.id)
print(crawl_status)
attempts = 15
while attempts > 0 and crawl_status.status != 'completed':
print(crawl_status)
crawl_status = await app.check_crawl_status(async_result.id)
attempts -= 1
await asyncio.sleep(1) # Use async sleep instead of time.sleep
crawl_status = await app.check_crawl_status(async_result.id)
print(crawl_status)
async def example_llm_extraction():
# Define schema to extract contents into using pydantic
class ArticleSchema(BaseModel):
title: str
points: int
by: str
commentsURL: str
class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
extract_config = JsonConfig(schema=TopArticlesSchema.model_json_schema())
llm_extraction_result = await app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
print(llm_extraction_result.extract)
async def example_map_and_extract():
# Map a website:
map_result = await app.map_url('https://firecrawl.dev', search="blog")
print(map_result)
# Extract URLs:
class ExtractSchema(BaseModel):
title: str
description: str
links: List[str]
# Define the schema using Pydantic
extract_schema = ExtractSchema.schema()
# Perform the extraction
extract_result = await app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
print(extract_result)
# Define event handlers for websocket
def on_document(detail):
print("DOC", detail)
def on_error(detail):
print("ERR", detail['error'])
def on_done(detail):
print("DONE", detail['status'])
async def example_websocket_crawl():
# Initiate the crawl job and get the watcher
watcher = await app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
# Add event listeners
watcher.add_event_listener("document", on_document)
watcher.add_event_listener("error", on_error)
watcher.add_event_listener("done", on_done)
# Start the watcher
await watcher.connect()
async def main():
nest_asyncio.apply()
await example_scrape()
await example_batch_scrape()
await example_crawl()
await example_llm_extraction()
await example_map_and_extract()
await example_websocket_crawl()
if __name__ == "__main__":
asyncio.run(main())