firecrawl/apps/python-sdk/example.py

import time
import nest_asyncio
import uuid
from firecrawl.firecrawl import FirecrawlApp

app = FirecrawlApp(api_key="fc-YOUR_API_KEY")

# Scrape a website:
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])

# Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
print(crawl_result)

# Asynchronous Crawl a website:
async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
print(async_result)

crawl_status = app.check_crawl_status(async_result['id'])
print(crawl_status)

attempts = 15
while attempts > 0 and crawl_status['status'] != 'completed':
    print(crawl_status)
    crawl_status = app.check_crawl_status(async_result['id'])
    attempts -= 1
    time.sleep(1)

crawl_status = app.get_crawl_status(async_result['id'])
print(crawl_status)

# LLM Extraction:
# Define schema to extract contents into using pydantic
# from pydantic import BaseModel, Field
# from typing import List

# class ArticleSchema(BaseModel):
#     title: str
#     points: int
#     by: str
#     commentsURL: str

# class TopArticlesSchema(BaseModel):
#     top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")

# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
#     'extractorOptions': {
#         'extractionSchema': TopArticlesSchema.model_json_schema(),
#         'mode': 'llm-extraction'
#     },
#     'pageOptions':{
#         'onlyMainContent': True
#     }
# })

# print(llm_extraction_result['llm_extraction'])

# # Define schema to extract contents into using json schema
# json_schema = {
#   "type": "object",
#   "properties": {
#     "top": {
#       "type": "array",
#       "items": {
#         "type": "object",
#         "properties": {
#           "title": {"type": "string"},
#           "points": {"type": "number"},
#           "by": {"type": "string"},
#           "commentsURL": {"type": "string"}
#         },
#         "required": ["title", "points", "by", "commentsURL"]
#       },
#       "minItems": 5,
#       "maxItems": 5,
#       "description": "Top 5 stories on Hacker News"
#     }
#   },
#   "required": ["top"]
# }

# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
#     'extractorOptions': {
#         'extractionSchema': json_schema,
#         'mode': 'llm-extraction'
#     },
#     'pageOptions':{
#         'onlyMainContent': True
#     }
# })

# print(llm_extraction_result['llm_extraction'])


# Map a website:
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
print(map_result)

# Crawl a website with WebSockets:
# inside an async function...
nest_asyncio.apply()

# Define event handlers
def on_document(detail):
    print("DOC", detail)

def on_error(detail):
    print("ERR", detail['error'])

def on_done(detail):
    print("DONE", detail['status'])

    # Function to start the crawl and watch process
async def start_crawl_and_watch():
    # Initiate the crawl job and get the watcher
    watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })

    # Add event listeners
    watcher.add_event_listener("document", on_document)
    watcher.add_event_listener("error", on_error)
    watcher.add_event_listener("done", on_done)

    # Start the watcher
    await watcher.connect()

# Run the event loop
await start_crawl_and_watch()