mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-21 05:29:42 +08:00
129 lines
3.4 KiB
Python
129 lines
3.4 KiB
Python
import time
|
|
import nest_asyncio
|
|
import uuid
|
|
from firecrawl.firecrawl import FirecrawlApp
|
|
|
|
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
|
|
|
# Scrape a website:
|
|
scrape_result = app.scrape_url('firecrawl.dev')
|
|
print(scrape_result['markdown'])
|
|
|
|
# Crawl a website:
|
|
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
|
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
|
print(crawl_result)
|
|
|
|
# Asynchronous Crawl a website:
|
|
async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
|
|
print(async_result)
|
|
|
|
crawl_status = app.check_crawl_status(async_result['id'])
|
|
print(crawl_status)
|
|
|
|
attempts = 15
|
|
while attempts > 0 and crawl_status['status'] != 'completed':
|
|
print(crawl_status)
|
|
crawl_status = app.check_crawl_status(async_result['id'])
|
|
attempts -= 1
|
|
time.sleep(1)
|
|
|
|
crawl_status = app.get_crawl_status(async_result['id'])
|
|
print(crawl_status)
|
|
|
|
# LLM Extraction:
|
|
# Define schema to extract contents into using pydantic
|
|
# from pydantic import BaseModel, Field
|
|
# from typing import List
|
|
|
|
# class ArticleSchema(BaseModel):
|
|
# title: str
|
|
# points: int
|
|
# by: str
|
|
# commentsURL: str
|
|
|
|
# class TopArticlesSchema(BaseModel):
|
|
# top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
|
|
|
# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
|
# 'extractorOptions': {
|
|
# 'extractionSchema': TopArticlesSchema.model_json_schema(),
|
|
# 'mode': 'llm-extraction'
|
|
# },
|
|
# 'pageOptions':{
|
|
# 'onlyMainContent': True
|
|
# }
|
|
# })
|
|
|
|
# print(llm_extraction_result['llm_extraction'])
|
|
|
|
# # Define schema to extract contents into using json schema
|
|
# json_schema = {
|
|
# "type": "object",
|
|
# "properties": {
|
|
# "top": {
|
|
# "type": "array",
|
|
# "items": {
|
|
# "type": "object",
|
|
# "properties": {
|
|
# "title": {"type": "string"},
|
|
# "points": {"type": "number"},
|
|
# "by": {"type": "string"},
|
|
# "commentsURL": {"type": "string"}
|
|
# },
|
|
# "required": ["title", "points", "by", "commentsURL"]
|
|
# },
|
|
# "minItems": 5,
|
|
# "maxItems": 5,
|
|
# "description": "Top 5 stories on Hacker News"
|
|
# }
|
|
# },
|
|
# "required": ["top"]
|
|
# }
|
|
|
|
# llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
|
# 'extractorOptions': {
|
|
# 'extractionSchema': json_schema,
|
|
# 'mode': 'llm-extraction'
|
|
# },
|
|
# 'pageOptions':{
|
|
# 'onlyMainContent': True
|
|
# }
|
|
# })
|
|
|
|
# print(llm_extraction_result['llm_extraction'])
|
|
|
|
|
|
# Map a website:
|
|
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
|
print(map_result)
|
|
|
|
# Crawl a website with WebSockets:
|
|
# inside an async function...
|
|
nest_asyncio.apply()
|
|
|
|
# Define event handlers
|
|
def on_document(detail):
|
|
print("DOC", detail)
|
|
|
|
def on_error(detail):
|
|
print("ERR", detail['error'])
|
|
|
|
def on_done(detail):
|
|
print("DONE", detail['status'])
|
|
|
|
# Function to start the crawl and watch process
|
|
async def start_crawl_and_watch():
|
|
# Initiate the crawl job and get the watcher
|
|
watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
|
|
|
|
# Add event listeners
|
|
watcher.add_event_listener("document", on_document)
|
|
watcher.add_event_listener("error", on_error)
|
|
watcher.add_event_listener("done", on_done)
|
|
|
|
# Start the watcher
|
|
await watcher.connect()
|
|
|
|
# Run the event loop
|
|
await start_crawl_and_watch() |