mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-04 11:24:40 +08:00
141 lines
3.9 KiB
Python
141 lines
3.9 KiB
Python
from firecrawl import JsonConfig, FirecrawlApp
|
|
from pydantic import BaseModel, Field
|
|
from typing import List
|
|
import time
|
|
app = FirecrawlApp(api_url="https://api.firecrawl.dev")
|
|
|
|
# Scrape a website:
|
|
scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
|
|
print(scrape_result.markdown)
|
|
|
|
|
|
# # # Test batch scrape
|
|
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
|
# # Synchronous batch scrape
|
|
batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
|
|
print("Synchronous Batch Scrape Result:")
|
|
print(batch_result.data[0].markdown)
|
|
|
|
# # # Asynchronous batch scrape
|
|
async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
|
|
print("\nAsynchronous Batch Scrape Result:")
|
|
print(async_batch_result)
|
|
|
|
# # Crawl a website:
|
|
crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
|
print(crawl_result.data[0].markdown)
|
|
|
|
# # Asynchronous Crawl a website:
|
|
async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
|
print(async_result)
|
|
|
|
crawl_status = app.check_crawl_status(async_result.id)
|
|
print(crawl_status)
|
|
|
|
attempts = 15
|
|
while attempts > 0 and crawl_status.status != 'completed':
|
|
print(crawl_status)
|
|
crawl_status = app.check_crawl_status(async_result.id)
|
|
attempts -= 1
|
|
time.sleep(1)
|
|
|
|
crawl_status = app.check_crawl_status(async_result.id)
|
|
print(crawl_status)
|
|
|
|
# JSON format:
|
|
# Define schema to extract contents into using json schema
|
|
json_schema = {
|
|
"type": "object",
|
|
"properties": {
|
|
"top": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object",
|
|
"properties": {
|
|
"title": {"type": "string"},
|
|
"points": {"type": "number"},
|
|
"by": {"type": "string"},
|
|
"commentsURL": {"type": "string"}
|
|
},
|
|
"required": ["title", "points", "by", "commentsURL"]
|
|
},
|
|
"description": "Top 5 stories on Hacker News"
|
|
}
|
|
},
|
|
"required": ["top"]
|
|
}
|
|
|
|
extract_config = JsonConfig(schema=json_schema)
|
|
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["json"], json_options=extract_config)
|
|
|
|
print(llm_extraction_result.json)
|
|
|
|
# Map a website:
|
|
map_result = app.map_url('https://firecrawl.dev', search="blog")
|
|
print(map_result)
|
|
|
|
# Extract URLs:
|
|
class ExtractSchema(BaseModel):
|
|
title: str
|
|
description: str
|
|
links: List[str]
|
|
|
|
# Define the schema using Pydantic
|
|
extract_schema = ExtractSchema.schema()
|
|
|
|
# Perform the extraction
|
|
extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
|
print(extract_result)
|
|
|
|
|
|
# Deep research example
|
|
research_result = app.deep_research(
|
|
"What are the latest developments in large language models?",
|
|
max_urls=4
|
|
)
|
|
print("Research Results:", research_result)
|
|
|
|
# Generate LLMs.txt example
|
|
llms_result = app.generate_llms_text(
|
|
"https://firecrawl.dev")
|
|
print("LLMs.txt Results:", llms_result)
|
|
|
|
|
|
# Crawl a website with WebSockets:
|
|
# inside an async function...
|
|
import nest_asyncio
|
|
nest_asyncio.apply()
|
|
|
|
# Define event handlers
|
|
def on_document(detail):
|
|
print("DOC", detail)
|
|
|
|
def on_error(detail):
|
|
print("ERR", detail['error'])
|
|
|
|
def on_done(detail):
|
|
print("DONE", detail['status'])
|
|
|
|
# Function to start the crawl and watch process
|
|
async def start_crawl_and_watch():
|
|
# Initiate the crawl job and get the watcher
|
|
watcher = app.crawl_url_and_watch('firecrawl.dev', { 'excludePaths': ['blog/*'], 'limit': 5 })
|
|
|
|
# Add event listeners
|
|
watcher.add_event_listener("document", on_document)
|
|
watcher.add_event_listener("error", on_error)
|
|
watcher.add_event_listener("done", on_done)
|
|
|
|
# Start the watcher
|
|
await watcher.connect()
|
|
|
|
|
|
class ExtractSchema(BaseModel):
|
|
company_mission: str
|
|
supports_sso: bool
|
|
is_open_source: bool
|
|
is_in_yc: bool
|
|
|
|
extract_config = JsonConfig(schema=ExtractSchema.model_json_schema())
|
|
data = app.scrape_url('https://docs.firecrawl.dev/', formats=['json'], json_options=extract_config)
|
|
print(data.json) |