mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 01:59:04 +08:00
Nick: examples
This commit is contained in:
parent
4e67803863
commit
16439b1c7e
@ -1,27 +1,27 @@
|
||||
from firecrawl.firecrawl import ExtractConfig, FirecrawlApp
|
||||
from firecrawl import JsonConfig, FirecrawlApp
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
import time
|
||||
app = FirecrawlApp(api_url="https://api.firecrawl.dev")
|
||||
|
||||
# # Scrape a website:
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
|
||||
print(scrape_result.markdown)
|
||||
|
||||
|
||||
# # Test batch scrapeq
|
||||
# # # Test batch scrape
|
||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||
# Synchronous batch scrape
|
||||
# # Synchronous batch scrape
|
||||
batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("Synchronous Batch Scrape Result:")
|
||||
print(batch_result.data[0].markdown)
|
||||
|
||||
# # Asynchronous batch scrape
|
||||
# # # Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("\nAsynchronous Batch Scrape Result:")
|
||||
print(async_batch_result)
|
||||
|
||||
# Crawl a website:
|
||||
# # Crawl a website:
|
||||
crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||
print(crawl_result.data[0].markdown)
|
||||
|
||||
@ -53,13 +53,13 @@ class ArticleSchema(BaseModel):
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
||||
|
||||
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
|
||||
extract_config = JsonConfig(schema=TopArticlesSchema.model_json_schema())
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||
|
||||
print(llm_extraction_result.extract)
|
||||
|
||||
# # Define schema to extract contents into using json schema
|
||||
# Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@ -75,20 +75,18 @@ json_schema = {
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True})
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||
extract_config = JsonConfig(schema=json_schema)
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["json"], json_options=extract_config)
|
||||
|
||||
print(llm_extraction_result.extract)
|
||||
print(llm_extraction_result.json)
|
||||
|
||||
# print(llm_extraction_result['llm_extraction'])
|
||||
print(llm_extraction_result['llm_extraction'])
|
||||
|
||||
|
||||
# Map a website:
|
||||
@ -108,6 +106,20 @@ extract_schema = ExtractSchema.schema()
|
||||
extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
||||
print(extract_result)
|
||||
|
||||
|
||||
# Deep research example
|
||||
research_result = app.deep_research(
|
||||
"What are the latest developments in large language models?",
|
||||
max_urls=4
|
||||
)
|
||||
print("Research Results:", research_result)
|
||||
|
||||
# Generate LLMs.txt example
|
||||
llms_result = app.generate_llms_text(
|
||||
"https://firecrawl.dev")
|
||||
print("LLMs.txt Results:", llms_result)
|
||||
|
||||
|
||||
# Crawl a website with WebSockets:
|
||||
# inside an async function...
|
||||
import nest_asyncio
|
||||
@ -134,4 +146,15 @@ async def start_crawl_and_watch():
|
||||
watcher.add_event_listener("done", on_done)
|
||||
|
||||
# Start the watcher
|
||||
await watcher.connect()
|
||||
await watcher.connect()
|
||||
|
||||
|
||||
class ExtractSchema(BaseModel):
|
||||
company_mission: str
|
||||
supports_sso: bool
|
||||
is_open_source: bool
|
||||
is_in_yc: bool
|
||||
|
||||
extract_config = JsonConfig(schema=ExtractSchema.model_json_schema())
|
||||
data = app.scrape_url('https://docs.firecrawl.dev/', formats=['json'], json_options=extract_config)
|
||||
print(data.json)
|
Loading…
x
Reference in New Issue
Block a user