Nick: new examples

This commit is contained in:
Nicolas 2025-04-18 01:13:53 -07:00
parent 0001d6ea25
commit f3522666db

View File

@ -1,53 +1,45 @@
import time from firecrawl.firecrawl import ExtractConfig, FirecrawlApp
import nest_asyncio
import uuid
from firecrawl.firecrawl import FirecrawlApp
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
import time
app = FirecrawlApp(api_url="https://api.firecrawl.dev")
app = FirecrawlApp(api_key="fc-") # # Scrape a website:
scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
# Scrape a website: print(scrape_result.markdown)
scrape_result = app.scrape_url('firecrawl.dev')
print(scrape_result['markdown'])
# Test batch scrape # # Test batch scrapeq
urls = ['https://example.com', 'https://docs.firecrawl.dev'] urls = ['https://example.com', 'https://docs.firecrawl.dev']
batch_scrape_params = {
'formats': ['markdown', 'html'],
}
# Synchronous batch scrape # Synchronous batch scrape
batch_result = app.batch_scrape_urls(urls, batch_scrape_params) batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
print("Synchronous Batch Scrape Result:") print("Synchronous Batch Scrape Result:")
print(batch_result['data'][0]['markdown']) print(batch_result.data[0].markdown)
# Asynchronous batch scrape # # Asynchronous batch scrape
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params) async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
print("\nAsynchronous Batch Scrape Result:") print("\nAsynchronous Batch Scrape Result:")
print(async_batch_result) print(async_batch_result)
# Crawl a website: # Crawl a website:
idempotency_key = str(uuid.uuid4()) # optional idempotency key crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) print(crawl_result.data[0].markdown)
print(crawl_result)
# Asynchronous Crawl a website: # # Asynchronous Crawl a website:
async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
print(async_result) print(async_result)
crawl_status = app.check_crawl_status(async_result['id']) crawl_status = app.check_crawl_status(async_result.id)
print(crawl_status) print(crawl_status)
attempts = 15 attempts = 15
while attempts > 0 and crawl_status['status'] != 'completed': while attempts > 0 and crawl_status.status != 'completed':
print(crawl_status) print(crawl_status)
crawl_status = app.check_crawl_status(async_result['id']) crawl_status = app.check_crawl_status(async_result.id)
attempts -= 1 attempts -= 1
time.sleep(1) time.sleep(1)
crawl_status = app.check_crawl_status(async_result['id']) crawl_status = app.check_crawl_status(async_result.id)
print(crawl_status) print(crawl_status)
# LLM Extraction: # LLM Extraction:
@ -61,14 +53,11 @@ class ArticleSchema(BaseModel):
class TopArticlesSchema(BaseModel): class TopArticlesSchema(BaseModel):
top: List[ArticleSchema] = Field(..., description="Top 5 stories") top: List[ArticleSchema] = Field(..., description="Top 5 stories")
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
'formats': ['extract'],
'extract': {
'schema': TopArticlesSchema.model_json_schema()
}
})
print(llm_extraction_result['extract']) llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
print(llm_extraction_result.extract)
# # Define schema to extract contents into using json schema # # Define schema to extract contents into using json schema
json_schema = { json_schema = {
@ -94,24 +83,16 @@ json_schema = {
"required": ["top"] "required": ["top"]
} }
app2 = FirecrawlApp(api_key="fc-", version="v0") extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True})
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
print(llm_extraction_result.extract)
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
'extractorOptions': {
'extractionSchema': json_schema,
'mode': 'llm-extraction'
},
'pageOptions':{
'onlyMainContent': True
}
})
# print(llm_extraction_result['llm_extraction']) # print(llm_extraction_result['llm_extraction'])
# Map a website: # Map a website:
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) map_result = app.map_url('https://firecrawl.dev', search="blog")
print(map_result) print(map_result)
# Extract URLs: # Extract URLs:
@ -124,14 +105,12 @@ class ExtractSchema(BaseModel):
extract_schema = ExtractSchema.schema() extract_schema = ExtractSchema.schema()
# Perform the extraction # Perform the extraction
extract_result = app.extract(['https://firecrawl.dev'], { extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
'prompt': "Extract the title, description, and links from the website",
'schema': extract_schema
})
print(extract_result) print(extract_result)
# Crawl a website with WebSockets: # Crawl a website with WebSockets:
# inside an async function... # inside an async function...
import nest_asyncio
nest_asyncio.apply() nest_asyncio.apply()
# Define event handlers # Define event handlers