mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-02 03:10:39 +08:00
Nick: new examples
This commit is contained in:
parent
0001d6ea25
commit
f3522666db
@ -1,53 +1,45 @@
|
||||
import time
|
||||
import nest_asyncio
|
||||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
from firecrawl.firecrawl import ExtractConfig, FirecrawlApp
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
import time
|
||||
app = FirecrawlApp(api_url="https://api.firecrawl.dev")
|
||||
|
||||
app = FirecrawlApp(api_key="fc-")
|
||||
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
# # Scrape a website:
|
||||
scrape_result = app.scrape_url('example.com', formats=["markdown", "html"])
|
||||
print(scrape_result.markdown)
|
||||
|
||||
|
||||
# Test batch scrape
|
||||
# # Test batch scrapeq
|
||||
urls = ['https://example.com', 'https://docs.firecrawl.dev']
|
||||
batch_scrape_params = {
|
||||
'formats': ['markdown', 'html'],
|
||||
}
|
||||
|
||||
# Synchronous batch scrape
|
||||
batch_result = app.batch_scrape_urls(urls, batch_scrape_params)
|
||||
batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("Synchronous Batch Scrape Result:")
|
||||
print(batch_result['data'][0]['markdown'])
|
||||
print(batch_result.data[0].markdown)
|
||||
|
||||
# Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params)
|
||||
# # Asynchronous batch scrape
|
||||
async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"])
|
||||
print("\nAsynchronous Batch Scrape Result:")
|
||||
print(async_batch_result)
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key)
|
||||
print(crawl_result)
|
||||
crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||
print(crawl_result.data[0].markdown)
|
||||
|
||||
# Asynchronous Crawl a website:
|
||||
async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "")
|
||||
# # Asynchronous Crawl a website:
|
||||
async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*'])
|
||||
print(async_result)
|
||||
|
||||
crawl_status = app.check_crawl_status(async_result['id'])
|
||||
crawl_status = app.check_crawl_status(async_result.id)
|
||||
print(crawl_status)
|
||||
|
||||
attempts = 15
|
||||
while attempts > 0 and crawl_status['status'] != 'completed':
|
||||
while attempts > 0 and crawl_status.status != 'completed':
|
||||
print(crawl_status)
|
||||
crawl_status = app.check_crawl_status(async_result['id'])
|
||||
crawl_status = app.check_crawl_status(async_result.id)
|
||||
attempts -= 1
|
||||
time.sleep(1)
|
||||
|
||||
crawl_status = app.check_crawl_status(async_result['id'])
|
||||
crawl_status = app.check_crawl_status(async_result.id)
|
||||
print(crawl_status)
|
||||
|
||||
# LLM Extraction:
|
||||
@ -61,14 +53,11 @@ class ArticleSchema(BaseModel):
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., description="Top 5 stories")
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'formats': ['extract'],
|
||||
'extract': {
|
||||
'schema': TopArticlesSchema.model_json_schema()
|
||||
}
|
||||
})
|
||||
extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema())
|
||||
|
||||
print(llm_extraction_result['extract'])
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||
|
||||
print(llm_extraction_result.extract)
|
||||
|
||||
# # Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
@ -94,24 +83,16 @@ json_schema = {
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
app2 = FirecrawlApp(api_key="fc-", version="v0")
|
||||
extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True})
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config)
|
||||
|
||||
|
||||
llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': json_schema,
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
print(llm_extraction_result.extract)
|
||||
|
||||
# print(llm_extraction_result['llm_extraction'])
|
||||
|
||||
|
||||
# Map a website:
|
||||
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
|
||||
map_result = app.map_url('https://firecrawl.dev', search="blog")
|
||||
print(map_result)
|
||||
|
||||
# Extract URLs:
|
||||
@ -124,14 +105,12 @@ class ExtractSchema(BaseModel):
|
||||
extract_schema = ExtractSchema.schema()
|
||||
|
||||
# Perform the extraction
|
||||
extract_result = app.extract(['https://firecrawl.dev'], {
|
||||
'prompt': "Extract the title, description, and links from the website",
|
||||
'schema': extract_schema
|
||||
})
|
||||
extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema)
|
||||
print(extract_result)
|
||||
|
||||
# Crawl a website with WebSockets:
|
||||
# inside an async function...
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
|
||||
# Define event handlers
|
||||
|
Loading…
x
Reference in New Issue
Block a user