diff --git a/apps/python-sdk/example.py b/apps/python-sdk/example.py index ae4258f7..705d2e0c 100644 --- a/apps/python-sdk/example.py +++ b/apps/python-sdk/example.py @@ -1,53 +1,45 @@ -import time -import nest_asyncio -import uuid -from firecrawl.firecrawl import FirecrawlApp +from firecrawl.firecrawl import ExtractConfig, FirecrawlApp from pydantic import BaseModel, Field from typing import List +import time +app = FirecrawlApp(api_url="https://api.firecrawl.dev") -app = FirecrawlApp(api_key="fc-") - -# Scrape a website: -scrape_result = app.scrape_url('firecrawl.dev') -print(scrape_result['markdown']) +# # Scrape a website: +scrape_result = app.scrape_url('example.com', formats=["markdown", "html"]) +print(scrape_result.markdown) -# Test batch scrape +# # Test batch scrapeq urls = ['https://example.com', 'https://docs.firecrawl.dev'] -batch_scrape_params = { - 'formats': ['markdown', 'html'], -} - # Synchronous batch scrape -batch_result = app.batch_scrape_urls(urls, batch_scrape_params) +batch_result = app.batch_scrape_urls(urls, formats=["markdown", "html"]) print("Synchronous Batch Scrape Result:") -print(batch_result['data'][0]['markdown']) +print(batch_result.data[0].markdown) -# Asynchronous batch scrape -async_batch_result = app.async_batch_scrape_urls(urls, batch_scrape_params) +# # Asynchronous batch scrape +async_batch_result = app.async_batch_scrape_urls(urls, formats=["markdown", "html"]) print("\nAsynchronous Batch Scrape Result:") print(async_batch_result) # Crawl a website: -idempotency_key = str(uuid.uuid4()) # optional idempotency key -crawl_result = app.crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, 2, idempotency_key) -print(crawl_result) +crawl_result = app.crawl_url('firecrawl.dev', exclude_paths=['blog/*']) +print(crawl_result.data[0].markdown) -# Asynchronous Crawl a website: -async_result = app.async_crawl_url('firecrawl.dev', {'excludePaths': ['blog/*']}, "") +# # Asynchronous Crawl a website: +async_result = app.async_crawl_url('firecrawl.dev', exclude_paths=['blog/*']) print(async_result) -crawl_status = app.check_crawl_status(async_result['id']) +crawl_status = app.check_crawl_status(async_result.id) print(crawl_status) attempts = 15 -while attempts > 0 and crawl_status['status'] != 'completed': +while attempts > 0 and crawl_status.status != 'completed': print(crawl_status) - crawl_status = app.check_crawl_status(async_result['id']) + crawl_status = app.check_crawl_status(async_result.id) attempts -= 1 time.sleep(1) -crawl_status = app.check_crawl_status(async_result['id']) +crawl_status = app.check_crawl_status(async_result.id) print(crawl_status) # LLM Extraction: @@ -61,14 +53,11 @@ class ArticleSchema(BaseModel): class TopArticlesSchema(BaseModel): top: List[ArticleSchema] = Field(..., description="Top 5 stories") -llm_extraction_result = app.scrape_url('https://news.ycombinator.com', { - 'formats': ['extract'], - 'extract': { - 'schema': TopArticlesSchema.model_json_schema() - } -}) +extract_config = ExtractConfig(schema=TopArticlesSchema.model_json_schema()) -print(llm_extraction_result['extract']) +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) + +print(llm_extraction_result.extract) # # Define schema to extract contents into using json schema json_schema = { @@ -94,24 +83,16 @@ json_schema = { "required": ["top"] } -app2 = FirecrawlApp(api_key="fc-", version="v0") +extract_config = ExtractConfig(extractionSchema=json_schema, mode="llm-extraction", pageOptions={"onlyMainContent": True}) +llm_extraction_result = app.scrape_url('https://news.ycombinator.com', formats=["extract"], extract=extract_config) - -llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': json_schema, - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } -}) +print(llm_extraction_result.extract) # print(llm_extraction_result['llm_extraction']) # Map a website: -map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) +map_result = app.map_url('https://firecrawl.dev', search="blog") print(map_result) # Extract URLs: @@ -124,14 +105,12 @@ class ExtractSchema(BaseModel): extract_schema = ExtractSchema.schema() # Perform the extraction -extract_result = app.extract(['https://firecrawl.dev'], { - 'prompt': "Extract the title, description, and links from the website", - 'schema': extract_schema -}) +extract_result = app.extract(['https://firecrawl.dev'], prompt="Extract the title, description, and links from the website", schema=extract_schema) print(extract_result) # Crawl a website with WebSockets: # inside an async function... +import nest_asyncio nest_asyncio.apply() # Define event handlers