firecrawl/examples/hacker_news_scraper/firecrawl_scraper.py
2024-11-28 15:02:29 +05:00

62 lines
1.6 KiB
Python

# firecrawl_scraper.py
import json
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List
from datetime import datetime
load_dotenv()
BASE_URL = "https://news.ycombinator.com/"
class NewsItem(BaseModel):
title: str = Field(description="The title of the news item")
source_url: str = Field(description="The URL of the news item")
author: str = Field(
description="The URL of the post author's profile concatenated with the base URL."
)
rank: str = Field(description="The rank of the news item")
upvotes: str = Field(description="The number of upvotes of the news item")
date: str = Field(description="The date of the news item.")
class NewsData(BaseModel):
news_items: List[NewsItem]
def get_firecrawl_news_data():
app = FirecrawlApp()
data = app.scrape_url(
BASE_URL,
params={
"formats": ["extract"],
"extract": {"schema": NewsData.model_json_schema()},
},
)
return data
def save_firecrawl_news_data():
"""
Save the scraped news data to a JSON file with the current date in the filename.
"""
# Get the data
data = get_firecrawl_news_data()
# Format current date for filename
date_str = datetime.now().strftime("%Y_%m_%d_%H_%M")
filename = f"firecrawl_hacker_news_data_{date_str}.json"
# Save the news items to JSON file
with open(filename, "w") as f:
json.dump(data["extract"]["news_items"], f, indent=4)
print(f"{datetime.now()}: Successfully saved the news data.")
if __name__ == "__main__":
save_firecrawl_news_data()