diff --git a/examples/hacker_news_scraper/bs4_scraper.py b/examples/hacker_news_scraper/bs4_scraper.py new file mode 100644 index 00000000..3c215d62 --- /dev/null +++ b/examples/hacker_news_scraper/bs4_scraper.py @@ -0,0 +1,98 @@ +import json +import requests + +from bs4 import BeautifulSoup +from pydantic import BaseModel +from datetime import datetime + + +class NewsItem(BaseModel): + title: str + source_url: str + author: str + rank: str + upvotes: str + date: str + + +BASE_URL = "https://news.ycombinator.com/" + + +def get_page_content(): + """ + Send a GET request to the Hacker News homepage and return the HTML content. + """ + response = requests.get(BASE_URL) + return response.text + + +def get_title_rows(html_content, class_name): + """ + Parse the HTML content and return the first table row. + """ + soup = BeautifulSoup(html_content, "html.parser") + title_rows = soup.find("table").find_all("tr", {"class": class_name}) + return title_rows + + +def get_subtext_rows(html_content): + """ + Parse the HTML content and return the subtext row. + """ + soup = BeautifulSoup(html_content, "html.parser") + subtext_rows = soup.find("table").find_all("td", {"class": "subtext"}) + return subtext_rows + + +def get_news_data(): + """ + Extract the news data from the table row. + """ + title_rows = get_title_rows(get_page_content(), "athing submission") + subtext_rows = get_subtext_rows(get_page_content()) + + news_data = [] + + for title_row, subtext_row in zip(title_rows, subtext_rows): + # Extract title information from the title row + title_span = title_row.find("span", {"class": "titleline"}) + title = title_span.a.text + url = title_span.a["href"] + rank = title_row.find("span", {"class": "rank"}).text + + # Extract metadata from the subtext row + author = BASE_URL + subtext_row.find("a", {"class": "hnuser"})["href"] + upvotes = subtext_row.find("span", {"class": "score"}).text + date = subtext_row.find("span", {"class": "age"}).get("title").split(" ")[0] + + news_data.append( + NewsItem( + title=title, + source_url=url, + author=author, + rank=rank, + upvotes=upvotes, + date=date, + ) + ) + + return news_data + + +def save_news_data(): + """ + Save the scraped news data to a JSON file with the current date in the filename. + """ + + news_data = get_news_data() + current_date = datetime.now().strftime("%Y_%m_%d_%H_%M") + filename = f"hacker_news_data_{current_date}.json" + + with open(filename, "w") as f: + json.dump([item.dict() for item in news_data], f, indent=4) + + return filename + + +if __name__ == "__main__": + save_news_data() diff --git a/examples/hacker_news_scraper/firecrawl_scraper.py b/examples/hacker_news_scraper/firecrawl_scraper.py new file mode 100644 index 00000000..d53621b6 --- /dev/null +++ b/examples/hacker_news_scraper/firecrawl_scraper.py @@ -0,0 +1,61 @@ +# firecrawl_scraper.py +import json +from firecrawl import FirecrawlApp +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from typing import List +from datetime import datetime + +load_dotenv() + +BASE_URL = "https://news.ycombinator.com/" + + +class NewsItem(BaseModel): + title: str = Field(description="The title of the news item") + source_url: str = Field(description="The URL of the news item") + author: str = Field( + description="The URL of the post author's profile concatenated with the base URL." + ) + rank: str = Field(description="The rank of the news item") + upvotes: str = Field(description="The number of upvotes of the news item") + date: str = Field(description="The date of the news item.") + + +class NewsData(BaseModel): + news_items: List[NewsItem] + + +def get_firecrawl_news_data(): + app = FirecrawlApp() + + data = app.scrape_url( + BASE_URL, + params={ + "formats": ["extract"], + "extract": {"schema": NewsData.model_json_schema()}, + }, + ) + + return data + + +def save_firecrawl_news_data(): + """ + Save the scraped news data to a JSON file with the current date in the filename. + """ + # Get the data + data = get_firecrawl_news_data() + # Format current date for filename + date_str = datetime.now().strftime("%Y_%m_%d_%H_%M") + filename = f"firecrawl_hacker_news_data_{date_str}.json" + + # Save the news items to JSON file + with open(filename, "w") as f: + json.dump(data["extract"]["news_items"], f, indent=4) + + return filename + + +if __name__ == "__main__": + save_firecrawl_news_data() diff --git a/examples/hacker_news_scraper/requirements.txt b/examples/hacker_news_scraper/requirements.txt new file mode 100644 index 00000000..e9d71878 --- /dev/null +++ b/examples/hacker_news_scraper/requirements.txt @@ -0,0 +1,6 @@ +requests +beautifulsoup4 +firecrawl +pydantic +python-dotenv +firecrawl-py