2024-11-28 15:02:29 +05:00

99 lines
2.6 KiB
Python

import json
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel
from datetime import datetime
class NewsItem(BaseModel):
title: str
source_url: str
author: str
rank: str
upvotes: str
date: str
BASE_URL = "https://news.ycombinator.com/"
def get_page_content():
"""
Send a GET request to the Hacker News homepage and return the HTML content.
"""
response = requests.get(BASE_URL)
return response.text
def get_title_rows(html_content, class_name):
"""
Parse the HTML content and return the first table row.
"""
soup = BeautifulSoup(html_content, "html.parser")
title_rows = soup.find("table").find_all("tr", {"class": class_name})
return title_rows
def get_subtext_rows(html_content):
"""
Parse the HTML content and return the subtext row.
"""
soup = BeautifulSoup(html_content, "html.parser")
subtext_rows = soup.find("table").find_all("td", {"class": "subtext"})
return subtext_rows
def get_news_data():
"""
Extract the news data from the table row.
"""
title_rows = get_title_rows(get_page_content(), "athing submission")
subtext_rows = get_subtext_rows(get_page_content())
news_data = []
for title_row, subtext_row in zip(title_rows, subtext_rows):
# Extract title information from the title row
title_span = title_row.find("span", {"class": "titleline"})
title = title_span.a.text
url = title_span.a["href"]
rank = title_row.find("span", {"class": "rank"}).text
# Extract metadata from the subtext row
author = BASE_URL + subtext_row.find("a", {"class": "hnuser"})["href"]
upvotes = subtext_row.find("span", {"class": "score"}).text
date = subtext_row.find("span", {"class": "age"}).get("title").split(" ")[0]
news_data.append(
NewsItem(
title=title,
source_url=url,
author=author,
rank=rank,
upvotes=upvotes,
date=date,
)
)
return news_data
def save_news_data():
"""
Save the scraped news data to a JSON file with the current date in the filename.
"""
news_data = get_news_data()
current_date = datetime.now().strftime("%Y_%m_%d_%H_%M")
filename = f"hacker_news_data_{current_date}.json"
with open(filename, "w") as f:
json.dump([item.dict() for item in news_data], f, indent=4)
print(f"{datetime.now()}: Successfully saved the news data.")
if __name__ == "__main__":
save_news_data()