mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-04 11:24:40 +08:00
99 lines
2.6 KiB
Python
99 lines
2.6 KiB
Python
import json
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
from pydantic import BaseModel
|
|
from datetime import datetime
|
|
|
|
|
|
class NewsItem(BaseModel):
|
|
title: str
|
|
source_url: str
|
|
author: str
|
|
rank: str
|
|
upvotes: str
|
|
date: str
|
|
|
|
|
|
BASE_URL = "https://news.ycombinator.com/"
|
|
|
|
|
|
def get_page_content():
|
|
"""
|
|
Send a GET request to the Hacker News homepage and return the HTML content.
|
|
"""
|
|
response = requests.get(BASE_URL)
|
|
return response.text
|
|
|
|
|
|
def get_title_rows(html_content, class_name):
|
|
"""
|
|
Parse the HTML content and return the first table row.
|
|
"""
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
title_rows = soup.find("table").find_all("tr", {"class": class_name})
|
|
return title_rows
|
|
|
|
|
|
def get_subtext_rows(html_content):
|
|
"""
|
|
Parse the HTML content and return the subtext row.
|
|
"""
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
subtext_rows = soup.find("table").find_all("td", {"class": "subtext"})
|
|
return subtext_rows
|
|
|
|
|
|
def get_news_data():
|
|
"""
|
|
Extract the news data from the table row.
|
|
"""
|
|
title_rows = get_title_rows(get_page_content(), "athing submission")
|
|
subtext_rows = get_subtext_rows(get_page_content())
|
|
|
|
news_data = []
|
|
|
|
for title_row, subtext_row in zip(title_rows, subtext_rows):
|
|
# Extract title information from the title row
|
|
title_span = title_row.find("span", {"class": "titleline"})
|
|
title = title_span.a.text
|
|
url = title_span.a["href"]
|
|
rank = title_row.find("span", {"class": "rank"}).text
|
|
|
|
# Extract metadata from the subtext row
|
|
author = BASE_URL + subtext_row.find("a", {"class": "hnuser"})["href"]
|
|
upvotes = subtext_row.find("span", {"class": "score"}).text
|
|
date = subtext_row.find("span", {"class": "age"}).get("title").split(" ")[0]
|
|
|
|
news_data.append(
|
|
NewsItem(
|
|
title=title,
|
|
source_url=url,
|
|
author=author,
|
|
rank=rank,
|
|
upvotes=upvotes,
|
|
date=date,
|
|
)
|
|
)
|
|
|
|
return news_data
|
|
|
|
|
|
def save_news_data():
|
|
"""
|
|
Save the scraped news data to a JSON file with the current date in the filename.
|
|
"""
|
|
|
|
news_data = get_news_data()
|
|
current_date = datetime.now().strftime("%Y_%m_%d_%H_%M")
|
|
filename = f"hacker_news_data_{current_date}.json"
|
|
|
|
with open(filename, "w") as f:
|
|
json.dump([item.dict() for item in news_data], f, indent=4)
|
|
|
|
print(f"{datetime.now()}: Successfully saved the news data.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
save_news_data()
|