mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-07-31 04:11:59 +08:00
Add a new project to examples that shows how to scrape Hacker News website
This commit is contained in:
parent
ce6d3e21e1
commit
d777633b30
98
examples/hacker_news_scraper/bs4_scraper.py
Normal file
98
examples/hacker_news_scraper/bs4_scraper.py
Normal file
@ -0,0 +1,98 @@
|
||||
import json
|
||||
import requests
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class NewsItem(BaseModel):
|
||||
title: str
|
||||
source_url: str
|
||||
author: str
|
||||
rank: str
|
||||
upvotes: str
|
||||
date: str
|
||||
|
||||
|
||||
BASE_URL = "https://news.ycombinator.com/"
|
||||
|
||||
|
||||
def get_page_content():
|
||||
"""
|
||||
Send a GET request to the Hacker News homepage and return the HTML content.
|
||||
"""
|
||||
response = requests.get(BASE_URL)
|
||||
return response.text
|
||||
|
||||
|
||||
def get_title_rows(html_content, class_name):
|
||||
"""
|
||||
Parse the HTML content and return the first table row.
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
title_rows = soup.find("table").find_all("tr", {"class": class_name})
|
||||
return title_rows
|
||||
|
||||
|
||||
def get_subtext_rows(html_content):
|
||||
"""
|
||||
Parse the HTML content and return the subtext row.
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
subtext_rows = soup.find("table").find_all("td", {"class": "subtext"})
|
||||
return subtext_rows
|
||||
|
||||
|
||||
def get_news_data():
|
||||
"""
|
||||
Extract the news data from the table row.
|
||||
"""
|
||||
title_rows = get_title_rows(get_page_content(), "athing submission")
|
||||
subtext_rows = get_subtext_rows(get_page_content())
|
||||
|
||||
news_data = []
|
||||
|
||||
for title_row, subtext_row in zip(title_rows, subtext_rows):
|
||||
# Extract title information from the title row
|
||||
title_span = title_row.find("span", {"class": "titleline"})
|
||||
title = title_span.a.text
|
||||
url = title_span.a["href"]
|
||||
rank = title_row.find("span", {"class": "rank"}).text
|
||||
|
||||
# Extract metadata from the subtext row
|
||||
author = BASE_URL + subtext_row.find("a", {"class": "hnuser"})["href"]
|
||||
upvotes = subtext_row.find("span", {"class": "score"}).text
|
||||
date = subtext_row.find("span", {"class": "age"}).get("title").split(" ")[0]
|
||||
|
||||
news_data.append(
|
||||
NewsItem(
|
||||
title=title,
|
||||
source_url=url,
|
||||
author=author,
|
||||
rank=rank,
|
||||
upvotes=upvotes,
|
||||
date=date,
|
||||
)
|
||||
)
|
||||
|
||||
return news_data
|
||||
|
||||
|
||||
def save_news_data():
|
||||
"""
|
||||
Save the scraped news data to a JSON file with the current date in the filename.
|
||||
"""
|
||||
|
||||
news_data = get_news_data()
|
||||
current_date = datetime.now().strftime("%Y_%m_%d_%H_%M")
|
||||
filename = f"hacker_news_data_{current_date}.json"
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump([item.dict() for item in news_data], f, indent=4)
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
save_news_data()
|
61
examples/hacker_news_scraper/firecrawl_scraper.py
Normal file
61
examples/hacker_news_scraper/firecrawl_scraper.py
Normal file
@ -0,0 +1,61 @@
|
||||
# firecrawl_scraper.py
|
||||
import json
|
||||
from firecrawl import FirecrawlApp
|
||||
from dotenv import load_dotenv
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
|
||||
load_dotenv()
|
||||
|
||||
BASE_URL = "https://news.ycombinator.com/"
|
||||
|
||||
|
||||
class NewsItem(BaseModel):
|
||||
title: str = Field(description="The title of the news item")
|
||||
source_url: str = Field(description="The URL of the news item")
|
||||
author: str = Field(
|
||||
description="The URL of the post author's profile concatenated with the base URL."
|
||||
)
|
||||
rank: str = Field(description="The rank of the news item")
|
||||
upvotes: str = Field(description="The number of upvotes of the news item")
|
||||
date: str = Field(description="The date of the news item.")
|
||||
|
||||
|
||||
class NewsData(BaseModel):
|
||||
news_items: List[NewsItem]
|
||||
|
||||
|
||||
def get_firecrawl_news_data():
|
||||
app = FirecrawlApp()
|
||||
|
||||
data = app.scrape_url(
|
||||
BASE_URL,
|
||||
params={
|
||||
"formats": ["extract"],
|
||||
"extract": {"schema": NewsData.model_json_schema()},
|
||||
},
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def save_firecrawl_news_data():
|
||||
"""
|
||||
Save the scraped news data to a JSON file with the current date in the filename.
|
||||
"""
|
||||
# Get the data
|
||||
data = get_firecrawl_news_data()
|
||||
# Format current date for filename
|
||||
date_str = datetime.now().strftime("%Y_%m_%d_%H_%M")
|
||||
filename = f"firecrawl_hacker_news_data_{date_str}.json"
|
||||
|
||||
# Save the news items to JSON file
|
||||
with open(filename, "w") as f:
|
||||
json.dump(data["extract"]["news_items"], f, indent=4)
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
save_firecrawl_news_data()
|
6
examples/hacker_news_scraper/requirements.txt
Normal file
6
examples/hacker_news_scraper/requirements.txt
Normal file
@ -0,0 +1,6 @@
|
||||
requests
|
||||
beautifulsoup4
|
||||
firecrawl
|
||||
pydantic
|
||||
python-dotenv
|
||||
firecrawl-py
|
Loading…
x
Reference in New Issue
Block a user