diff --git a/examples/automated_price_tracking/.github/workflows/check_prices.yml b/examples/automated_price_tracking/.github/workflows/check_prices.yml new file mode 100644 index 00000000..5bd0e671 --- /dev/null +++ b/examples/automated_price_tracking/.github/workflows/check_prices.yml @@ -0,0 +1,33 @@ +name: Price Check + +on: + schedule: + # Runs every 6 hours + - cron: "0 0,6,12,18 * * *" + workflow_dispatch: # Allows manual triggering + +jobs: + check-prices: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run price checker + env: + FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} + POSTGRES_URL: ${{ secrets.POSTGRES_URL }} + DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }} + run: python check_prices.py diff --git a/examples/automated_price_tracking/.gitignore b/examples/automated_price_tracking/.gitignore new file mode 100644 index 00000000..1d17dae1 --- /dev/null +++ b/examples/automated_price_tracking/.gitignore @@ -0,0 +1 @@ +.venv diff --git a/examples/automated_price_tracking/README.md b/examples/automated_price_tracking/README.md new file mode 100644 index 00000000..9ab50dbe --- /dev/null +++ b/examples/automated_price_tracking/README.md @@ -0,0 +1,31 @@ +# Automated Price Tracking System + +A robust price tracking system that monitors product prices across e-commerce websites and notifies users of price changes through Discord. + +## Features + +- Automated price checking every 6 hours +- Support for multiple e-commerce platforms through Firecrawl API +- Discord notifications for price changes +- Historical price data storage in PostgreSQL database +- Interactive price history visualization with Streamlit + +## Setup + +1. Clone the repository +2. Install dependencies: + + ```bash + pip install -r requirements.txt + ``` + +3. Configure environment variables: + + ```bash + cp .env.example .env + ``` + + Then edit `.env` with your: + - Discord webhook URL + - Database credentials + - Firecrawl API key diff --git a/examples/automated_price_tracking/check_prices.py b/examples/automated_price_tracking/check_prices.py new file mode 100644 index 00000000..33a48843 --- /dev/null +++ b/examples/automated_price_tracking/check_prices.py @@ -0,0 +1,49 @@ +import os +import asyncio +from database import Database +from dotenv import load_dotenv +from firecrawl import FirecrawlApp +from scraper import scrape_product +from notifications import send_price_alert + +load_dotenv() + +db = Database(os.getenv("POSTGRES_URL")) +app = FirecrawlApp() + +# Threshold percentage for price drop alerts (e.g., 5% = 0.05) +PRICE_DROP_THRESHOLD = 0.05 + + +async def check_prices(): + products = db.get_all_products() + product_urls = set(product.url for product in products) + + for product_url in product_urls: + # Get the price history + price_history = db.get_price_history(product_url) + if not price_history: + continue + + # Get the earliest recorded price + earliest_price = price_history[-1].price + + # Retrieve updated product data + updated_product = scrape_product(product_url) + current_price = updated_product["price"] + + # Add the price to the database + db.add_price(updated_product) + print(f"Added new price entry for {updated_product['name']}") + + # Check if price dropped below threshold + if earliest_price > 0: # Avoid division by zero + price_drop = (earliest_price - current_price) / earliest_price + if price_drop >= PRICE_DROP_THRESHOLD: + await send_price_alert( + updated_product["name"], earliest_price, current_price, product_url + ) + + +if __name__ == "__main__": + asyncio.run(check_prices()) diff --git a/examples/automated_price_tracking/database.py b/examples/automated_price_tracking/database.py new file mode 100644 index 00000000..2aec92a8 --- /dev/null +++ b/examples/automated_price_tracking/database.py @@ -0,0 +1,134 @@ +from sqlalchemy import create_engine, Column, String, Float, DateTime, ForeignKey +from sqlalchemy.orm import sessionmaker, relationship, declarative_base +from datetime import datetime + +Base = declarative_base() + + +class Product(Base): + __tablename__ = "products" + + url = Column(String, primary_key=True) + prices = relationship( + "PriceHistory", back_populates="product", cascade="all, delete-orphan" + ) + + +class PriceHistory(Base): + __tablename__ = "price_histories" + + id = Column(String, primary_key=True) + product_url = Column(String, ForeignKey("products.url")) + name = Column(String, nullable=False) + price = Column(Float, nullable=False) + currency = Column(String, nullable=False) + main_image_url = Column(String) + timestamp = Column(DateTime, nullable=False) + product = relationship("Product", back_populates="prices") + + +class Database: + def __init__(self, connection_string): + self.engine = create_engine(connection_string) + Base.metadata.create_all(self.engine) + self.Session = sessionmaker(bind=self.engine) + + def add_product(self, url): + session = self.Session() + try: + # Create the product entry + product = Product(url=url) + session.merge(product) # merge will update if exists, insert if not + session.commit() + finally: + session.close() + + def product_exists(self, url): + session = self.Session() + try: + return session.query(Product).filter(Product.url == url).first() is not None + finally: + session.close() + + def add_price(self, product_data): + session = self.Session() + try: + # First ensure the product exists + if not self.product_exists(product_data["url"]): + # Create the product if it doesn't exist + product = Product(url=product_data["url"]) + session.add(product) + session.flush() # Flush to ensure the product is created before adding price + + # Convert timestamp string to datetime if it's a string + timestamp = product_data["timestamp"] + if isinstance(timestamp, str): + timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H-%M") + + price_history = PriceHistory( + id=f"{product_data['url']}_{timestamp.strftime('%Y%m%d%H%M%S')}", + product_url=product_data["url"], + name=product_data["name"], + price=product_data["price"], + currency=product_data["currency"], + main_image_url=product_data["main_image_url"], + timestamp=timestamp, + ) + session.add(price_history) + session.commit() + finally: + session.close() + + def get_all_products(self): + session = self.Session() + try: + return session.query(Product).all() + finally: + session.close() + + def get_price_history(self, url): + """Get price history for a product""" + session = self.Session() + try: + return ( + session.query(PriceHistory) + .filter(PriceHistory.product_url == url) + .order_by(PriceHistory.timestamp.desc()) + .all() + ) + finally: + session.close() + + def remove_all_products(self): + session = self.Session() + try: + # First delete all price histories + session.query(PriceHistory).delete() + # Then delete all products + session.query(Product).delete() + session.commit() + finally: + session.close() + + # def remove_product(self, url): + # """Remove a product and its price history""" + # session = self.Session() + # try: + # product = session.query(Product).filter(Product.url == url).first() + # if product: + # session.delete( + # product + # ) # This will also delete associated price history due to cascade + # session.commit() + # finally: + # session.close() + + +if __name__ == "__main__": + from dotenv import load_dotenv + import os + + load_dotenv() + + db = Database(os.getenv("POSTGRES_URL")) + db.remove_all_products() diff --git a/examples/automated_price_tracking/notifications.py b/examples/automated_price_tracking/notifications.py new file mode 100644 index 00000000..2837fb70 --- /dev/null +++ b/examples/automated_price_tracking/notifications.py @@ -0,0 +1,36 @@ +from dotenv import load_dotenv +import os +import aiohttp +import asyncio + +load_dotenv() + + +async def send_price_alert( + product_name: str, old_price: float, new_price: float, url: str +): + """Send a price drop alert to Discord""" + drop_percentage = ((old_price - new_price) / old_price) * 100 + + message = { + "embeds": [ + { + "title": "Price Drop Alert! ๐ŸŽ‰", + "description": f"**{product_name}**\nPrice dropped by {drop_percentage:.1f}%!\n" + f"Old price: ${old_price:.2f}\n" + f"New price: ${new_price:.2f}\n" + f"[View Product]({url})", + "color": 3066993, + } + ] + } + + try: + async with aiohttp.ClientSession() as session: + await session.post(os.getenv("DISCORD_WEBHOOK_URL"), json=message) + except Exception as e: + print(f"Error sending Discord notification: {e}") + + +if __name__ == "__main__": + asyncio.run(send_price_alert("Test Product", 100, 90, "https://www.google.com")) diff --git a/examples/automated_price_tracking/requirements.txt b/examples/automated_price_tracking/requirements.txt new file mode 100644 index 00000000..52f0541b --- /dev/null +++ b/examples/automated_price_tracking/requirements.txt @@ -0,0 +1,9 @@ +streamlit +firecrawl-py +pydantic +psycopg2-binary +python-dotenv +sqlalchemy==2.0.35 +pandas +plotly +aiohttp \ No newline at end of file diff --git a/examples/automated_price_tracking/scraper.py b/examples/automated_price_tracking/scraper.py new file mode 100644 index 00000000..fc06b73e --- /dev/null +++ b/examples/automated_price_tracking/scraper.py @@ -0,0 +1,38 @@ +from firecrawl import FirecrawlApp +from pydantic import BaseModel, Field +from datetime import datetime +from dotenv import load_dotenv + +load_dotenv() +app = FirecrawlApp() + + +class Product(BaseModel): + """Schema for creating a new product""" + + url: str = Field(description="The URL of the product") + name: str = Field(description="The product name/title") + price: float = Field(description="The current price of the product") + currency: str = Field(description="Currency code (USD, EUR, etc)") + main_image_url: str = Field(description="The URL of the main image of the product") + + +def scrape_product(url: str): + extracted_data = app.scrape_url( + url, + params={ + "formats": ["extract"], + "extract": {"schema": Product.model_json_schema()}, + }, + ) + + # Add the scraping date to the extracted data + extracted_data["extract"]["timestamp"] = datetime.utcnow() + + return extracted_data["extract"] + + +if __name__ == "__main__": + product = "https://www.amazon.com/gp/product/B002U21ZZK/" + + print(scrape_product(product)) diff --git a/examples/automated_price_tracking/ui.py b/examples/automated_price_tracking/ui.py new file mode 100644 index 00000000..11969897 --- /dev/null +++ b/examples/automated_price_tracking/ui.py @@ -0,0 +1,86 @@ +import os +import streamlit as st +import pandas as pd +import plotly.express as px + +from utils import is_valid_url +from database import Database +from dotenv import load_dotenv +from scraper import scrape_product + +load_dotenv() + +st.set_page_config(page_title="Price Tracker", page_icon="๐Ÿ“Š", layout="wide") + +with st.spinner("Loading database..."): + db = Database(os.getenv("POSTGRES_URL")) + + +# Set up sidebar +with st.sidebar: + st.title("Add New Product") + product_url = st.text_input("Product URL") + add_button = st.button("Add Product") + + if add_button: + if not product_url: + st.error("Please enter a product URL") + elif not is_valid_url(product_url): + st.error("Please enter a valid URL") + else: + db.add_product(product_url) + with st.spinner("Added product to database. Scraping product data..."): + product_data = scrape_product(product_url) + db.add_price(product_data) + st.success("Product is now being tracked!") + +# Main content +st.title("Price Tracker Dashboard") +st.markdown("## Tracked Products") + +# Get all products and their price histories +products = db.get_all_products() + +# Create a card for each product +for product in products: + price_history = db.get_price_history(product.url) + if price_history: + # Create DataFrame for plotting + df = pd.DataFrame( + [ + {"timestamp": ph.timestamp, "price": ph.price, "name": ph.name} + for ph in price_history + ] + ) + + # Create a card-like container for each product + with st.expander(df["name"][0], expanded=False): + st.markdown("---") + col1, col2 = st.columns([1, 3]) + + with col1: + if price_history[0].main_image_url: + st.image(price_history[0].main_image_url, width=200) + st.metric( + label="Current Price", + value=f"{price_history[0].price} {price_history[0].currency}", + ) + + with col2: + # Create price history plot + fig = px.line( + df, + x="timestamp", + y="price", + title=None, + ) + fig.update_layout( + xaxis_title=None, + yaxis_title="Price ($)", + showlegend=False, + margin=dict(l=0, r=0, t=0, b=0), + height=300, + ) + fig.update_xaxes(tickformat="%Y-%m-%d %H:%M", tickangle=45) + fig.update_yaxes(tickprefix="$", tickformat=".2f") + st.plotly_chart(fig, use_container_width=True) diff --git a/examples/automated_price_tracking/utils.py b/examples/automated_price_tracking/utils.py new file mode 100644 index 00000000..c7af0a94 --- /dev/null +++ b/examples/automated_price_tracking/utils.py @@ -0,0 +1,28 @@ +from urllib.parse import urlparse +import re + + +def is_valid_url(url: str) -> bool: + try: + # Parse the URL + result = urlparse(url) + + # Check if scheme and netloc are present + if not all([result.scheme, result.netloc]): + return False + + # Check if scheme is http or https + if result.scheme not in ["http", "https"]: + return False + + # Basic regex pattern for domain validation + domain_pattern = ( + r"^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z]{2,})+$" + ) + if not re.match(domain_pattern, result.netloc): + return False + + return True + + except Exception: + return False diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/actions.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/actions.png new file mode 100644 index 00000000..87524177 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/actions.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/alert.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/alert.png new file mode 100644 index 00000000..34bb5234 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/alert.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/discord.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/discord.png new file mode 100644 index 00000000..aa845f68 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/discord.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/finished.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/finished.png new file mode 100644 index 00000000..f172000b Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/finished.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/linechart.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/linechart.png new file mode 100644 index 00000000..9871e629 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/linechart.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/new-alert.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/new-alert.png new file mode 100644 index 00000000..9cf734bd Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/new-alert.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/new-server.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/new-server.png new file mode 100644 index 00000000..48ac7469 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/new-server.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/sneak-peek.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/sneak-peek.png new file mode 100644 index 00000000..deff8823 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/sneak-peek.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/supabase_connect.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/supabase_connect.png new file mode 100644 index 00000000..6ce0ea89 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/supabase_connect.png differ diff --git a/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/webhook.png b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/webhook.png new file mode 100644 index 00000000..68a89a37 Binary files /dev/null and b/examples/blog-articles/amazon-price-tracking/amazon-price-tracking-images/webhook.png differ diff --git a/examples/blog-articles/amazon-price-tracking/notebook.ipynb b/examples/blog-articles/amazon-price-tracking/notebook.ipynb new file mode 100644 index 00000000..cec05a05 --- /dev/null +++ b/examples/blog-articles/amazon-price-tracking/notebook.ipynb @@ -0,0 +1,1753 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# How to Build an Automated Amazon Price Tracking Tool in Python For Free\n", + "## That sends alerts to your phone and keeps price history" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What Shall We Build in This Tutorial?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is a lot to be said about the psychology of discounts. For example, buying a discounted item even though we don't need it isn't saving money at all. That's walking into the oldest trap sellers use to increase sales. However, there are legitimate cases where waiting for a price drop on items you actually need makes perfect sense.\n", + "\n", + "The challenge is that e-commerce websites run flash sales and temporary discounts constantly, but these deals often disappear as quickly as they appear. Missing these brief windows of opportunity can be frustrating.\n", + "\n", + "That's where automation comes in. In this guide, we'll build a Python application that monitors product prices across any e-commerce website and instantly notifies you when prices drop on items you're actually interested in. Here is a sneak peak of the app:\n", + "\n", + "![](images/sneak-peek.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The app looks pretty dull, doesn't it? Well, no worries because it is fully functional:\n", + "- It has a minimalistic UI to add or remove products from the tracker\n", + "- A simple dashboard to display price history for each product\n", + "- Controls for setting the price drop threshold in percentages\n", + "- A notification system that sends Discord alerts when a tracked item's price drops\n", + "- A scheduling system that updates the product prices on an interval you specify\n", + "- Runs for free for as long as you want\n", + "\n", + "Even though the title says \"Amazon price tracker\" (full disclosure: I was forced to write that for SEO purposes), the app will work for any e-commerce website you can imagine (except Ebay, for some reason). \n", + "\n", + "So, let's get started building this Amazon price tracker. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Toolstack We Will Use" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The app's code will be written fully in Python and its libraries:\n", + "\n", + "- [Streamlit](streamlit.io) for the UI\n", + "- [Firecrawl](firecrawl.dev) for AI-based scraping of e-commerce websites\n", + "- [SQLAlchemy](https://www.sqlalchemy.org/) for database management\n", + "\n", + "Apart from Python, we will use these platforms:\n", + "\n", + "- Discord for notifications\n", + "- GitHub for hosting the app\n", + "- GitHub Actions for running the app on a schedule\n", + "- Supabase for hosting a free Postgres database instance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building an Amazon Price Tracker App Step-by-step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since this project involves multiple components working together, we'll take a top-down approach rather than building individual pieces first. This approach makes it easier to understand how everything fits together, since we'll introduce each tool only when it's needed. The benefits of this strategy will become clear as we progress through the tutorial." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: Setting up the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's create a dedicated environment on our machines to work on the project:\n", + "\n", + "```bash\n", + "mkdir automated-price-tracker\n", + "cd automated-price-tracker\n", + "python -m venv .venv\n", + "source .venv/bin/activate\n", + "```\n", + "\n", + "These commands create a working directory and activate a virtual environment. Next, create a new script called `ui.py` for designing the user interface with Streamlit.\n", + "\n", + "```bash\n", + "touch ui.py\n", + "```\n", + "\n", + "Then, install Streamlit:\n", + "\n", + "```bash\n", + "pip install streamlit\n", + "```\n", + "\n", + "Next, create a `requirements.txt` file and add Streamlit as the first dependency:\n", + "\n", + "```bash\n", + "touch requirements.txt\n", + "echo \"streamlit\" >> requirements.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the code will be hosted on GitHub, we need to initialize Git and create a `.gitignore` file:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "git init\n", + "touch .gitignore\n", + "echo \".venv\" >> .gitignore # Add the virtual env folder\n", + "git commit -m \"Initial commit\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Add a sidebar to the UI for product input" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the final product one more time:\n", + "\n", + "![](images/sneak-peek.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It has two sections: the sidebar and the main dashboard. Since the first thing you do when launching this app is adding products, we will start building the sidebar first. Open `ui.py` and paste the following code:\n", + "\n", + "```python\n", + "import streamlit as st\n", + "\n", + "# Set up sidebar\n", + "with st.sidebar:\n", + " st.title(\"Add New Product\")\n", + " product_url = st.text_input(\"Product URL\")\n", + " add_button = st.button(\"Add Product\")\n", + "\n", + "# Main content\n", + "st.title(\"Price Tracker Dashboard\")\n", + "st.markdown(\"## Tracked Products\")\n", + "```\n", + "\n", + "The code snippet above sets up a basic Streamlit web application with two main sections. In the sidebar, it creates a form for adding new products with a text input field for the product URL and an \"Add Product\" button. The main content area contains a dashboard title and a section header for tracked products. The code uses Streamlit's `st.sidebar` context manager to create the sidebar layout and basic Streamlit components like `st.title`, `st.text_input`, and `st.button` to build the user interface elements.\n", + "\n", + "To see how this app looks like, run the following command:\n", + "\n", + "```bash\n", + "streamlit run ui.py\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's add a commit to save our progress:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a sidebar to the basic UI\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Add a feature to check if input URL is valid\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next step, we want to add some restrictions to the input field like checking if the passed URL is valid. For this, create a new file called `utils.py` where we write additional utility functions for our app:\n", + "\n", + "```bash\n", + "touch utils.py\n", + "```\n", + "\n", + "Inside the script, paste following code:\n", + "\n", + "```bash\n", + "# utils.py\n", + "from urllib.parse import urlparse\n", + "import re\n", + "\n", + "\n", + "def is_valid_url(url: str) -> bool:\n", + " try:\n", + " # Parse the URL\n", + " result = urlparse(url)\n", + "\n", + " # Check if scheme and netloc are present\n", + " if not all([result.scheme, result.netloc]):\n", + " return False\n", + "\n", + " # Check if scheme is http or https\n", + " if result.scheme not in [\"http\", \"https\"]:\n", + " return False\n", + "\n", + " # Basic regex pattern for domain validation\n", + " domain_pattern = (\n", + " r\"^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\\.[a-zA-Z]{2,})+$\"\n", + " )\n", + " if not re.match(domain_pattern, result.netloc):\n", + " return False\n", + "\n", + " return True\n", + "\n", + " except Exception:\n", + " return False\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above function `is_valid_url()` validates URLs by checking several criteria:\n", + "\n", + "1. It verifies the URL has both a scheme (`http`/`https`) and domain name\n", + "2. It ensures the scheme is specifically `http` or `https`\n", + "3. It validates the domain name format using regex to check for valid characters and TLD\n", + "4. It returns True only if all checks pass, False otherwise\n", + "\n", + "Let's use this function in our `ui.py` file. Here is the modified code:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import streamlit as st\n", + "from utils import is_valid_url\n", + "\n", + "\n", + "# Set up sidebar\n", + "with st.sidebar:\n", + " st.title(\"Add New Product\")\n", + " product_url = st.text_input(\"Product URL\")\n", + " add_button = st.button(\"Add Product\")\n", + "\n", + " if add_button:\n", + " if not product_url:\n", + " st.error(\"Please enter a product URL\")\n", + " elif not is_valid_url(product_url):\n", + " st.error(\"Please enter a valid URL\")\n", + " else:\n", + " st.success(\"Product is now being tracked!\")\n", + "\n", + "# Main content\n", + "...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is what's new:\n", + "\n", + "1. We added URL validation using the `is_valid_url()` function from `utils.py`\n", + "2. When the button is clicked, we perform validation:\n", + " - Check if URL is empty\n", + " - Validate URL format using `is_valid_url()`\n", + "3. User feedback is provided through error/success messages:\n", + " - Error shown for empty URL\n", + " - Error shown for invalid URL format \n", + " - Success message when URL passes validation\n", + "\n", + "Rerun the Streamlit app again and see if our validation works. Then, return to your terminal to commit the changes we've made:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a feature to check URL validity\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: Scrape the input URL for product details" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When a valid URL is entered and the add button is clicked, we need to implement product scraping functionality instead of just showing a success message. The system should:\n", + "\n", + "1. Immediately scrape the product URL to extract key details:\n", + " - Product name\n", + " - Current price\n", + " - Main product image\n", + " - Brand name\n", + " - Other relevant attributes\n", + "\n", + "2. Store these details in a database to enable:\n", + " - Regular price monitoring\n", + " - Historical price tracking\n", + " - Price change alerts\n", + " - Product status updates\n", + "\n", + "For the scraper, we will use [Firecrawl](firecrawl.dev), an AI-based scraping API for extracting webpage data without HTML parsing. This solution provides several advantages:\n", + "\n", + "1. No website HTML code analysis required for element selection\n", + "2. Resilient to HTML structure changes through AI-based element detection\n", + "3. Universal compatibility with product webpages due to structure-agnostic approach \n", + "4. Reliable website blocker bypass via robust API infrastructure\n", + "\n", + "First, create a new file called `scraper.py`:\n", + "\n", + "```bash\n", + "touch scraper.py\n", + "```\n", + "\n", + "Then, install these three libraries:\n", + "\n", + "```bash\n", + "pip install firecrawl-py pydantic python-dotenv\n", + "echo \"firecrawl-py\\npydantic\\npython-dotenv\\n\" >> requirements.txt # Add them to dependencies\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`firecrawl-py` is the Python SDK for Firecrawl scraping engine, `pydantic` is a data validation library that helps enforce data types and structure through Python class definitions, and `python-dotenv` is a library that loads environment variables from a `.env` file into your Python application.\n", + "\n", + "With that said, head over to the Firecrawl website and [sign up for a free account](https://www.firecrawl.dev/) (the free plan will work fine). You will be given an API key, which you should copy. \n", + "\n", + "Then, create a `.env` file in your terminal and add the API key as an environment variable:\n", + "\n", + "```bash\n", + "touch .env\n", + "echo \"FIRECRAWL_API_KEY='YOUR-API-KEY-HERE' >> .env\"\n", + "echo \".env\" >> .gitignore # Ignore .env files in Git\n", + "```\n", + "\n", + "The `.env` file is used to securely store sensitive configuration values like API keys that shouldn't be committed to version control. By storing the Firecrawl API key in `.env` and adding it to `.gitignore`, we ensure it stays private while still being accessible to our application code. This is a security best practice to avoid exposing credentials in source control.\n", + "\n", + "Now, we can start writing the `scraper.py`:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "from firecrawl import FirecrawlApp\n", + "from pydantic import BaseModel, Field\n", + "from dotenv import load_dotenv\n", + "from datetime import datetime\n", + "\n", + "load_dotenv()\n", + "\n", + "app = FirecrawlApp()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, `load_dotenv()` function reads the `.env` file you have in your working directory and loads the environment variables inside, including the Firecrawl API key. When you create an instance of `FirecrawlApp` class, the API key is automatically detected to establish a connection between your script and the scraping engine in the form of the `app` variable.\n", + "\n", + "Now, we create a Pydantic class (usually called a model) that defines the details we want to scrape from each product:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "class Product(BaseModel):\n", + " \"\"\"Schema for creating a new product\"\"\"\n", + "\n", + " url: str = Field(description=\"The URL of the product\")\n", + " name: str = Field(description=\"The product name/title\")\n", + " price: float = Field(description=\"The current price of the product\")\n", + " currency: str = Field(description=\"Currency code (USD, EUR, etc)\")\n", + " main_image_url: str = Field(description=\"The URL of the main image of the product\")\n", + "```\n", + "\n", + "Pydantic models may be completely new to you, so let's break down the `Product` model:\n", + "\n", + "- The `url` field stores the product page URL we want to track\n", + "- The `name` field stores the product title/name that will be scraped\n", + "- The `price` field stores the current price as a float number\n", + "- The `currency` field stores the 3-letter currency code (e.g. USD, EUR)\n", + "- The `main_image_url` field stores the URL of the product's main image\n", + "\n", + "Each field is typed and has a description that documents its purpose. The `Field` class from Pydantic allows us to add metadata like descriptions to each field. These descriptions are especially important for Firecrawl since it uses them to automatically locate the relevant HTML elements containing the data we want. \n", + "\n", + "Now, let's create a function to call the engine to scrape URL's based on the schema above:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "def scrape_product(url: str):\n", + " extracted_data = app.scrape_url(\n", + " url,\n", + " params={\n", + " \"formats\": [\"extract\"],\n", + " \"extract\": {\"schema\": Product.model_json_schema()},\n", + " },\n", + " )\n", + "\n", + " # Add the scraping date to the extracted data\n", + " extracted_data[\"extract\"][\"timestamp\"] = datetime.utcnow()\n", + "\n", + " return extracted_data[\"extract\"]\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " product = \"https://www.amazon.com/gp/product/B002U21ZZK/\"\n", + "\n", + " print(scrape_product(product))\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code above defines a function called `scrape_product` that takes a URL as input and uses it to scrape product information. Here's how it works:\n", + "\n", + "The function calls `app.scrape_url` with two parameters:\n", + "1. The product URL to scrape\n", + "2. A params dictionary that configures the scraping:\n", + " - It specifies we want to use the \"extract\" format\n", + " - It provides our `Product` Pydantic model schema as the extraction template as a JSON object\n", + "\n", + "The scraper will attempt to find and extract data that matches our Product schema fields - the URL, name, price, currency, and image URL.\n", + "\n", + "The function returns just the \"extract\" portion of the scraped data, which contains the structured product information. `extract` returns a dictionary to which we add the date of the scraping as it will be important later on.\n", + "\n", + "Let's test the script by running it:\n", + "\n", + "```bash\n", + "python scraper.py\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should get an output like this:\n", + "\n", + "```python\n", + "{\n", + " 'url': 'https://www.amazon.com/dp/B002U21ZZK', \n", + " 'name': 'MOVA Globe Earth with Clouds 4.5\"', \n", + " 'price': 212, \n", + " 'currency': 'USD', \n", + " 'main_image_url': 'https://m.media-amazon.com/images/I/41bQ3Y58y3L._AC_.jpg', \n", + " 'timestamp': '2024-12-05 13-20'\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output shows that a [MOVA Globe](https://www.amazon.com/dp/B002U21ZZK) costs $212 USD on Amazon at the time of writing this article. You can test the script for any other website that contains the information we are looking (except Ebay):\n", + "\n", + "- Price\n", + "- Product name/title\n", + "- Main image URL\n", + "\n", + "One key advantage of using Firecrawl is that it returns data in a consistent dictionary format across all websites. Unlike HTML-based scrapers like BeautifulSoup or Scrapy which require custom code for each site and can break when website layouts change, Firecrawl uses AI to understand and extract the requested data fields regardless of the underlying HTML structure. \n", + "\n", + "Finish this step by committing the new changes to Git:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Implement a Firecrawl scraper for products\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5: Storing new products in a PostgreSQL database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we want to check product prices regularly, we need to have an online database. In this case, Postgres is the best option since it's reliable, scalable, and has great support for storing time-series data like price histories.\n", + "\n", + "There are many platforms for hosting Postgres instances but the one I find the easiest and fastest to set up is Supabase. So, please head over to [the Supabase website](https://supabase.com) and create your free account. During the sign-up process, you will be given a password, which you should save somewhere safe on your machine. \n", + "\n", + "\n", + "Then, in a few minutes, your free Postgres instance comes online. To connect to this instance, click on Home in the left sidebar and then, \"Connect\":\n", + "\n", + "![](images/supabase_connect.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will be shown your database connection string with a placeholder for the password you copied. You should paste this string in your `.env` file with your password added to the `.env` file:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "echo POSTGRES_URL=\"THE-SUPABASE-URL-STRING-WITH-YOUR-PASSWORD-ADDED\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the easiest way to interact with this database is through SQLAlchemy. Let's install it:\n", + "\n", + "```bash\n", + "pip install \"sqlalchemy==2.0.35\" psycopg2-binary\n", + "echo \"psycopg2-binary\\nsqlalchemy==2.0.35\" >> requirements.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> Note: [SQLAlchemy](https://sqlalchemy.org) is a Python SQL toolkit and Object-Relational Mapping (ORM) library that lets us interact with databases using Python code instead of raw SQL. For our price tracking project, it provides essential features like database connection management, schema definition through Python classes, and efficient querying capabilities. This makes it much easier to store and retrieve product information and price histories in our Postgres database.\n", + "\n", + "After the installation, create a new `database.py` file for storing database-related functions:\n", + "\n", + "```bash\n", + "touch database.py\n", + "```\n", + "\n", + "Let's populate this script:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "from sqlalchemy import create_engine, Column, String, Float, DateTime, ForeignKey\n", + "from sqlalchemy.orm import sessionmaker, relationship, declarative_base\n", + "from datetime import datetime\n", + "\n", + "Base = declarative_base()\n", + "\n", + "\n", + "class Product(Base):\n", + " __tablename__ = \"products\"\n", + "\n", + " url = Column(String, primary_key=True)\n", + " prices = relationship(\n", + " \"PriceHistory\", back_populates=\"product\", cascade=\"all, delete-orphan\"\n", + " )\n", + "\n", + "\n", + "class PriceHistory(Base):\n", + " __tablename__ = \"price_histories\"\n", + "\n", + " id = Column(String, primary_key=True)\n", + " product_url = Column(String, ForeignKey(\"products.url\"))\n", + " name = Column(String, nullable=False)\n", + " price = Column(Float, nullable=False)\n", + " currency = Column(String, nullable=False)\n", + " main_image_url = Column(String)\n", + " timestamp = Column(DateTime, nullable=False)\n", + " product = relationship(\"Product\", back_populates=\"prices\")\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The code above defines two SQLAlchemy models for our price tracking database:\n", + "\n", + "The Product model represents items we want to track, with the product URL as the primary key. It has a one-to-many relationship with price histories (which means each product in `products` can have multiple price history entry in `price_histories`).\n", + "\n", + "The `PriceHistory` model stores individual price points over time. Each record contains:\n", + "- A unique ID as primary key\n", + "- The product URL as a foreign key linking to the `Product`\n", + "- The product name\n", + "- The price value and currency\n", + "- The main product image URL\n", + "- A timestamp of when the price was recorded\n", + "\n", + "The relationship between `Product` and `PriceHistory` is bidirectional, allowing easy navigation between related records. The `cascade` setting ensures price histories are deleted when their product is deleted.\n", + "\n", + "These models provide the structure for storing and querying our price tracking data in a PostgreSQL database using SQLAlchemy's ORM capabilities." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we define a `Database` class with a singe `add_product` method:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "class Database:\n", + " def __init__(self, connection_string):\n", + " self.engine = create_engine(connection_string)\n", + " Base.metadata.create_all(self.engine)\n", + " self.Session = sessionmaker(bind=self.engine)\n", + "\n", + " def add_product(self, url):\n", + " session = self.Session()\n", + " try:\n", + " # Create the product entry\n", + " product = Product(url=url)\n", + " session.merge(product) # merge will update if exists, insert if not\n", + " session.commit()\n", + " finally:\n", + " session.close()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The `Database` class above provides core functionality for managing product data in our PostgreSQL database. It takes a connection string in its constructor to establish the database connection using SQLAlchemy.\n", + "\n", + "The `add_product` method allows us to store new product URLs in the database. It uses SQLAlchemy's `merge` functionality which intelligently handles both inserting new products and updating existing ones, preventing duplicate entries.\n", + "\n", + "The method carefully manages database sessions, ensuring proper resource cleanup by using `try`/`finally` blocks. This prevents resource leaks and maintains database connection stability." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use this method inside the sidebar of our UI. Switch to `ui.py` and make the following adjustments:\n", + "\n", + "First, update the imports to load the Database class and initialize it:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import os\n", + "import streamlit as st\n", + "\n", + "from utils import is_valid_url\n", + "from database import Database\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "with st.spinner(\"Loading database...\"):\n", + " db = Database(os.getenv(\"POSTGRES_URL\"))\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code integrates the `Database` class into the Streamlit UI by importing required dependencies and establishing a database connection. The database URL is loaded securely from environment variables using `python-dotenv`. The `Database` class creates or updates the tables we specified in `database.py` after being initialized.\n", + "\n", + "The database initialization process is wrapped in a Streamlit spinner component to maintain responsiveness while establishing the connection. This provides visual feedback during the connection setup period, which typically requires a brief initialization time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, in the sidebar code, we only need to add a single line of code to add the product to the database if the URL is valid:\n", + "\n", + "```python\n", + "# Set up sidebar\n", + "with st.sidebar:\n", + " st.title(\"Add New Product\")\n", + " product_url = st.text_input(\"Product URL\")\n", + " add_button = st.button(\"Add Product\")\n", + "\n", + " if add_button:\n", + " if not product_url:\n", + " st.error(\"Please enter a product URL\")\n", + " elif not is_valid_url(product_url):\n", + " st.error(\"Please enter a valid URL\")\n", + " else:\n", + " db.add_product(product_url) # This is the new line\n", + " st.success(\"Product is now being tracked!\")\n", + "```\n", + "\n", + "In the final `else` block that runs when the product URL is valid, we call the `add_product` method to store the product in the database.\n", + "\n", + "Let's commit everything:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a Postgres database integration for tracking product URLs\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6: Storing price histories for new products" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, after the product is added to the `products` table, we want to add its details and its scraped price to the `price_histories` table. \n", + "\n", + "First, switch to `database.py` and add a new method for creating entries in the `PriceHistories` table:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "class Database:\n", + " ... # the rest of the class\n", + "\n", + " def add_price(self, product_data):\n", + " session = self.Session()\n", + " try:\n", + " price_history = PriceHistory(\n", + " id=f\"{product_data['url']}_{product_data['timestamp']}\",\n", + " product_url=product_data[\"url\"],\n", + " name=product_data[\"name\"],\n", + " price=product_data[\"price\"],\n", + " currency=product_data[\"currency\"],\n", + " main_image_url=product_data[\"main_image_url\"],\n", + " timestamp=product_data[\"timestamp\"],\n", + " )\n", + " session.add(price_history)\n", + " session.commit()\n", + " finally:\n", + " session.close()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `add_price` method takes a dictionary containing product data (which is returned by our scraper) and creates a new entry in the `PriceHistory` table. The entry's ID is generated by combining the product URL with a timestamp. The method stores essential product information like name, price, currency, image URL, and the timestamp of when the price was recorded. It uses SQLAlchemy's session management to safely commit the new price history entry to the database.\n", + "\n", + "Now, we need to add this functionality to the sidebar as well. In `ui.py`, add a new import statement that loads the `scrape_product` function from `scraper.py`:\n", + "\n", + "```python\n", + "... # The rest of the imports\n", + "from scraper import scrape_product\n", + "```\n", + "\n", + "Then, update the `else` block in the sidebar again:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "with st.sidebar:\n", + " st.title(\"Add New Product\")\n", + " product_url = st.text_input(\"Product URL\")\n", + " add_button = st.button(\"Add Product\")\n", + "\n", + " if add_button:\n", + " if not product_url:\n", + " st.error(\"Please enter a product URL\")\n", + " elif not is_valid_url(product_url):\n", + " st.error(\"Please enter a valid URL\")\n", + " else:\n", + " db.add_product(product_url)\n", + " with st.spinner(\"Added product to database. Scraping product data...\"):\n", + " product_data = scrape_product(product_url)\n", + " db.add_price(product_data)\n", + " st.success(\"Product is now being tracked!\")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now when a user enters a product URL and clicks the \"Add Product\" button, several things happen:\n", + "\n", + "1. The URL is validated to ensure it's not empty and is properly formatted.\n", + "2. If valid, the URL is added to the products table via `add_product()`.\n", + "3. The product page is scraped immediately to get current price data.\n", + "4. This initial price data is stored in the price history table via `add_price()`.\n", + "5. The user sees loading spinners and success messages throughout the process.\n", + "\n", + "This gives us a complete workflow for adding new products to track, including capturing their initial price point. The UI provides clear feedback at each step and handles errors gracefully.\n", + "\n", + "Check that everything is working the way we want it and then, commit the new changes:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a feature to track product prices after they are added\"\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7: Displaying each product's price history in the main dashboard" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the final product shown in the introduction once again:\n", + "\n", + "![](images/sneak-peek.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Apart from the sidebar, the main dashboard shows each product's price history visualized with a Plotly line plot where the X axis is the timestamp while the Y axis is the prices. Each line plot is wrapped in a Streamlit component that includes buttons for removing the product from the database or visiting its source URL. \n", + "\n", + "In this step, we will implement the plotting feature and leave the two buttons for a later section. First, add a new method to the `Database` class for retrieving the price history for each product:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "class Database:\n", + " ... # The rest of the code\n", + "\n", + " def get_price_history(self, url):\n", + " \"\"\"Get price history for a product\"\"\"\n", + " session = self.Session()\n", + " try:\n", + " return (\n", + " session.query(PriceHistory)\n", + " .filter(PriceHistory.product_url == url)\n", + " .order_by(PriceHistory.timestamp.desc())\n", + " .all()\n", + " )\n", + " finally:\n", + " session.close()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The method queries the price histories table based on product URL, orders the rows in descending order (oldest first) and returns the results. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, add another method for retrieving all products from the `products` table:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "class Database:\n", + " ...\n", + " \n", + " def get_all_products(self):\n", + " session = self.Session()\n", + " try:\n", + " return session.query(Product).all()\n", + " finally:\n", + " session.close()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The idea is that every time our Streamlit app is opened, the main dashboard queries all existing products from the database and render their price histories with line charts in dedicated components. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To create the line charts, we need Plotly and Pandas, so install them in your environment:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "pip install pandas plotly\n", + "echo \"pandas\\nplotly\" >> requirements.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Afterward, import them at the top of `ui.py` along with other existing imports:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, switch to `ui.py` and paste the following snippet of code after the Main content section:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "# Main content\n", + "st.title(\"Price Tracker Dashboard\")\n", + "st.markdown(\"## Tracked Products\")\n", + "\n", + "# Get all products\n", + "products = db.get_all_products()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, after the page title and subtitle is shown, we are retrieving all products from the database. Let's loop over them:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "# Create a card for each product\n", + "for product in products:\n", + " price_history = db.get_price_history(product.url)\n", + " if price_history:\n", + " # Create DataFrame for plotting\n", + " df = pd.DataFrame(\n", + " [\n", + " {\"timestamp\": ph.timestamp, \"price\": ph.price, \"name\": ph.name}\n", + " for ph in price_history\n", + " ]\n", + " )\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For each product, we get their price history with `db.get_price_history` and then, convert this data into a dataframe with three columns:\n", + "\n", + "- Timestamp\n", + "- Price\n", + "- Product name\n", + "\n", + "This makes plotting easier with Plotly. Next, we create a Streamlit expander component for each product:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "# Create a card for each product\n", + "for product in products:\n", + " price_history = db.get_price_history(product.url)\n", + " if price_history:\n", + " ...\n", + " # Create a card-like container for each product\n", + " with st.expander(df[\"name\"][0], expanded=False):\n", + " st.markdown(\"---\")\n", + " col1, col2 = st.columns([1, 3])\n", + "\n", + " with col1:\n", + " if price_history[0].main_image_url:\n", + " st.image(price_history[0].main_image_url, width=200)\n", + " st.metric(\n", + " label=\"Current Price\",\n", + " value=f\"{price_history[0].price} {price_history[0].currency}\",\n", + " )\n", + "```\n", + "\n", + "The expander shows the product name as its title and contains:\n", + "\n", + "1. A divider line\n", + "2. Two columns:\n", + " - Left column: Product image (if available) and current price metric\n", + " - Right column (shown in next section)\n", + "\n", + "The price is displayed using Streamlit's metric component which shows the current price and currency.\n", + "\n", + "Here is the rest of the code:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + " ...\n", + " \n", + " with col2:\n", + " # Create price history plot\n", + " fig = px.line(\n", + " df,\n", + " x=\"timestamp\",\n", + " y=\"price\",\n", + " title=None,\n", + " )\n", + " fig.update_layout(\n", + " xaxis_title=None,\n", + " yaxis_title=\"Price ($)\",\n", + " showlegend=False,\n", + " margin=dict(l=0, r=0, t=0, b=0),\n", + " height=300,\n", + " )\n", + " fig.update_xaxes(tickformat=\"%Y-%m-%d %H:%M\", tickangle=45)\n", + " fig.update_yaxes(tickprefix=\"$\", tickformat=\".2f\")\n", + " st.plotly_chart(fig, use_container_width=True)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the right column, we create an interactive line plot using Plotly Express to visualize the price history over time. The plot shows price on the y-axis and timestamp on the x-axis. The layout is customized to remove the title, adjust axis labels and formatting, and optimize the display size. The timestamps are formatted to show date and time, with angled labels for better readability. Prices are displayed with 2 decimal places and a dollar sign prefix. The plot is rendered using Streamlit's `plotly_chart` component and automatically adjusts its width to fill the container.\n", + "\n", + "After this step, the UI must be fully functional and ready to track products. For example, here is what mine looks like after adding a couple of products:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](images/finished.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But notice how the price history chart doesn't show anything. That's because we haven't populated it by checking the product price in regular intervals. Let's do that in the next couple of steps. For now, commit the latest changes we've made:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Display product price histories for each product in the dashboard\"\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "------------\n", + "\n", + "Let's take a brief moment to summarize the steps we took so far and what's next. So far, we've built a Streamlit interface that allows users to add product URLs and displays their current prices and basic information. We've implemented the database schema, created functions to scrape product data, and designed a clean UI with price history visualization. The next step is to set up automated price checking to populate our history charts and enable proper price tracking over time.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8: Adding new price entries for existing products" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we want to write a script that adds new price entries in the `price_histories` table for each product in `products` table. We call this script `check_prices.py`:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import os\n", + "from database import Database\n", + "from dotenv import load_dotenv\n", + "from firecrawl import FirecrawlApp\n", + "from scraper import scrape_product\n", + "\n", + "load_dotenv()\n", + "\n", + "db = Database(os.getenv(\"POSTGRES_URL\"))\n", + "app = FirecrawlApp()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At the top, we are importing the functions and packages and initializing the database and a Firecrawl app. Then, we define a simple `check_prices` function:\n", + "\n", + "```python\n", + "def check_prices():\n", + " products = db.get_all_products()\n", + "\n", + " for product in products:\n", + " # Retrieve updated product data\n", + " updated_product = scrape_product(product.url)\n", + "\n", + " # Add the price to the database\n", + " db.add_price(updated_product)\n", + " print(f\"Added new price entry for {updated_product['name']}\")\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " check_prices()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the function body, we retrieve all products URLs, retrieve their new price data with `scrape_product` function from `scraper.py` and then, add a new price entry for the product with `db.add_price`. \n", + "\n", + "If you run the function once and refresh the Streamlit app, you must see a line chart appear for each product you are tracking:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](images/linechart.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's commit the changes in this step:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a script for checking prices of existing products\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 9: Check prices regularly with GitHub actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "GitHub Actions is a continuous integration and continuous delivery (CI/CD) platform that allows you to automate various software workflows directly from your GitHub repository. In our case, it's particularly useful because we can set up automated price checks to run the `check_prices.py` script at regular intervals (e.g., daily or hourly) without manual intervention. This ensures we consistently track price changes and maintain an up-to-date database of historical prices for our tracked products.\n", + "\n", + "So, the first step is creating a new GitHub repository for our project and pushing existing code to it:\n", + "\n", + "```bash\n", + "git remote add origin https://github.com/yourusername/price-tracker.git\n", + "git push origin main\n", + "```\n", + "\n", + "Then, return to your terminal and create this directory structure:\n", + "\n", + "```bash\n", + "mkdir -p .github/workflows\n", + "touch .github/workflows/check_prices.yml\n", + "```\n", + "\n", + "The first command creates a new directory structure `.github/workflows` using the `-p` flag to create parent directories if they don't exist.\n", + "\n", + "The second command creates an empty YAML file called `check_prices.yml` inside the workflows directory. GitHub Actions looks for workflow files in this specific location - any YAML files in the `.github/workflows` directory will be automatically detected and processed as workflow configurations. These YAML files define when and how your automated tasks should run, what environment they need, and what commands to execute. In our case, this file will contain instructions for GitHub Actions to periodically run our price checking script. Let's write it:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```yaml\n", + "name: Price Check\n", + "\n", + "on:\n", + " schedule:\n", + " # Runs every 3 minutes\n", + " - cron: \"*/3 * * * *\"\n", + " workflow_dispatch: # Allows manual triggering\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's break down this first part of the YAML file:\n", + "\n", + "The `name: Price Check` line gives our workflow a descriptive name that will appear in the GitHub Actions interface.\n", + "\n", + "The `on:` section defines when this workflow should be triggered. We've configured two triggers:\n", + "\n", + "1. A schedule using cron syntax `*/3 * * * *` which runs the workflow every 3 minutes. The five asterisks represent minute, hour, day of month, month, and day of week respectively. The `*/3` means \"every 3rd minute\". The 3-minute interval is for debugging purposes, we will need to choose a wider interval later on to respect the free limits of GitHub actions. \n", + "\n", + "2. `workflow_dispatch` enables manual triggering of the workflow through the GitHub Actions UI, which is useful for testing or running the check on-demand.\n", + "\n", + "Now, let's add the rest:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```yaml\n", + "jobs:\n", + " check-prices:\n", + " runs-on: ubuntu-latest\n", + "\n", + " steps:\n", + " - name: Checkout code\n", + " uses: actions/checkout@v4\n", + "\n", + " - name: Set up Python\n", + " uses: actions/setup-python@v5\n", + " with:\n", + " python-version: \"3.10\"\n", + " cache: \"pip\"\n", + "\n", + " - name: Install dependencies\n", + " run: |\n", + " python -m pip install --upgrade pip\n", + " pip install -r automated_price_tracking/requirements.txt\n", + "\n", + " - name: Run price checker\n", + " env:\n", + " FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }}\n", + " POSTGRES_URL: ${{ secrets.POSTGRES_URL }}\n", + " run: python automated_price_tracking/check_prices.py\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's break down this second part of the YAML file:\n", + "\n", + "The `jobs:` section defines the actual work to be performed. We have one job named `check-prices` that runs on an Ubuntu virtual machine (`runs-on: ubuntu-latest`).\n", + "\n", + "Under `steps:`, we define the sequence of actions:\n", + "\n", + "1. First, we checkout our repository code using the standard `actions/checkout@v4` action\n", + "\n", + "2. Then we set up Python 3.10 using `actions/setup-python@v5`, enabling pip caching to speed up dependency installation\n", + "\n", + "3. Next, we install our Python dependencies by upgrading `pip` and installing requirements from our `requirements.txt` file. At this point, it is essential that you were keeping a complete dependency file based on the installs we made in the project. \n", + "\n", + "4. Finally, we run our price checker script, providing two environment variables:\n", + " - `FIRECRAWL_API_KEY`: For accessing the web scraping service\n", + " - `POSTGRES_URL`: For connecting to our database\n", + "\n", + "Both variables must be stored in our GitHub repository as secrets for this workflow file to run without errors. So, navigate to the repository you've created for the project and open its Settings. Under \"Secrets and variables\" > \"Actions\", click on \"New repository secret\" button to add the environment variables we have in the `.env` file one-by-one. \n", + "\n", + "Then, return to your terminal, commit the changes and push:\n", + "\n", + "```bash\n", + "git add . \n", + "git commit -m \"Add a workflow to check prices regularly\"\n", + "git push origin main\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, navigate to your GitHub repository again and click on the \"Actions\" tab:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](images/actions.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From there, you can run the workflow manually (click \"Run workflow\" and refresh the page). If it is executed successfully, you can return to the Streamlit app and refresh to see the new price added to the chart." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 10: Setting up Discord for notifications" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we know our scheduling workflow works, the first order of business is setting a wider check interval in the workflow file. Even though our first workflow run was manually, the rest happen automatically." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "on:\n", + " schedule:\n", + " # Runs every 6 hours\n", + " - cron: \"0 0,6,12,18 * * *\"\n", + " workflow_dispatch: # Allows manual triggering\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the workflow file, change the cron field to the syntax you see above, which runs the workflow at the first minute of 12am, 6am, 12pm and 6pm UTC. Then, commit and push the changes:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Set a wider check interval in the workflow file\"\n", + "git push origin main\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now comes the interesting part. Each time the workflow is run, we want to compare the current price of the product to its original price when we started tracking it. If the difference between these two prices is below a certain threshold like 5%, this means there is a discount happening for the product and we want to send a notification. \n", + "\n", + "The easiest way to set this up is by using Discord webhooks. So, if you haven't got one already, go to Discord.com and create a new account (optionally, download the desktop app as well). Then, log in to your account and you will find a \"Plus\" button in the bottom-left corner. Click on it to create your own Discord server:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![](images/discord.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After pressing \"Plus\", choose \"Create my own\" and \"For me and my friends\". Then, give a new name to your server and you will be presented with an empty channel:\n", + "\n", + "![](images/new-server.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Right click on \"general\" and choose \"Edit channel\". Switch to the integrations tab and click on \"Create webhook\". Discord immediately generates a new webhook with a random name and you should copy its URL. \n", + "\n", + "![](images/webhook.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Webhooks are automated messages sent from apps to other apps in real-time. They work like a notification system - when something happens in one app, it automatically sends data to another app through a unique URL. In our case, we'll use Discord webhooks to automatically notify us when there's a price drop. Whenever our price tracking script detects a significant discount, it will send a message to our Discord channel through the webhook URL, ensuring we never miss a good deal.\n", + "\n", + "After copying the webhook URL, you should save it as environment variable to your `.env` file:\n", + "\n", + "```python\n", + "echo \"DISCORD_WEBHOOK_URL='THE-URL-YOU-COPIED'\" >> .env\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, create a new file called `notifications.py` and paste the following contents:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "from dotenv import load_dotenv\n", + "import os\n", + "import aiohttp\n", + "import asyncio\n", + "\n", + "load_dotenv()\n", + "\n", + "\n", + "async def send_price_alert(\n", + " product_name: str, old_price: float, new_price: float, url: str\n", + "):\n", + " \"\"\"Send a price drop alert to Discord\"\"\"\n", + " drop_percentage = ((old_price - new_price) / old_price) * 100\n", + "\n", + " message = {\n", + " \"embeds\": [\n", + " {\n", + " \"title\": \"Price Drop Alert! ๐ŸŽ‰\",\n", + " \"description\": f\"**{product_name}**\\nPrice dropped by {drop_percentage:.1f}%!\\n\"\n", + " f\"Old price: ${old_price:.2f}\\n\"\n", + " f\"New price: ${new_price:.2f}\\n\"\n", + " f\"[View Product]({url})\",\n", + " \"color\": 3066993,\n", + " }\n", + " ]\n", + " }\n", + "\n", + " try:\n", + " async with aiohttp.ClientSession() as session:\n", + " await session.post(os.getenv(\"DISCORD_WEBHOOK_URL\"), json=message)\n", + " except Exception as e:\n", + " print(f\"Error sending Discord notification: {e}\")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `send_price_alert` function above is responsible for sending price drop notifications to Discord using webhooks. Let's break down what's new:\n", + "\n", + "1. The function takes 4 parameters:\n", + " - `product_name`: The name of the product that dropped in price\n", + " - `old_price`: The previous price before the drop\n", + " - `new_price`: The current lower price\n", + " - `url`: Link to view the product\n", + "\n", + "2. It calculates the percentage drop in price using the formula: `((old_price - new_price) / old_price) * 100`\n", + "\n", + "3. The notification is formatted as a Discord embed - a rich message format that includes:\n", + " - A title with a celebration emoji\n", + " - A description showing the product name, price drop percentage, old and new prices\n", + " - A link to view the product\n", + " - A green color (3066993 in decimal)\n", + "\n", + "4. The message is sent asynchronously using `aiohttp` to post to the Discord webhook URL stored in the environment variables\n", + "\n", + "5. Error handling is included to catch and print any issues that occur during the HTTP request" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "This provides a clean way to notify users through Discord whenever we detect a price drop for tracked products.\n", + "\n", + "To check the notification system works, add this main block to the end of the script:\n", + "\n", + "```python\n", + "if __name__ == \"__main__\":\n", + " asyncio.run(send_price_alert(\"Test Product\", 100, 90, \"https://www.google.com\"))\n", + "```\n", + "\n", + "`asyncio.run()` is used here because `send_price_alert` is an async function that needs to be executed in an event loop. `asyncio.run()` creates and manages this event loop, allowing the async HTTP request to be made properly. Without it, we wouldn't be able to use the `await` keyword inside `send_price_alert`.\n", + "\n", + "\n", + "To run the script, install `aiohttp`:\n", + "\n", + "```python\n", + "pip install aiohttp\n", + "echo \"aiohttp\\n\" >> requirements.txt\n", + "python notifications.py\n", + "```\n", + "\n", + "If all is well, you should get a Discord message in your server that looks like this:\n", + "\n", + "![](images/alert.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's commit the changes we have again:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Set up Discord alert system\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 11: Sending Discord alerts when prices drop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, the only step left is adding a price comparison logic to `check_prices.py`. In other words, we want to use the `send_price_alert` function if the new scraped price is lower than the original. This requires a revamped `check_prices.py` script:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import os\n", + "import asyncio\n", + "from database import Database\n", + "from dotenv import load_dotenv\n", + "from firecrawl import FirecrawlApp\n", + "from scraper import scrape_product\n", + "from notifications import send_price_alert\n", + "\n", + "load_dotenv()\n", + "\n", + "db = Database(os.getenv(\"POSTGRES_URL\"))\n", + "app = FirecrawlApp()\n", + "\n", + "# Threshold percentage for price drop alerts (e.g., 5% = 0.05)\n", + "PRICE_DROP_THRESHOLD = 0.05\n", + "\n", + "\n", + "async def check_prices():\n", + " products = db.get_all_products()\n", + " product_urls = set(product.url for product in products)\n", + "\n", + " for product_url in product_urls:\n", + " # Get the price history\n", + " price_history = db.get_price_history(product_url)\n", + " if not price_history:\n", + " continue\n", + "\n", + " # Get the earliest recorded price\n", + " earliest_price = price_history[-1].price\n", + "\n", + " # Retrieve updated product data\n", + " updated_product = scrape_product(product_url)\n", + " current_price = updated_product[\"price\"]\n", + "\n", + " # Add the price to the database\n", + " db.add_price(updated_product)\n", + " print(f\"Added new price entry for {updated_product['name']}\")\n", + "\n", + " # Check if price dropped below threshold\n", + " if earliest_price > 0: # Avoid division by zero\n", + " price_drop = (earliest_price - current_price) / earliest_price\n", + " if price_drop >= PRICE_DROP_THRESHOLD:\n", + " await send_price_alert(\n", + " updated_product[\"name\"], earliest_price, current_price, product_url\n", + " )\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " asyncio.run(check_prices())\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's examine the key changes in this enhanced version of `check_prices.py`:\n", + "\n", + "1. New imports and setup\n", + " - Added `asyncio` for `async`/`await` support\n", + " - Imported `send_price_alert` from `notifications.py`\n", + " - Defined `PRICE_DROP_THRESHOLD = 0.05` (5% threshold for alerts)\n", + "\n", + "2. Async function conversion\n", + " - Converted `check_prices()` to async function\n", + " - Gets unique product URLs using set comprehension to avoid duplicates\n", + " \n", + "3. Price history analysis\n", + " - Retrieves full price history for each product\n", + " - Gets `earliest_price` from `history[-1]` (works because we ordered by timestamp DESC)\n", + " - Skips products with no price history using `continue`\n", + " \n", + "4. Price drop detection logic\n", + " - Calculates drop percentage: `(earliest_price - current_price) / earliest_price`\n", + " - Checks if drop exceeds 5% threshold\n", + " - Sends Discord alert if threshold exceeded using `await send_price_alert()`\n", + " \n", + "5. Async main block\n", + " - Uses `asyncio.run()` to execute async `check_prices()` in event loop\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When I tested this new version of the script, I immediately got an alert:\n", + "\n", + "![](images/new-alert.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's commit everything and push to GitHub so that our workflow is supercharged with our notification system:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add notification system to price drops\"\n", + "git push origin main\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion and Next Steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations for making it to the end of this extremely long tutorial! We've just covered how to implement an end-to-end Python project you can proudly showcase on your portfolio. We built a complete price tracking system that scrapes product data from e-commerce websites, stores it in a Postgres database, analyzes price histories, and sends automated Discord notifications when prices drop significantly. Along the way, we learned about web scraping with Firecrawl, database management with SQLAlchemy, asynchronous programming with asyncio, building interactive UIs with Streamlit, automating with GitHub actions and integrating external webhooks.\n", + "\n", + "However, the project is far from perfect. Since we took a top-down approach to building this app, our project code is scattered across multiple files and doesn't conform to programming best practices most of the time. For this reason, I've recreated the same project in a much more sophisticated matter with production-level features. [This new version on GitHub](https://github.com/BexTuychiev/automated-price-tracking) implements proper database session management, faster operations and overall smoother user experience. \n", + "\n", + "If you decide to stick with the basic version, you can find the full project code and the notebook from the official Firecrawl GitHub repository example projects. I also recommend that you deploy your Streamlit app to Streamlit Cloud so that you have a function app accessible everywhere you go. \n", + "\n", + "Here are some more guides from our blog if you are interested:\n", + "\n", + "- [How to Run Web Scrapers on Schedule](https://www.firecrawl.dev/blog/automated-web-scraping-free-2025)\n", + "- [More about using Firecrawl's `scrape_url` function](https://www.firecrawl.dev/blog/mastering-firecrawl-scrape-endpoint)\n", + "- [Scraping entire websites with Firecrawl in a single command - the /crawl endpoint](https://www.firecrawl.dev/blog/mastering-the-crawl-endpoint-in-firecrawl)\n", + "\n", + "Thank you for reading!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/blog-articles/amazon-price-tracking/notebook.md b/examples/blog-articles/amazon-price-tracking/notebook.md new file mode 100644 index 00000000..59ca47a5 --- /dev/null +++ b/examples/blog-articles/amazon-price-tracking/notebook.md @@ -0,0 +1,1237 @@ +--- +title: How to Build an Automated Amazon Price Tracking Tool in Python For Free +description: Learn how to build a free automated price tracking tool in Python that monitors Amazon and other e-commerce sites, sends Discord alerts for price drops, and maintains price history using Firecrawl, Streamlit, and GitHub Actions. +slug: amazon-price-tracker-in-python-for-free +date: Dec 6, 2024 +author: bex_tuychiev +image: /images/blog/amazon-price-tracking/amazon-price-tracker-in-python-for-free.jpg +categories: [tutorials] +keywords: [amazon price tracker, amazon price history tracker, amazon price tracker app, amazon web scraper, amazon web scraper python, ecommerce web scraping, web scraping python] +--- + +## That sends alerts to your phone and keeps price history + +## What Shall We Build in This Tutorial? + +There is a lot to be said about the psychology of discounts. For example, buying a discounted item we don't need isn't saving money at all - it's falling for one of the oldest sales tactics. However, there are legitimate cases where waiting for a price drop on items you actually need makes perfect sense. + +The challenge is that e-commerce websites run flash sales and temporary discounts constantly, but these deals often disappear as quickly as they appear. Missing these brief windows of opportunity can be frustrating. + +That's where automation comes in. In this guide, we'll build a Python application that monitors product prices across any e-commerce website and instantly notifies you when prices drop on items you're actually interested in. Here is a sneak peek of the app: + +![Screenshot of a minimalist price tracking application showing product listings, price history charts, and notification controls for monitoring e-commerce deals using Firecrawl](amazon-price-tracking-images/sneak-peek.png) + +The app has a simple appearance but provides complete functionality: + +- It has a minimalistic UI to add or remove products from the tracker +- A simple dashboard to display price history for each product +- Controls for setting the price drop threshold in percentages +- A notification system that sends Discord alerts when a tracked item's price drops +- A scheduling system that updates the product prices on an interval you specify +- Runs for free for as long as you want + +Even though the title says "Amazon price tracker" (full disclosure: I was forced to write that for SEO purposes), the app will work for any e-commerce website you can imagine (except Ebay, for some reason). + +So, let's get started building this Amazon price tracker. + +## The Toolstack We Will Use + +The app will be built using Python and these libraries:: + +- [Streamlit](streamlit.io) for the UI +- [Firecrawl](firecrawl.dev) for AI-based scraping of e-commerce websites +- [SQLAlchemy](https://www.sqlalchemy.org/) for database management + +In addition to Python, we will use these platforms: + +- Discord for notifications +- GitHub for hosting the app +- GitHub Actions for running the app on a schedule +- Supabase for hosting a free Postgres database instance + +## Building an Amazon Price Tracker App Step-by-step + +Since this project involves multiple components working together, we'll take a top-down approach rather than building individual pieces first. This approach makes it easier to understand how everything fits together, since we'll introduce each tool only when it's needed. The benefits of this strategy will become clear as we progress through the tutorial. + +### Step 1: Setting up the environment + +First, let's create a dedicated environment on our machines to work on the project: + +```bash +mkdir automated-price-tracker +cd automated-price-tracker +python -m venv .venv +source .venv/bin/activate +``` + +These commands create a working directory and activate a virtual environment. Next, create a new script called `ui.py` for designing the user interface with Streamlit. + +```bash +touch ui.py +``` + +Then, install Streamlit: + +```bash +pip install streamlit +``` + +Next, create a `requirements.txt` file and add Streamlit as the first dependency: + +```bash +touch requirements.txt +echo "streamlit\n" >> requirements.txt +``` + +Since the code will be hosted on GitHub, we need to initialize Git and create a `.gitignore` file: + +```bash +git init +touch .gitignore +echo ".venv" >> .gitignore # Add the virtual env folder +git commit -m "Initial commit" +``` + +### Step 2: Add a sidebar to the UI for product input + +Let's take a look at the final product one more time: + +![A screenshot of an Amazon price tracker web application showing a sidebar for adding product URLs and a main dashboard displaying tracked products with price history charts. Created with streamlit and firecrawl](amazon-price-tracking-images/sneak-peek.png) + +It has two sections: the sidebar and the main dashboard. Since the first thing you do when launching this app is adding products, we will start building the sidebar first. Open `ui.py` and paste the following code: + +```python +import streamlit as st + +# Set up sidebar +with st.sidebar: + st.title("Add New Product") + product_url = st.text_input("Product URL") + add_button = st.button("Add Product") + +# Main content +st.title("Price Tracker Dashboard") +st.markdown("## Tracked Products") +``` + +The code snippet above sets up a basic Streamlit web application with two main sections. In the sidebar, it creates a form for adding new products with a text input field for the product URL and an "Add Product" button. The main content area contains a dashboard title and a section header for tracked products. The code uses Streamlit's `st.sidebar` context manager to create the sidebar layout and basic Streamlit components like `st.title`, `st.text_input`, and `st.button` to build the user interface elements. + +To see how this app looks like, run the following command: + +```bash +streamlit run ui.py +``` + +Now, let's add a commit to save our progress: + +```bash +git add . +git commit -m "Add a sidebar to the basic UI" +``` + +### Step 3: Add a feature to check if input URL is valid + +In the next step, we want to add some restrictions to the input field like checking if the passed URL is valid. For this, create a new file called `utils.py` where we write additional utility functions for our app: + +```bash +touch utils.py +``` + +Inside the script, paste following code: + +```bash +# utils.py +from urllib.parse import urlparse +import re + + +def is_valid_url(url: str) -> bool: + try: + # Parse the URL + result = urlparse(url) + + # Check if scheme and netloc are present + if not all([result.scheme, result.netloc]): + return False + + # Check if scheme is http or https + if result.scheme not in ["http", "https"]: + return False + + # Basic regex pattern for domain validation + domain_pattern = ( + r"^[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z]{2,})+$" + ) + if not re.match(domain_pattern, result.netloc): + return False + + return True + + except Exception: + return False +``` + +The above function `is_valid_url()` validates URLs by checking several criteria: + +1. It verifies the URL has both a scheme (`http`/`https`) and domain name +2. It ensures the scheme is specifically `http` or `https` +3. It validates the domain name format using regex to check for valid characters and TLD +4. It returns True only if all checks pass, False otherwise + +Let's use this function in our `ui.py` file. Here is the modified code: + +```python +import streamlit as st +from utils import is_valid_url + + +# Set up sidebar +with st.sidebar: + st.title("Add New Product") + product_url = st.text_input("Product URL") + add_button = st.button("Add Product") + + if add_button: + if not product_url: + st.error("Please enter a product URL") + elif not is_valid_url(product_url): + st.error("Please enter a valid URL") + else: + st.success("Product is now being tracked!") + +# Main content +... +``` + +Here is what's new: + +1. We added URL validation using the `is_valid_url()` function from `utils.py` +2. When the button is clicked, we perform validation: + - Check if URL is empty + - Validate URL format using `is_valid_url()` +3. User feedback is provided through error/success messages: + - Error shown for empty URL + - Error shown for invalid URL format + - Success message when URL passes validation + +Rerun the Streamlit app again and see if our validation works. Then, return to your terminal to commit the changes we've made: + +```bash +git add . +git commit -m "Add a feature to check URL validity" +``` + +### Step 4: Scrape the input URL for product details + +When a valid URL is entered and the add button is clicked, we need to implement product scraping functionality instead of just showing a success message. The system should: + +1. Immediately scrape the product URL to extract key details: + - Product name + - Current price + - Main product image + - Brand name + - Other relevant attributes + +2. Store these details in a database to enable: + - Regular price monitoring + - Historical price tracking + - Price change alerts + - Product status updates + +For the scraper, we will use [Firecrawl](firecrawl.dev), an AI-based scraping API for extracting webpage data without HTML parsing. This solution provides several advantages: + +1. No website HTML code analysis required for element selection +2. Resilient to HTML structure changes through AI-based element detection +3. Universal compatibility with product webpages due to structure-agnostic approach +4. Reliable website blocker bypass via robust API infrastructure + +First, create a new file called `scraper.py`: + +```bash +touch scraper.py +``` + +Then, install these three libraries: + +```bash +pip install firecrawl-py pydantic python-dotenv +echo "firecrawl-py\npydantic\npython-dotenv\n" >> requirements.txt # Add them to dependencies +``` + +`firecrawl-py` is the Python SDK for Firecrawl scraping engine, `pydantic` is a data validation library that helps enforce data types and structure through Python class definitions, and `python-dotenv` is a library that loads environment variables from a `.env` file into your Python application. + +With that said, head over to the Firecrawl website and [sign up for a free account](https://www.firecrawl.dev/) (the free plan will work fine). You will be given an API key, which you should copy. + +Then, create a `.env` file in your terminal and add the API key as an environment variable: + +```bash +touch .env +echo "FIRECRAWL_API_KEY='YOUR-API-KEY-HERE' >> .env" +echo ".env" >> .gitignore # Ignore .env files in Git +``` + +The `.env` file is used to securely store sensitive configuration values like API keys that shouldn't be committed to version control. By storing the Firecrawl API key in `.env` and adding it to `.gitignore`, we ensure it stays private while still being accessible to our application code. This is a security best practice to avoid exposing credentials in source control. + +Now, we can start writing the `scraper.py`: + +```python +from firecrawl import FirecrawlApp +from pydantic import BaseModel, Field +from dotenv import load_dotenv +from datetime import datetime + +load_dotenv() + +app = FirecrawlApp() +``` + +Here, `load_dotenv()` function reads the `.env` file you have in your working directory and loads the environment variables inside, including the Firecrawl API key. When you create an instance of `FirecrawlApp` class, the API key is automatically detected to establish a connection between your script and the scraping engine in the form of the `app` variable. + +Now, we create a Pydantic class (usually called a model) that defines the details we want to scrape from each product: + +```python +class Product(BaseModel): + """Schema for creating a new product""" + + url: str = Field(description="The URL of the product") + name: str = Field(description="The product name/title") + price: float = Field(description="The current price of the product") + currency: str = Field(description="Currency code (USD, EUR, etc)") + main_image_url: str = Field(description="The URL of the main image of the product") +``` + +Pydantic models may be completely new to you, so let's break down the `Product` model: + +- The `url` field stores the product page URL we want to track +- The `name` field stores the product title/name that will be scraped +- The `price` field stores the current price as a float number +- The `currency` field stores the 3-letter currency code (e.g. USD, EUR) +- The `main_image_url` field stores the URL of the product's main image + +Each field is typed and has a description that documents its purpose. The `Field` class from Pydantic allows us to add metadata like descriptions to each field. These descriptions are especially important for Firecrawl since it uses them to automatically locate the relevant HTML elements containing the data we want. + +Now, let's create a function to call the engine to scrape URL's based on the schema above: + +```python +def scrape_product(url: str): + extracted_data = app.scrape_url( + url, + params={ + "formats": ["extract"], + "extract": {"schema": Product.model_json_schema()}, + }, + ) + + # Add the scraping date to the extracted data + extracted_data["extract"]["timestamp"] = datetime.utcnow() + + return extracted_data["extract"] + + +if __name__ == "__main__": + product = "https://www.amazon.com/gp/product/B002U21ZZK/" + + print(scrape_product(product)) +``` + +The code above defines a function called `scrape_product` that takes a URL as input and uses it to scrape product information. Here's how it works: + +The function calls `app.scrape_url` with two parameters: + +1. The product URL to scrape +2. A params dictionary that configures the scraping: + - It specifies we want to use the "extract" format + - It provides our `Product` Pydantic model schema as the extraction template as a JSON object + +The scraper will attempt to find and extract data that matches our Product schema fields - the URL, name, price, currency, and image URL. + +The function returns just the "extract" portion of the scraped data, which contains the structured product information. `extract` returns a dictionary to which we add the date of the scraping as it will be important later on. + +Let's test the script by running it: + +```bash +python scraper.py +``` + +You should get an output like this: + +```python +{ + 'url': 'https://www.amazon.com/dp/B002U21ZZK', + 'name': 'MOVA Globe Earth with Clouds 4.5"', + 'price': 212, + 'currency': 'USD', + 'main_image_url': 'https://m.media-amazon.com/images/I/41bQ3Y58y3L._AC_.jpg', + 'timestamp': '2024-12-05 13-20' +} +``` + +The output shows that a [MOVA Globe](https://www.amazon.com/dp/B002U21ZZK) costs $212 USD on Amazon at the time of writing this article. You can test the script for any other website that contains the information we are looking (except Ebay): + +- Price +- Product name/title +- Main image URL + +One key advantage of using Firecrawl is that it returns data in a consistent dictionary format across all websites. Unlike HTML-based scrapers like BeautifulSoup or Scrapy which require custom code for each site and can break when website layouts change, Firecrawl uses AI to understand and extract the requested data fields regardless of the underlying HTML structure. + +Finish this step by committing the new changes to Git: + +```bash +git add . +git commit -m "Implement a Firecrawl scraper for products" +``` + +### Step 5: Storing new products in a PostgreSQL database + +If we want to check product prices regularly, we need to have an online database. In this case, Postgres is the best option since it's reliable, scalable, and has great support for storing time-series data like price histories. + +There are many platforms for hosting Postgres instances but the one I find the easiest and fastest to set up is Supabase. So, please head over to [the Supabase website](https://supabase.com) and create your free account. During the sign-up process, you will be given a password, which you should save somewhere safe on your machine. + +Then, in a few minutes, your free Postgres instance comes online. To connect to this instance, click on Home in the left sidebar and then, "Connect": + +![Screenshot of Supabase dashboard showing database connection settings and credentials for connecting to a PostgreSQL database instance](amazon-price-tracking-images/supabase_connect.png) + +You will be shown your database connection string with a placeholder for the password you copied. You should paste this string in your `.env` file with your password added to the `.env` file: + +```bash +echo POSTGRES_URL="THE-SUPABASE-URL-STRING-WITH-YOUR-PASSWORD-ADDED" +``` + +Now, the easiest way to interact with this database is through SQLAlchemy. Let's install it: + +```bash +pip install "sqlalchemy==2.0.35" psycopg2-binary +echo "psycopg2-binary\nsqlalchemy==2.0.35\n" >> requirements.txt +``` + +> Note: [SQLAlchemy](https://sqlalchemy.org) is a Python SQL toolkit and Object-Relational Mapping (ORM) library that lets us interact with databases using Python code instead of raw SQL. For our price tracking project, it provides essential features like database connection management, schema definition through Python classes, and efficient querying capabilities. This makes it much easier to store and retrieve product information and price histories in our Postgres database. + +After the installation, create a new `database.py` file for storing database-related functions: + +```bash +touch database.py +``` + +Let's populate this script: + +```python +from sqlalchemy import create_engine, Column, String, Float, DateTime, ForeignKey +from sqlalchemy.orm import sessionmaker, relationship, declarative_base +from datetime import datetime + +Base = declarative_base() + + +class Product(Base): + __tablename__ = "products" + + url = Column(String, primary_key=True) + prices = relationship( + "PriceHistory", back_populates="product", cascade="all, delete-orphan" + ) + + +class PriceHistory(Base): + __tablename__ = "price_histories" + + id = Column(String, primary_key=True) + product_url = Column(String, ForeignKey("products.url")) + name = Column(String, nullable=False) + price = Column(Float, nullable=False) + currency = Column(String, nullable=False) + main_image_url = Column(String) + timestamp = Column(DateTime, nullable=False) + product = relationship("Product", back_populates="prices") + +``` + +The code above defines two SQLAlchemy models for our price tracking database: + +The `Product` model acts as a registry of all items we want to track. It's kept simple with just the URL as we don't want to duplicate data that changes over time. + +The `PriceHistory` model stores the actual price data points and product details at specific moments in time. This separation allows us to: + +- Track how product details (name, price, image) change over time +- Maintain a clean historical record for each product +- Efficiently query price trends without loading unnecessary data + +Each record in `PriceHistory` contains: + +- A unique ID as primary key +- The product URL as a foreign key linking to the `Product` +- The product name +- The price value and currency +- The main product image URL +- A timestamp of when the price was recorded + +The relationship between `Product` and `PriceHistory` is bidirectional, allowing easy navigation between related records. The `cascade` setting ensures price histories are deleted when their product is deleted. + +These models provide the structure for storing and querying our price tracking data in a PostgreSQL database using SQLAlchemy's ORM capabilities. + +Now, we define a `Database` class with a singe `add_product` method: + +```python +class Database: + def __init__(self, connection_string): + self.engine = create_engine(connection_string) + Base.metadata.create_all(self.engine) + self.Session = sessionmaker(bind=self.engine) + + def add_product(self, url): + session = self.Session() + try: + # Create the product entry + product = Product(url=url) + session.merge(product) # merge will update if exists, insert if not + session.commit() + finally: + session.close() +``` + +The `Database` class above provides core functionality for managing product data in our PostgreSQL database. It takes a connection string in its constructor to establish the database connection using SQLAlchemy. + +The `add_product` method allows us to store new product URLs in the database. It uses SQLAlchemy's `merge` functionality which intelligently handles both inserting new products and updating existing ones, preventing duplicate entries. + +The method carefully manages database sessions, ensuring proper resource cleanup by using `try`/`finally` blocks. This prevents resource leaks and maintains database connection stability. + +Let's use this method inside the sidebar of our UI. Switch to `ui.py` and make the following adjustments: + +First, update the imports to load the Database class and initialize it: + +```python +import os +import streamlit as st + +from utils import is_valid_url +from database import Database +from dotenv import load_dotenv + +load_dotenv() + +with st.spinner("Loading database..."): + db = Database(os.getenv("POSTGRES_URL")) +``` + +The code integrates the `Database` class into the Streamlit UI by importing required dependencies and establishing a database connection. The database URL is loaded securely from environment variables using `python-dotenv`. The `Database` class creates or updates the tables we specified in `database.py` after being initialized. + +The database initialization process is wrapped in a Streamlit spinner component to maintain responsiveness while establishing the connection. This provides visual feedback during the connection setup period, which typically requires a brief initialization time. + +Then, in the sidebar code, we only need to add a single line of code to add the product to the database if the URL is valid: + +```python +# Set up sidebar +with st.sidebar: + st.title("Add New Product") + product_url = st.text_input("Product URL") + add_button = st.button("Add Product") + + if add_button: + if not product_url: + st.error("Please enter a product URL") + elif not is_valid_url(product_url): + st.error("Please enter a valid URL") + else: + db.add_product(product_url) # This is the new line + st.success("Product is now being tracked!") +``` + +In the final `else` block that runs when the product URL is valid, we call the `add_product` method to store the product in the database. + +Let's commit everything: + +```bash +git add . +git commit -m "Add a Postgres database integration for tracking product URLs" +``` + +### Step 6: Storing price histories for new products + +Now, after the product is added to the `products` table, we want to add its details and its scraped price to the `price_histories` table. + +First, switch to `database.py` and add a new method for creating entries in the `PriceHistories` table: + +```python +class Database: + ... # the rest of the class + + def add_price(self, product_data): + session = self.Session() + try: + price_history = PriceHistory( + id=f"{product_data['url']}_{product_data['timestamp']}", + product_url=product_data["url"], + name=product_data["name"], + price=product_data["price"], + currency=product_data["currency"], + main_image_url=product_data["main_image_url"], + timestamp=product_data["timestamp"], + ) + session.add(price_history) + session.commit() + finally: + session.close() +``` + +The `add_price` method takes a dictionary containing product data (which is returned by our scraper) and creates a new entry in the `PriceHistory` table. The entry's ID is generated by combining the product URL with a timestamp. The method stores essential product information like name, price, currency, image URL, and the timestamp of when the price was recorded. It uses SQLAlchemy's session management to safely commit the new price history entry to the database. + +Now, we need to add this functionality to the sidebar as well. In `ui.py`, add a new import statement that loads the `scrape_product` function from `scraper.py`: + +```python +... # The rest of the imports +from scraper import scrape_product +``` + +Then, update the `else` block in the sidebar again: + +```python +with st.sidebar: + st.title("Add New Product") + product_url = st.text_input("Product URL") + add_button = st.button("Add Product") + + if add_button: + if not product_url: + st.error("Please enter a product URL") + elif not is_valid_url(product_url): + st.error("Please enter a valid URL") + else: + db.add_product(product_url) + with st.spinner("Added product to database. Scraping product data..."): + product_data = scrape_product(product_url) + db.add_price(product_data) + st.success("Product is now being tracked!") +``` + +Now when a user enters a product URL and clicks the "Add Product" button, several things happen: + +1. The URL is validated to ensure it's not empty and is properly formatted. +2. If valid, the URL is added to the products table via `add_product()`. +3. The product page is scraped immediately to get current price data. +4. This initial price data is stored in the price history table via `add_price()`. +5. The user sees loading spinners and success messages throughout the process. + +This gives us a complete workflow for adding new products to track, including capturing their initial price point. The UI provides clear feedback at each step and handles errors gracefully. + +Check that everything is working the way we want it and then, commit the new changes: + +```bash +git add . +git commit -m "Add a feature to track product prices after they are added" +``` + +### Step 7: Displaying each product's price history in the main dashboard + +Let's take a look at the final product shown in the introduction once again: + +![Screenshot of a minimalist price tracking dashboard showing product price history charts, add/remove product controls, and notification settings for monitoring e-commerce deals and price drops](amazon-price-tracking-images/sneak-peek.png) + +Apart from the sidebar, the main dashboard shows each product's price history visualized with a Plotly line plot where the X axis is the timestamp while the Y axis is the prices. Each line plot is wrapped in a Streamlit component that includes buttons for removing the product from the database or visiting its source URL. + +In this step, we will implement the plotting feature and leave the two buttons for a later section. First, add a new method to the `Database` class for retrieving the price history for each product: + +```python +class Database: + ... # The rest of the code + + def get_price_history(self, url): + """Get price history for a product""" + session = self.Session() + try: + return ( + session.query(PriceHistory) + .filter(PriceHistory.product_url == url) + .order_by(PriceHistory.timestamp.desc()) + .all() + ) + finally: + session.close() +``` + +The method queries the price histories table based on product URL, orders the rows in descending order (oldest first) and returns the results. + +Then, add another method for retrieving all products from the `products` table: + +```python +class Database: + ... + + def get_all_products(self): + session = self.Session() + try: + return session.query(Product).all() + finally: + session.close() +``` + +The idea is that every time our Streamlit app is opened, the main dashboard queries all existing products from the database and render their price histories with line charts in dedicated components. + +To create the line charts, we need Plotly and Pandas, so install them in your environment: + +```bash +pip install pandas plotly +echo "pandas\nplotly\n" >> requirements.txt +``` + +Afterward, import them at the top of `ui.py` along with other existing imports: + +```python +import pandas as pd +import plotly.express as px +``` + +Then, switch to `ui.py` and paste the following snippet of code after the Main content section: + +```python +# Main content +st.title("Price Tracker Dashboard") +st.markdown("## Tracked Products") + +# Get all products +products = db.get_all_products() +``` + +Here, after the page title and subtitle is shown, we are retrieving all products from the database. Let's loop over them: + +```python +# Create a card for each product +for product in products: + price_history = db.get_price_history(product.url) + if price_history: + # Create DataFrame for plotting + df = pd.DataFrame( + [ + {"timestamp": ph.timestamp, "price": ph.price, "name": ph.name} + for ph in price_history + ] + ) +``` + +For each product, we get their price history with `db.get_price_history` and then, convert this data into a dataframe with three columns: + +- Timestamp +- Price +- Product name + +This makes plotting easier with Plotly. Next, we create a Streamlit expander component for each product: + +```python +# Create a card for each product +for product in products: + price_history = db.get_price_history(product.url) + if price_history: + ... + # Create a card-like container for each product + with st.expander(df["name"][0], expanded=False): + st.markdown("---") + col1, col2 = st.columns([1, 3]) + + with col1: + if price_history[0].main_image_url: + st.image(price_history[0].main_image_url, width=200) + st.metric( + label="Current Price", + value=f"{price_history[0].price} {price_history[0].currency}", + ) +``` + +The expander shows the product name as its title and contains: + +1. A divider line +2. Two columns: + - Left column: Product image (if available) and current price metric + - Right column (shown in next section) + +The price is displayed using Streamlit's metric component which shows the current price and currency. + +Here is the rest of the code: + +```python + ... + + with col2: + # Create price history plot + fig = px.line( + df, + x="timestamp", + y="price", + title=None, + ) + fig.update_layout( + xaxis_title=None, + yaxis_title="Price", + showlegend=False, + margin=dict(l=0, r=0, t=0, b=0), + height=300, + ) + fig.update_xaxes(tickformat="%Y-%m-%d %H:%M", tickangle=45) + fig.update_yaxes(tickprefix=f"{price_history[0].currency} ", tickformat=".2f") + st.plotly_chart(fig, use_container_width=True) +``` + +In the right column, we create an interactive line plot using Plotly Express to visualize the price history over time. The plot shows price on the y-axis and timestamp on the x-axis. The layout is customized to remove the title, adjust axis labels and formatting, and optimize the display size. The timestamps are formatted to show date and time, with angled labels for better readability. Prices are displayed with 2 decimal places and a dollar sign prefix. The plot is rendered using Streamlit's `plotly_chart` component and automatically adjusts its width to fill the container. + +After this step, the UI must be fully functional and ready to track products. For example, here is what mine looks like after adding a couple of products: + +![Screenshot of a price tracking dashboard showing multiple product listings with price history charts, product images, and current prices for Amazon items](amazon-price-tracking-images/finished.png) + +But notice how the price history chart doesn't show anything. That's because we haven't populated it by checking the product price in regular intervals. Let's do that in the next couple of steps. For now, commit the latest changes we've made: + +```bash +git add . +git commit -m "Display product price histories for each product in the dashboard" +``` + +------------ + +Let's take a brief moment to summarize the steps we took so far and what's next. So far, we've built a Streamlit interface that allows users to add product URLs and displays their current prices and basic information. We've implemented the database schema, created functions to scrape product data, and designed a clean UI with price history visualization. The next step is to set up automated price checking to populate our history charts and enable proper price tracking over time. + +### Step 8: Adding new price entries for existing products + +Now, we want to write a script that adds new price entries in the `price_histories` table for each product in `products` table. We call this script `check_prices.py`: + +```python +import os +from database import Database +from dotenv import load_dotenv +from firecrawl import FirecrawlApp +from scraper import scrape_product + +load_dotenv() + +db = Database(os.getenv("POSTGRES_URL")) +app = FirecrawlApp() +``` + +At the top, we are importing the functions and packages and initializing the database and a Firecrawl app. Then, we define a simple `check_prices` function: + +```python +def check_prices(): + products = db.get_all_products() + + for product in products: + try: + updated_product = scrape_product(product.url) + db.add_price(updated_product) + print(f"Added new price entry for {updated_product['name']}") + except Exception as e: + print(f"Error processing {product.url}: {e}") + + +if __name__ == "__main__": + check_prices() +``` + +In the function body, we retrieve all products URLs, retrieve their new price data with `scrape_product` function from `scraper.py` and then, add a new price entry for the product with `db.add_price`. + +If you run the function once and refresh the Streamlit app, you must see a line chart appear for each product you are tracking: + +![Screenshot of a price tracking dashboard showing a line chart visualization of product price history over time, with price on the y-axis and dates on the x-axis](amazon-price-tracking-images/linechart.png) + +Let's commit the changes in this step: + +```bash +git add . +git commit -m "Add a script for checking prices of existing products" +``` + +### Step 9: Check prices regularly with GitHub actions + +GitHub Actions is a continuous integration and continuous delivery (CI/CD) platform that allows you to automate various software workflows directly from your GitHub repository. In our case, it's particularly useful because we can set up automated price checks to run the `check_prices.py` script at regular intervals (e.g., daily or hourly) without manual intervention. This ensures we consistently track price changes and maintain an up-to-date database of historical prices for our tracked products. + +So, the first step is creating a new GitHub repository for our project and pushing existing code to it: + +```bash +git remote add origin https://github.com/yourusername/price-tracker.git +git push origin main +``` + +Then, return to your terminal and create this directory structure: + +```bash +mkdir -p .github/workflows +touch .github/workflows/check_prices.yml +``` + +The first command creates a new directory structure `.github/workflows` using the `-p` flag to create parent directories if they don't exist. + +The second command creates an empty YAML file called `check_prices.yml` inside the workflows directory. GitHub Actions looks for workflow files in this specific location - any YAML files in the `.github/workflows` directory will be automatically detected and processed as workflow configurations. These YAML files define when and how your automated tasks should run, what environment they need, and what commands to execute. In our case, this file will contain instructions for GitHub Actions to periodically run our price checking script. Let's write it: + +```yaml +name: Price Check + +on: + schedule: + # Runs every 3 minutes + - cron: "*/3 * * * *" + workflow_dispatch: # Allows manual triggering +``` + +Let's break down this first part of the YAML file: + +The `name: Price Check` line gives our workflow a descriptive name that will appear in the GitHub Actions interface. + +The `on:` section defines when this workflow should be triggered. We've configured two triggers: + +1. A schedule using cron syntax `*/3 * * * *` which runs the workflow every 3 minutes. The five asterisks represent minute, hour, day of month, month, and day of week respectively. The `*/3` means "every 3rd minute". The 3-minute interval is for debugging purposes, we will need to choose a wider interval later on to respect the free limits of GitHub actions. + +2. `workflow_dispatch` enables manual triggering of the workflow through the GitHub Actions UI, which is useful for testing or running the check on-demand. + +Now, let's add the rest: + +```yaml +jobs: + check-prices: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run price checker + env: + FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} + POSTGRES_URL: ${{ secrets.POSTGRES_URL }} + run: python check_prices.py +``` + +Let's break down this second part of the YAML file: + +The `jobs:` section defines the actual work to be performed. We have one job named `check-prices` that runs on an Ubuntu virtual machine (`runs-on: ubuntu-latest`). + +Under `steps:`, we define the sequence of actions: + +1. First, we checkout our repository code using the standard `actions/checkout@v4` action + +2. Then we set up Python 3.10 using `actions/setup-python@v5`, enabling pip caching to speed up dependency installation + +3. Next, we install our Python dependencies by upgrading `pip` and installing requirements from our `requirements.txt` file. At this point, it is essential that you were keeping a complete dependency file based on the installs we made in the project. + +4. Finally, we run our price checker script, providing two environment variables: + - `FIRECRAWL_API_KEY`: For accessing the web scraping service + - `POSTGRES_URL`: For connecting to our database + +Both variables must be stored in our GitHub repository as secrets for this workflow file to run without errors. So, navigate to the repository you've created for the project and open its Settings. Under "Secrets and variables" > "Actions", click on "New repository secret" button to add the environment variables we have in the `.env` file one-by-one. + +Then, return to your terminal, commit the changes and push: + +```bash +git add . +git commit -m "Add a workflow to check prices regularly" +git push origin main +``` + +Next, navigate to your GitHub repository again and click on the "Actions" tab: + +![Screenshot of GitHub Actions interface showing workflow runs and manual trigger button for automated price tracking application](amazon-price-tracking-images/actions.png) + +From there, you can run the workflow manually (click "Run workflow" and refresh the page). If it is executed successfully, you can return to the Streamlit app and refresh to see the new price added to the chart. + +### Step 10: Setting up Discord for notifications + +Now that we know our scheduling workflow works, the first order of business is setting a wider check interval in the workflow file. Even though our first workflow run was manually, the rest happen automatically. + +```bash +on: + schedule: + # Runs every 6 hours + - cron: "0 0,6,12,18 * * *" + workflow_dispatch: # Allows manual triggering +``` + +The cron syntax `0 0,6,12,18 * * *` can be broken down as follows: + +- First `0`: Run at minute 0 +- `0,6,12,18`: Run at hours 0 (midnight), 6 AM, 12 PM (noon), and 6 PM +- First `*`: Run every day of the month +- Second `*`: Run every month +- Third `*`: Run every day of the week + +So this schedule will check prices four times daily: at midnight, 6 AM, noon, and 6 PM (UTC time). This spacing helps stay within GitHub Actions' free tier limits while still catching most price changes. + +Now, commit and push the changes: + +```bash +git add . +git commit -m "Set a wider check interval in the workflow file" +git push origin main +``` + +Now comes the interesting part. Each time the workflow is run, we want to compare the current price of the product to its original price when we started tracking it. If the difference between these two prices exceeds a certain threshold like 5%, this means there is a discount happening for the product and we want to send a notification. + +The easiest way to set this up is by using Discord webhooks. So, if you don't have one already, go to Discord.com and create a new account (optionally, download the desktop app as well). Then, setting up Discord notifications requires a few careful steps: + +1. **Create a discord server** + - Click the "+" button in the bottom-left corner of Discord + - Choose "Create My Own" โ†’ "For me and my friends" + - Give your server a name (e.g., "Price Alerts") + +2. **Create a channel for alerts** + - Your server comes with a #general channel by default + - You can use this or create a new channel called #price-alerts + - Right-click the channel you want to use + +3. **Set up the webhook** + - Select "Edit Channel" from the right-click menu + - Go to the "Integrations" tab + - Click "Create Webhook" + - Give it a name like "Price Alert Bot" + - The webhook URL will be generated automatically + - Click "Copy Webhook URL" - this is your unique notification endpoint + +4. **Secure the webhook URL** + - Never share or commit your webhook URL directly + - Add it to your `.env` file as `DISCORD_WEBHOOK_URL` + - Add it to your GitHub repository secrets + - The URL should look something like: `https://discord.com/api/webhooks/...` + +This webhook will serve as a secure endpoint that our price tracker can use to send notifications directly to your Discord channel. + +Webhooks are automated messages sent from apps to other apps in real-time. They work like a notification system - when something happens in one app, it automatically sends data to another app through a unique URL. In our case, we'll use Discord webhooks to automatically notify us when there's a price drop. Whenever our price tracking script detects a significant discount, it will send a message to our Discord channel through the webhook URL, ensuring we never miss a good deal. + +After copying the webhook URL, you should save it as environment variable to your `.env` file: + +```python +echo "DISCORD_WEBHOOK_URL='THE-URL-YOU-COPIED'" >> .env +``` + +Now, create a new file called `notifications.py` and paste the following contents: + +```python +from dotenv import load_dotenv +import os +import aiohttp +import asyncio + +load_dotenv() + + +async def send_price_alert( + product_name: str, old_price: float, new_price: float, url: str +): + """Send a price drop alert to Discord""" + drop_percentage = ((old_price - new_price) / old_price) * 100 + + message = { + "embeds": [ + { + "title": "Price Drop Alert! ๐ŸŽ‰", + "description": f"**{product_name}**\nPrice dropped by {drop_percentage:.1f}%!\n" + f"Old price: ${old_price:.2f}\n" + f"New price: ${new_price:.2f}\n" + f"[View Product]({url})", + "color": 3066993, + } + ] + } + + try: + async with aiohttp.ClientSession() as session: + await session.post(os.getenv("DISCORD_WEBHOOK_URL"), json=message) + except Exception as e: + print(f"Error sending Discord notification: {e}") +``` + +The `send_price_alert` function above is responsible for sending price drop notifications to Discord using webhooks. Let's break down what's new: + +1. The function takes 4 parameters: + - `product_name`: The name of the product that dropped in price + - `old_price`: The previous price before the drop + - `new_price`: The current lower price + - `url`: Link to view the product + +2. It calculates the percentage drop in price using the formula: `((old_price - new_price) / old_price) * 100` + +3. The notification is formatted as a Discord embed - a rich message format that includes: + - A title with a celebration emoji + - A description showing the product name, price drop percentage, old and new prices + - A link to view the product + - A green color (3066993 in decimal) + +4. The message is sent asynchronously using `aiohttp` to post to the Discord webhook URL stored in the environment variables + +5. Error handling is included to catch and print any issues that occur during the HTTP request + +This provides a clean way to notify users through Discord whenever we detect a price drop for tracked products. + +To check the notification system works, add this main block to the end of the script: + +```python +if __name__ == "__main__": + asyncio.run(send_price_alert("Test Product", 100, 90, "https://www.google.com")) +``` + +`asyncio.run()` is used here because `send_price_alert` is an async function that needs to be executed in an event loop. `asyncio.run()` creates and manages this event loop, allowing the async HTTP request to be made properly. Without it, we wouldn't be able to use the `await` keyword inside `send_price_alert`. + +To run the script, install `aiohttp`: + +```python +pip install aiohttp +echo "aiohttp\n" >> requirements.txt +python notifications.py +``` + +If all is well, you should get a Discord message in your server that looks like this: + +![Screenshot of a Discord notification showing a price drop alert with product details, original price, new discounted price and percentage savings](amazon-price-tracking-images/alert.png) + +Let's commit the changes we have: + +```bash +git add . +git commit -m "Set up Discord alert system" +``` + +Also, don't forget to add the Discord webhook URL to your GitHub repository secrets! + +### Step 11: Sending Discord alerts when prices drop + +Now, the only step left is adding a price comparison logic to `check_prices.py`. In other words, we want to use the `send_price_alert` function if the new scraped price is lower than the original. This requires a revamped `check_prices.py` script: + +```python +import os +import asyncio +from database import Database +from dotenv import load_dotenv +from firecrawl import FirecrawlApp +from scraper import scrape_product +from notifications import send_price_alert + +load_dotenv() + +db = Database(os.getenv("POSTGRES_URL")) +app = FirecrawlApp() + +# Threshold percentage for price drop alerts (e.g., 5% = 0.05) +PRICE_DROP_THRESHOLD = 0.05 + + +async def check_prices(): + products = db.get_all_products() + product_urls = set(product.url for product in products) + + for product_url in product_urls: + # Get the price history + price_history = db.get_price_history(product_url) + if not price_history: + continue + + # Get the earliest recorded price + earliest_price = price_history[-1].price + + # Retrieve updated product data + updated_product = scrape_product(product_url) + current_price = updated_product["price"] + + # Add the price to the database + db.add_price(updated_product) + print(f"Added new price entry for {updated_product['name']}") + + # Check if price dropped below threshold + if earliest_price > 0: # Avoid division by zero + price_drop = (earliest_price - current_price) / earliest_price + if price_drop >= PRICE_DROP_THRESHOLD: + await send_price_alert( + updated_product["name"], earliest_price, current_price, product_url + ) + + +if __name__ == "__main__": + asyncio.run(check_prices()) +``` + +Let's examine the key changes in this enhanced version of `check_prices.py`: + +1. New imports and setup + - Added `asyncio` for `async`/`await` support + - Imported `send_price_alert` from `notifications.py` + - Defined `PRICE_DROP_THRESHOLD = 0.05` (5% threshold for alerts) + +2. Async function conversion + - Converted `check_prices()` to async function + - Gets unique product URLs using set comprehension to avoid duplicates + +3. Price history analysis + - Retrieves full price history for each product + - Gets `earliest_price` from `history[-1]` (works because we ordered by timestamp DESC) + - Skips products with no price history using `continue` + +4. Price drop detection logic + - Calculates drop percentage: `(earliest_price - current_price) / earliest_price` + - Checks if drop exceeds 5% threshold + - Sends Discord alert if threshold exceeded using `await send_price_alert()` + +5. Async main block + - Uses `asyncio.run()` to execute async `check_prices()` in event loop + +When I tested this new version of the script, I immediately got an alert: + +![Screenshot of a Discord notification showing a price drop alert for an Amazon product, displaying the original and discounted prices with percentage savings](amazon-price-tracking-images/new-alert.png) + +Before we supercharge our workflow with the new notification system, you should add this line of code to your `check_prices.yml` workflow file to read the Discord webhook URL from your GitHub secrets: + +```python +... + - name: Run price checker + env: + FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} + POSTGRES_URL: ${{ secrets.POSTGRES_URL }} + DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }} + run: python automated_price_tracking/check_prices.py +``` + +Finally, let's commit everything and push to GitHub so that our workflow is supercharged with our notification system: + +```bash +git add . +git commit -m "Add notification system to price drops" +git push origin main +``` + +## Limitations of Free Tier Tools Used in the Tutorial + +Before wrapping up, let's quickly review the limitations of the free tools we used in this tutorial: + +- GitHub Actions: Limited to 2,000 minutes per month for free accounts. Consider increasing the cron interval to stay within limits. +- Supabase: Free tier includes 500MB database storage and limited row count. Monitor usage if tracking many products. +- Firecrawl: Free API tier allows 500 requests per month. This means that at 6 hour intervals, you can track up to four products in the free plan. +- Streamlit Cloud: Free hosting tier has some memory/compute restrictions and goes to sleep after inactivity. + +While these limitations exist, they're quite generous for personal use and learning. The app will work well for tracking a reasonable number of products with daily price checks. + +## Conclusion and Next Steps + +Congratulations for making it to the end of this extremely long tutorial! We've just covered how to implement an end-to-end Python project you can proudly showcase on your portfolio. We built a complete price tracking system that scrapes product data from e-commerce websites, stores it in a Postgres database, analyzes price histories, and sends automated Discord notifications when prices drop significantly. Along the way, we learned about web scraping with Firecrawl, database management with SQLAlchemy, asynchronous programming with asyncio, building interactive UIs with Streamlit, automating with GitHub actions and integrating external webhooks. + +However, the project is far from perfect. Since we took a top-down approach to building this app, our project code is scattered across multiple files and often doesn't follow programming best practices. For this reason, I've recreated the same project in a much more sophisticated manner with production-level features. [This new version on GitHub](https://github.com/BexTuychiev/automated-price-tracking) implements proper database session management, faster operations and overall smoother user experience. Also, this version includes buttons for removing products from the database and visiting them through the app. + +If you decide to stick with the basic version, you can find the full project code and notebook in [the official Firecrawl GitHub repository's example projects](https://github.com/mendableai/firecrawl/tree/main/examples/automated_price_tracking). I also recommend that you [deploy your Streamlit app to Streamlit Cloud](https://share.streamlit.io) so that you have a functional app accessible everywhere you go. + +Here are some further improvements you might consider for the app: + +- Improve the price comparison logic: the app compares the current price to the oldest recorded price, which might not be ideal. You may want to compare against recent price trends instead. +- No handling of currency conversion if products use different currencies. +- The Discord notification system doesn't handle rate limits or potential webhook failures gracefully. +- No error handling for Firecrawl scraper - what happens if the scraping fails? +- No consistent usage of logging to help track issues in production. +- No input URL sanitization before scraping. + +Some of these features are implemented in [the advanced version of the project](https://github.com/BexTuychiev/automated-price-tracking), so definitely check it out! + +Here are some more guides from our blog if you are interested: + +- [How to Run Web Scrapers on Schedule](https://www.firecrawl.dev/blog/automated-web-scraping-free-2025) +- [More about using Firecrawl's `scrape_url` function](https://www.firecrawl.dev/blog/mastering-firecrawl-scrape-endpoint) +- [Scraping entire websites with Firecrawl in a single command - the /crawl endpoint](https://www.firecrawl.dev/blog/mastering-the-crawl-endpoint-in-firecrawl) + +Thank you for reading! diff --git a/examples/blog-articles/github-actions-tutorial/github-actions-tutorial-images/cron-syntax.png b/examples/blog-articles/github-actions-tutorial/github-actions-tutorial-images/cron-syntax.png new file mode 100644 index 00000000..d790c1ef Binary files /dev/null and b/examples/blog-articles/github-actions-tutorial/github-actions-tutorial-images/cron-syntax.png differ diff --git a/examples/blog-articles/github-actions-tutorial/notebook.ipynb b/examples/blog-articles/github-actions-tutorial/notebook.ipynb new file mode 100644 index 00000000..42041b34 --- /dev/null +++ b/examples/blog-articles/github-actions-tutorial/notebook.ipynb @@ -0,0 +1,1630 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comprehensive GitHub Actions Tutorial For Beginners With Examples in Python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "GitHub Actions is a powerful automation platform that helps developers automate tedious, time-wasting software development workflows. Instead of running tests, executing scripts at intervals, or doing any programmable task manually, you can let GitHub Actions take the wheel when certain events happen in your repository. In this tutorial, you will learn how to use this critical feature of GitHub and design your own workflows for several real-world use cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What are GitHub Actions?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At its core, [GitHub Actions](https://docs.github.com/en/actions) is a continuous integration and continuous delivery (CI/CD) platform that lets you automate various tasks directly from your GitHub repository. Think of it as your personal robot assistant that can:\n", + "\n", + "- Run your Python tests automatically when you push code\n", + "- Deploy your application when you create a new release\n", + "- Send notifications when issues are created\n", + "- Schedule tasks to run at specific times\n", + "- And much more..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Why automate with GitHub Actions?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at a common scenario: You are building a Python application that scrapes product prices from various e-commerce websites. Without GitHub actions, you would need to:\n", + "\n", + "1. Manually run your tests after each code change\n", + "2. Remember to execute the scraper at regular intervals\n", + "3. Deploy updates to your production environment\n", + "4. Keep track of environment variables and secrets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "With GitHub actions, all of these tasks can be automated through workflows, usually written in YAML files like below:\n", + "\n", + "```yaml\n", + "name: Run Price Scraper\n", + "\n", + "on:\n", + " schedule:\n", + " - cron: '0 */12 * * *' # Runs every 12 hours\n", + " workflow_dispatch: # Allows manual triggers\n", + "\n", + "jobs:\n", + " scrape:\n", + " runs-on: ubuntu-latest\n", + " \n", + " steps:\n", + " - uses: actions/checkout@v3\n", + " - name: Set up Python\n", + " uses: actions/setup-python@v4\n", + " with:\n", + " python-version: '3.9'\n", + " \n", + " - name: Run scraper\n", + " env:\n", + " API_KEY: ${{ secrets.FIRECRAWL_API_KEY }}\n", + " run: python scraper.py\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This workflow automatically runs a scraper every 12 hours, handles Python version setup, and securely manages API keys - all without manual intervention." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What we'll build in this tutorial" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Throughout this tutorial, we'll build several practical GitHub Actions workflows for Python applications. You will learn how to:\n", + "\n", + "1. Create basic and advanced workflow configurations.\n", + "2. Work with environment variables and secrets.\n", + "3. Set up automated testing pipelines.\n", + "4. Build a real-world example: an automated scraping system app [Firecrawl](https://firecrawl.dev) in Python.\n", + "5. Implement best practices for security and efficiency. \n", + "\n", + "By the end, you will have hands-on experience with GitHub Actions and be able to automate your own Python projects effectively. \n", + "\n", + "> Note: Even though code examples are Python, the concepts and hands-on experience you will gain from the tutorial will apply to any programming language. \n", + "\n", + "Let's start by understanding the core concepts that make GitHub Actions work." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding GitHub Actions Core Concepts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To write your own GitHub Actions workflows, you need to understand how its different components work together. Let's break down these core concepts using a practical example: automating tests for a simple Python script." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GitHub Actions workflows and their components" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A workflow is an automated process that you define in a YAML file within your repository's `.github/workflows` directory. Think of it as a recipe that tells GitHub exactly what to do, how and when to do it. You can transform virtually any programmable task into a GitHub workflow as long as it can be executed in a Linux, Windows, or macOS environment and doesn't require direct user interaction.\n", + "\n", + "Here is a basic workflow structure:\n", + "\n", + "```yaml\n", + "# test.yaml\n", + "name: Python Tests\n", + "on: [push, pull_request]\n", + "\n", + "jobs:\n", + " test:\n", + " runs-on: ubuntu-latest\n", + " steps:\n", + " - name: Check out repository\n", + " uses: actions/checkout@v3\n", + " \n", + " - name: Set up Python\n", + " uses: actions/setup-python@v4\n", + " with:\n", + " python-version: '3.9'\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The YAML file starts by specifying the name of the workflow with the `name` field. Immediately after, we specify the events that triggers this workflow. In this example, the workflow automatically executes on each `git push` command and pull request. We will learn more about events and triggers in a later section. \n", + "\n", + "Next, we define jobs, which are the building blocks of workflows. Each job:\n", + "\n", + "- Runs on a fresh virtual machine (called a runner) that is specified using the `runs-on` field.\n", + "- Can execute multiple steps in sequence\n", + "- Can run in parallel with other jobs\n", + "- Has access to shared workflow data\n", + "\n", + "For example, you might have separate jobs for testing and deployment:\n", + "\n", + "```yaml\n", + "jobs:\n", + " test:\n", + " runs-on: ubuntu-latest\n", + " ...\n", + " deploy:\n", + " runs-on: macos-latest\n", + " ...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Each job can contain one or more `steps` that are executed sequentially. Steps are individual tasks that make up your job. They can:\n", + "\n", + "- Run commands or shell scripts\n", + "- Execute actions (reusable units of code)\n", + "- Run commands in Docker containers\n", + "- Reference other GitHub repositories\n", + "\n", + "For example, a typical test job might have steps to:\n", + "\n", + "1. Check out (clone) code from your GitHub repository\n", + "2. Set up dependencies\n", + "3. Run tests\n", + "4. Upload test results\n", + "\n", + "Each step can specify:\n", + "\n", + "- `name`: A display name for the step\n", + "- `uses`: Reference to an action to run\n", + "- `run`: Any operating-system specific terminal command like `pip install package` or `python script.py`\n", + "- `with`: Input parameters for actions\n", + "- `env`: Environment variables for the step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we understand jobs and steps, let's look at Actions - the reusable building blocks that make GitHub Actions so powerful.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `test.yaml` file from earlier has a single `test` job that executes two steps:\n", + "\n", + "1. Checking out the repository code using a built-in `actions/checkout@v3` action.\n", + "2. Setting up a Python environment with `actions/setup-python@v4` and `python-version` as an input parameter for said action.\n", + "\n", + "```bash\n", + "# test.yaml\n", + "name: Python Tests\n", + "on: [push, pull_request]\n", + "\n", + "jobs:\n", + " test:\n", + " runs-on: ubuntu-latest\n", + " steps:\n", + " - name: Check out repository\n", + " uses: actions/checkout@v3\n", + " \n", + " - name: Set up Python\n", + " uses: actions/setup-python@v4\n", + " with:\n", + " python-version: '3.9'\n", + "```\n", + "\n", + "Actions are reusable units of code that can be shared across workflows (this is where GitHub Actions take its name). They are like pre-packaged functions that handle common tasks. For instance, instead of writing code to set up Node.js or caching dependencies, you can use the GitHub official actions like:\n", + "\n", + "- `actions/setup-node@v3` - Sets up Node.js environment\n", + "- `actions/cache@v3` - Caches dependencies and build outputs\n", + "- `actions/upload-artifact@v3` - Uploads workflow artifacts\n", + "- `actions/download-artifact@v3` - Downloads workflow artifacts\n", + "- `actions/labeler@v4` - Automatically labels pull requests\n", + "- `actions/stale@v8` - Marks and closes stale issues/PRs\n", + "- `actions/dependency-review-action@v3` - Reviews dependency changes\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Events and triggers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Events are specific activities that trigger a workflow. Common triggers include:\n", + "\n", + "- `push`: When code is pushed to the repository\n", + "- `pull_request`: When a PR is opened or updated\n", + "- `schedule`: At specified times using cron syntax\n", + "- `workflow_dispatch`: Manual trigger via GitHub UI\n", + "\n", + "Here is how you can configure multiple triggers:\n", + "\n", + "```yaml\n", + "name: Comprehensive Workflow\n", + "on:\n", + " push:\n", + " branches: [main]\n", + " pull_request:\n", + " branches: [main]\n", + " schedule:\n", + " - cron: '0 0 * * *' # Daily at midnight\n", + " workflow_dispatch: # Manual trigger\n", + "\n", + "jobs:\n", + " process:\n", + " runs-on: ubuntu-latest\n", + " steps:\n", + " - uses: actions/checkout@v3\n", + " - name: Run daily tasks\n", + " run: python daily_tasks.py\n", + " env:\n", + " API_KEY: ${{ secrets.FIRECRAWL_API_KEY }}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example shows how a single workflow can:\n", + "\n", + "- Run automatically on code changes on `git push`\n", + "- Execute daily scheduled tasks with cron\n", + "- Be triggered automatically when needed through the GitHub UI\n", + "- Handle sensitive data like API keys securely" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cron jobs in GitHub Actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the `schedule` trigger effectively in GitHub Actions, you'll need to understand cron syntax. This powerful scheduling format lets you automate workflows to run at precise times. The syntax uses five fields to specify when a job should run:\n", + "\n", + "![](images/cron-syntax.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are some common cron schedule examples:\n", + "\n", + "```yaml\n", + "# Daily at 3:30 AM UTC\n", + "- cron: '30 3 * * *'\n", + "\n", + "# Every Monday at 1:00 PM UTC\n", + "- cron: '0 13 * * 1'\n", + "\n", + "# Every 6 hours at the first minute\n", + "- cron: '0 */6 * * *'\n", + "\n", + "# At minute 15 of every hour\n", + "- cron: '15 * * * *'\n", + "\n", + "# Every weekday (Monday through Friday)\n", + "- cron: '0 0 * * 1-5'\n", + "\n", + "# Each day at 12am, 6am, 12pm, 6pm on Tuesday, Thursday, Saturday\n", + "- cron: '0 0,6,12,18 * * 1,3,5'\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is a sample workflow for a scraping job with four different schedules (multiple schedules are allowed):\n", + "\n", + "```yaml\n", + "name: Price Scraper Schedules\n", + "on:\n", + " schedule:\n", + " - cron: '0 */4 * * *' # Every 4 hours\n", + " - cron: '30 1 * * *' # Daily at 1:30 AM UTC\n", + " - cron: '0 9 * * 1-5' # Weekdays at 9 AM UTC\n", + "\n", + "jobs:\n", + " scrape:\n", + " runs-on: ubuntu-latest\n", + " steps:\n", + " - uses: actions/checkout@v3\n", + " - name: Run Firecrawl scraper\n", + " env:\n", + " API_KEY: ${{ secrets.FIRECRAWL_API_KEY }}\n", + " run: python scraper.py\n", + "```\n", + "\n", + "Remember that GitHub Actions runs on UTC time, and schedules might experience slight delays during peak GitHub usage. That's why it's helpful to combine `schedule` with `workflow_dispatch` as we saw earlier - giving you both automated and manual trigger options." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "---------------\n", + "\n", + "Understanding these core concepts allows you to create workflows that are efficient (running only when needed), secure (properly handling sensitive data), maintainable (using reusable actions) and scalable (running on different platforms). \n", + "\n", + "In the next section, we will put these concepts into practice by creating your first GitHub actions workflow." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating Your First GitHub Actions Workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a practical GitHub Actions workflow from scratch. We'll build a workflow that automatically tests a Python script and runts it on a schedule - a universal task applicable to any programming language. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting up the environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start by creating a working directory for this mini-project:\n", + "\n", + "```bash\n", + "mkdir first-workflows\n", + "cd first-workflows\n", + "```\n", + "\n", + "Let's create the standard `.github/workflows` folder structure GitHub uses for detecting workflow files:\n", + "\n", + "```bash\n", + "mkdir -p .github/workflows\n", + "```\n", + "\n", + "The workflow files can have any name but must have a `.yml` extension:\n", + "\n", + "```bash\n", + "touch .github/workflows/system_monitor.yml\n", + "```\n", + "\n", + "In addition to the workflows folder, create a `tests` folder as well as a test file:\n", + "\n", + "```bash\n", + "mkdir tests\n", + "touch tests/test_main.py\n", + "```\n", + "\n", + "We should also create the `main.py` file along with a `requirements.txt`:\n", + "\n", + "```bash\n", + "touch main.py requirements.txt\n", + "```\n", + "\n", + "Then, add these two dependencies to `requirements.txt`:\n", + "\n", + "```text\n", + "psutil>=5.9.0\n", + "pytest>=7.0.0\n", + "```\n", + "\n", + "Finally, let's initialize git and make our first commit:\n", + "\n", + "```bash\n", + "git init \n", + "git add .\n", + "git commit -m \"Initial commit\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check out the [Git documentation](https://git-scm.com/doc) if you don't have it installed already." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Writing your first workflow file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's write the workflow logic first. Open `system_monitor.yml` and paste each code snippet we are about to define one after the other. \n", + "\n", + "1. Workflow name and triggers:\n", + "\n", + "```yaml\n", + "name: System Monitoring\n", + "on:\n", + " schedule:\n", + " - cron: '*/30 * * * *' # Run every 30 minutes\n", + " workflow_dispatch: # Enables manual trigger\n", + "```\n", + "\n", + "In this part, we give a descriptive name to the workflow that appears in GitHub's UI. Using the `on` field, we set the workflow to run every 30 minutes and through a manual trigger." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "2. Job definition:\n", + "\n", + "```yaml\n", + "jobs:\n", + " run_script:\n", + " runs-on: ubuntu-latest\n", + "```\n", + "\n", + "`jobs` contains all the jobs in this workflow and it has a `run_script` name, which is a unique identifier. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3. Steps:\n", + "\n", + "There are five steps that run sequentially in this workflow. They are given descriptive names that appear in the GitHub UI and uses official GitHub actions and custom terminal commands. \n", + "\n", + "```yaml\n", + "jobs:\n", + " monitor:\n", + " runs-on: ubuntu-latest\n", + " \n", + " steps:\n", + " - name: Check out repository\n", + " uses: actions/checkout@v3\n", + " \n", + " - name: Set up Python\n", + " uses: actions/setup-python@v4\n", + " with:\n", + " python-version: '3.9'\n", + " \n", + " - name: Install dependencies\n", + " run: |\n", + " python -m pip install --upgrade pip\n", + " pip install -r requirements.txt\n", + " \n", + " - name: Run tests\n", + " run: pytest tests/\n", + " \n", + " - name: Collect system metrics\n", + " run: python main.py\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is what each step does:\n", + "\n", + "1. Check out repository code with `actions/checkout@v3`.\n", + "2. Configures Python 3.9 environment.\n", + "3. Runs two terminal commands that:\n", + " - Install/upgrade `pip`\n", + " - Install `pytest` package\n", + "4. Runs the tests located in the `tests` directory using `pytest`.\n", + "5. Executes the main script with `python main.py`. \n", + "\n", + "Notice the use of `|` (pipe) operator for multi-line commands.\n", + "\n", + "After you complete writing the workflow, commit the changes to Git:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a workflow file for monitoring system resources\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating the Python script" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's write the `main.py` file, which is a monitoring script that helps software developers track system resource usage over time, enabling them to identify performance bottlenecks and capacity issues in their development environment. \n", + "\n", + "```python\n", + "import psutil\n", + "import json\n", + "from datetime import datetime\n", + "from pathlib import Path\n", + "```\n", + "\n", + "This script collects and logs system metrics over time. It uses `psutil` to gather CPU usage, memory usage, disk usage, and active process counts. The metrics are timestamped and saved to JSON files organized by date.\n", + "\n", + "The script has three main functions:\n", + "\n", + "```python\n", + "def get_system_metrics():\n", + " \"\"\"Collect key system metrics\"\"\"\n", + " metrics = {\n", + " \"cpu_percent\": psutil.cpu_percent(interval=1),\n", + " \"memory_percent\": psutil.virtual_memory().percent,\n", + " \"disk_usage\": psutil.disk_usage('/').percent,\n", + " \"timestamp\": datetime.now().isoformat()\n", + " }\n", + " \n", + " # Add running processes count\n", + " metrics[\"active_processes\"] = len(psutil.pids())\n", + " \n", + " return metrics\n", + "```\n", + "\n", + "`get_system_metrics()` - Collects current system metrics including CPU percentage, memory usage percentage, disk usage percentage, timestamp, and count of active processes.\n", + "\n", + "```python\n", + "def save_metrics(metrics):\n", + " \"\"\"Save metrics to a JSON file with today's date\"\"\"\n", + " date_str = datetime.now().strftime(\"%Y-%m-%d\")\n", + " reports_dir = Path(\"system_metrics\")\n", + " reports_dir.mkdir(exist_ok=True)\n", + " \n", + " # Save to daily file\n", + " file_path = reports_dir / f\"metrics_{date_str}.json\"\n", + " \n", + " # Load existing metrics if file exists\n", + " if file_path.exists():\n", + " with open(file_path) as f:\n", + " daily_metrics = json.load(f)\n", + " else:\n", + " daily_metrics = []\n", + " \n", + " # Append new metrics\n", + " daily_metrics.append(metrics)\n", + " \n", + " # Save updated metrics\n", + " with open(file_path, 'w') as f:\n", + " json.dump(daily_metrics, f, indent=2)\n", + "```\n", + "\n", + "`save_metrics()` - Handles saving the metrics to JSON files. It creates a `system_metrics` directory if needed, and saves metrics to date-specific files (e.g. `metrics_2024-12-12.json`). If a file for the current date exists, it loads and appends to it, otherwise creates a new file.\n", + "\n", + "```python\n", + "def main():\n", + " try:\n", + " metrics = get_system_metrics()\n", + " save_metrics(metrics)\n", + " print(f\"System metrics collected at {metrics['timestamp']}\")\n", + " print(f\"CPU: {metrics['cpu_percent']}% | Memory: {metrics['memory_percent']}%\")\n", + " return True\n", + " except Exception as e:\n", + " print(f\"Error collecting metrics: {str(e)}\")\n", + " return False\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n", + "```\n", + "\n", + "`main()` - Orchestrates the metric collection and saving process. It calls `get_system_metrics()`, saves the data via `save_metrics()`, prints current CPU and memory usage to console, and handles any errors that occur during execution.\n", + "\n", + "The script can be run directly or imported as a module. When run directly (which is what happens in a GitHub Actions workflow), it executes the `main()` function which collects and saves one set of metrics." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Combine the code snippets above into the `main.py` file and commit the changes:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add the main.py functionality\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adding tests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Testing is a critical part of software engineering workflows for several reasons:\n", + "\n", + "1. Reliability: Tests help ensure code behaves correctly and consistently across changes.\n", + "2. Regression prevention: Tests catch when new changes break existing functionality.\n", + "3. Documentation: Tests serve as executable documentation of expected behavior\n", + "4. Design feedback: Writing tests helps identify design issues early\n", + "5. Confidence: A good test suite gives confidence when refactoring or adding features\n", + "\n", + "For our system metrics collection script, tests would be valuable to verify:\n", + "\n", + "- The `get_system_metrics()` function returns data in the expected format with valid ranges\n", + "- The `save_metrics()` function properly handles file operations and JSON serialization\n", + "- Error handling works correctly for various failure scenarios\n", + "- The `main()` function orchestrates the workflow as intended\n", + "\n", + "With that said, let's work on the `tests/test_main.py` file:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "import json\n", + "from datetime import datetime\n", + "from pathlib import Path\n", + "from main import get_system_metrics, save_metrics\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The test file we are about to write demonstrates key principles of testing with `pytest`, a popular Python testing framework. Pytest makes it easy to write tests by using simple `assert` statements and providing a rich set of features for test organization and execution. The test functions are automatically discovered by `pytest` when their names start with `test_`, and each function tests a specific aspect of the system's functionality.\n", + "\n", + "```python\n", + "def test_get_system_metrics():\n", + " \"\"\"Test if system metrics are collected correctly\"\"\"\n", + " metrics = get_system_metrics()\n", + " \n", + " # Check if all required metrics exist and are valid\n", + " assert 0 <= metrics['cpu_percent'] <= 100\n", + " assert 0 <= metrics['memory_percent'] <= 100\n", + " assert metrics['active_processes'] > 0\n", + " assert 'timestamp' in metrics\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we have two test functions that verify different components of our metrics collection system. The first test, `test_get_system_metrics()`, checks if the metrics collection function returns data in the expected format and with valid ranges. It uses multiple assert statements to verify that CPU and memory percentages are between 0 and 100, that there are active processes, and that a timestamp is included. This demonstrates the practice of testing both the structure of returned data and the validity of its values.\n", + "\n", + "```python\n", + "def test_save_and_read_metrics():\n", + " \"\"\"Test if metrics are saved and can be read back\"\"\"\n", + " # Get and save metrics\n", + " metrics = get_system_metrics()\n", + " save_metrics(metrics)\n", + " \n", + " # Check if file exists and contains data\n", + " date_str = datetime.now().strftime(\"%Y-%m-%d\")\n", + " file_path = Path(\"system_metrics\") / f\"metrics_{date_str}.json\"\n", + " \n", + " assert file_path.exists()\n", + " with open(file_path) as f:\n", + " saved_data = json.load(f)\n", + " \n", + " assert isinstance(saved_data, list)\n", + " assert len(saved_data) > 0\n", + "```\n", + "\n", + "The second test, `test_save_and_read_metrics()`, showcases integration testing by verifying that metrics can be both saved to and read from a file. It follows a common testing pattern: arrange (setup the test conditions), act (perform the operations being tested), and assert (verify the results). The test ensures that the file is created in the expected location and that the saved data maintains the correct structure. This type of test is particularly valuable as it verifies that different components of the system work together correctly.\n", + "\n", + "Combine the above code snippets and commit the changes to GitHub:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```bash\n", + "git add .\n", + "git commit -m \"Write tests for main.py\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Running your first GitHub Actions workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we've created our system monitoring workflow, let's set it up on GitHub and run it. First, push everything we have to a new GitHub repository:\n", + "\n", + "```bash\n", + "git remote add origin https://github.com/your-username/your-repository.git\n", + "git branch -M main\n", + "git push -u origin main\n", + "```\n", + "\n", + "Once the workflow file is pushed, GitHub automatically detects it and displays in the \"Actions\" tab of your repository. The workflow is scheduled so you don't need to do anything - the first workflow run will happen within 30 minutes (remember how we set the running interval to `*/30` with cron). Since the workflow also includes a `workflow_dispatch` field, you can trigger it manually by clicking on the \"Run workflow\" button. \n", + "\n", + "After clicking the button, a new run appears within a few seconds (refresh if you don't see it). To see the workflow run in real-time, click on it and expand the `monitor` job. You'll see each step executing:\n", + "\n", + "- Checking out repository\n", + "- Setting up Python\n", + "- Installing dependencies\n", + "- Running tests\n", + "- Collecting metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Committing changes made by GitHub Actions workflows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Right now, our workflow file has a problem - while it successfully collects metrics, it doesn't commit and push the changes back to the repository. This means that although metrics are being gathered, they aren't being saved in version control. Let's modify the workflow to automatically commit and push the collected metrics:\n", + "\n", + "```yaml\n", + "name: System Monitor\n", + "\n", + "on:\n", + " schedule:\n", + " - cron: '*/30 * * * *'\n", + " workflow_dispatch:\n", + "\n", + "permissions:\n", + " contents: write\n", + "\n", + "jobs:\n", + " monitor:\n", + " runs-on: ubuntu-latest\n", + " steps:\n", + " - uses: actions/checkout@v3\n", + " \n", + " - name: Set up Python\n", + " uses: actions/setup-python@v4\n", + " with:\n", + " python-version: '3.10'\n", + " \n", + " - name: Install dependencies\n", + " run: |\n", + " python -m pip install --upgrade pip\n", + " pip install -r requirements.txt\n", + " \n", + " - name: Run tests\n", + " run: python -m pytest\n", + " \n", + " - name: Collect metrics\n", + " run: python main.py\n", + " \n", + " - name: Commit and push changes\n", + " run: |\n", + " git config --global user.name 'github-actions[bot]'\n", + " git config --global user.email 'github-actions[bot]@users.noreply.github.com'\n", + " git add metrics.json\n", + " git commit -m \"Update metrics\" || exit 0\n", + " git push\n", + "```\n", + "\n", + "The key changes in this updated workflow are:\n", + "\n", + "1. Added `permissions` block with `contents: write` to allow the workflow to push changes back to the repository.\n", + "2. Added a new \"Commit and push changes\" step that:\n", + " - Configures git user identity as `github-actions` bot\n", + " - Stages the `metrics.json` file\n", + " - Creates a commit with message \"Update metrics\" \n", + " - Pushes the changes back to the repository\n", + " \n", + "The \"|| exit 0\" after `git commit` ensures the workflow doesn't fail if there are no changes to commit.\n", + "\n", + "This allows the workflow to automatically save and version control the metrics it collects. Let's commit the changes to the workflow and push:\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a commit step to the workflow file\"\n", + "git push origin main\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After this change, try running the workflow manually and verify its success by navigating to the Actions tab in your GitHub repository. You should see the workflow run and the `metrics.json` file updated with new data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Managing Sensitive Data and Environment Variables" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When building automated workflows with GitHub Actions, proper handling of sensitive data like API keys, passwords, and access tokens is crucial for security. Let's explore best practices for managing these credentials." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Understanding environment variables in GitHub Actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Environment variables in GitHub Actions can be set at different levels:\n", + "\n", + "- Repository level (GitHub Secrets)\n", + "- Workflow level\n", + "- Job level\n", + "- Step level\n", + "\n", + "Here is how to properly configure and use them:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Setting up repository secrets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, store sensitive values as repository secrets:\n", + "\n", + "1. Navigate to your GitHub repository\n", + "2. Go to Settings โ†’ Secrets and variables โ†’ Actions\n", + "3. Click \"New repository secret\"\n", + "4. Add your secrets with descriptive names like:\n", + "\n", + "- `API_KEY`\n", + "- `DATABASE_URL`\n", + "- `AWS_ACCESS_KEY`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Using secrets in workflows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Reference secrets in your workflow file using the `secrets` context:\n", + "\n", + "```yaml\n", + "name: Web Scraping Pipeline\n", + "# ... the rest of the file\n", + "\n", + "jobs:\n", + " scrape:\n", + " runs-on: ubuntu-latest\n", + " \n", + " steps:\n", + " # ... the rest of the steps\n", + " \n", + " - name: Run scraper\n", + " env:\n", + " API_KEY: ${{ secrets.API_KEY }}\n", + " DATABASE_URL: ${{ secrets.DATABASE_URL }}\n", + " run: python scraper.py\n", + "```\n", + "\n", + "Above, the \"Run scraper\" step executes `scraper.py` which relies on two environment variables configured through the `secrets` context. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Local development with .env files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For local development, use `.env` files to manage environment variables:\n", + "\n", + "```bash\n", + "touch .env\n", + "echo \"API_KEY='your-api-key-here'\" >> .env\n", + "echo \"DATABASE_URL='postgresql://user:pass@localhost:5432/db'\" >> .env\n", + "```\n", + "\n", + "Create a `.gitignore` file to prevent committing sensitive data:\n", + "\n", + "```bash\n", + "echo \".env\" >> .gitignore\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Loading environment variables in Python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `python-dotenv` to load variables from `.env` files:\n", + "\n", + "```python\n", + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "# Load environment variables from .env file\n", + "load_dotenv()\n", + "\n", + "# Access variables\n", + "api_key = os.getenv('API_KEY')\n", + "database_url = os.getenv('DATABASE_URL')\n", + "\n", + "if not api_key or not database_url:\n", + " raise ValueError(\"Missing required environment variables\")\n", + "```\n", + "\n", + "This code demonstrates loading environment variables from a `.env` file using `python-dotenv`. The `load_dotenv()` function reads the variables, which can then be accessed via `os.getenv()`. Basic validation ensures required variables exist." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5. Environment variable validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a configuration class to validate environment variables:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "from pydantic import BaseSettings, SecretStr\n", + "\n", + "\n", + "class Settings(BaseSettings):\n", + " api_key: SecretStr\n", + " database_url: str\n", + " debug_mode: bool = False\n", + "\n", + " class Config:\n", + " env_file = \".env\"\n", + " env_file_encoding = \"utf-8\"\n", + "\n", + "\n", + "# Initialize settings\n", + "settings = Settings()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This approach using Pydantic provides several advantages for environment variable management:\n", + "\n", + "1. Type validation - Pydantic automatically validates types and converts values\n", + "2. Default values - The `debug_mode` demonstrates setting defaults\n", + "3. Secret handling - `SecretStr` provides secure handling of sensitive values\n", + "4. Centralized config - All environment variables are defined in one place\n", + "5. IDE support - Get autocomplete and type hints when using the settings object\n", + "\n", + "The `Settings` class inherits from `BaseSettings` which automatically loads from environment variables. The `Config` class specifies to also load from a `.env` file.\n", + "\n", + "Using `settings = Settings()` creates a validated configuration object that can be imported and used throughout the application. This is more robust than accessing `os.environ` directly.\n", + "\n", + "Example usage:\n", + "\n", + "```python\n", + "settings.api_key.get_secret_value() # Securely access API key\n", + "settings.database_url # Type-checked database URL\n", + "settings.debug_mode # Boolean with default value\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6. Handle different environments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Handle different environments (development, staging, production) using environment-specific files:\n", + "\n", + "```bash\n", + ".env # Default environment variables\n", + ".env.development # Development-specific variables\n", + ".env.staging # Staging-specific variables\n", + ".env.production # Production-specific variables\n", + "```\n", + "\n", + "Load the appropriate file based on the environment:\n", + "\n", + "```bash\n", + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "env = os.getenv('ENVIRONMENT', 'development')\n", + "env_file = f'.env.{env}'\n", + "\n", + "load_dotenv(env_file)\n", + "```\n", + "\n", + "This approach allows you to maintain separate configurations for different environments while keeping sensitive information secure. The environment-specific files can contain different values for the same variables, such as:\n", + "\n", + "- Development environment may use local services and dummy credentials\n", + "- Staging environment may use test services with restricted access\n", + "- Production environment contains real credentials and production service endpoints\n", + "\n", + "You can also combine this with the `Pydantic` settings approach shown earlier for robust configuration management across environments.\n", + "\n", + "For example, staging might use a test database while production uses the live database:\n", + "\n", + "```bash\n", + "# .env.staging:\n", + "DATABASE_URL=postgresql://test-db.example.com\n", + "API_KEY=test-key\n", + "```\n", + "\n", + "```bash\n", + ".env.production:\n", + "DATABASE_URL=postgresql://prod-db.example.com \n", + "API_KEY=live-key\n", + "```\n", + "\n", + "This separation helps prevent accidental use of production resources during development and testing." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building Real-World Python Workflows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's explore three practical examples of GitHub Actions workflows for common Python tasks: web scraping, package publishing, and container builds." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Scheduled web scraping with Firecrawl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Web scraping is a common use case for automated workflows. Let's build a workflow that scrapes [Hacker News](https://news.ycombinator.com/) on a schedule using [Firecrawl](https://docs.firecrawl.dev), which is a Python AI-based web scraping engine designed for large-scale data collection. Here are some key benefits that make Firecrawl an excellent choice for this task:\n", + "\n", + "1. **Enterprise-grade automation and scalability** - Firecrawl streamlines web scraping with powerful automation features.\n", + "2. **AI-powered content extraction** - Maintains scraper reliability over time by identifying and extracting data based on semantic descriptions instead of relying HTML elements and CSS selectors.\n", + "3. **Handles complex scraping challenges** - Automatically manages proxies, anti-bot mechanisms, and dynamic JavaScript content.\n", + "4. **Multiple output formats** - Supports scraping and converting data in markdown, tabular, screenshots, and HTML, making it versatile for various applications.\n", + "5. **Built-in rate limiting and request management** - Ensures efficient and compliant data extraction.\n", + "6. **Geographic location customization** - Avoids IP bans by customizing the geographic location of requests." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's build our web scraping workflow using Firecrawl to demonstrate these capabilities.\n", + "\n", + "```bash\n", + "# Create project directory and install dependencies\n", + "mkdir hacker-news-scraper && cd hacker-news-scraper\n", + "pip install firecrawl-py pydantic python-dotenv\n", + "\n", + "# Create necessary files\n", + "touch requirements.txt scraper.py .env\n", + "\n", + "# Add dependencies to requirements.txt\n", + "echo \"firecrawl-py\\npydantic\\npython-dotenv\" > requirements.txt\n", + "\n", + "# Add Firecrawl API key to .env (get your key at firecrawl.dev/signin/signup)\n", + "echo \"FIRECRAWL_API_KEY='your_api_key_here'\" > .env\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Open the scraper script where we define our scraping logic:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# scraper.py\n", + "import json\n", + "from firecrawl import FirecrawlApp\n", + "from dotenv import load_dotenv\n", + "from pydantic import BaseModel, Field\n", + "from typing import List\n", + "from datetime import datetime\n", + "\n", + "load_dotenv()\n", + "\n", + "BASE_URL = \"https://news.ycombinator.com/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we import necessary libraries and packages, also defining a base URL we are going to scrape." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class NewsItem(BaseModel):\n", + " title: str = Field(description=\"The title of the news item\")\n", + " source_url: str = Field(description=\"The URL of the news item\")\n", + " author: str = Field(\n", + " description=\"The URL of the post author's profile concatenated with the base URL.\"\n", + " )\n", + " rank: str = Field(description=\"The rank of the news item\")\n", + " upvotes: str = Field(description=\"The number of upvotes of the news item\")\n", + " date: str = Field(description=\"The date of the news item.\")\n", + "\n", + "\n", + "class NewsData(BaseModel):\n", + " news_items: List[NewsItem]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define two Pydantic models to structure our scraped data:\n", + "\n", + "1. `NewsItem` - Represents a single news item with fields for title, URL, author, rank, upvotes and date\n", + "2. `NewsData` - Contains a list of `NewsItem` objects\n", + "\n", + "These models help validate the scraped data and ensure it matches our expected schema. They also make it easier to serialize/deserialize the data when saving to JSON. Using `Field` with a detailed description is crucial because Firecrawl uses these definitions to automatically detect the HTMl elements and CSS selectors we are looking for.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_news_data():\n", + " app = FirecrawlApp()\n", + "\n", + " data = app.scrape_url(\n", + " BASE_URL,\n", + " params={\n", + " \"formats\": [\"extract\"],\n", + " \"extract\": {\"schema\": NewsData.model_json_schema()},\n", + " },\n", + " )\n", + "\n", + " return data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `get_news_data()` function uses Firecrawl to scrape Hacker News. It creates a `FirecrawlApp` instance and calls `scrape_url()` with the `BASE_URL` and parameters specifying we want to extract data according to our `NewsData` schema. The schema helps Firecrawl automatically identify and extract the relevant HTML elements. The function returns the scraped data containing news items with their titles, URLs, authors, ranks, upvotes and dates." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def save_firecrawl_news_data():\n", + " \"\"\"\n", + " Save the scraped news data to a JSON file with the current date in the filename.\n", + " \"\"\"\n", + " # Get the data\n", + " data = get_news_data()\n", + " # Format current date for filename\n", + " date_str = datetime.now().strftime(\"%Y_%m_%d_%H_%M\")\n", + " filename = f\"firecrawl_hacker_news_data_{date_str}.json\"\n", + "\n", + " # Save the news items to JSON file\n", + " with open(filename, \"w\") as f:\n", + " json.dump(data[\"extract\"][\"news_items\"], f, indent=4)\n", + "\n", + " print(f\"{datetime.now()}: Successfully saved the news data.\")\n", + " \n", + "if __name__ == \"__main__\":\n", + " save_firecrawl_news_data()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `save_firecrawl_news_data()` function handles saving the scraped Hacker News data to a JSON file. It first calls `get_news_data()` to fetch the latest data from Hacker News. Then it generates a filename using the current timestamp to ensure uniqueness. The data is saved to a JSON file with that filename, with the news items formatted with proper indentation for readability. Finally, it prints a confirmation message with the current timestamp when the save is complete. This function provides a convenient way to store snapshots of Hacker News data that can be analyzed later.\n", + "\n", + "Combine these snippets into the `scraper.py` script. Then, we can write a workflow that executes it on schedule:\n", + "\n", + "```bash\n", + "cd .. # Change back to the project root directory\n", + "touch .github/workflows/hacker-news-scraper.py # Create the workflow file\n", + "```\n", + "\n", + "Here is what the workflow file must look like:\n", + "\n", + "```yaml\n", + "name: Run Hacker News Scraper\n", + "\n", + "permissions:\n", + " contents: write\n", + "\n", + "on:\n", + " schedule:\n", + " - cron: \"0 */6 * * *\"\n", + " workflow_dispatch:\n", + "\n", + "jobs:\n", + " scrape:\n", + " runs-on: ubuntu-latest\n", + " \n", + " steps:\n", + " - uses: actions/checkout@v3\n", + " \n", + " - name: Set up Python\n", + " uses: actions/setup-python@v4\n", + " with:\n", + " python-version: \"3.10\"\n", + " \n", + " - name: Install dependencies\n", + " run: |\n", + " python -m pip install --upgrade pip\n", + " pip install -r hacker-news-scraper/requirements.txt\n", + " \n", + " - name: Run scraper\n", + " run: python hacker-news-scraper/scraper.py\n", + " \n", + " - name: Commit and push if changes\n", + " run: |\n", + " git config --local user.email \"github-actions[bot]@users.noreply.github.com\"\n", + " git config --local user.name \"github-actions[bot]\"\n", + " git add .\n", + " git commit -m \"Update scraped data\" -a || exit 0\n", + " git push\n", + "```\n", + "\n", + "This workflow runs our Hacker News scraper every 6 hours using GitHub Actions. It sets up Python, installs dependencies, executes the scraper, and automatically commits any new data to the repository. The workflow can also be triggered manually using the `workflow_dispatch` event. One important note about the paths specified in the workflow file is that they must match your repository's directory structure exactly, including the requirements.txt location and the path to your scraper script.\n", + "\n", + "To enable the workflow, simply push all the changes to GitHub and test it through the UI. The next runs will be automatic.\n", + "\n", + "```bash\n", + "git add .\n", + "git commit -m \"Add a scraping workflow\"\n", + "git push origin main\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Package publishing workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Publishing Python packages to PyPI (Python Package Index) typically involves several steps. First, developers need to prepare their package by creating a proper directory structure, writing setup files, and ensuring all metadata is correct. Then, the package needs to be built into distribution formats - both source distributions (`sdist`) and wheel distributions (`bdist_wheel`). Finally, these distribution files are uploaded to PyPI using tools like `twine`. This process often requires careful version management and proper credentials for the package repository. While this can be done manually, automating it with CI/CD pipelines like GitHub Actions ensures consistency and reduces human error in the release process.\n", + "\n", + "For example, the following workflow publishes a new version of a package when you create a new release:\n", + "\n", + "```yaml\n", + "name: Publish Python Package\n", + "on:\n", + " release:\n", + " types: [created]\n", + "\n", + "jobs:\n", + " deploy:\n", + " runs-on: ubuntu-latest\n", + " steps:\n", + " - uses: actions/checkout@v3\n", + " \n", + " - name: Set up Python\n", + " uses: actions/setup-python@v4\n", + " with:\n", + " python-version: '3.10'\n", + " \n", + " - name: Install dependencies\n", + " run: |\n", + " python -m pip install --upgrade pip\n", + " pip install build twine\n", + " \n", + " - name: Build package\n", + " run: python -m build\n", + " \n", + " - name: Publish to PyPI\n", + " env:\n", + " TWINE_USERNAME: __token__\n", + " TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}\n", + " run: |\n", + " python -m twine upload dist/*\n", + "```\n", + "\n", + "The workflow automates publishing Python packages to PyPI when GitHub releases are created. \n", + "\n", + "Required setup steps:\n", + "1. Package must have `setup.py` or `pyproject.toml` configured\n", + "2. Create PyPI account at `pypi.org`\n", + "3. Generate PyPI API token with upload permissions\n", + "4. Store token as `PYPI_API_TOKEN` in repository secrets\n", + "\n", + "The workflow process:\n", + "1. Triggers on new GitHub release\n", + "2. Checks out code and sets up Python\n", + "3. Installs build tools\n", + "4. Creates distribution packages\n", + "5. Uploads to PyPI using stored token\n", + "\n", + "The `__token__` username is used with PyPI's token authentication, while the actual token is accessed securely through GitHub secrets." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Container build and push workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "GitHub Actions can also automate building and pushing Docker containers to container registries like Docker Hub or GitHub Container Registry (GHCR). This workflow is useful for maintaining containerized applications and ensuring your latest code changes are packaged into updated container images.\n", + "\n", + "The process typically involves:\n", + "\n", + "1. Building a Docker image from your `Dockerfile`\n", + "2. Tagging the image with version/metadata\n", + "3. Authenticating with the container registry\n", + "4. Pushing the tagged image to the registry\n", + "\n", + "This automation ensures your container images stay in sync with code changes and are readily available for deployment. Here is a sample workflow containing these steps:\n", + "\n", + "```yaml\n", + "name: Build and Push Container\n", + "on:\n", + " push:\n", + " branches: [main]\n", + " paths:\n", + " - 'Dockerfile'\n", + " - 'src/**'\n", + " workflow_dispatch:\n", + "\n", + "jobs:\n", + " build:\n", + " runs-on: ubuntu-latest\n", + " steps:\n", + " - uses: actions/checkout@v3\n", + " \n", + " - name: Set up Docker Buildx\n", + " uses: docker/setup-buildx-action@v2\n", + " \n", + " - name: Login to Docker Hub\n", + " uses: docker/login-action@v2\n", + " with:\n", + " username: ${{ secrets.DOCKERHUB_USERNAME }}\n", + " password: ${{ secrets.DOCKERHUB_TOKEN }}\n", + " \n", + " - name: Build and push\n", + " uses: docker/build-push-action@v4\n", + " with:\n", + " context: .\n", + " push: true\n", + " tags: |\n", + " user/app:latest\n", + " user/app:${{ github.sha }}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This workflow introduces a few new GitHub Actions concepts and syntax:\n", + "\n", + "The `paths` trigger filter ensures the workflow only runs when changes are made to the Dockerfile or files in the `src` directory, preventing unnecessary builds.\n", + "\n", + "`docker/setup-buildx-action` configures Docker Buildx, which provides enhanced build capabilities including multi-platform builds and build caching.\n", + "\n", + "`docker/login-action` handles registry authentication. Before using this, you must:\n", + "\n", + "1. [Create a Docker Hub account](https://app.docker.com/signup)\n", + "2. Generate an access token in Docker Hub settings\n", + "3. Add `DOCKERHUB_USERNAME` and `DOCKERHUB_TOKEN` as repository secrets in GitHub\n", + "\n", + "`docker/build-push-action` is a specialized action for building and pushing Docker images. The configuration shows:\n", + "- `context: .` (builds from current directory)\n", + "- `push: true` (automatically pushes after building)\n", + "- `tags:` specifies multiple tags including:\n", + " - `latest:` rolling tag for most recent version\n", + " - `github.sha:` unique tag using commit hash for versioning\n", + "\n", + "The workflow assumes you have:\n", + "- A valid Dockerfile in your repository\n", + "- Required application code in `src` directory\n", + "- Docker Hub repository permissions\n", + "- Properly configured repository secrets\n", + "\n", + "When this workflow runs successfully, it produces a containerized version of your application that is automatically published to Docker Hub and can be pulled with either the latest tag or specific commit hash." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Throughout this tutorial, we've explored the fundamentals and practical applications of GitHub Actions for Python development. From understanding core concepts like workflows, jobs, and actions, to implementing real-world examples including automated testing, web scraping, package publishing, and container builds, you've gained hands-on experience with this powerful automation platform. We've also covered critical aspects like managing sensitive data through environment variables and secrets, ensuring your automated workflows are both secure and maintainable.\n", + "\n", + "As you continue your journey with GitHub Actions, remember that automation is an iterative process. Start small with basic workflows, test thoroughly, and gradually add complexity as needed. The examples provided here serve as templates that you can adapt and expand for your specific use cases. For further learning, explore the [GitHub Actions documentation](https://docs.github.com/en/actions), join the [GitHub Community Forum](https://github.community/), and experiment with the vast ecosystem of pre-built actions available in the [GitHub Marketplace](https://github.com/marketplace?type=actions). Whether you're building a personal project or managing enterprise applications, GitHub Actions provides the tools you need to streamline your development workflow and focus on what matters most - writing great code.\n", + "\n", + "If you want to learn more about Firecrawl, the web scraping API we used today, you can read the following posts:\n", + "\n", + "- [Guide to Scheduling Web Scrapers in Python](https://www.firecrawl.dev/blog/automated-web-scraping-free-2025)\n", + "- [Mastering Firecrawl's Scrape Endpoint](https://www.firecrawl.dev/blog/mastering-firecrawl-scrape-endpoint)\n", + "- [Getting Started With Predicted Outputs in OpenAI](https://www.firecrawl.dev/blog/getting-started-with-predicted-outputs-openai)\n", + "\n", + "Thank you for reading!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/blog-articles/github-actions-tutorial/notebook.md b/examples/blog-articles/github-actions-tutorial/notebook.md new file mode 100644 index 00000000..332a1c57 --- /dev/null +++ b/examples/blog-articles/github-actions-tutorial/notebook.md @@ -0,0 +1,1187 @@ +--- +title: Comprehensive GitHub Actions Tutorial For Beginners With Examples in Python +description: Learn how to automate software development workflows with GitHub Actions. This beginner-friendly tutorial covers workflow creation, CI/CD pipelines, scheduled tasks, and practical Python examples to help you streamline your development process. +slug: github-actions-tutorial-for-beginners-with-python-examples +date: Dec 9, 2024 +author: bex_tuychiev +image: /images/blog/github-actions-tutorial/github-actions-tutorial-for-beginners-with-python-examples.jpg +categories: [tutorials] +keywords: [github actions, github actions tutorial, github actions environment variables, github actions secrets, github actions workflow, github actions run, github actions jobs] +--- + +## Introduction + +GitHub Actions is a powerful automation platform that helps developers automate repetitive, time-consuming software development workflows. Instead of manually running tests, executing scripts at intervals, or performing any programmable task, you can let GitHub Actions handle those operations when specific events occur in your repository. In this tutorial, you will learn how to use this critical feature of GitHub and design your own workflows for several real-world use cases. + +### What are GitHub Actions? + +At its core, [GitHub Actions](https://docs.github.com/en/actions) is a continuous integration and continuous delivery (CI/CD) platform that lets you automate various tasks directly from your GitHub repository. Think of it as your personal automation assistant, which can: + +- Run your Python tests automatically when you push code +- Deploy your application when you create a new release +- Send notifications when issues are created +- Schedule tasks to run at specific times +- And much more... + +### Why automate with GitHub Actions? + +Consider this common scenario: You are building a Python application that scrapes product prices from various e-commerce websites. Without GitHub Actions, you would need to: + +1. Manually run your tests after each code change +2. Remember to execute the scraper at regular intervals +3. Deploy updates to your production environment +4. Keep track of environment variables and secrets + +With GitHub Actions, all of these tasks can be automated through workflows, typically written in YAML files like this: + +```yaml +name: Run Price Scraper + +on: + schedule: + - cron: '0 */12 * * *' # Runs every 12 hours + workflow_dispatch: # Allows manual triggers + +jobs: + scrape: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Run scraper + env: + API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} + run: python scraper.py +``` + +This workflow automatically runs a scraper every 12 hours, handles Python version setup, and securely manages API keysโ€”all without manual intervention. + +### What we'll build in this tutorial + +Throughout this tutorial, we'll build several practical GitHub Actions workflows for Python applications. You will learn how to: + +1. Create basic and advanced workflow configurations +2. Work with environment variables and secrets +3. Set up automated testing pipelines +4. Build a real-world example: an automated scraping system using [Firecrawl](https://firecrawl.dev) in Python +5. Implement best practices for security and efficiency + +By the end, you will have hands-on experience with GitHub Actions and be able to automate your own Python projects effectively. + +> Note: Even though code examples are Python, the concepts and hands-on experience you will gain from the tutorial will apply to any programming language. + +Let's start by understanding the core concepts that make GitHub Actions work. + +## How to Use This GitHub Actions Tutorial + +Before diving into the technical details, here's how to get the most from this GitHub Actions tutorial: + +1. Follow the examples sequentially - each builds on previous concepts +2. Try running the workflows yourself - hands-on practice is crucial +3. Refer back to earlier sections as needed +4. Use the provided code samples as templates for your own projects + +## Understanding GitHub Actions Core Concepts + +To write your own GitHub Actions workflows, you need to understand how its different components work together. Let's break down these core concepts using a practical example: automating tests for a simple Python script. + +### GitHub Actions workflows and their components + +A workflow is an automated process that you define in a YAML file within your repository's `.github/workflows` directory. Think of it as a recipe that tells GitHub exactly what to do, how and when to do it. You can transform virtually any programmable task into a GitHub workflow as long as it can be executed in a Linux, Windows, or macOS environment and doesn't require direct user interaction. + +Here is a basic workflow structure: + +```yaml +# test.yaml +name: Python Tests +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' +``` + +The YAML file starts by specifying the name of the workflow with the `name` field. Immediately after, we specify the events that triggers this workflow. In this example, the workflow automatically executes on each `git push` command and pull request. We will learn more about events and triggers in a later section. + +Next, we define jobs, which are the building blocks of workflows. Each job: + +- Runs on a fresh virtual machine (called a runner) that is specified using the `runs-on` field. +- Can execute multiple steps in sequence +- Can run in parallel with other jobs +- Has access to shared workflow data + +For example, you might have separate jobs for testing and deployment: + +```yaml +jobs: + test: + runs-on: ubuntu-latest + ... + deploy: + runs-on: macos-latest + ... +``` + +Each job can contain one or more `steps` that are executed sequentially. Steps are individual tasks that make up your job. They can: + +- Run commands or shell scripts +- Execute actions (reusable units of code) +- Run commands in Docker containers +- Reference other GitHub repositories + +For example, a typical test job might have steps to: + +1. Check out (clone) code from your GitHub repository +2. Set up dependencies +3. Run tests +4. Upload test results + +Each step can specify: + +- `name`: A display name for the step +- `uses`: Reference to an action to run +- `run`: Any operating-system specific terminal command like `pip install package` or `python script.py` +- `with`: Input parameters for actions +- `env`: Environment variables for the step + +Now that we understand jobs and steps, let's look at Actions - the reusable building blocks that make GitHub Actions so powerful. + +### Actions + +The `test.yaml` file from earlier has a single `test` job that executes two steps: + +1. Checking out the repository code using a built-in `actions/checkout@v3` action. +2. Setting up a Python environment with `actions/setup-python@v4` and `python-version` as an input parameter for said action. + +```bash +# test.yaml +name: Python Tests +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' +``` + +Actions are reusable units of code that can be shared across workflows (this is where GitHub Actions take its name). They are like pre-packaged functions that handle common tasks. For instance, instead of writing code to set up Node.js or caching dependencies, you can use the GitHub official actions like: + +- `actions/setup-node@v3` - Sets up Node.js environment +- `actions/cache@v3` - Caches dependencies and build outputs +- `actions/upload-artifact@v3` - Uploads workflow artifacts +- `actions/download-artifact@v3` - Downloads workflow artifacts +- `actions/labeler@v4` - Automatically labels pull requests +- `actions/stale@v8` - Marks and closes stale issues/PRs +- `actions/dependency-review-action@v3` - Reviews dependency changes + +### Events and triggers + +Events are specific activities that trigger a workflow. Common triggers include: + +- `push`: When code is pushed to the repository +- `pull_request`: When a PR is opened or updated +- `schedule`: At specified times using cron syntax +- `workflow_dispatch`: Manual trigger via GitHub UI + +Here is how you can configure multiple triggers: + +```yaml +name: Comprehensive Workflow +on: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + - cron: '0 0 * * *' # Daily at midnight + workflow_dispatch: # Manual trigger + +jobs: + process: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run daily tasks + run: python daily_tasks.py + env: + API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} +``` + +This example shows how a single workflow can: + +- Run automatically on code changes on `git push` +- Execute daily scheduled tasks with cron +- Be triggered automatically when needed through the GitHub UI +- Handle sensitive data like API keys securely + +### Cron jobs in GitHub Actions + +To use the `schedule` trigger effectively in GitHub Actions, you'll need to understand cron syntax. This powerful scheduling format lets you automate workflows to run at precise times. The syntax uses five fields to specify when a job should run: + +![Cron syntax diagram showing minute, hour, day of month, month, and day of week fields with examples and explanations for GitHub Actions scheduling](github-actions-tutorial-images/cron-syntax.png) + +Here are some common cron schedule examples: + +```yaml +# Daily at 3:30 AM UTC +- cron: '30 3 * * *' + +# Every Monday at 1:00 PM UTC +- cron: '0 13 * * 1' + +# Every 6 hours at the first minute +- cron: '0 */6 * * *' + +# At minute 15 of every hour +- cron: '15 * * * *' + +# Every weekday (Monday through Friday) +- cron: '0 0 * * 1-5' + +# Each day at 12am, 6am, 12pm, 6pm on Tuesday, Thursday, Saturday +- cron: '0 0,6,12,18 * * 1,3,5' +``` + +Here is a sample workflow for a scraping job with four different schedules (multiple schedules are allowed): + +```yaml +name: Price Scraper Schedules +on: + schedule: + - cron: '0 */4 * * *' # Every 4 hours + - cron: '30 1 * * *' # Daily at 1:30 AM UTC + - cron: '0 9 * * 1-5' # Weekdays at 9 AM UTC + +jobs: + scrape: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Run Firecrawl scraper + env: + API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} + run: python scraper.py +``` + +Remember that GitHub Actions runs on UTC time, and schedules might experience slight delays during peak GitHub usage. That's why it's helpful to combine `schedule` with `workflow_dispatch` as we saw earlier - giving you both automated and manual trigger options. + +--------------- + +Understanding these core concepts allows you to create workflows that are efficient (running only when needed), secure (properly handling sensitive data), maintainable (using reusable actions) and scalable (running on different platforms). + +In the next section, we will put these concepts into practice by creating your first GitHub actions workflow. + +## Creating Your First GitHub Actions Workflow + +Let's create a practical GitHub Actions workflow from scratch. We'll build a workflow that automatically tests a Python script and runts it on a schedule - a universal task applicable to any programming language. + +### Setting up the environment + +Let's start by creating a working directory for this mini-project: + +```bash +mkdir first-workflows +cd first-workflows +``` + +Let's create the standard `.github/workflows` folder structure GitHub uses for detecting workflow files: + +```bash +mkdir -p .github/workflows +``` + +The workflow files can have any name but must have a `.yml` extension: + +```bash +touch .github/workflows/system_monitor.yml +``` + +In addition to the workflows folder, create a `tests` folder as well as a test file: + +```bash +mkdir tests +touch tests/test_main.py +``` + +We should also create the `main.py` file along with a `requirements.txt`: + +```bash +touch main.py requirements.txt +``` + +Then, add these two dependencies to `requirements.txt`: + +```text +psutil>=5.9.0 +pytest>=7.0.0 +``` + +Finally, let's initialize git and make our first commit: + +```bash +git init +git add . +git commit -m "Initial commit" +``` + +Check out the [Git documentation](https://git-scm.com/doc) if you don't have it installed already. + +### Writing your first workflow file + +Let's write the workflow logic first. Open `system_monitor.yml` and paste each code snippet we are about to define one after the other. + +- Workflow name and triggers: + +```yaml +name: System Monitoring +on: + schedule: + - cron: '*/30 * * * *' # Run every 30 minutes + workflow_dispatch: # Enables manual trigger +``` + +In this part, we give a descriptive name to the workflow that appears in GitHub's UI. Using the `on` field, we set the workflow to run every 30 minutes and through a manual trigger. + +- Job definition: + +```yaml +jobs: + run_script: + runs-on: ubuntu-latest +``` + +`jobs` contains all the jobs in this workflow and it has a `run_script` name, which is a unique identifier. + +- Steps: + +There are five steps that run sequentially in this workflow. They are given descriptive names that appear in the GitHub UI and uses official GitHub actions and custom terminal commands. + +```yaml +jobs: + monitor: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run tests + run: pytest tests/ + + - name: Collect system metrics + run: python main.py +``` + +Here is what each step does: + +1. Check out repository code with `actions/checkout@v3`. +2. Configures Python 3.9 environment. +3. Runs two terminal commands that: + - Install/upgrade `pip` + - Install `pytest` package +4. Runs the tests located in the `tests` directory using `pytest`. +5. Executes the main script with `python main.py`. + +Notice the use of `|` (pipe) operator for multi-line commands. + +After you complete writing the workflow, commit the changes to Git: + +```bash +git add . +git commit -m "Add a workflow file for monitoring system resources" +``` + +### Creating the Python script + +Now, let's write the `main.py` file, which is a monitoring script that helps software developers track system resource usage over time, enabling them to identify performance bottlenecks and capacity issues in their development environment. + +```python +import psutil +import json +from datetime import datetime +from pathlib import Path +``` + +This script collects and logs system metrics over time. It uses `psutil` to gather CPU usage, memory usage, disk usage, and active process counts. The metrics are timestamped and saved to JSON files organized by date. + +The script has three main functions: + +```python +def get_system_metrics(): + """Collect key system metrics""" + metrics = { + "cpu_percent": psutil.cpu_percent(interval=1), + "memory_percent": psutil.virtual_memory().percent, + "disk_usage": psutil.disk_usage('/').percent, + "timestamp": datetime.now().isoformat() + } + + # Add running processes count + metrics["active_processes"] = len(psutil.pids()) + + return metrics +``` + +`get_system_metrics()` - Collects current system metrics including CPU percentage, memory usage percentage, disk usage percentage, timestamp, and count of active processes. + +```python +def save_metrics(metrics): + """Save metrics to a JSON file with today's date""" + date_str = datetime.now().strftime("%Y-%m-%d") + reports_dir = Path("system_metrics") + reports_dir.mkdir(exist_ok=True) + + # Save to daily file + file_path = reports_dir / f"metrics_{date_str}.json" + + # Load existing metrics if file exists + if file_path.exists(): + with open(file_path) as f: + daily_metrics = json.load(f) + else: + daily_metrics = [] + + # Append new metrics + daily_metrics.append(metrics) + + # Save updated metrics + with open(file_path, 'w') as f: + json.dump(daily_metrics, f, indent=2) +``` + +`save_metrics()` - Handles saving the metrics to JSON files. It creates a `system_metrics` directory if needed, and saves metrics to date-specific files (e.g. `metrics_2024-12-12.json`). If a file for the current date exists, it loads and appends to it, otherwise creates a new file. + +```python +def main(): + try: + metrics = get_system_metrics() + save_metrics(metrics) + print(f"System metrics collected at {metrics['timestamp']}") + print(f"CPU: {metrics['cpu_percent']}% | Memory: {metrics['memory_percent']}%") + return True + except Exception as e: + print(f"Error collecting metrics: {str(e)}") + return False + +if __name__ == "__main__": + main() +``` + +`main()` - Orchestrates the metric collection and saving process. It calls `get_system_metrics()`, saves the data via `save_metrics()`, prints current CPU and memory usage to console, and handles any errors that occur during execution. + +The script can be run directly or imported as a module. When run directly (which is what happens in a GitHub Actions workflow), it executes the `main()` function which collects and saves one set of metrics. + +Combine the code snippets above into the `main.py` file and commit the changes: + +```bash +git add . +git commit -m "Add the main.py functionality" +``` + +### Adding tests + +Testing is a critical part of software engineering workflows for several reasons: + +1. Reliability: Tests help ensure code behaves correctly and consistently across changes. +2. Regression prevention: Tests catch when new changes break existing functionality. +3. Documentation: Tests serve as executable documentation of expected behavior +4. Design feedback: Writing tests helps identify design issues early +5. Confidence: A good test suite gives confidence when refactoring or adding features + +For our system metrics collection script, tests would be valuable to verify: + +- The `get_system_metrics()` function returns data in the expected format with valid ranges +- The `save_metrics()` function properly handles file operations and JSON serialization +- Error handling works correctly for various failure scenarios +- The `main()` function orchestrates the workflow as intended + +With that said, let's work on the `tests/test_main.py` file: + +```python +import json +from datetime import datetime +from pathlib import Path +from main import get_system_metrics, save_metrics +``` + +The test file we are about to write demonstrates key principles of testing with `pytest`, a popular Python testing framework. Pytest makes it easy to write tests by using simple `assert` statements and providing a rich set of features for test organization and execution. The test functions are automatically discovered by `pytest` when their names start with `test_`, and each function tests a specific aspect of the system's functionality. + +```python +def test_get_system_metrics(): + """Test if system metrics are collected correctly""" + metrics = get_system_metrics() + + # Check if all required metrics exist and are valid + assert 0 <= metrics['cpu_percent'] <= 100 + assert 0 <= metrics['memory_percent'] <= 100 + assert metrics['active_processes'] > 0 + assert 'timestamp' in metrics +``` + +In this example, we have two test functions that verify different components of our metrics collection system. The first test, `test_get_system_metrics()`, checks if the metrics collection function returns data in the expected format and with valid ranges. It uses multiple assert statements to verify that CPU and memory percentages are between 0 and 100, that there are active processes, and that a timestamp is included. This demonstrates the practice of testing both the structure of returned data and the validity of its values. + +```python +def test_save_and_read_metrics(): + """Test if metrics are saved and can be read back""" + # Get and save metrics + metrics = get_system_metrics() + save_metrics(metrics) + + # Check if file exists and contains data + date_str = datetime.now().strftime("%Y-%m-%d") + file_path = Path("system_metrics") / f"metrics_{date_str}.json" + + assert file_path.exists() + with open(file_path) as f: + saved_data = json.load(f) + + assert isinstance(saved_data, list) + assert len(saved_data) > 0 +``` + +The second test, `test_save_and_read_metrics()`, showcases integration testing by verifying that metrics can be both saved to and read from a file. It follows a common testing pattern: arrange (setup the test conditions), act (perform the operations being tested), and assert (verify the results). The test ensures that the file is created in the expected location and that the saved data maintains the correct structure. This type of test is particularly valuable as it verifies that different components of the system work together correctly. + +Combine the above code snippets and commit the changes to GitHub: + +```bash +git add . +git commit -m "Write tests for main.py" +``` + +### Running your first GitHub Actions workflow + +Now that we've created our system monitoring workflow, let's set it up on GitHub and run it. First, push everything we have to a new GitHub repository: + +```bash +git remote add origin https://github.com/your-username/your-repository.git +git branch -M main +git push -u origin main +``` + +Once the workflow file is pushed, GitHub automatically detects it and displays in the "Actions" tab of your repository. The workflow is scheduled so you don't need to do anything - the first workflow run will happen within 30 minutes (remember how we set the running interval to `*/30` with cron). Since the workflow also includes a `workflow_dispatch` field, you can trigger it manually by clicking on the "Run workflow" button. + +After clicking the button, a new run appears within a few seconds (refresh if you don't see it). To see the workflow run in real-time, click on it and expand the `monitor` job. You'll see each step executing: + +- Checking out repository +- Setting up Python +- Installing dependencies +- Running tests +- Collecting metrics + +### Committing changes made by GitHub Actions workflows + +Right now, our workflow file has a problem - while it successfully collects metrics, it doesn't commit and push the changes back to the repository. This means that although metrics are being gathered, they aren't being saved in version control. Let's modify the workflow to automatically commit and push the collected metrics: + +```yaml +name: System Monitor + +on: + schedule: + - cron: '*/30 * * * *' + workflow_dispatch: + +permissions: + contents: write + +jobs: + monitor: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Run tests + run: python -m pytest + + - name: Collect metrics + run: python main.py + + - name: Commit and push changes + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git add metrics.json + git commit -m "Update metrics" || exit 0 + git push +``` + +The key changes in this updated workflow are: + +1. Added `permissions` block with `contents: write` to allow the workflow to push changes back to the repository. +2. Added a new "Commit and push changes" step that: + - Configures git user identity as `github-actions` bot + - Stages the `metrics.json` file + - Creates a commit with message "Update metrics" + - Pushes the changes back to the repository + +The "|| exit 0" after `git commit` ensures the workflow doesn't fail if there are no changes to commit. + +This allows the workflow to automatically save and version control the metrics it collects. Let's commit the changes to the workflow and push: + +```bash +git add . +git commit -m "Add a commit step to the workflow file" +git push origin main +``` + +After this change, try running the workflow manually and verify its success by navigating to the Actions tab in your GitHub repository. You should see the workflow run and the `metrics.json` file updated with new data. + +## Managing Sensitive Data and Environment Variables + +When building automated workflows with GitHub Actions, proper handling of sensitive data like API keys, passwords, and access tokens is crucial for security. Let's explore best practices for managing these credentials. + +### Understanding environment variables in GitHub Actions + +Environment variables in GitHub Actions can be set at different levels: + +- Repository level (GitHub Secrets) +- Workflow level +- Job level +- Step level + +Here is how to properly configure and use them: + +#### 1. Setting up the repository secrets + +First, store sensitive values as repository secrets: + +1. Navigate to your GitHub repository +2. Go to Settings โ†’ Secrets and variables โ†’ Actions +3. Click "New repository secret" +4. Add your secrets with descriptive names like: + +- `API_KEY` +- `DATABASE_URL` +- `AWS_ACCESS_KEY` + +#### 2. Using secrets in workflows + +Reference secrets in your workflow file using the `secrets` context: + +```yaml +name: Web Scraping Pipeline +# ... the rest of the file + +jobs: + scrape: + runs-on: ubuntu-latest + + steps: + # ... the rest of the steps + + - name: Run scraper + env: + API_KEY: ${{ secrets.API_KEY }} + DATABASE_URL: ${{ secrets.DATABASE_URL }} + run: python scraper.py +``` + +Above, the "Run scraper" step executes `scraper.py` which relies on two environment variables configured through the `secrets` context. + +#### 3. Local development with .env files + +For local development, use `.env` files to manage environment variables: + +```bash +touch .env +echo "API_KEY='your-api-key-here'" >> .env +echo "DATABASE_URL='postgresql://user:pass@localhost:5432/db'" >> .env +``` + +Create a `.gitignore` file to prevent committing sensitive data: + +```bash +echo ".env" >> .gitignore +``` + +#### 4. Loading environment variables in Python + +Use `python-dotenv` to load variables from `.env` files: + +```python +from dotenv import load_dotenv +import os + +# Load environment variables from .env file +load_dotenv() + +# Access variables +api_key = os.getenv('API_KEY') +database_url = os.getenv('DATABASE_URL') + +if not api_key or not database_url: + raise ValueError("Missing required environment variables") +``` + +This code demonstrates loading environment variables from a `.env` file using `python-dotenv`. The `load_dotenv()` function reads the variables, which can then be accessed via `os.getenv()`. Basic validation ensures required variables exist. + +#### 5. Environment variable validation + +Create a configuration class to validate environment variables: + +```python +from pydantic import BaseSettings, SecretStr + + +class Settings(BaseSettings): + api_key: SecretStr + database_url: str + debug_mode: bool = False + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + + +# Initialize settings +settings = Settings() +``` + +This approach using Pydantic provides several advantages for environment variable management: + +1. Type validation - Pydantic automatically validates types and converts values +2. Default values - The `debug_mode` demonstrates setting defaults +3. Secret handling - `SecretStr` provides secure handling of sensitive values +4. Centralized config - All environment variables are defined in one place +5. IDE support - Get autocomplete and type hints when using the settings object + +The `Settings` class inherits from `BaseSettings` which automatically loads from environment variables. The `Config` class specifies to also load from a `.env` file. + +Using `settings = Settings()` creates a validated configuration object that can be imported and used throughout the application. This is more robust than accessing `os.environ` directly. + +Example usage: + +```python +settings.api_key.get_secret_value() # Securely access API key +settings.database_url # Type-checked database URL +settings.debug_mode # Boolean with default value +``` + +#### 6. Handle different environments + +Handle different environments (development, staging, production) using environment-specific files: + +```bash +.env # Default environment variables +.env.development # Development-specific variables +.env.staging # Staging-specific variables +.env.production # Production-specific variables +``` + +Load the appropriate file based on the environment: + +```bash +from dotenv import load_dotenv +import os + +env = os.getenv('ENVIRONMENT', 'development') +env_file = f'.env.{env}' + +load_dotenv(env_file) +``` + +This approach allows you to maintain separate configurations for different environments while keeping sensitive information secure. The environment-specific files can contain different values for the same variables, such as: + +- Development environment may use local services and dummy credentials +- Staging environment may use test services with restricted access +- Production environment contains real credentials and production service endpoints + +You can also combine this with the `Pydantic` settings approach shown earlier for robust configuration management across environments. + +For example, staging might use a test database while production uses the live database: + +```bash +# .env.staging: +DATABASE_URL=postgresql://test-db.example.com +API_KEY=test-key +``` + +```bash +.env.production: +DATABASE_URL=postgresql://prod-db.example.com +API_KEY=live-key +``` + +This separation helps prevent accidental use of production resources during development and testing. + +## Building Real-World Python Workflows + +Let's explore three practical examples of GitHub Actions workflows for common Python tasks: web scraping, package publishing, and container builds. + +### 1. Scheduled web scraping with Firecrawl + +Web scraping is a common use case for automated workflows. Let's build a workflow that scrapes [Hacker News](https://news.ycombinator.com/) on a schedule using [Firecrawl](https://docs.firecrawl.dev), which is a Python AI-based web scraping engine designed for large-scale data collection. Here are some key benefits that make Firecrawl an excellent choice for this task: + +1. **Enterprise-grade automation and scalability** - Firecrawl streamlines web scraping with powerful automation features. +2. **AI-powered content extraction** - Maintains scraper reliability over time by identifying and extracting data based on semantic descriptions instead of relying HTML elements and CSS selectors. +3. **Handles complex scraping challenges** - Automatically manages proxies, anti-bot mechanisms, and dynamic JavaScript content. +4. **Multiple output formats** - Supports scraping and converting data in markdown, tabular, screenshots, and HTML, making it versatile for various applications. +5. **Built-in rate limiting and request management** - Ensures efficient and compliant data extraction. +6. **Geographic location customization** - Avoids IP bans by customizing the geographic location of requests. + +Let's build our web scraping workflow using Firecrawl to demonstrate these capabilities. + +```bash +# Create project directory and install dependencies +mkdir hacker-news-scraper && cd hacker-news-scraper +pip install firecrawl-py pydantic python-dotenv + +# Create necessary files +touch requirements.txt scraper.py .env + +# Add dependencies to requirements.txt +echo "firecrawl-py\npydantic\npython-dotenv" > requirements.txt + +# Add Firecrawl API key to .env (get your key at firecrawl.dev/signin/signup) +echo "FIRECRAWL_API_KEY='your_api_key_here'" > .env +``` + +Open the scraper script where we define our scraping logic: + +```python +# scraper.py +import json +from firecrawl import FirecrawlApp +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from typing import List +from datetime import datetime + +load_dotenv() + +BASE_URL = "https://news.ycombinator.com/" +``` + +First, we import necessary libraries and packages, also defining a base URL we are going to scrape. + +```python +class NewsItem(BaseModel): + title: str = Field(description="The title of the news item") + source_url: str = Field(description="The URL of the news item") + author: str = Field( + description="The URL of the post author's profile concatenated with the base URL." + ) + rank: str = Field(description="The rank of the news item") + upvotes: str = Field(description="The number of upvotes of the news item") + date: str = Field(description="The date of the news item.") + + +class NewsData(BaseModel): + news_items: List[NewsItem] +``` + +We define two Pydantic models to structure our scraped data: + +1. `NewsItem` - Represents a single news item with fields for title, URL, author, rank, upvotes and date +2. `NewsData` - Contains a list of `NewsItem` objects + +These models help validate the scraped data and ensure it matches our expected schema. They also make it easier to serialize/deserialize the data when saving to JSON. Using `Field` with a detailed description is crucial because Firecrawl uses these definitions to automatically detect the HTMl elements and CSS selectors we are looking for. + +```python +def get_news_data(): + app = FirecrawlApp() + + data = app.scrape_url( + BASE_URL, + params={ + "formats": ["extract"], + "extract": {"schema": NewsData.model_json_schema()}, + }, + ) + + return data +``` + +The `get_news_data()` function uses Firecrawl to scrape Hacker News. It creates a `FirecrawlApp` instance and calls `scrape_url()` with the `BASE_URL` and parameters specifying we want to extract data according to our `NewsData` schema. The schema helps Firecrawl automatically identify and extract the relevant HTML elements. The function returns the scraped data containing news items with their titles, URLs, authors, ranks, upvotes and dates. + +```python +def save_firecrawl_news_data(): + """ + Save the scraped news data to a JSON file with the current date in the filename. + """ + # Get the data + data = get_news_data() + # Format current date for filename + date_str = datetime.now().strftime("%Y_%m_%d_%H_%M") + filename = f"firecrawl_hacker_news_data_{date_str}.json" + + # Save the news items to JSON file + with open(filename, "w") as f: + json.dump(data["extract"]["news_items"], f, indent=4) + + print(f"{datetime.now()}: Successfully saved the news data.") + +if __name__ == "__main__": + save_firecrawl_news_data() + +``` + +The `save_firecrawl_news_data()` function handles saving the scraped Hacker News data to a JSON file. It first calls `get_news_data()` to fetch the latest data from Hacker News. Then it generates a filename using the current timestamp to ensure uniqueness. The data is saved to a JSON file with that filename, with the news items formatted with proper indentation for readability. Finally, it prints a confirmation message with the current timestamp when the save is complete. This function provides a convenient way to store snapshots of Hacker News data that can be analyzed later. + +Combine these snippets into the `scraper.py` script. Then, we can write a workflow that executes it on schedule: + +```bash +cd .. # Change back to the project root directory +touch .github/workflows/hacker-news-scraper.py # Create the workflow file +``` + +Here is what the workflow file must look like: + +```yaml +name: Run Hacker News Scraper + +permissions: + contents: write + +on: + schedule: + - cron: "0 */6 * * *" + workflow_dispatch: + +jobs: + scrape: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r hacker-news-scraper/requirements.txt + + - name: Run scraper + run: python hacker-news-scraper/scraper.py + + - name: Commit and push if changes + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add . + git commit -m "Update scraped data" -a || exit 0 + git push +``` + +This workflow runs our Hacker News scraper every 6 hours using GitHub Actions. It sets up Python, installs dependencies, executes the scraper, and automatically commits any new data to the repository. The workflow can also be triggered manually using the `workflow_dispatch` event. One important note about the paths specified in the workflow file is that they must match your repository's directory structure exactly, including the requirements.txt location and the path to your scraper script. + +To enable the workflow, simply push all the changes to GitHub and test it through the UI. The next runs will be automatic. + +```bash +git add . +git commit -m "Add a scraping workflow" +git push origin main +``` + +### 2. Package publishing workflow + +Publishing Python packages to PyPI (Python Package Index) typically involves several steps. First, developers need to prepare their package by creating a proper directory structure, writing setup files, and ensuring all metadata is correct. Then, the package needs to be built into distribution formats - both source distributions (`sdist`) and wheel distributions (`bdist_wheel`). Finally, these distribution files are uploaded to PyPI using tools like `twine`. This process often requires careful version management and proper credentials for the package repository. While this can be done manually, automating it with CI/CD pipelines like GitHub Actions ensures consistency and reduces human error in the release process. + +For example, the following workflow publishes a new version of a package when you create a new release: + +```yaml +name: Publish Python Package +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine + + - name: Build package + run: python -m build + + - name: Publish to PyPI + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} + run: | + python -m twine upload dist/* +``` + +The workflow automates publishing Python packages to PyPI when GitHub releases are created. + +Required setup steps: + +1. Package must have `setup.py` or `pyproject.toml` configured +2. Create PyPI account at `pypi.org` +3. Generate PyPI API token with upload permissions +4. Store token as `PYPI_API_TOKEN` in repository secrets + +The workflow process: + +1. Triggers on new GitHub release +2. Checks out code and sets up Python +3. Installs build tools +4. Creates distribution packages +5. Uploads to PyPI using stored token + +The `__token__` username is used with PyPI's token authentication, while the actual token is accessed securely through GitHub secrets. + +### 3. Container build and push workflow + +GitHub Actions can also automate building and pushing Docker containers to container registries like Docker Hub or GitHub Container Registry (GHCR). This workflow is useful for maintaining containerized applications and ensuring your latest code changes are packaged into updated container images. + +The process typically involves: + +1. Building a Docker image from your `Dockerfile` +2. Tagging the image with version/metadata +3. Authenticating with the container registry +4. Pushing the tagged image to the registry + +This automation ensures your container images stay in sync with code changes and are readily available for deployment. Here is a sample workflow containing these steps: + +```yaml +name: Build and Push Container +on: + push: + branches: [main] + paths: + - 'Dockerfile' + - 'src/**' + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: | + user/app:latest + user/app:${{ github.sha }} +``` + +This workflow introduces a few new GitHub Actions concepts and syntax: + +The `paths` trigger filter ensures the workflow only runs when changes are made to the Dockerfile or files in the `src` directory, preventing unnecessary builds. + +`docker/setup-buildx-action` configures Docker Buildx, which provides enhanced build capabilities including multi-platform builds and build caching. + +`docker/login-action` handles registry authentication. Before using this, you must: + +1. [Create a Docker Hub account](https://app.docker.com/signup) +2. Generate an access token in Docker Hub settings +3. Add `DOCKERHUB_USERNAME` and `DOCKERHUB_TOKEN` as repository secrets in GitHub + +`docker/build-push-action` is a specialized action for building and pushing Docker images. The configuration shows: + +- `context: .` (builds from current directory) +- `push: true` (automatically pushes after building) +- `tags:` specifies multiple tags including: + - `latest:` rolling tag for most recent version + - `github.sha:` unique tag using commit hash for versioning + +The workflow assumes you have: + +- A valid Dockerfile in your repository +- Required application code in `src` directory +- Docker Hub repository permissions +- Properly configured repository secrets + +When this workflow runs successfully, it produces a containerized version of your application that is automatically published to Docker Hub and can be pulled with either the latest tag or specific commit hash. + +## Understanding How GitHub Actions Run + +When you trigger a GitHub Actions run, whether manually or through automated events, the platform: + +1. Provisions a fresh virtual machine (runner) +2. Executes your workflow steps sequentially +3. Reports results back to GitHub +4. Tears down the runner environment + +This isolated execution model ensures consistency and security for each GitHub Actions run. + +## Glossary of GitHub Actions Terms + +- **GitHub Actions**: GitHub's built-in automation platform for software development workflows +- **GitHub Actions Workflow**: A configurable automated process made up of one or more jobs +- **GitHub Actions Jobs**: Individual units of work that can run sequentially or in parallel +- **GitHub Actions Run**: A single execution instance of a workflow +- **GitHub Actions Environment Variables**: Configuration values available during workflow execution +- **GitHub Actions Secrets**: Encrypted environment variables for sensitive data +- **GitHub Actions Tutorial**: A guide teaching the fundamentals of GitHub Actions (like this one!) + +## Conclusion + +Throughout this tutorial, we've explored the fundamentals and practical applications of GitHub Actions for Python development. From understanding core concepts like workflows, jobs, and actions, to implementing real-world examples including automated testing, web scraping, package publishing, and container builds, you've gained hands-on experience with this powerful automation platform. We've also covered critical aspects like managing sensitive data through environment variables and secrets, ensuring your automated workflows are both secure and maintainable. + +As you continue your journey with GitHub Actions, remember that automation is an iterative process. Start small with basic workflows, test thoroughly, and gradually add complexity as needed. The examples provided here serve as templates that you can adapt and expand for your specific use cases. For further learning, explore the [GitHub Actions documentation](https://docs.github.com/en/actions), join the [GitHub Community Forum](https://github.community/), and experiment with the vast ecosystem of pre-built actions available in the [GitHub Marketplace](https://github.com/marketplace?type=actions). Whether you're building a personal project or managing enterprise applications, GitHub Actions provides the tools you need to streamline your development workflow and focus on what matters most - writing great code. + +If you want to learn more about Firecrawl, the web scraping API we used today, you can read the following posts: + +- [Guide to Scheduling Web Scrapers in Python](https://www.firecrawl.dev/blog/automated-web-scraping-free-2025) +- [Mastering Firecrawl's Scrape Endpoint](https://www.firecrawl.dev/blog/mastering-firecrawl-scrape-endpoint) +- [Getting Started With Predicted Outputs in OpenAI](https://www.firecrawl.dev/blog/getting-started-with-predicted-outputs-openai) + +Thank you for reading!