mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-04 17:20:36 +08:00
39 lines
1.1 KiB
Python
39 lines
1.1 KiB
Python
from firecrawl import FirecrawlApp
|
|
from pydantic import BaseModel, Field
|
|
from datetime import datetime
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
app = FirecrawlApp()
|
|
|
|
|
|
class Product(BaseModel):
|
|
"""Schema for creating a new product"""
|
|
|
|
url: str = Field(description="The URL of the product")
|
|
name: str = Field(description="The product name/title")
|
|
price: float = Field(description="The current price of the product")
|
|
currency: str = Field(description="Currency code (USD, EUR, etc)")
|
|
main_image_url: str = Field(description="The URL of the main image of the product")
|
|
|
|
|
|
def scrape_product(url: str):
|
|
extracted_data = app.scrape_url(
|
|
url,
|
|
params={
|
|
"formats": ["extract"],
|
|
"extract": {"schema": Product.model_json_schema()},
|
|
},
|
|
)
|
|
|
|
# Add the scraping date to the extracted data
|
|
extracted_data["extract"]["timestamp"] = datetime.utcnow()
|
|
|
|
return extracted_data["extract"]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
product = "https://www.amazon.com/gp/product/B002U21ZZK/"
|
|
|
|
print(scrape_product(product))
|