diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py new file mode 100644 index 00000000..ae14fc62 --- /dev/null +++ b/examples/sales_web_crawler/app.py @@ -0,0 +1,99 @@ +import os +import csv +import json + +from dotenv import load_dotenv +from firecrawl import FirecrawlApp +from openai import OpenAI +from serpapi import GoogleSearch + +load_dotenv() + +# Initialize FirecrawlApp and OpenAI +app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def search_google(query, objective): + """Search Google using SerpAPI.""" + print(f"Parameters: query={query}, objective={objective}") + search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) + results = search.get_dict().get("organic_results", []) + return {"objective": objective, "results": results} + +def scrape_url(url, objective): + """Scrape a website using Firecrawl.""" + print(f"Parameters: url={url}, objective={objective}") + scrape_status = app.scrape_url( + url, + params={'formats': ['markdown']} + ) + return {"objective": objective, "results": scrape_status} + +def crawl_url(url, objective): + """Crawl a website using Firecrawl.""" + print(f"Parameters: url={url}, objective={objective}") + # If using a crawled url set, pass the ID in the function call below + # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") + # scrape_status['results'] = scrape_status['data'] + scrape_status = app.crawl_url( + url, + params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} + ) + return {"objective": objective, "results": scrape_status} + +def analyze_website_content(content, objective): + """Analyze the scraped website content using OpenAI.""" + print(f"Parameters: content={content[:50]}..., objective={objective}") + analysis = generate_completion( + "website data extractor", + f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", + "Objective: " + objective + "\nContent: " + content + ) + return {"objective": objective, "results": analysis} + +def generate_completion(role, task, content): + """Generate a completion using OpenAI.""" + print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": f"You are a {role}. {task}"}, + {"role": "user", "content": content} + ] + ) + return response.choices[0].message.content + +def read_websites_from_csv(file_path): + """Read websites from a CSV file.""" + websites = [] + with open(file_path, mode='r') as file: + csv_reader = csv.DictReader(file) + for row in csv_reader: + websites.append(row['website']) + return websites + +def write_results_to_json(results, file_path): + """Write results to a JSON file.""" + with open(file_path, mode='w') as file: + json.dump(results, file, indent=4) + +def process_websites(file_path): + """Process websites from a CSV file and write results to a new JSON file.""" + results = [] + websites = read_websites_from_csv(file_path) + for website in websites: + search_results = search_google(website, "Search website") + if search_results['results']: + top_result = search_results['results'][0] + url = top_result['link'] + crawl_results = crawl_url(url, "Crawl website") + if crawl_results['results']: + for each_result in crawl_results['results']['data'][:2]: + analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.") + print(analysis_results['results']) + results.append(json.loads(analysis_results['results'])) + write_results_to_json(results, 'enriched_data.json') + +if __name__ == "__main__": + # Process websites from the CSV file + process_websites('websites.csv')