From 7d8519218ae2ed674fd7aa6995fe94221ad0de73 Mon Sep 17 00:00:00 2001 From: Rishi Raj Jain Date: Sat, 19 Oct 2024 02:27:39 +0530 Subject: [PATCH] Update app.py --- examples/sales_web_crawler/app.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py index ae14fc62..f76280e9 100644 --- a/examples/sales_web_crawler/app.py +++ b/examples/sales_web_crawler/app.py @@ -1,11 +1,13 @@ -import os import csv import json +import os +import uuid from dotenv import load_dotenv from firecrawl import FirecrawlApp from openai import OpenAI from serpapi import GoogleSearch +from tqdm import tqdm load_dotenv() @@ -15,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) def search_google(query, objective): """Search Google using SerpAPI.""" - print(f"Parameters: query={query}, objective={objective}") + # print(f"Parameters: query={query}, objective={objective}") search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) results = search.get_dict().get("organic_results", []) return {"objective": objective, "results": results} def scrape_url(url, objective): """Scrape a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") + # print(f"Parameters: url={url}, objective={objective}") scrape_status = app.scrape_url( url, params={'formats': ['markdown']} @@ -31,19 +33,19 @@ def scrape_url(url, objective): def crawl_url(url, objective): """Crawl a website using Firecrawl.""" - print(f"Parameters: url={url}, objective={objective}") + # print(f"Parameters: url={url}, objective={objective}") # If using a crawled url set, pass the ID in the function call below # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") # scrape_status['results'] = scrape_status['data'] scrape_status = app.crawl_url( url, - params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} + params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}} ) return {"objective": objective, "results": scrape_status} def analyze_website_content(content, objective): """Analyze the scraped website content using OpenAI.""" - print(f"Parameters: content={content[:50]}..., objective={objective}") + # print(f"Parameters: content={content[:50]}..., objective={objective}") analysis = generate_completion( "website data extractor", f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", @@ -53,7 +55,7 @@ def analyze_website_content(content, objective): def generate_completion(role, task, content): """Generate a completion using OpenAI.""" - print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") + # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") response = client.chat.completions.create( model="gpt-4o", messages=[ @@ -86,13 +88,18 @@ def process_websites(file_path): if search_results['results']: top_result = search_results['results'][0] url = top_result['link'] + unique_filename = f'output_{uuid.uuid4()}.json' crawl_results = crawl_url(url, "Crawl website") if crawl_results['results']: - for each_result in crawl_results['results']['data'][:2]: - analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.") - print(analysis_results['results']) - results.append(json.loads(analysis_results['results'])) - write_results_to_json(results, 'enriched_data.json') + for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"): + analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.") + try: + result = json.loads(analysis_results['results']) + if result: + results.append(result) + write_results_to_json(results, unique_filename) + except: + continue if __name__ == "__main__": # Process websites from the CSV file