From 7d8519218ae2ed674fd7aa6995fe94221ad0de73 Mon Sep 17 00:00:00 2001
From: Rishi Raj Jain <jain71000@gmail.com>
Date: Sat, 19 Oct 2024 02:27:39 +0530
Subject: [PATCH] Update app.py

---
 examples/sales_web_crawler/app.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/examples/sales_web_crawler/app.py b/examples/sales_web_crawler/app.py
index ae14fc62..f76280e9 100644
--- a/examples/sales_web_crawler/app.py
+++ b/examples/sales_web_crawler/app.py
@@ -1,11 +1,13 @@
-import os
 import csv
 import json
+import os
+import uuid
 
 from dotenv import load_dotenv
 from firecrawl import FirecrawlApp
 from openai import OpenAI
 from serpapi import GoogleSearch
+from tqdm import tqdm
 
 load_dotenv()
 
@@ -15,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 def search_google(query, objective):
     """Search Google using SerpAPI."""
-    print(f"Parameters: query={query}, objective={objective}")
+    # print(f"Parameters: query={query}, objective={objective}")
     search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
     results = search.get_dict().get("organic_results", [])
     return {"objective": objective, "results": results}
 
 def scrape_url(url, objective):
     """Scrape a website using Firecrawl."""
-    print(f"Parameters: url={url}, objective={objective}")
+    # print(f"Parameters: url={url}, objective={objective}")
     scrape_status = app.scrape_url(
         url,
         params={'formats': ['markdown']}
@@ -31,19 +33,19 @@ def scrape_url(url, objective):
 
 def crawl_url(url, objective):
     """Crawl a website using Firecrawl."""
-    print(f"Parameters: url={url}, objective={objective}")
+    # print(f"Parameters: url={url}, objective={objective}")
     # If using a crawled url set, pass the ID in the function call below
     # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
     # scrape_status['results'] = scrape_status['data']
     scrape_status = app.crawl_url(
         url,
-        params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
+        params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}}
     )
     return {"objective": objective, "results": scrape_status}
 
 def analyze_website_content(content, objective):
     """Analyze the scraped website content using OpenAI."""
-    print(f"Parameters: content={content[:50]}..., objective={objective}")
+    # print(f"Parameters: content={content[:50]}..., objective={objective}")
     analysis = generate_completion(
         "website data extractor",
         f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
@@ -53,7 +55,7 @@ def analyze_website_content(content, objective):
 
 def generate_completion(role, task, content):
     """Generate a completion using OpenAI."""
-    print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
+    # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
     response = client.chat.completions.create(
         model="gpt-4o",
         messages=[
@@ -86,13 +88,18 @@ def process_websites(file_path):
         if search_results['results']:
             top_result = search_results['results'][0]
             url = top_result['link']
+            unique_filename = f'output_{uuid.uuid4()}.json'
             crawl_results = crawl_url(url, "Crawl website")
             if crawl_results['results']:
-                for each_result in crawl_results['results']['data'][:2]:
-                    analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.")
-                    print(analysis_results['results'])
-                    results.append(json.loads(analysis_results['results']))
-    write_results_to_json(results, 'enriched_data.json')
+                for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"):
+                    analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.")
+                    try:
+                        result = json.loads(analysis_results['results'])
+                        if result:
+                            results.append(result)
+                            write_results_to_json(results, unique_filename)
+                    except:
+                        continue
 
 if __name__ == "__main__":
     # Process websites from the CSV file