Update app.py

This commit is contained in:
Rishi Raj Jain 2024-10-19 02:27:39 +05:30 committed by GitHub
parent 2022db7f0a
commit 7d8519218a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,11 +1,13 @@
import os
import csv import csv
import json import json
import os
import uuid
from dotenv import load_dotenv from dotenv import load_dotenv
from firecrawl import FirecrawlApp from firecrawl import FirecrawlApp
from openai import OpenAI from openai import OpenAI
from serpapi import GoogleSearch from serpapi import GoogleSearch
from tqdm import tqdm
load_dotenv() load_dotenv()
@ -15,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def search_google(query, objective): def search_google(query, objective):
"""Search Google using SerpAPI.""" """Search Google using SerpAPI."""
print(f"Parameters: query={query}, objective={objective}") # print(f"Parameters: query={query}, objective={objective}")
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")}) search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
results = search.get_dict().get("organic_results", []) results = search.get_dict().get("organic_results", [])
return {"objective": objective, "results": results} return {"objective": objective, "results": results}
def scrape_url(url, objective): def scrape_url(url, objective):
"""Scrape a website using Firecrawl.""" """Scrape a website using Firecrawl."""
print(f"Parameters: url={url}, objective={objective}") # print(f"Parameters: url={url}, objective={objective}")
scrape_status = app.scrape_url( scrape_status = app.scrape_url(
url, url,
params={'formats': ['markdown']} params={'formats': ['markdown']}
@ -31,19 +33,19 @@ def scrape_url(url, objective):
def crawl_url(url, objective): def crawl_url(url, objective):
"""Crawl a website using Firecrawl.""" """Crawl a website using Firecrawl."""
print(f"Parameters: url={url}, objective={objective}") # print(f"Parameters: url={url}, objective={objective}")
# If using a crawled url set, pass the ID in the function call below # If using a crawled url set, pass the ID in the function call below
# scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d") # scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
# scrape_status['results'] = scrape_status['data'] # scrape_status['results'] = scrape_status['data']
scrape_status = app.crawl_url( scrape_status = app.crawl_url(
url, url,
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}} params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}}
) )
return {"objective": objective, "results": scrape_status} return {"objective": objective, "results": scrape_status}
def analyze_website_content(content, objective): def analyze_website_content(content, objective):
"""Analyze the scraped website content using OpenAI.""" """Analyze the scraped website content using OpenAI."""
print(f"Parameters: content={content[:50]}..., objective={objective}") # print(f"Parameters: content={content[:50]}..., objective={objective}")
analysis = generate_completion( analysis = generate_completion(
"website data extractor", "website data extractor",
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response", f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
@ -53,7 +55,7 @@ def analyze_website_content(content, objective):
def generate_completion(role, task, content): def generate_completion(role, task, content):
"""Generate a completion using OpenAI.""" """Generate a completion using OpenAI."""
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...") # print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
response = client.chat.completions.create( response = client.chat.completions.create(
model="gpt-4o", model="gpt-4o",
messages=[ messages=[
@ -86,13 +88,18 @@ def process_websites(file_path):
if search_results['results']: if search_results['results']:
top_result = search_results['results'][0] top_result = search_results['results'][0]
url = top_result['link'] url = top_result['link']
unique_filename = f'output_{uuid.uuid4()}.json'
crawl_results = crawl_url(url, "Crawl website") crawl_results = crawl_url(url, "Crawl website")
if crawl_results['results']: if crawl_results['results']:
for each_result in crawl_results['results']['data'][:2]: for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"):
analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.") analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.")
print(analysis_results['results']) try:
results.append(json.loads(analysis_results['results'])) result = json.loads(analysis_results['results'])
write_results_to_json(results, 'enriched_data.json') if result:
results.append(result)
write_results_to_json(results, unique_filename)
except:
continue
if __name__ == "__main__": if __name__ == "__main__":
# Process websites from the CSV file # Process websites from the CSV file