mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 21:09:00 +08:00
Update app.py
This commit is contained in:
parent
2022db7f0a
commit
7d8519218a
@ -1,11 +1,13 @@
|
||||
import os
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
from openai import OpenAI
|
||||
from serpapi import GoogleSearch
|
||||
from tqdm import tqdm
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -15,14 +17,14 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
def search_google(query, objective):
|
||||
"""Search Google using SerpAPI."""
|
||||
print(f"Parameters: query={query}, objective={objective}")
|
||||
# print(f"Parameters: query={query}, objective={objective}")
|
||||
search = GoogleSearch({"q": query, "api_key": os.getenv("SERP_API_KEY")})
|
||||
results = search.get_dict().get("organic_results", [])
|
||||
return {"objective": objective, "results": results}
|
||||
|
||||
def scrape_url(url, objective):
|
||||
"""Scrape a website using Firecrawl."""
|
||||
print(f"Parameters: url={url}, objective={objective}")
|
||||
# print(f"Parameters: url={url}, objective={objective}")
|
||||
scrape_status = app.scrape_url(
|
||||
url,
|
||||
params={'formats': ['markdown']}
|
||||
@ -31,19 +33,19 @@ def scrape_url(url, objective):
|
||||
|
||||
def crawl_url(url, objective):
|
||||
"""Crawl a website using Firecrawl."""
|
||||
print(f"Parameters: url={url}, objective={objective}")
|
||||
# print(f"Parameters: url={url}, objective={objective}")
|
||||
# If using a crawled url set, pass the ID in the function call below
|
||||
# scrape_status = app.check_crawl_status("c99c9598-5a21-46d3-bced-3444a8b1942d")
|
||||
# scrape_status['results'] = scrape_status['data']
|
||||
scrape_status = app.crawl_url(
|
||||
url,
|
||||
params={'limit': 10, 'scrapeOptions': {'formats': ['markdown']}}
|
||||
params={'limit': 5, 'scrapeOptions': {'formats': ['markdown']}}
|
||||
)
|
||||
return {"objective": objective, "results": scrape_status}
|
||||
|
||||
def analyze_website_content(content, objective):
|
||||
"""Analyze the scraped website content using OpenAI."""
|
||||
print(f"Parameters: content={content[:50]}..., objective={objective}")
|
||||
# print(f"Parameters: content={content[:50]}..., objective={objective}")
|
||||
analysis = generate_completion(
|
||||
"website data extractor",
|
||||
f"Analyze the following website content and extract a JSON object based on the objective. Do not write the ```json and ``` to denote a JSON when returning a response",
|
||||
@ -53,7 +55,7 @@ def analyze_website_content(content, objective):
|
||||
|
||||
def generate_completion(role, task, content):
|
||||
"""Generate a completion using OpenAI."""
|
||||
print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
||||
# print(f"Parameters: role={role}, task={task[:50]}..., content={content[:50]}...")
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[
|
||||
@ -86,13 +88,18 @@ def process_websites(file_path):
|
||||
if search_results['results']:
|
||||
top_result = search_results['results'][0]
|
||||
url = top_result['link']
|
||||
unique_filename = f'output_{uuid.uuid4()}.json'
|
||||
crawl_results = crawl_url(url, "Crawl website")
|
||||
if crawl_results['results']:
|
||||
for each_result in crawl_results['results']['data'][:2]:
|
||||
analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people found.")
|
||||
print(analysis_results['results'])
|
||||
results.append(json.loads(analysis_results['results']))
|
||||
write_results_to_json(results, 'enriched_data.json')
|
||||
for each_result in tqdm(crawl_results['results']['data'], desc="Analyzing crawl results"):
|
||||
analysis_results = analyze_website_content(each_result['markdown'], "Extract emails, names, and titles of the people and companies found.")
|
||||
try:
|
||||
result = json.loads(analysis_results['results'])
|
||||
if result:
|
||||
results.append(result)
|
||||
write_results_to_json(results, unique_filename)
|
||||
except:
|
||||
continue
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Process websites from the CSV file
|
||||
|
Loading…
x
Reference in New Issue
Block a user