mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-19 04:29:46 +08:00
Add groq_web_crawler example and dependencies
This commit is contained in:
parent
bf1a79588e
commit
75ac980fe4
248
examples/groq_web_crawler/groq_website_analyzer.py
Normal file
248
examples/groq_web_crawler/groq_website_analyzer.py
Normal file
@ -0,0 +1,248 @@
|
||||
import os
|
||||
from firecrawl import FirecrawlApp
|
||||
from groq import Groq
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# ANSI color codes for pretty terminal output
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Retrieve API keys from environment variables
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
groq_api_key = os.getenv("GROQ_API_KEY")
|
||||
|
||||
# Initialize the FirecrawlApp and Groq client
|
||||
app = FirecrawlApp(api_key=firecrawl_api_key)
|
||||
groq_client = Groq(api_key=groq_api_key)
|
||||
|
||||
def scrape_website(url):
|
||||
"""
|
||||
Scrape a website using Firecrawl.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape
|
||||
|
||||
Returns:
|
||||
dict: The scraped data
|
||||
"""
|
||||
try:
|
||||
print(f"{Colors.YELLOW}Scraping website: {url}{Colors.RESET}")
|
||||
scrape_result = app.scrape_url(url, params={'formats': ['markdown']})
|
||||
print(f"{Colors.GREEN}Website scraped successfully.{Colors.RESET}")
|
||||
return scrape_result
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error scraping website: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
def summarize_content(content, model="deepseek-r1-distill-llama-70b"):
|
||||
"""
|
||||
Summarize content using Groq's API.
|
||||
|
||||
Args:
|
||||
content (str): The content to summarize
|
||||
model (str): The model to use for summarization
|
||||
|
||||
Returns:
|
||||
str: The generated summary
|
||||
"""
|
||||
try:
|
||||
print(f"{Colors.YELLOW}Generating summary using Groq's {model} model...{Colors.RESET}")
|
||||
|
||||
prompt = f"""
|
||||
Please provide a concise summary of the following website content.
|
||||
The summary should:
|
||||
- Be around 3-5 paragraphs
|
||||
- Highlight the main purpose of the website
|
||||
- Include key features or offerings
|
||||
- Mention any unique selling points
|
||||
|
||||
Content:
|
||||
{content}
|
||||
"""
|
||||
|
||||
completion = groq_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant that specializes in creating concise website summaries."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.5,
|
||||
max_tokens=1000
|
||||
)
|
||||
|
||||
summary = completion.choices[0].message.content
|
||||
print(f"{Colors.GREEN}Summary generated successfully.{Colors.RESET}")
|
||||
return summary
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error generating summary: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
def analyze_website_sentiment(content, model="deepseek-r1-distill-llama-70b"):
|
||||
"""
|
||||
Analyze the sentiment and tone of the website content using Groq's API.
|
||||
|
||||
Args:
|
||||
content (str): The content to analyze
|
||||
model (str): The model to use for analysis
|
||||
|
||||
Returns:
|
||||
dict: The sentiment analysis result
|
||||
"""
|
||||
try:
|
||||
print(f"{Colors.YELLOW}Analyzing website sentiment using Groq's {model} model...{Colors.RESET}")
|
||||
|
||||
prompt = f"""
|
||||
Please analyze the sentiment and tone of the following website content.
|
||||
Return your analysis as a JSON object with the following fields:
|
||||
- sentiment: the overall sentiment (positive, neutral, negative)
|
||||
- tone_descriptors: an array of 3-5 adjectives describing the tone
|
||||
- formality_level: an estimate of how formal the language is (1-10 scale)
|
||||
- target_audience: your estimate of who the content is aimed at
|
||||
|
||||
Content:
|
||||
{content}
|
||||
"""
|
||||
|
||||
completion = groq_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant that specializes in content and sentiment analysis."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.2,
|
||||
max_tokens=800
|
||||
)
|
||||
|
||||
analysis_text = completion.choices[0].message.content
|
||||
print(f"{Colors.GREEN}Sentiment analysis completed.{Colors.RESET}")
|
||||
|
||||
# Extract the JSON from the response
|
||||
try:
|
||||
import re
|
||||
import json
|
||||
json_match = re.search(r'({.*})', analysis_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str = json_match.group(1)
|
||||
analysis = json.loads(json_str)
|
||||
return analysis
|
||||
return {"error": "Could not parse JSON from response"}
|
||||
except Exception as json_err:
|
||||
print(f"{Colors.RED}Error parsing JSON response: {str(json_err)}{Colors.RESET}")
|
||||
return {"error": "Could not parse JSON", "raw_response": analysis_text}
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error analyzing sentiment: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
def extract_key_topics(content, model="deepseek-r1-distill-llama-70b"):
|
||||
"""
|
||||
Extract key topics and concepts from the website content using Groq's API.
|
||||
|
||||
Args:
|
||||
content (str): The content to analyze
|
||||
model (str): The model to use for extraction
|
||||
|
||||
Returns:
|
||||
list: The extracted key topics
|
||||
"""
|
||||
try:
|
||||
print(f"{Colors.YELLOW}Extracting key topics using Groq's {model} model...{Colors.RESET}")
|
||||
|
||||
prompt = f"""
|
||||
Extract the 5-8 most important topics or concepts from the following website content.
|
||||
For each topic, provide:
|
||||
1. A short name (1-3 words)
|
||||
2. A brief description (10-15 words)
|
||||
|
||||
Return your response as a simple list in the following format:
|
||||
1. [Topic name]: [Brief description]
|
||||
2. [Topic name]: [Brief description]
|
||||
|
||||
Content:
|
||||
{content}
|
||||
"""
|
||||
|
||||
completion = groq_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant that specializes in extracting key topics from content."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3,
|
||||
max_tokens=800
|
||||
)
|
||||
|
||||
topics_text = completion.choices[0].message.content
|
||||
print(f"{Colors.GREEN}Key topics extracted successfully.{Colors.RESET}")
|
||||
return topics_text
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error extracting key topics: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to run the website analysis.
|
||||
"""
|
||||
# Get user input
|
||||
url = input(f"{Colors.BLUE}Enter the website URL to analyze: {Colors.RESET}")
|
||||
|
||||
if not url.strip():
|
||||
print(f"{Colors.RED}No URL entered. Exiting.{Colors.RESET}")
|
||||
return
|
||||
|
||||
# Add http:// prefix if not present
|
||||
if not url.startswith('http'):
|
||||
url = 'https://' + url
|
||||
|
||||
# Scrape the website
|
||||
scrape_result = scrape_website(url)
|
||||
|
||||
if not scrape_result or 'markdown' not in scrape_result:
|
||||
print(f"{Colors.RED}Failed to scrape website. Exiting.{Colors.RESET}")
|
||||
return
|
||||
|
||||
content = scrape_result['markdown']
|
||||
|
||||
# Ask user which analysis to perform
|
||||
print(f"\n{Colors.BLUE}Select an analysis option:{Colors.RESET}")
|
||||
print(f"1. Generate a concise summary of the website")
|
||||
print(f"2. Analyze the sentiment and tone of the website")
|
||||
print(f"3. Extract key topics from the website")
|
||||
print(f"4. Perform all analyses")
|
||||
|
||||
option = input(f"{Colors.BLUE}Enter your choice (1-4): {Colors.RESET}")
|
||||
|
||||
# Perform the selected analysis
|
||||
if option == '1' or option == '4':
|
||||
summary = summarize_content(content)
|
||||
if summary:
|
||||
print(f"\n{Colors.CYAN}Website Summary:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{summary}{Colors.RESET}")
|
||||
print("\n")
|
||||
|
||||
if option == '2' or option == '4':
|
||||
sentiment = analyze_website_sentiment(content)
|
||||
if sentiment:
|
||||
print(f"\n{Colors.CYAN}Sentiment Analysis:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{sentiment}{Colors.RESET}")
|
||||
print("\n")
|
||||
|
||||
if option == '3' or option == '4':
|
||||
topics = extract_key_topics(content)
|
||||
if topics:
|
||||
print(f"\n{Colors.CYAN}Key Topics:{Colors.RESET}")
|
||||
print(f"{Colors.MAGENTA}{topics}{Colors.RESET}")
|
||||
print("\n")
|
||||
|
||||
print(f"{Colors.GREEN}Analysis complete!{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
3
examples/groq_web_crawler/requirements.txt
Normal file
3
examples/groq_web_crawler/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
firecrawl-py
|
||||
groq
|
||||
python-dotenv
|
Loading…
x
Reference in New Issue
Block a user