feature/gemini-2.5-company-extractor

This commit is contained in:
Aparup Ganguly 2025-03-26 17:51:29 +05:30
parent 42236ef0f0
commit be43598071
5 changed files with 344 additions and 0 deletions

View File

@ -0,0 +1,8 @@
# Google Gemini API Key
GOOGLE_API_KEY=your_google_api_key_here
# Firecrawl API Key
FIRECRAWL_API_KEY=your_firecrawl_api_key_here
# SerpAPI Key
SERP_API_KEY=your_serp_api_key_here

View File

@ -0,0 +1,34 @@
# Environment variables
.env
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# IDE
.idea/
.vscode/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db

View File

@ -0,0 +1,85 @@
# Gemini 2.5 Web Extractor
A powerful web information extraction tool that combines Google's Gemini 2.5 Pro (Experimental) model with Firecrawl's web extraction capabilities to gather structured information about companies from the web.
## Features
- Uses Google Search (via SerpAPI) to find relevant web pages
- Leverages Gemini 2.5 Pro (Experimental) to intelligently select the most relevant URLs
- Extracts structured information using Firecrawl's advanced web extraction
- Real-time progress monitoring and colorized console output
## Prerequisites
- Python 3.8 or higher
- Google API Key (Gemini)
- Firecrawl API Key
- SerpAPI Key
## Setup
1. Clone the repository:
```bash
git clone <repository-url>
cd gemini-2.5-web-extractor
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Set up environment variables:
- Copy `.env.example` to `.env`
- Fill in your API keys in the `.env` file:
- `GOOGLE_API_KEY`: Your Google API key for Gemini
- `FIRECRAWL_API_KEY`: Your Firecrawl API key
- `SERP_API_KEY`: Your SerpAPI key
## Usage
Run the script:
```bash
python gemini-2.5-web-extractor.py
```
The script will:
1. Prompt you for a company name
2. Ask what information you want to extract about the company
3. Search for relevant web pages
4. Use Gemini to select the most relevant URLs
5. Extract structured information using Firecrawl
6. Display the results in a formatted JSON output
## Example
```bash
Enter the company name: Tesla
Enter what information you want about the company: latest electric vehicle models and their specifications
```
The script will then:
1. Search for relevant Tesla information
2. Select the most informative URLs about Tesla's current EV lineup
3. Extract and structure the vehicle specifications
4. Present the data in a clean, organized format
## Error Handling
The script includes comprehensive error handling for:
- API failures
- Network issues
- Invalid responses
- Timeout scenarios
All errors are clearly displayed with colored output for better visibility.
## License
[Add your license information here]

View File

@ -0,0 +1,213 @@
import os
import json
import time
import requests
from dotenv import load_dotenv
from serpapi.google_search import GoogleSearch
from google import genai
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Initialize clients
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
serp_api_key = os.getenv("SERP_API_KEY")
if not firecrawl_api_key:
print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}")
def search_google(query):
"""Search Google using SerpAPI and return top results."""
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
search = GoogleSearch({"q": query, "api_key": serp_api_key})
return search.get_dict().get("organic_results", [])
def select_urls_with_gemini(company, objective, serp_results):
"""
Use Gemini 2.5 Flash to select URLs from SERP results.
Returns a list of URLs.
"""
try:
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
for r in serp_results if r.get("link")]
prompt = (
"Task: Select relevant URLs from search results.\n\n"
"Instructions:\n"
"1. Analyze the search results for information about the specified company\n"
"2. Select URLs that are most likely to contain the requested information\n"
"3. Return ONLY a JSON object with the following structure: {\"selected_urls\": [\"url1\", \"url2\"]}\n"
"4. Do not include social media links\n\n"
f"Company: {company}\n"
f"Information Needed: {objective}\n"
f"Search Results: {json.dumps(serp_data, indent=2)}\n\n"
"Response Format: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}"
)
response = client.models.generate_content(
model="gemini-2.5-pro-exp-03-25",
contents=prompt
)
# Clean the response text
cleaned_response = response.text.strip()
if cleaned_response.startswith('```'):
cleaned_response = cleaned_response.split('```')[1]
if cleaned_response.startswith('json'):
cleaned_response = cleaned_response[4:]
cleaned_response = cleaned_response.strip()
try:
# Parse JSON response
result = json.loads(cleaned_response)
if isinstance(result, dict) and "selected_urls" in result:
urls = result["selected_urls"]
else:
# Fallback to text parsing
urls = [line.strip() for line in cleaned_response.split('\n')
if line.strip().startswith(('http://', 'https://'))]
except json.JSONDecodeError:
# Fallback to text parsing
urls = [line.strip() for line in cleaned_response.split('\n')
if line.strip().startswith(('http://', 'https://'))]
# Clean up URLs
cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls]
cleaned_urls = [url for url in cleaned_urls if url]
if not cleaned_urls:
print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}")
return []
print(f"{Colors.CYAN}Selected URLs for extraction:{Colors.RESET}")
for url in cleaned_urls:
print(f"- {url}")
return cleaned_urls
except Exception as e:
print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}")
return []
def extract_company_info(urls, prompt, company, api_key):
"""Use requests to call Firecrawl's extract endpoint with selected URLs."""
print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}'
}
payload = {
"urls": urls,
"prompt": prompt + " for " + company,
"enableWebSearch": True
}
try:
response = requests.post(
"https://api.firecrawl.dev/v1/extract",
headers=headers,
json=payload,
timeout=30
)
data = response.json()
if not data.get('success'):
print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}")
return None
extraction_id = data.get('id')
if not extraction_id:
print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}")
return None
return poll_firecrawl_result(extraction_id, api_key)
except requests.exceptions.RequestException as e:
print(f"{Colors.RED}Request failed: {e}{Colors.RESET}")
return None
except json.JSONDecodeError as e:
print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
return None
def poll_firecrawl_result(extraction_id, api_key, interval=10, max_attempts=60):
"""Poll Firecrawl API to get the extraction result."""
url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}"
headers = {
'Authorization': f'Bearer {api_key}'
}
print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}")
for attempt in range(1, max_attempts + 1):
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
if data.get('success') and data.get('data'):
print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
print(json.dumps(data['data'], indent=2))
return data['data']
elif data.get('success') and not data.get('data'):
if attempt % 6 == 0:
print(f"{Colors.YELLOW}Still processing... (attempt {attempt}/{max_attempts}){Colors.RESET}")
time.sleep(interval)
else:
print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}")
return None
except requests.exceptions.RequestException as e:
print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}")
return None
except json.JSONDecodeError as e:
print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}")
return None
print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}")
return None
def main():
company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
serp_results = search_google(f"{company}")
if not serp_results:
print(f"{Colors.RED}No search results found.{Colors.RESET}")
return
selected_urls = select_urls_with_gemini(company, objective, serp_results)
if not selected_urls:
print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")
return
data = extract_company_info(selected_urls, objective, company, firecrawl_api_key)
if data:
print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}")
else:
print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,4 @@
python-dotenv==1.0.0
google-generativeai==0.3.2
requests==2.31.0
serpapi==0.1.5