Merge pull request #1380 from aparupganguly/feature/gemini-2.5-crawler

Add examples/gemini-2.5-pro crawler
This commit is contained in:
Eric Ciarla 2025-03-27 09:45:40 -04:00 committed by GitHub
commit 56d23cc6ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 474 additions and 0 deletions

View File

@ -0,0 +1,6 @@
# Firecrawl API key from your Firecrawl account
FIRECRAWL_API_KEY=your_firecrawl_api_key_here
# Google Cloud API key with Gemini API access
# Get this from Google Cloud Console: https://console.cloud.google.com/
GEMINI_API_KEY=your_gemini_api_key_here

View File

@ -0,0 +1,89 @@
# Gemini 2.5 Web Crawler
A powerful web crawler that uses Google's Gemini 2.5 Pro model to intelligently analyze web content, PDFs, and images based on user-defined objectives.
## Features
- Intelligent URL mapping and ranking based on relevance to search objective
- PDF content extraction and analysis
- Image content analysis and description
- Smart content filtering based on user objectives
- Support for multiple content types (markdown, PDFs, images)
- Color-coded console output for better readability
## Prerequisites
- Python 3.8+
- Google Cloud API key with Gemini API access
- Firecrawl API key
## Installation
1. Clone the repository:
```bash
git clone <your-repo-url>
cd <your-repo-directory>
```
2. Install the required dependencies:
```bash
pip install -r requirements.txt
```
3. Create a `.env` file based on `.env.example`:
```bash
cp .env.example .env
```
4. Add your API keys to the `.env` file:
```
FIRECRAWL_API_KEY=your_firecrawl_api_key
GEMINI_API_KEY=your_gemini_api_key
```
## Usage
Run the script:
```bash
python gemini-2.5-crawler.py
```
The script will prompt you for:
1. The website URL to crawl
2. Your search objective
The crawler will then:
1. Map the website and find relevant pages
2. Analyze the content using Gemini 2.5 Pro
3. Extract and analyze any PDFs or images found
4. Return structured information related to your objective
## Output
The script provides color-coded console output for:
- Process steps and progress
- Debug information
- Success and error messages
- Final results in JSON format
## Error Handling
The script includes comprehensive error handling for:
- API failures
- Content extraction issues
- Invalid URLs
- Timeouts
- JSON parsing errors
## Note
This script uses the experimental Gemini 2.5 Pro model (`gemini-2.5-pro-exp-03-25`). Make sure you have appropriate access and quota for using this model.

View File

@ -0,0 +1,374 @@
import os
from firecrawl import FirecrawlApp
import json
import re
import requests
from requests.exceptions import RequestException
from dotenv import load_dotenv
import google.genai as genai
# Load environment variables
load_dotenv()
# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
gemini_api_key = os.getenv("GEMINI_API_KEY")
# Initialize the FirecrawlApp and Gemini client
app = FirecrawlApp(api_key=firecrawl_api_key)
client = genai.Client(api_key=gemini_api_key) # Create Gemini client
model_name = "gemini-2.5-pro-exp-03-25"
types = genai.types
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
def pdf_size_in_mb(data: bytes) -> float:
"""Utility function to estimate PDF size in MB from raw bytes."""
return len(data) / (1024 * 1024)
def gemini_extract_pdf_content(pdf_url, objective):
"""
Downloads a PDF from pdf_url, then calls Gemini to extract text.
Returns a string with the extracted text only.
"""
try:
pdf_data = requests.get(pdf_url, timeout=15).content
size_mb = pdf_size_in_mb(pdf_data)
if size_mb > 15:
print(
f"{Colors.YELLOW}Warning: PDF size is {size_mb} MB. Skipping PDF extraction.{Colors.RESET}")
return ""
prompt = f"""
The objective is: {objective}.
From this PDF, extract only the text that helps address this objective.
If it contains no relevant info, return an empty string.
"""
response = client.models.generate_content(
model=model_name,
contents=[
types.Part.from_bytes(
data=pdf_data, mime_type="application/pdf"),
prompt
]
)
return response.text.strip()
except Exception as e:
print(f"Error using Gemini to process PDF '{pdf_url}': {str(e)}")
return ""
def gemini_extract_image_data(image_url):
"""
Downloads an image from image_url, then calls Gemini to:
1) Summarize what's in the image
Returns a string with the summary.
"""
try:
print(f"Gemini IMAGE extraction from: {image_url}")
image_data = requests.get(image_url, timeout=15).content
# 1) Summarize
resp_summary = client.models.generate_content([
"Describe the contents of this image in a short paragraph.",
types.Part.from_bytes(data=image_data, mime_type="image/jpeg"),
])
summary_text = resp_summary.text.strip()
return f"**Image Summary**:\n{summary_text}"
except Exception as e:
print(f"Error using Gemini to process Image '{image_url}': {str(e)}")
return ""
def extract_urls_from_markdown(markdown_text):
"""
Simple regex-based approach to extract potential URLs from a markdown string.
We look for http(s)://someurl up until a space or parenthesis or quote, etc.
"""
pattern = r'(https?://[^\s\'")]+)'
found = re.findall(pattern, markdown_text)
return list(set(found)) # unique them
def detect_mime_type(url, timeout=8):
"""
Attempt a HEAD request to detect the Content-Type. Return 'pdf', 'image' or None if undetermined.
Also validates image extensions for supported formats.
"""
try:
resp = requests.head(url, timeout=timeout, allow_redirects=True)
ctype = resp.headers.get('Content-Type', '').lower()
exts = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.heic', '.heif']
if 'pdf' in ctype:
return 'pdf'
elif ctype.startswith('image/') and any(url.lower().endswith(ext) for ext in exts):
return 'image'
else:
return None
except RequestException as e:
print(f"Warning: HEAD request failed for {url}. Error: {e}")
return None
def find_relevant_page_via_map(objective, url, app):
try:
print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
map_prompt = f"""
Based on the objective of: {objective}, provide a 1-2 word search parameter that will help find the information.
Respond with ONLY 1-2 words, no other text or formatting.
"""
print(
f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
response = client.models.generate_content(
model=model_name,
contents=[map_prompt]
)
map_search_parameter = response.text.strip()
print(
f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
print(
f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
map_website = app.map_url(url, params={"search": map_search_parameter})
print(f"{Colors.MAGENTA}Debug - Map response structure: {json.dumps(map_website, indent=2)}{Colors.RESET}")
print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
if isinstance(map_website, dict):
links = map_website.get('urls', []) or map_website.get('links', [])
elif isinstance(map_website, str):
try:
parsed = json.loads(map_website)
links = parsed.get('urls', []) or parsed.get('links', [])
except json.JSONDecodeError:
links = []
else:
links = map_website if isinstance(map_website, list) else []
if not links:
print(f"{Colors.RED}No links found in map response.{Colors.RESET}")
return None
rank_prompt = f"""RESPOND ONLY WITH JSON.
Analyze these URLs and rank the top 3 most relevant ones for finding information about: {objective}
Return ONLY a JSON array in this exact format - no other text or explanation:
[
{{
"url": "http://example.com",
"relevance_score": 95,
"reason": "Main about page with company information"
}},
{{
"url": "http://example2.com",
"relevance_score": 85,
"reason": "Team page with details"
}},
{{
"url": "http://example3.com",
"relevance_score": 75,
"reason": "Blog post about company"
}}
]
URLs to analyze:
{json.dumps(links, indent=2)}"""
print(f"{Colors.YELLOW}Ranking URLs by relevance to objective...{Colors.RESET}")
response = client.models.generate_content(
model=model_name,
contents=[rank_prompt]
)
print(f"{Colors.MAGENTA}Debug - Raw Gemini response:{Colors.RESET}")
print(response.text)
try:
response_text = response.text.strip()
print(f"{Colors.MAGENTA}Debug - Cleaned response:{Colors.RESET}")
print(response_text)
if '[' in response_text and ']' in response_text:
start_idx = response_text.find('[')
end_idx = response_text.rfind(']') + 1
json_str = response_text[start_idx:end_idx]
print(
f"{Colors.MAGENTA}Debug - Extracted JSON string:{Colors.RESET}")
print(json_str)
ranked_results = json.loads(json_str)
else:
print(f"{Colors.RED}No JSON array found in response{Colors.RESET}")
return None
links = [result["url"] for result in ranked_results]
print(f"{Colors.CYAN}Top 3 ranked URLs:{Colors.RESET}")
for result in ranked_results:
print(f"{Colors.GREEN}URL: {result['url']}{Colors.RESET}")
print(
f"{Colors.YELLOW}Relevance Score: {result['relevance_score']}{Colors.RESET}")
print(f"{Colors.BLUE}Reason: {result['reason']}{Colors.RESET}")
print("---")
if not links:
print(f"{Colors.RED}No relevant links identified.{Colors.RESET}")
return None
except json.JSONDecodeError as e:
print(f"{Colors.RED}Error parsing ranked results: {str(e)}{Colors.RESET}")
print(f"{Colors.RED}Failed JSON string: {response_text}{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}")
return None
print(f"{Colors.GREEN}Located {len(links)} relevant links.{Colors.RESET}")
return links
except Exception as e:
print(
f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
return None
def find_objective_in_top_pages(map_website, objective, app):
try:
if not map_website:
print(f"{Colors.RED}No links found to analyze.{Colors.RESET}")
return None
top_links = map_website[:3]
print(
f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
for link in top_links:
print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
scrape_result = app.scrape_url(
link, params={'formats': ['markdown']})
print(
f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
# Now detect any PDF or image URLs in the Markdown text
page_markdown = scrape_result.get('markdown', '')
if not page_markdown:
print(
f"{Colors.RED}No markdown returned for {link}, skipping...{Colors.RESET}")
continue
found_urls = extract_urls_from_markdown(page_markdown)
pdf_image_append = ""
for sub_url in found_urls:
mime_type_short = detect_mime_type(sub_url)
if mime_type_short == 'pdf':
print(
f"{Colors.YELLOW} Detected PDF: {sub_url}. Extracting content...{Colors.RESET}")
pdf_content = gemini_extract_pdf_content(sub_url)
if pdf_content:
pdf_image_append += f"\n\n---\n[PDF from {sub_url}]:\n{pdf_content}"
elif mime_type_short == 'image':
print(
f"{Colors.YELLOW} Detected Image: {sub_url}. Extracting content...{Colors.RESET}")
image_content = gemini_extract_image_data(sub_url)
if image_content:
pdf_image_append += f"\n\n---\n[Image from {sub_url}]:\n{image_content}"
# Append extracted PDF/image text to the main markdown for the page
if pdf_image_append:
scrape_result[
'markdown'] += f"\n\n---\n**Additional Gemini Extraction:**\n{pdf_image_append}\n"
check_prompt = f"""
Analyze this content to find: {objective}
If found, return ONLY a JSON object with information related to the objective. If not found, respond EXACTLY with: Objective not met
Content to analyze:
{scrape_result['markdown']}
Remember:
- Return valid JSON if information is found
- Return EXACTLY "Objective not met" if not found
- No other text or explanations
"""
response = client.models.generate_content(
model=model_name,
contents=[check_prompt]
)
result = response.text.strip()
print(f"{Colors.MAGENTA}Debug - Check response:{Colors.RESET}")
print(result)
if result != "Objective not met":
print(
f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
try:
if '{' in result and '}' in result:
start_idx = result.find('{')
end_idx = result.rfind('}') + 1
json_str = result[start_idx:end_idx]
return json.loads(json_str)
else:
print(
f"{Colors.RED}No JSON object found in response{Colors.RESET}")
except json.JSONDecodeError:
print(
f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
else:
print(
f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
return None
except Exception as e:
print(
f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
return None
def main():
url = input(f"{Colors.BLUE}Enter the website to crawl : {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
map_website = find_relevant_page_via_map(objective, url, app)
if map_website:
print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis using gemini-pro...{Colors.RESET}")
result = find_objective_in_top_pages(map_website, objective, app)
if result:
print(
f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
else:
print(
f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
else:
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,5 @@
google-cloud-aiplatform>=1.36.0
google-generativeai>=0.3.2
python-dotenv>=1.0.0
requests>=2.31.0
firecrawl>=0.1.0