Add examples/mistral 3.1 company researcher

This commit is contained in:
Aparup Ganguly 2025-03-21 14:03:31 +05:30
parent 2fb29ee46e
commit 6a6199eb4b

View File

@ -0,0 +1,376 @@
import os
import json
import time
import requests
from dotenv import load_dotenv
from serpapi.google_search import GoogleSearch
from mistralai import Mistral
# ANSI color codes
class Colors:
CYAN = '\033[96m'
YELLOW = '\033[93m'
GREEN = '\033[92m'
RED = '\033[91m'
MAGENTA = '\033[95m'
BLUE = '\033[94m'
RESET = '\033[0m'
# Load environment variables
load_dotenv()
# Initialize clients
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
serp_api_key = os.getenv("SERP_API_KEY")
if not firecrawl_api_key:
print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}")
if not os.getenv("MISTRAL_API_KEY"):
print(f"{Colors.RED}Warning: MISTRAL_API_KEY not found in environment variables{Colors.RESET}")
def search_google(query):
"""Search Google using SerpAPI and return top results."""
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
search = GoogleSearch({"q": query, "api_key": serp_api_key})
return search.get_dict().get("organic_results", [])
def select_urls_with_mistral(company, objective, serp_results):
"""
Use Mistral Small 3.1 to select URLs from SERP results with enhanced criteria.
Returns a list of URLs with confidence scores and justifications.
"""
try:
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
for r in serp_results if r.get("link")]
prompt = (
"Task: Select the MOST RELIABLE and RELEVANT URLs that contain VERIFIABLE information about the specified company.\n\n"
"Instructions:\n"
"1. Analyze the search results for information SPECIFICALLY about the requested objective\n"
"2. Select ONLY official and highly reliable URLs that DIRECTLY address the requested information\n"
"3. Prioritize in this exact order:\n"
" a. The company's official website sections that specifically address the requested information\n"
" b. Official company documents (annual reports, SEC filings, press releases) that contain verifiable data\n"
" c. Government databases or regulatory filings that contain verified information\n"
" d. Trusted industry databases with cited sources (e.g., Bloomberg, Reuters, industry associations)\n"
"4. EXCLUDE any sources that:\n"
" a. Contain primarily opinions or analysis rather than facts\n"
" b. Are outdated (older than 1 year unless historical information is requested)\n"
" c. Are from general news sites without specific expertise in the topic\n"
" d. Do not cite their sources or methodology\n"
" e. Are social media links or user-generated content\n"
"5. For each URL selected, provide a confidence score (1-10) and brief justification\n"
"6. Limit selection to 3-5 of the MOST RELIABLE and RELEVANT sources only\n"
"7. Return a JSON object with the following structure: {\"selected_urls\": [{\"url\": \"url1\", \"confidence\": 9, \"justification\": \"Official company annual report with audited figures\"}]}\n\n"
f"Company: {company}\n"
f"Information Needed: {objective}\n"
f"Search Results: {json.dumps(serp_data, indent=2)}\n\n"
"Response Format: {\"selected_urls\": [{\"url\": \"https://example.com\", \"confidence\": 9, \"justification\": \"Reason this is reliable\"}]}"
)
response = mistral_client.chat.complete(
model="mistral-small-latest",
messages=[
{"role": "user", "content": prompt}
]
)
# Clean the response text
cleaned_response = response.choices[0].message.content.strip()
if cleaned_response.startswith('```'):
cleaned_response = cleaned_response.split('```')[1]
if cleaned_response.startswith('json'):
cleaned_response = cleaned_response[4:]
cleaned_response = cleaned_response.strip()
try:
# Parse JSON response
result = json.loads(cleaned_response)
if isinstance(result, dict) and "selected_urls" in result:
url_data = result["selected_urls"]
# Extract just the URLs for compatibility with existing code
urls = [item["url"] for item in url_data if "url" in item]
# Print detailed information about selected URLs
print(f"{Colors.CYAN}Selected URLs with confidence scores:{Colors.RESET}")
for item in url_data:
if "url" in item and "confidence" in item and "justification" in item:
print(f"- {item['url']} (Confidence: {item['confidence']}/10)")
print(f" Justification: {item['justification']}")
else:
# Fallback to text parsing
urls = [line.strip() for line in cleaned_response.split('\n')
if line.strip().startswith(('http://', 'https://'))]
except json.JSONDecodeError:
# Fallback to text parsing
urls = [line.strip() for line in cleaned_response.split('\n')
if line.strip().startswith(('http://', 'https://'))]
# Clean up URLs
cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls]
cleaned_urls = [url for url in cleaned_urls if url]
# Limit to top 5 URLs to ensure quality over quantity
cleaned_urls = cleaned_urls[:5]
if not cleaned_urls:
print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}")
return []
# Return the URLs for cross-verification
return cleaned_urls
except Exception as e:
print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}")
return []
def cross_verify_sources(urls, company, objective):
"""Use Mistral to cross-verify information across selected sources."""
print(f"{Colors.YELLOW}Cross-verifying selected sources...{Colors.RESET}")
verification_prompt = (
f"Task: Evaluate the reliability and consistency of these sources for information about {company}.\n\n"
f"Objective: {objective}\n\n"
f"URLs to evaluate: {json.dumps(urls)}\n\n"
"Instructions:\n"
"1. For each URL, identify what makes it reliable or unreliable for the specific objective\n"
"2. Assess whether these sources are likely to provide consistent or contradictory information\n"
"3. Identify any potential biases in these sources (e.g., company's own website may present favorable information)\n"
"4. Recommend the final set of URLs that, when used together, will provide the most accurate and complete information\n"
"5. IMPORTANT: Only include URLs that are DIRECTLY relevant to the specific objective\n"
"6. Exclude any URLs that contain primarily general information about the company not related to the objective\n"
"7. Return a JSON object with: {\"verified_urls\": [\"url1\", \"url2\"], \"verification_notes\": \"explanation\"}\n"
)
try:
response = mistral_client.chat.complete(
model="mistral-small-latest",
messages=[
{"role": "user", "content": verification_prompt}
]
)
# Clean the response text
cleaned_response = response.choices[0].message.content.strip()
if cleaned_response.startswith('```'):
cleaned_response = cleaned_response.split('```')[1]
if cleaned_response.startswith('json'):
cleaned_response = cleaned_response[4:]
cleaned_response = cleaned_response.strip()
try:
# Parse JSON response
result = json.loads(cleaned_response)
if isinstance(result, dict) and "verified_urls" in result:
verified_urls = result["verified_urls"]
verification_notes = result.get("verification_notes", "")
print(f"{Colors.CYAN}Cross-verification complete:{Colors.RESET}")
print(f"{Colors.CYAN}Notes: {verification_notes}{Colors.RESET}")
print(f"{Colors.CYAN}Final verified URLs:{Colors.RESET}")
for url in verified_urls:
print(f"- {url}")
return verified_urls
else:
# If JSON parsing fails, return original URLs
print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}")
return urls
except json.JSONDecodeError:
# If JSON parsing fails, return original URLs
print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}")
return urls
except Exception as e:
print(f"{Colors.RED}Error during cross-verification: {str(e)}{Colors.RESET}")
return urls # Return original URLs if cross-verification fails
def extract_company_info(urls, prompt, company, api_key):
"""Use requests to call Firecrawl's extract endpoint with selected URLs."""
print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
# Enhanced prompt for better data quality
enhanced_prompt = (
f"Extract accurate and verified information about {company}. "
f"Specifically focus on: {prompt}. "
f"IMPORTANT INSTRUCTIONS:\n"
f"1. Only include information that is EXPLICITLY stated in the source material\n"
f"2. Do NOT include any speculative information\n"
f"3. If information conflicts between sources, prioritize information from the company's official website\n"
f"4. For each piece of information, cite the specific source URL\n"
f"5. Assign a confidence score (1-10) to each piece of information based on source reliability\n"
f"6. ONLY include information that is DIRECTLY relevant to the specific request\n"
f"7. EXCLUDE any tangential or general information about the company not related to the specific request\n"
f"8. Format the response as a structured JSON with clear categories related to the request\n"
f"9. For each data point, include both the information and its source in this format: {{\"value\": \"information\", \"source\": \"url\", \"confidence\": 8}}\n"
f"10. If multiple sources confirm the same information, cite all sources and increase the confidence score\n"
f"11. If you cannot find specific information requested, explicitly state that it was not found in the sources rather than providing general information"
)
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {api_key}'
}
payload = {
"urls": urls,
"prompt": enhanced_prompt,
"enableWebSearch": False # Changed to False to rely only on verified URLs
}
try:
# Print the payload for debugging
print(f"{Colors.YELLOW}Request payload:{Colors.RESET}")
print(json.dumps(payload, indent=2))
response = requests.post(
"https://api.firecrawl.dev/v1/extract",
headers=headers,
json=payload,
timeout=30
)
# Print detailed response for debugging
print(f"{Colors.YELLOW}Response status code: {response.status_code}{Colors.RESET}")
print(f"{Colors.YELLOW}Response headers: {response.headers}{Colors.RESET}")
data = response.json()
print(f"{Colors.YELLOW}Response body:{Colors.RESET}")
print(json.dumps(data, indent=2))
if not data.get('success'):
print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}")
return None
extraction_id = data.get('id')
if not extraction_id:
print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}")
return None
return poll_firecrawl_result(extraction_id, api_key)
except requests.exceptions.RequestException as e:
print(f"{Colors.RED}Request failed: {e}{Colors.RESET}")
return None
except json.JSONDecodeError as e:
print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}")
return None
except Exception as e:
print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
return None
def poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=60):
"""Poll Firecrawl API to get the extraction result."""
url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}"
headers = {
'Authorization': f'Bearer {api_key}'
}
print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}")
# Show a simple progress indicator instead of "still processing" messages
print(f"{Colors.YELLOW}[", end="", flush=True)
for attempt in range(1, max_attempts + 1):
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
if data.get('success') and data.get('data'):
print(f"]{Colors.RESET}") # Close the progress indicator
print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
# Validate and clean the extracted data
validated_data = validate_extracted_data(data['data'])
print(json.dumps(validated_data, indent=2))
return validated_data
elif data.get('success') and not data.get('data'):
# Show a simple progress indicator
print(f"{Colors.YELLOW}.", end="", flush=True)
time.sleep(interval)
else:
print(f"]{Colors.RESET}") # Close the progress indicator
print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}")
return None
except requests.exceptions.RequestException as e:
print(f"]{Colors.RESET}") # Close the progress indicator
print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}")
return None
except json.JSONDecodeError as e:
print(f"]{Colors.RESET}") # Close the progress indicator
print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}")
return None
except Exception as e:
print(f"]{Colors.RESET}") # Close the progress indicator
print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}")
return None
print(f"]{Colors.RESET}") # Close the progress indicator
print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}")
return None
def validate_extracted_data(data):
"""Validate and clean the extracted data to reduce misinformation."""
if not data or not isinstance(data, dict):
return data
# Look for confidence scores or source information if available
validated_data = {}
for key, value in data.items():
# Skip entries that indicate uncertainty
if isinstance(value, str) and any(term in value.lower() for term in ["unknown", "unclear", "not specified", "not found", "couldn't find"]):
continue
# Keep entries with clear information
validated_data[key] = value
return validated_data
def main():
company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
# Add more specific search terms for better results
search_query = f"{company} {objective}"
# print(f"{Colors.YELLOW}Searching Google for '{search_query}'...{Colors.RESET}")
serp_results = search_google(search_query)
if not serp_results:
# Fallback to just company name
print(f"{Colors.YELLOW}No results found. Trying broader search...{Colors.RESET}")
serp_results = search_google(company)
if not serp_results:
print(f"{Colors.RED}No search results found.{Colors.RESET}")
return
# Select URLs with Mistral
selected_urls = select_urls_with_mistral(company, objective, serp_results)
if not selected_urls:
print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")
return
# Cross-verify the selected sources
verified_urls = cross_verify_sources(selected_urls, company, objective)
if not verified_urls:
print(f"{Colors.YELLOW}No URLs were verified. Using original selected URLs.{Colors.RESET}")
verified_urls = selected_urls
data = extract_company_info(verified_urls, objective, company, firecrawl_api_key)
if data:
print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}")
else:
print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
if __name__ == "__main__":
main()