mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-18 12:09:42 +08:00
Add examples/mistral 3.1 company researcher
This commit is contained in:
parent
2fb29ee46e
commit
6a6199eb4b
@ -0,0 +1,376 @@
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
from serpapi.google_search import GoogleSearch
|
||||
from mistralai import Mistral
|
||||
|
||||
# ANSI color codes
|
||||
class Colors:
|
||||
CYAN = '\033[96m'
|
||||
YELLOW = '\033[93m'
|
||||
GREEN = '\033[92m'
|
||||
RED = '\033[91m'
|
||||
MAGENTA = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
RESET = '\033[0m'
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Initialize clients
|
||||
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||
serp_api_key = os.getenv("SERP_API_KEY")
|
||||
|
||||
|
||||
if not firecrawl_api_key:
|
||||
print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}")
|
||||
|
||||
if not os.getenv("MISTRAL_API_KEY"):
|
||||
print(f"{Colors.RED}Warning: MISTRAL_API_KEY not found in environment variables{Colors.RESET}")
|
||||
|
||||
def search_google(query):
|
||||
"""Search Google using SerpAPI and return top results."""
|
||||
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
|
||||
search = GoogleSearch({"q": query, "api_key": serp_api_key})
|
||||
return search.get_dict().get("organic_results", [])
|
||||
|
||||
def select_urls_with_mistral(company, objective, serp_results):
|
||||
"""
|
||||
Use Mistral Small 3.1 to select URLs from SERP results with enhanced criteria.
|
||||
Returns a list of URLs with confidence scores and justifications.
|
||||
"""
|
||||
try:
|
||||
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
|
||||
for r in serp_results if r.get("link")]
|
||||
|
||||
prompt = (
|
||||
"Task: Select the MOST RELIABLE and RELEVANT URLs that contain VERIFIABLE information about the specified company.\n\n"
|
||||
"Instructions:\n"
|
||||
"1. Analyze the search results for information SPECIFICALLY about the requested objective\n"
|
||||
"2. Select ONLY official and highly reliable URLs that DIRECTLY address the requested information\n"
|
||||
"3. Prioritize in this exact order:\n"
|
||||
" a. The company's official website sections that specifically address the requested information\n"
|
||||
" b. Official company documents (annual reports, SEC filings, press releases) that contain verifiable data\n"
|
||||
" c. Government databases or regulatory filings that contain verified information\n"
|
||||
" d. Trusted industry databases with cited sources (e.g., Bloomberg, Reuters, industry associations)\n"
|
||||
"4. EXCLUDE any sources that:\n"
|
||||
" a. Contain primarily opinions or analysis rather than facts\n"
|
||||
" b. Are outdated (older than 1 year unless historical information is requested)\n"
|
||||
" c. Are from general news sites without specific expertise in the topic\n"
|
||||
" d. Do not cite their sources or methodology\n"
|
||||
" e. Are social media links or user-generated content\n"
|
||||
"5. For each URL selected, provide a confidence score (1-10) and brief justification\n"
|
||||
"6. Limit selection to 3-5 of the MOST RELIABLE and RELEVANT sources only\n"
|
||||
"7. Return a JSON object with the following structure: {\"selected_urls\": [{\"url\": \"url1\", \"confidence\": 9, \"justification\": \"Official company annual report with audited figures\"}]}\n\n"
|
||||
f"Company: {company}\n"
|
||||
f"Information Needed: {objective}\n"
|
||||
f"Search Results: {json.dumps(serp_data, indent=2)}\n\n"
|
||||
"Response Format: {\"selected_urls\": [{\"url\": \"https://example.com\", \"confidence\": 9, \"justification\": \"Reason this is reliable\"}]}"
|
||||
)
|
||||
|
||||
response = mistral_client.chat.complete(
|
||||
model="mistral-small-latest",
|
||||
messages=[
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
)
|
||||
|
||||
# Clean the response text
|
||||
cleaned_response = response.choices[0].message.content.strip()
|
||||
if cleaned_response.startswith('```'):
|
||||
cleaned_response = cleaned_response.split('```')[1]
|
||||
if cleaned_response.startswith('json'):
|
||||
cleaned_response = cleaned_response[4:]
|
||||
cleaned_response = cleaned_response.strip()
|
||||
|
||||
try:
|
||||
# Parse JSON response
|
||||
result = json.loads(cleaned_response)
|
||||
if isinstance(result, dict) and "selected_urls" in result:
|
||||
url_data = result["selected_urls"]
|
||||
# Extract just the URLs for compatibility with existing code
|
||||
urls = [item["url"] for item in url_data if "url" in item]
|
||||
|
||||
# Print detailed information about selected URLs
|
||||
print(f"{Colors.CYAN}Selected URLs with confidence scores:{Colors.RESET}")
|
||||
for item in url_data:
|
||||
if "url" in item and "confidence" in item and "justification" in item:
|
||||
print(f"- {item['url']} (Confidence: {item['confidence']}/10)")
|
||||
print(f" Justification: {item['justification']}")
|
||||
else:
|
||||
# Fallback to text parsing
|
||||
urls = [line.strip() for line in cleaned_response.split('\n')
|
||||
if line.strip().startswith(('http://', 'https://'))]
|
||||
except json.JSONDecodeError:
|
||||
# Fallback to text parsing
|
||||
urls = [line.strip() for line in cleaned_response.split('\n')
|
||||
if line.strip().startswith(('http://', 'https://'))]
|
||||
|
||||
# Clean up URLs
|
||||
cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls]
|
||||
cleaned_urls = [url for url in cleaned_urls if url]
|
||||
|
||||
# Limit to top 5 URLs to ensure quality over quantity
|
||||
cleaned_urls = cleaned_urls[:5]
|
||||
|
||||
if not cleaned_urls:
|
||||
print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}")
|
||||
return []
|
||||
|
||||
# Return the URLs for cross-verification
|
||||
return cleaned_urls
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}")
|
||||
return []
|
||||
|
||||
def cross_verify_sources(urls, company, objective):
|
||||
"""Use Mistral to cross-verify information across selected sources."""
|
||||
|
||||
print(f"{Colors.YELLOW}Cross-verifying selected sources...{Colors.RESET}")
|
||||
|
||||
verification_prompt = (
|
||||
f"Task: Evaluate the reliability and consistency of these sources for information about {company}.\n\n"
|
||||
f"Objective: {objective}\n\n"
|
||||
f"URLs to evaluate: {json.dumps(urls)}\n\n"
|
||||
"Instructions:\n"
|
||||
"1. For each URL, identify what makes it reliable or unreliable for the specific objective\n"
|
||||
"2. Assess whether these sources are likely to provide consistent or contradictory information\n"
|
||||
"3. Identify any potential biases in these sources (e.g., company's own website may present favorable information)\n"
|
||||
"4. Recommend the final set of URLs that, when used together, will provide the most accurate and complete information\n"
|
||||
"5. IMPORTANT: Only include URLs that are DIRECTLY relevant to the specific objective\n"
|
||||
"6. Exclude any URLs that contain primarily general information about the company not related to the objective\n"
|
||||
"7. Return a JSON object with: {\"verified_urls\": [\"url1\", \"url2\"], \"verification_notes\": \"explanation\"}\n"
|
||||
)
|
||||
|
||||
try:
|
||||
response = mistral_client.chat.complete(
|
||||
model="mistral-small-latest",
|
||||
messages=[
|
||||
{"role": "user", "content": verification_prompt}
|
||||
]
|
||||
)
|
||||
|
||||
# Clean the response text
|
||||
cleaned_response = response.choices[0].message.content.strip()
|
||||
if cleaned_response.startswith('```'):
|
||||
cleaned_response = cleaned_response.split('```')[1]
|
||||
if cleaned_response.startswith('json'):
|
||||
cleaned_response = cleaned_response[4:]
|
||||
cleaned_response = cleaned_response.strip()
|
||||
|
||||
try:
|
||||
# Parse JSON response
|
||||
result = json.loads(cleaned_response)
|
||||
if isinstance(result, dict) and "verified_urls" in result:
|
||||
verified_urls = result["verified_urls"]
|
||||
verification_notes = result.get("verification_notes", "")
|
||||
|
||||
print(f"{Colors.CYAN}Cross-verification complete:{Colors.RESET}")
|
||||
print(f"{Colors.CYAN}Notes: {verification_notes}{Colors.RESET}")
|
||||
print(f"{Colors.CYAN}Final verified URLs:{Colors.RESET}")
|
||||
for url in verified_urls:
|
||||
print(f"- {url}")
|
||||
|
||||
return verified_urls
|
||||
else:
|
||||
# If JSON parsing fails, return original URLs
|
||||
print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}")
|
||||
return urls
|
||||
except json.JSONDecodeError:
|
||||
# If JSON parsing fails, return original URLs
|
||||
print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}")
|
||||
return urls
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Error during cross-verification: {str(e)}{Colors.RESET}")
|
||||
return urls # Return original URLs if cross-verification fails
|
||||
|
||||
def extract_company_info(urls, prompt, company, api_key):
|
||||
"""Use requests to call Firecrawl's extract endpoint with selected URLs."""
|
||||
print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
|
||||
|
||||
# Enhanced prompt for better data quality
|
||||
enhanced_prompt = (
|
||||
f"Extract accurate and verified information about {company}. "
|
||||
f"Specifically focus on: {prompt}. "
|
||||
f"IMPORTANT INSTRUCTIONS:\n"
|
||||
f"1. Only include information that is EXPLICITLY stated in the source material\n"
|
||||
f"2. Do NOT include any speculative information\n"
|
||||
f"3. If information conflicts between sources, prioritize information from the company's official website\n"
|
||||
f"4. For each piece of information, cite the specific source URL\n"
|
||||
f"5. Assign a confidence score (1-10) to each piece of information based on source reliability\n"
|
||||
f"6. ONLY include information that is DIRECTLY relevant to the specific request\n"
|
||||
f"7. EXCLUDE any tangential or general information about the company not related to the specific request\n"
|
||||
f"8. Format the response as a structured JSON with clear categories related to the request\n"
|
||||
f"9. For each data point, include both the information and its source in this format: {{\"value\": \"information\", \"source\": \"url\", \"confidence\": 8}}\n"
|
||||
f"10. If multiple sources confirm the same information, cite all sources and increase the confidence score\n"
|
||||
f"11. If you cannot find specific information requested, explicitly state that it was not found in the sources rather than providing general information"
|
||||
)
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {api_key}'
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"prompt": enhanced_prompt,
|
||||
"enableWebSearch": False # Changed to False to rely only on verified URLs
|
||||
}
|
||||
|
||||
try:
|
||||
# Print the payload for debugging
|
||||
print(f"{Colors.YELLOW}Request payload:{Colors.RESET}")
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
response = requests.post(
|
||||
"https://api.firecrawl.dev/v1/extract",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Print detailed response for debugging
|
||||
print(f"{Colors.YELLOW}Response status code: {response.status_code}{Colors.RESET}")
|
||||
print(f"{Colors.YELLOW}Response headers: {response.headers}{Colors.RESET}")
|
||||
|
||||
data = response.json()
|
||||
print(f"{Colors.YELLOW}Response body:{Colors.RESET}")
|
||||
print(json.dumps(data, indent=2))
|
||||
|
||||
if not data.get('success'):
|
||||
print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
extraction_id = data.get('id')
|
||||
if not extraction_id:
|
||||
print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}")
|
||||
return None
|
||||
|
||||
return poll_firecrawl_result(extraction_id, api_key)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"{Colors.RED}Request failed: {e}{Colors.RESET}")
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
def poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=60):
|
||||
"""Poll Firecrawl API to get the extraction result."""
|
||||
url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}"
|
||||
headers = {
|
||||
'Authorization': f'Bearer {api_key}'
|
||||
}
|
||||
|
||||
print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}")
|
||||
|
||||
# Show a simple progress indicator instead of "still processing" messages
|
||||
print(f"{Colors.YELLOW}[", end="", flush=True)
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
if data.get('success') and data.get('data'):
|
||||
print(f"]{Colors.RESET}") # Close the progress indicator
|
||||
print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
|
||||
|
||||
# Validate and clean the extracted data
|
||||
validated_data = validate_extracted_data(data['data'])
|
||||
print(json.dumps(validated_data, indent=2))
|
||||
return validated_data
|
||||
elif data.get('success') and not data.get('data'):
|
||||
# Show a simple progress indicator
|
||||
print(f"{Colors.YELLOW}.", end="", flush=True)
|
||||
time.sleep(interval)
|
||||
else:
|
||||
print(f"]{Colors.RESET}") # Close the progress indicator
|
||||
print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"]{Colors.RESET}") # Close the progress indicator
|
||||
print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"]{Colors.RESET}") # Close the progress indicator
|
||||
print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"]{Colors.RESET}") # Close the progress indicator
|
||||
print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}")
|
||||
return None
|
||||
|
||||
print(f"]{Colors.RESET}") # Close the progress indicator
|
||||
print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}")
|
||||
return None
|
||||
|
||||
def validate_extracted_data(data):
|
||||
"""Validate and clean the extracted data to reduce misinformation."""
|
||||
if not data or not isinstance(data, dict):
|
||||
return data
|
||||
|
||||
# Look for confidence scores or source information if available
|
||||
validated_data = {}
|
||||
|
||||
for key, value in data.items():
|
||||
# Skip entries that indicate uncertainty
|
||||
if isinstance(value, str) and any(term in value.lower() for term in ["unknown", "unclear", "not specified", "not found", "couldn't find"]):
|
||||
continue
|
||||
|
||||
# Keep entries with clear information
|
||||
validated_data[key] = value
|
||||
|
||||
return validated_data
|
||||
|
||||
def main():
|
||||
company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
|
||||
objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
|
||||
|
||||
# Add more specific search terms for better results
|
||||
search_query = f"{company} {objective}"
|
||||
# print(f"{Colors.YELLOW}Searching Google for '{search_query}'...{Colors.RESET}")
|
||||
serp_results = search_google(search_query)
|
||||
|
||||
if not serp_results:
|
||||
# Fallback to just company name
|
||||
print(f"{Colors.YELLOW}No results found. Trying broader search...{Colors.RESET}")
|
||||
serp_results = search_google(company)
|
||||
|
||||
if not serp_results:
|
||||
print(f"{Colors.RED}No search results found.{Colors.RESET}")
|
||||
return
|
||||
|
||||
# Select URLs with Mistral
|
||||
selected_urls = select_urls_with_mistral(company, objective, serp_results)
|
||||
|
||||
if not selected_urls:
|
||||
print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")
|
||||
return
|
||||
|
||||
# Cross-verify the selected sources
|
||||
verified_urls = cross_verify_sources(selected_urls, company, objective)
|
||||
|
||||
if not verified_urls:
|
||||
print(f"{Colors.YELLOW}No URLs were verified. Using original selected URLs.{Colors.RESET}")
|
||||
verified_urls = selected_urls
|
||||
|
||||
data = extract_company_info(verified_urls, objective, company, firecrawl_api_key)
|
||||
|
||||
if data:
|
||||
print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}")
|
||||
else:
|
||||
print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user