Add examples/mistral 3.1 company researcher

2025-08-12 21:29:00 +08:00 · 2025-03-21 14:03:31 +05:30 · 2025-03-21 14:03:31 +05:30 · 6a6199eb4b
commit 6a6199eb4b
parent 2fb29ee46e
1 changed files with 376 additions and 0 deletions
--- a/examples/mistral-small-3.1-extractor/mistral-small-3.1-extractor.py
+++ b/examples/mistral-small-3.1-extractor/mistral-small-3.1-extractor.py
@ -0,0 +1,376 @@
+import os
+import json
+import time
+import requests
+from dotenv import load_dotenv
+from serpapi.google_search import GoogleSearch
+from mistralai import Mistral
+
+# ANSI color codes
+class Colors:
+    CYAN = '\033[96m'
+    YELLOW = '\033[93m'
+    GREEN = '\033[92m'
+    RED = '\033[91m'
+    MAGENTA = '\033[95m'
+    BLUE = '\033[94m'
+    RESET = '\033[0m'
+
+# Load environment variables
+load_dotenv()
+
+# Initialize clients
+mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
+firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
+serp_api_key = os.getenv("SERP_API_KEY")
+
+
+if not firecrawl_api_key:
+    print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}")
+
+if not os.getenv("MISTRAL_API_KEY"):
+    print(f"{Colors.RED}Warning: MISTRAL_API_KEY not found in environment variables{Colors.RESET}")
+
+def search_google(query):
+    """Search Google using SerpAPI and return top results."""
+    print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
+    search = GoogleSearch({"q": query, "api_key": serp_api_key})
+    return search.get_dict().get("organic_results", [])
+
+def select_urls_with_mistral(company, objective, serp_results):
+    """
+    Use Mistral Small 3.1 to select URLs from SERP results with enhanced criteria.
+    Returns a list of URLs with confidence scores and justifications.
+    """
+    try:
+        serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} 
+                     for r in serp_results if r.get("link")]
+
+        prompt = (
+            "Task: Select the MOST RELIABLE and RELEVANT URLs that contain VERIFIABLE information about the specified company.\n\n"
+            "Instructions:\n"
+            "1. Analyze the search results for information SPECIFICALLY about the requested objective\n"
+            "2. Select ONLY official and highly reliable URLs that DIRECTLY address the requested information\n"
+            "3. Prioritize in this exact order:\n"
+            "   a. The company's official website sections that specifically address the requested information\n"
+            "   b. Official company documents (annual reports, SEC filings, press releases) that contain verifiable data\n"
+            "   c. Government databases or regulatory filings that contain verified information\n"
+            "   d. Trusted industry databases with cited sources (e.g., Bloomberg, Reuters, industry associations)\n"
+            "4. EXCLUDE any sources that:\n"
+            "   a. Contain primarily opinions or analysis rather than facts\n"
+            "   b. Are outdated (older than 1 year unless historical information is requested)\n"
+            "   c. Are from general news sites without specific expertise in the topic\n"
+            "   d. Do not cite their sources or methodology\n"
+            "   e. Are social media links or user-generated content\n"
+            "5. For each URL selected, provide a confidence score (1-10) and brief justification\n"
+            "6. Limit selection to 3-5 of the MOST RELIABLE and RELEVANT sources only\n"
+            "7. Return a JSON object with the following structure: {\"selected_urls\": [{\"url\": \"url1\", \"confidence\": 9, \"justification\": \"Official company annual report with audited figures\"}]}\n\n"
+            f"Company: {company}\n"
+            f"Information Needed: {objective}\n"
+            f"Search Results: {json.dumps(serp_data, indent=2)}\n\n"
+            "Response Format: {\"selected_urls\": [{\"url\": \"https://example.com\", \"confidence\": 9, \"justification\": \"Reason this is reliable\"}]}"
+        )
+
+        response = mistral_client.chat.complete(
+            model="mistral-small-latest",
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+
+        # Clean the response text
+        cleaned_response = response.choices[0].message.content.strip()
+        if cleaned_response.startswith('```'):
+            cleaned_response = cleaned_response.split('```')[1]
+            if cleaned_response.startswith('json'):
+                cleaned_response = cleaned_response[4:]
+        cleaned_response = cleaned_response.strip()
+
+        try:
+            # Parse JSON response
+            result = json.loads(cleaned_response)
+            if isinstance(result, dict) and "selected_urls" in result:
+                url_data = result["selected_urls"]
+                # Extract just the URLs for compatibility with existing code
+                urls = [item["url"] for item in url_data if "url" in item]
+                
+                # Print detailed information about selected URLs
+                print(f"{Colors.CYAN}Selected URLs with confidence scores:{Colors.RESET}")
+                for item in url_data:
+                    if "url" in item and "confidence" in item and "justification" in item:
+                        print(f"- {item['url']} (Confidence: {item['confidence']}/10)")
+                        print(f"  Justification: {item['justification']}")
+            else:
+                # Fallback to text parsing
+                urls = [line.strip() for line in cleaned_response.split('\n') 
+                       if line.strip().startswith(('http://', 'https://'))]
+        except json.JSONDecodeError:
+            # Fallback to text parsing
+            urls = [line.strip() for line in cleaned_response.split('\n') 
+                   if line.strip().startswith(('http://', 'https://'))]
+
+        # Clean up URLs
+        cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls]
+        cleaned_urls = [url for url in cleaned_urls if url]
+
+        # Limit to top 5 URLs to ensure quality over quantity
+        cleaned_urls = cleaned_urls[:5]
+
+        if not cleaned_urls:
+            print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}")
+            return []
+
+        # Return the URLs for cross-verification
+        return cleaned_urls
+
+    except Exception as e:
+        print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}")
+        return []
+
+def cross_verify_sources(urls, company, objective):
+    """Use Mistral to cross-verify information across selected sources."""
+    
+    print(f"{Colors.YELLOW}Cross-verifying selected sources...{Colors.RESET}")
+    
+    verification_prompt = (
+        f"Task: Evaluate the reliability and consistency of these sources for information about {company}.\n\n"
+        f"Objective: {objective}\n\n"
+        f"URLs to evaluate: {json.dumps(urls)}\n\n"
+        "Instructions:\n"
+        "1. For each URL, identify what makes it reliable or unreliable for the specific objective\n"
+        "2. Assess whether these sources are likely to provide consistent or contradictory information\n"
+        "3. Identify any potential biases in these sources (e.g., company's own website may present favorable information)\n"
+        "4. Recommend the final set of URLs that, when used together, will provide the most accurate and complete information\n"
+        "5. IMPORTANT: Only include URLs that are DIRECTLY relevant to the specific objective\n"
+        "6. Exclude any URLs that contain primarily general information about the company not related to the objective\n"
+        "7. Return a JSON object with: {\"verified_urls\": [\"url1\", \"url2\"], \"verification_notes\": \"explanation\"}\n"
+    )
+    
+    try:
+        response = mistral_client.chat.complete(
+            model="mistral-small-latest",
+            messages=[
+                {"role": "user", "content": verification_prompt}
+            ]
+        )
+        
+        # Clean the response text
+        cleaned_response = response.choices[0].message.content.strip()
+        if cleaned_response.startswith('```'):
+            cleaned_response = cleaned_response.split('```')[1]
+            if cleaned_response.startswith('json'):
+                cleaned_response = cleaned_response[4:]
+        cleaned_response = cleaned_response.strip()
+        
+        try:
+            # Parse JSON response
+            result = json.loads(cleaned_response)
+            if isinstance(result, dict) and "verified_urls" in result:
+                verified_urls = result["verified_urls"]
+                verification_notes = result.get("verification_notes", "")
+                
+                print(f"{Colors.CYAN}Cross-verification complete:{Colors.RESET}")
+                print(f"{Colors.CYAN}Notes: {verification_notes}{Colors.RESET}")
+                print(f"{Colors.CYAN}Final verified URLs:{Colors.RESET}")
+                for url in verified_urls:
+                    print(f"- {url}")
+                
+                return verified_urls
+            else:
+                # If JSON parsing fails, return original URLs
+                print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}")
+                return urls
+        except json.JSONDecodeError:
+            # If JSON parsing fails, return original URLs
+            print(f"{Colors.YELLOW}Could not parse cross-verification result. Using original URLs.{Colors.RESET}")
+            return urls
+            
+    except Exception as e:
+        print(f"{Colors.RED}Error during cross-verification: {str(e)}{Colors.RESET}")
+        return urls  # Return original URLs if cross-verification fails
+
+def extract_company_info(urls, prompt, company, api_key):
+    """Use requests to call Firecrawl's extract endpoint with selected URLs."""
+    print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}")
+    
+    # Enhanced prompt for better data quality
+    enhanced_prompt = (
+        f"Extract accurate and verified information about {company}. "
+        f"Specifically focus on: {prompt}. "
+        f"IMPORTANT INSTRUCTIONS:\n"
+        f"1. Only include information that is EXPLICITLY stated in the source material\n"
+        f"2. Do NOT include any speculative information\n"
+        f"3. If information conflicts between sources, prioritize information from the company's official website\n"
+        f"4. For each piece of information, cite the specific source URL\n"
+        f"5. Assign a confidence score (1-10) to each piece of information based on source reliability\n"
+        f"6. ONLY include information that is DIRECTLY relevant to the specific request\n"
+        f"7. EXCLUDE any tangential or general information about the company not related to the specific request\n"
+        f"8. Format the response as a structured JSON with clear categories related to the request\n"
+        f"9. For each data point, include both the information and its source in this format: {{\"value\": \"information\", \"source\": \"url\", \"confidence\": 8}}\n"
+        f"10. If multiple sources confirm the same information, cite all sources and increase the confidence score\n"
+        f"11. If you cannot find specific information requested, explicitly state that it was not found in the sources rather than providing general information"
+    )
+    
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {api_key}'
+    }
+    
+    payload = {
+        "urls": urls,
+        "prompt": enhanced_prompt,
+        "enableWebSearch": False  # Changed to False to rely only on verified URLs
+    }
+    
+    try:
+        # Print the payload for debugging
+        print(f"{Colors.YELLOW}Request payload:{Colors.RESET}")
+        print(json.dumps(payload, indent=2))
+        
+        response = requests.post(
+            "https://api.firecrawl.dev/v1/extract",
+            headers=headers,
+            json=payload,
+            timeout=30
+        )
+        
+        # Print detailed response for debugging
+        print(f"{Colors.YELLOW}Response status code: {response.status_code}{Colors.RESET}")
+        print(f"{Colors.YELLOW}Response headers: {response.headers}{Colors.RESET}")
+        
+        data = response.json()
+        print(f"{Colors.YELLOW}Response body:{Colors.RESET}")
+        print(json.dumps(data, indent=2))
+        
+        if not data.get('success'):
+            print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}")
+            return None
+        
+        extraction_id = data.get('id')
+        if not extraction_id:
+            print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}")
+            return None
+
+        return poll_firecrawl_result(extraction_id, api_key)
+
+    except requests.exceptions.RequestException as e:
+        print(f"{Colors.RED}Request failed: {e}{Colors.RESET}")
+        return None
+    except json.JSONDecodeError as e:
+        print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}")
+        return None
+    except Exception as e:
+        print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}")
+        return None
+
+def poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=60):
+    """Poll Firecrawl API to get the extraction result."""
+    url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}"
+    headers = {
+        'Authorization': f'Bearer {api_key}'
+    }
+
+    print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}")
+    
+    # Show a simple progress indicator instead of "still processing" messages
+    print(f"{Colors.YELLOW}[", end="", flush=True)
+    
+    for attempt in range(1, max_attempts + 1):
+        try:
+            response = requests.get(url, headers=headers, timeout=30)
+            response.raise_for_status()
+            data = response.json()
+
+            if data.get('success') and data.get('data'):
+                print(f"]{Colors.RESET}")  # Close the progress indicator
+                print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}")
+                
+                # Validate and clean the extracted data
+                validated_data = validate_extracted_data(data['data'])
+                print(json.dumps(validated_data, indent=2))
+                return validated_data
+            elif data.get('success') and not data.get('data'):
+                # Show a simple progress indicator
+                print(f"{Colors.YELLOW}.", end="", flush=True)
+                time.sleep(interval)
+            else:
+                print(f"]{Colors.RESET}")  # Close the progress indicator
+                print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}")
+                return None
+
+        except requests.exceptions.RequestException as e:
+            print(f"]{Colors.RESET}")  # Close the progress indicator
+            print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}")
+            return None
+        except json.JSONDecodeError as e:
+            print(f"]{Colors.RESET}")  # Close the progress indicator
+            print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}")
+            return None
+        except Exception as e:
+            print(f"]{Colors.RESET}")  # Close the progress indicator
+            print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}")
+            return None
+
+    print(f"]{Colors.RESET}")  # Close the progress indicator
+    print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}")
+    return None
+
+def validate_extracted_data(data):
+    """Validate and clean the extracted data to reduce misinformation."""
+    if not data or not isinstance(data, dict):
+        return data
+    
+    # Look for confidence scores or source information if available
+    validated_data = {}
+    
+    for key, value in data.items():
+        # Skip entries that indicate uncertainty
+        if isinstance(value, str) and any(term in value.lower() for term in ["unknown", "unclear", "not specified", "not found", "couldn't find"]):
+            continue
+            
+        # Keep entries with clear information
+        validated_data[key] = value
+    
+    return validated_data
+
+def main():
+    company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}")
+    objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}")
+    
+    # Add more specific search terms for better results
+    search_query = f"{company} {objective}"
+    # print(f"{Colors.YELLOW}Searching Google for '{search_query}'...{Colors.RESET}")
+    serp_results = search_google(search_query)
+    
+    if not serp_results:
+        # Fallback to just company name
+        print(f"{Colors.YELLOW}No results found. Trying broader search...{Colors.RESET}")
+        serp_results = search_google(company)
+        
+    if not serp_results:
+        print(f"{Colors.RED}No search results found.{Colors.RESET}")
+        return
+    
+    # Select URLs with Mistral
+    selected_urls = select_urls_with_mistral(company, objective, serp_results)
+    
+    if not selected_urls:
+        print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")
+        return
+    
+    # Cross-verify the selected sources
+    verified_urls = cross_verify_sources(selected_urls, company, objective)
+    
+    if not verified_urls:
+        print(f"{Colors.YELLOW}No URLs were verified. Using original selected URLs.{Colors.RESET}")
+        verified_urls = selected_urls
+    
+    data = extract_company_info(verified_urls, objective, company, firecrawl_api_key)
+    
+    if data:
+        print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}")
+    else:
+        print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}")
+
+if __name__ == "__main__":
+    main()