Update Llama 4 Maverick extractor implementation

This commit is contained in:
Aparup Ganguly 2025-04-08 20:52:06 +05:30
parent 2f037fa1a7
commit 1321275102

View File

@ -21,26 +21,23 @@ load_dotenv()
# Initialize clients # Initialize clients
together_api_key = os.getenv("TOGETHER_API_KEY") together_api_key = os.getenv("TOGETHER_API_KEY")
if not together_api_key:
print(f"{Colors.RED}Error: TOGETHER_API_KEY not found in environment variables{Colors.RESET}")
client = Together(api_key=together_api_key)
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
serp_api_key = os.getenv("SERP_API_KEY") serp_api_key = os.getenv("SERP_API_KEY")
if not together_api_key:
print(f"{Colors.RED}Warning: TOGETHER_API_KEY not found in environment variables{Colors.RESET}")
if not firecrawl_api_key: if not firecrawl_api_key:
print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}")
# Initialize Together AI client
together_client = Together(api_key=together_api_key)
def search_google(query): def search_google(query):
"""Search Google using SerpAPI and return top results.""" """Search Google using SerpAPI and return top results."""
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
search = GoogleSearch({"q": query, "api_key": serp_api_key}) search = GoogleSearch({"q": query, "api_key": serp_api_key})
results = search.get_dict().get("organic_results", []) return search.get_dict().get("organic_results", [])
print(f"{Colors.CYAN}Found {len(results)} search results{Colors.RESET}")
return results
def select_urls_with_llama(company, objective, serp_results): def select_urls_with_gemini(company, objective, serp_results):
""" """
Use Llama 4 Maverick to select URLs from SERP results. Use Llama 4 Maverick to select URLs from SERP results.
Returns a list of URLs. Returns a list of URLs.
@ -49,33 +46,47 @@ def select_urls_with_llama(company, objective, serp_results):
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
for r in serp_results if r.get("link")] for r in serp_results if r.get("link")]
print(f"{Colors.CYAN}Processing {len(serp_data)} valid search results{Colors.RESET}") print(f"{Colors.CYAN}Found {len(serp_data)} search results to analyze{Colors.RESET}")
if not serp_data:
print(f"{Colors.YELLOW}No search results found to analyze{Colors.RESET}")
return []
prompt = ( prompt = (
"You are a URL selection assistant. Your task is to analyze search results and select relevant URLs.\n\n" "Task: Select the most relevant URLs from search results, prioritizing official sources.\n\n"
"IMPORTANT: You must respond ONLY with a JSON object containing selected URLs. Do not include any explanation or additional text.\n\n"
"Instructions:\n" "Instructions:\n"
"1. Analyze the search results for information about the specified company\n" "1. PRIORITIZE official company websites, documentation, and press releases first\n"
"2. Select URLs that are most likely to contain the requested information\n" "2. Select ONLY URLs that directly contain information about the requested topic\n"
"3. Return EXACTLY in this format: {\"selected_urls\": [\"url1\", \"url2\"]}\n" "3. Return ONLY a JSON object with the following structure: {\"selected_urls\": [\"url1\", \"url2\"]}\n"
"4. Do not include social media links\n" "4. Do not include social media links (Twitter, LinkedIn, Facebook, etc.)\n"
"5. DO NOT include any explanation or analysis in your response\n" "5. Exclude any LinkedIn URLs as they cannot be accessed\n"
"6. ONLY output the JSON object\n\n" "6. Select a MAXIMUM of 3 most relevant URLs\n"
"7. Order URLs by relevance: official sources first, then trusted news/industry sources\n"
"8. IMPORTANT: Only output the JSON object, no other text or explanation\n\n"
f"Company: {company}\n" f"Company: {company}\n"
f"Information Needed: {objective}\n" f"Information Needed: {objective}\n"
f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" f"Search Results: {json.dumps(serp_data, indent=2)}\n\n"
"YOUR RESPONSE MUST BE ONLY THE JSON OBJECT. NO OTHER TEXT." "Response Format: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}\n\n"
"Remember: Prioritize OFFICIAL sources and limit to 3 MOST RELEVANT URLs only."
) )
try: try:
print(f"{Colors.YELLOW}Asking Llama to analyze URLs...{Colors.RESET}") print(f"{Colors.YELLOW}Calling Together AI model...{Colors.RESET}")
response = together_client.chat.completions.create( response = client.chat.completions.create(
model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
temperature=0.1 # Lower temperature for more focused responses
) )
print(f"{Colors.GREEN}Got response from Together AI{Colors.RESET}")
print(f"{Colors.CYAN}Raw response: {response.choices[0].message.content}{Colors.RESET}")
cleaned_response = response.choices[0].message.content.strip() cleaned_response = response.choices[0].message.content.strip()
print(f"{Colors.MAGENTA}Llama response: {cleaned_response}{Colors.RESET}")
# Find the JSON object in the response
import re
json_match = re.search(r'\{[\s\S]*"selected_urls"[\s\S]*\}', cleaned_response)
if json_match:
cleaned_response = json_match.group(0)
print(f"{Colors.CYAN}Extracted JSON: {cleaned_response}{Colors.RESET}")
# Clean the response text # Clean the response text
if cleaned_response.startswith('```'): if cleaned_response.startswith('```'):
@ -84,24 +95,16 @@ def select_urls_with_llama(company, objective, serp_results):
cleaned_response = cleaned_response[4:] cleaned_response = cleaned_response[4:]
cleaned_response = cleaned_response.strip() cleaned_response = cleaned_response.strip()
# Try to find JSON object in the response
json_start = cleaned_response.find('{')
json_end = cleaned_response.rfind('}') + 1
if json_start != -1 and json_end != -1:
cleaned_response = cleaned_response[json_start:json_end]
try: try:
# Parse JSON response # Parse JSON response
result = json.loads(cleaned_response) result = json.loads(cleaned_response)
if isinstance(result, dict) and "selected_urls" in result: if isinstance(result, dict) and "selected_urls" in result:
urls = result["selected_urls"] urls = result["selected_urls"]
else: else:
print(f"{Colors.YELLOW}Response not in expected format. Falling back to text parsing...{Colors.RESET}") print(f"{Colors.YELLOW}Response did not contain the expected 'selected_urls' key{Colors.RESET}")
# Fallback to text parsing urls = []
urls = [line.strip() for line in cleaned_response.split('\n') except json.JSONDecodeError as e:
if line.strip().startswith(('http://', 'https://'))] print(f"{Colors.YELLOW}Failed to parse JSON: {str(e)}{Colors.RESET}")
except json.JSONDecodeError:
print(f"{Colors.YELLOW}Could not parse JSON response. Falling back to text parsing...{Colors.RESET}")
# Fallback to text parsing # Fallback to text parsing
urls = [line.strip() for line in cleaned_response.split('\n') urls = [line.strip() for line in cleaned_response.split('\n')
if line.strip().startswith(('http://', 'https://'))] if line.strip().startswith(('http://', 'https://'))]
@ -121,7 +124,7 @@ def select_urls_with_llama(company, objective, serp_results):
return cleaned_urls return cleaned_urls
except Exception as e: except Exception as e:
print(f"{Colors.RED}Error with Together AI API call: {str(e)}{Colors.RESET}") print(f"{Colors.RED}Error calling Together AI: {str(e)}{Colors.RESET}")
return [] return []
except Exception as e: except Exception as e:
@ -144,13 +147,18 @@ def extract_company_info(urls, prompt, company, api_key):
} }
try: try:
print(f"{Colors.CYAN}Making request to Firecrawl API...{Colors.RESET}")
response = requests.post( response = requests.post(
"https://api.firecrawl.dev/v1/extract", "https://api.firecrawl.dev/v1/extract",
headers=headers, headers=headers,
json=payload, json=payload,
timeout=30 timeout=120 # Increased timeout to 120 seconds
) )
if response.status_code != 200:
print(f"{Colors.RED}API returned status code {response.status_code}: {response.text}{Colors.RESET}")
return None
data = response.json() data = response.json()
if not data.get('success'): if not data.get('success'):
@ -162,8 +170,12 @@ def extract_company_info(urls, prompt, company, api_key):
print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}")
return None return None
return poll_firecrawl_result(extraction_id, api_key) return poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=120) # Increased polling attempts
except requests.exceptions.Timeout:
print(f"{Colors.RED}Request timed out. The operation might still be processing in the background.{Colors.RESET}")
print(f"{Colors.YELLOW}You may want to try again with fewer URLs or a more specific prompt.{Colors.RESET}")
return None
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") print(f"{Colors.RED}Request failed: {e}{Colors.RESET}")
return None return None
@ -223,7 +235,7 @@ def main():
print(f"{Colors.RED}No search results found.{Colors.RESET}") print(f"{Colors.RED}No search results found.{Colors.RESET}")
return return
selected_urls = select_urls_with_llama(company, objective, serp_results) selected_urls = select_urls_with_gemini(company, objective, serp_results)
if not selected_urls: if not selected_urls:
print(f"{Colors.RED}No URLs were selected.{Colors.RESET}") print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")