From 2f037fa1a7f8d84257442610582c572631a09b74 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Mon, 7 Apr 2025 19:00:10 +0530 Subject: [PATCH 1/2] Add examples/llama4-maverick-web-extractor --- .../.env.example | 11 + .../llama-4-maverick-web-extractor/.gitignore | 1 + .../llama-4-maverick-web-extractor/README.md | 84 ++++++ .../llama-4-maverick-extractor.py | 240 ++++++++++++++++++ .../requirements.txt | 4 + 5 files changed, 340 insertions(+) create mode 100644 examples/llama-4-maverick-web-extractor/.env.example create mode 100644 examples/llama-4-maverick-web-extractor/.gitignore create mode 100644 examples/llama-4-maverick-web-extractor/README.md create mode 100644 examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py create mode 100644 examples/llama-4-maverick-web-extractor/requirements.txt diff --git a/examples/llama-4-maverick-web-extractor/.env.example b/examples/llama-4-maverick-web-extractor/.env.example new file mode 100644 index 00000000..55db5bce --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/.env.example @@ -0,0 +1,11 @@ +# Together AI API Key (Required) +# Get it from: https://www.together.ai/ +TOGETHER_API_KEY=your_together_ai_key_here + +# SerpAPI Key (Required) +# Get it from: https://serpapi.com/ +SERP_API_KEY=your_serpapi_key_here + +# Firecrawl API Key (Required) +# Get it from: https://firecrawl.dev/ +FIRECRAWL_API_KEY=your_firecrawl_key_here \ No newline at end of file diff --git a/examples/llama-4-maverick-web-extractor/.gitignore b/examples/llama-4-maverick-web-extractor/.gitignore new file mode 100644 index 00000000..0519ecba --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/.gitignore @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/examples/llama-4-maverick-web-extractor/README.md b/examples/llama-4-maverick-web-extractor/README.md new file mode 100644 index 00000000..c2be6744 --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/README.md @@ -0,0 +1,84 @@ +# Web Information Extractor with Llama 4 Maverick + +This tool uses Llama 4 Maverick (via Together AI), SerpAPI, and Firecrawl to automatically extract structured information about companies from the web. It performs intelligent URL selection and information extraction from web content. + +## Features + +- Automated Google search using SerpAPI +- Intelligent URL selection using Llama 4 Maverick +- Structured data extraction using Firecrawl +- Color-coded console output for better readability + +## Prerequisites + +- Python 3.8+ +- Together AI API key +- SerpAPI API key +- Firecrawl API key + +## Installation + +1. Clone the repository: + +```bash +git clone +cd +``` + +2. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Copy the example environment file and fill in your API keys: + +```bash +cp .env.example .env +``` + +4. Edit the `.env` file with your API keys: + +``` +TOGETHER_API_KEY=your_together_ai_key +SERP_API_KEY=your_serpapi_key +FIRECRAWL_API_KEY=your_firecrawl_key +``` + +## Usage + +Run the script: + +```bash +python llama-4-maverick-extractor.py +``` + +The script will: + +1. Prompt you for a company name +2. Ask what information you want to extract +3. Search for relevant URLs +4. Extract and structure the requested information +5. Display the results + +## Example + +```bash +$ python llama-4-maverick-extractor.py +Enter the company name: Tesla +Enter what information you want about the company: latest electric vehicle models and their prices +``` + +## Error Handling + +The script includes comprehensive error handling for: + +- Missing API keys +- API rate limits +- Network issues +- Invalid responses +- JSON parsing errors + +## License + +MIT License - feel free to use and modify as needed. diff --git a/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py new file mode 100644 index 00000000..a2e5fcb9 --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py @@ -0,0 +1,240 @@ +import os +import json +import time +import requests +from dotenv import load_dotenv +from serpapi.google_search import GoogleSearch +from together import Together + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Initialize clients +together_api_key = os.getenv("TOGETHER_API_KEY") +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +serp_api_key = os.getenv("SERP_API_KEY") + +if not together_api_key: + print(f"{Colors.RED}Warning: TOGETHER_API_KEY not found in environment variables{Colors.RESET}") +if not firecrawl_api_key: + print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") + +# Initialize Together AI client +together_client = Together(api_key=together_api_key) + +def search_google(query): + """Search Google using SerpAPI and return top results.""" + print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") + search = GoogleSearch({"q": query, "api_key": serp_api_key}) + results = search.get_dict().get("organic_results", []) + print(f"{Colors.CYAN}Found {len(results)} search results{Colors.RESET}") + return results + +def select_urls_with_llama(company, objective, serp_results): + """ + Use Llama 4 Maverick to select URLs from SERP results. + Returns a list of URLs. + """ + try: + serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} + for r in serp_results if r.get("link")] + + print(f"{Colors.CYAN}Processing {len(serp_data)} valid search results{Colors.RESET}") + + prompt = ( + "You are a URL selection assistant. Your task is to analyze search results and select relevant URLs.\n\n" + "IMPORTANT: You must respond ONLY with a JSON object containing selected URLs. Do not include any explanation or additional text.\n\n" + "Instructions:\n" + "1. Analyze the search results for information about the specified company\n" + "2. Select URLs that are most likely to contain the requested information\n" + "3. Return EXACTLY in this format: {\"selected_urls\": [\"url1\", \"url2\"]}\n" + "4. Do not include social media links\n" + "5. DO NOT include any explanation or analysis in your response\n" + "6. ONLY output the JSON object\n\n" + f"Company: {company}\n" + f"Information Needed: {objective}\n" + f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" + "YOUR RESPONSE MUST BE ONLY THE JSON OBJECT. NO OTHER TEXT." + ) + + try: + print(f"{Colors.YELLOW}Asking Llama to analyze URLs...{Colors.RESET}") + response = together_client.chat.completions.create( + model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + messages=[{"role": "user", "content": prompt}], + temperature=0.1 # Lower temperature for more focused responses + ) + cleaned_response = response.choices[0].message.content.strip() + print(f"{Colors.MAGENTA}Llama response: {cleaned_response}{Colors.RESET}") + + # Clean the response text + if cleaned_response.startswith('```'): + cleaned_response = cleaned_response.split('```')[1] + if cleaned_response.startswith('json'): + cleaned_response = cleaned_response[4:] + cleaned_response = cleaned_response.strip() + + # Try to find JSON object in the response + json_start = cleaned_response.find('{') + json_end = cleaned_response.rfind('}') + 1 + if json_start != -1 and json_end != -1: + cleaned_response = cleaned_response[json_start:json_end] + + try: + # Parse JSON response + result = json.loads(cleaned_response) + if isinstance(result, dict) and "selected_urls" in result: + urls = result["selected_urls"] + else: + print(f"{Colors.YELLOW}Response not in expected format. Falling back to text parsing...{Colors.RESET}") + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + except json.JSONDecodeError: + print(f"{Colors.YELLOW}Could not parse JSON response. Falling back to text parsing...{Colors.RESET}") + # Fallback to text parsing + urls = [line.strip() for line in cleaned_response.split('\n') + if line.strip().startswith(('http://', 'https://'))] + + # Clean up URLs + cleaned_urls = [url.replace('/*', '').rstrip('/') for url in urls] + cleaned_urls = [url for url in cleaned_urls if url] + + if not cleaned_urls: + print(f"{Colors.YELLOW}No valid URLs found in response.{Colors.RESET}") + return [] + + print(f"{Colors.CYAN}Selected URLs for extraction:{Colors.RESET}") + for url in cleaned_urls: + print(f"- {url}") + + return cleaned_urls + + except Exception as e: + print(f"{Colors.RED}Error with Together AI API call: {str(e)}{Colors.RESET}") + return [] + + except Exception as e: + print(f"{Colors.RED}Error selecting URLs: {str(e)}{Colors.RESET}") + return [] + +def extract_company_info(urls, prompt, company, api_key): + """Use requests to call Firecrawl's extract endpoint with selected URLs.""" + print(f"{Colors.YELLOW}Extracting structured data from the provided URLs using Firecrawl...{Colors.RESET}") + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {api_key}' + } + + payload = { + "urls": urls, + "prompt": prompt + " for " + company, + "enableWebSearch": True + } + + try: + response = requests.post( + "https://api.firecrawl.dev/v1/extract", + headers=headers, + json=payload, + timeout=30 + ) + + data = response.json() + + if not data.get('success'): + print(f"{Colors.RED}API returned error: {data.get('error', 'No error message')}{Colors.RESET}") + return None + + extraction_id = data.get('id') + if not extraction_id: + print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") + return None + + return poll_firecrawl_result(extraction_id, api_key) + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}Failed to parse response: {e}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Failed to extract data: {e}{Colors.RESET}") + return None + +def poll_firecrawl_result(extraction_id, api_key, interval=10, max_attempts=60): + """Poll Firecrawl API to get the extraction result.""" + url = f"https://api.firecrawl.dev/v1/extract/{extraction_id}" + headers = { + 'Authorization': f'Bearer {api_key}' + } + + print(f"{Colors.YELLOW}Waiting for extraction to complete...{Colors.RESET}") + + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + data = response.json() + + if data.get('success') and data.get('data'): + print(f"{Colors.GREEN}Data successfully extracted:{Colors.RESET}") + print(json.dumps(data['data'], indent=2)) + return data['data'] + elif data.get('success') and not data.get('data'): + if attempt % 6 == 0: + print(f"{Colors.YELLOW}Still processing... (attempt {attempt}/{max_attempts}){Colors.RESET}") + time.sleep(interval) + else: + print(f"{Colors.RED}API Error: {data.get('error', 'No error message provided')}{Colors.RESET}") + return None + + except requests.exceptions.RequestException as e: + print(f"{Colors.RED}Request error: {str(e)}{Colors.RESET}") + return None + except json.JSONDecodeError as e: + print(f"{Colors.RED}JSON parsing error: {str(e)}{Colors.RESET}") + return None + except Exception as e: + print(f"{Colors.RED}Unexpected error: {str(e)}{Colors.RESET}") + return None + + print(f"{Colors.RED}Max polling attempts reached. Extraction did not complete in time.{Colors.RESET}") + return None + +def main(): + company = input(f"{Colors.BLUE}Enter the company name: {Colors.RESET}") + objective = input(f"{Colors.BLUE}Enter what information you want about the company: {Colors.RESET}") + + serp_results = search_google(f"{company}") + if not serp_results: + print(f"{Colors.RED}No search results found.{Colors.RESET}") + return + + selected_urls = select_urls_with_llama(company, objective, serp_results) + + if not selected_urls: + print(f"{Colors.RED}No URLs were selected.{Colors.RESET}") + return + + data = extract_company_info(selected_urls, objective, company, firecrawl_api_key) + + if data: + print(f"{Colors.GREEN}Extraction completed successfully.{Colors.RESET}") + else: + print(f"{Colors.RED}Failed to extract the requested information. Try refining your prompt or choosing a different company.{Colors.RESET}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/llama-4-maverick-web-extractor/requirements.txt b/examples/llama-4-maverick-web-extractor/requirements.txt new file mode 100644 index 00000000..33272430 --- /dev/null +++ b/examples/llama-4-maverick-web-extractor/requirements.txt @@ -0,0 +1,4 @@ +together>=0.2.5 +python-dotenv>=1.0.0 +requests>=2.31.0 +google-search-results>=2.4.2 \ No newline at end of file From 132127510209183ad224d23380fed0cf2f628bad Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Tue, 8 Apr 2025 20:52:06 +0530 Subject: [PATCH 2/2] Update Llama 4 Maverick extractor implementation --- .../llama-4-maverick-extractor.py | 92 +++++++++++-------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py index a2e5fcb9..eb87f871 100644 --- a/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py +++ b/examples/llama-4-maverick-web-extractor/llama-4-maverick-extractor.py @@ -21,26 +21,23 @@ load_dotenv() # Initialize clients together_api_key = os.getenv("TOGETHER_API_KEY") +if not together_api_key: + print(f"{Colors.RED}Error: TOGETHER_API_KEY not found in environment variables{Colors.RESET}") + +client = Together(api_key=together_api_key) firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") serp_api_key = os.getenv("SERP_API_KEY") -if not together_api_key: - print(f"{Colors.RED}Warning: TOGETHER_API_KEY not found in environment variables{Colors.RESET}") if not firecrawl_api_key: print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}") -# Initialize Together AI client -together_client = Together(api_key=together_api_key) - def search_google(query): """Search Google using SerpAPI and return top results.""" print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}") search = GoogleSearch({"q": query, "api_key": serp_api_key}) - results = search.get_dict().get("organic_results", []) - print(f"{Colors.CYAN}Found {len(results)} search results{Colors.RESET}") - return results + return search.get_dict().get("organic_results", []) -def select_urls_with_llama(company, objective, serp_results): +def select_urls_with_gemini(company, objective, serp_results): """ Use Llama 4 Maverick to select URLs from SERP results. Returns a list of URLs. @@ -49,33 +46,47 @@ def select_urls_with_llama(company, objective, serp_results): serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")} for r in serp_results if r.get("link")] - print(f"{Colors.CYAN}Processing {len(serp_data)} valid search results{Colors.RESET}") + print(f"{Colors.CYAN}Found {len(serp_data)} search results to analyze{Colors.RESET}") + + if not serp_data: + print(f"{Colors.YELLOW}No search results found to analyze{Colors.RESET}") + return [] prompt = ( - "You are a URL selection assistant. Your task is to analyze search results and select relevant URLs.\n\n" - "IMPORTANT: You must respond ONLY with a JSON object containing selected URLs. Do not include any explanation or additional text.\n\n" + "Task: Select the most relevant URLs from search results, prioritizing official sources.\n\n" "Instructions:\n" - "1. Analyze the search results for information about the specified company\n" - "2. Select URLs that are most likely to contain the requested information\n" - "3. Return EXACTLY in this format: {\"selected_urls\": [\"url1\", \"url2\"]}\n" - "4. Do not include social media links\n" - "5. DO NOT include any explanation or analysis in your response\n" - "6. ONLY output the JSON object\n\n" + "1. PRIORITIZE official company websites, documentation, and press releases first\n" + "2. Select ONLY URLs that directly contain information about the requested topic\n" + "3. Return ONLY a JSON object with the following structure: {\"selected_urls\": [\"url1\", \"url2\"]}\n" + "4. Do not include social media links (Twitter, LinkedIn, Facebook, etc.)\n" + "5. Exclude any LinkedIn URLs as they cannot be accessed\n" + "6. Select a MAXIMUM of 3 most relevant URLs\n" + "7. Order URLs by relevance: official sources first, then trusted news/industry sources\n" + "8. IMPORTANT: Only output the JSON object, no other text or explanation\n\n" f"Company: {company}\n" f"Information Needed: {objective}\n" f"Search Results: {json.dumps(serp_data, indent=2)}\n\n" - "YOUR RESPONSE MUST BE ONLY THE JSON OBJECT. NO OTHER TEXT." + "Response Format: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}\n\n" + "Remember: Prioritize OFFICIAL sources and limit to 3 MOST RELEVANT URLs only." ) - + try: - print(f"{Colors.YELLOW}Asking Llama to analyze URLs...{Colors.RESET}") - response = together_client.chat.completions.create( + print(f"{Colors.YELLOW}Calling Together AI model...{Colors.RESET}") + response = client.chat.completions.create( model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", messages=[{"role": "user", "content": prompt}], - temperature=0.1 # Lower temperature for more focused responses ) + print(f"{Colors.GREEN}Got response from Together AI{Colors.RESET}") + print(f"{Colors.CYAN}Raw response: {response.choices[0].message.content}{Colors.RESET}") + cleaned_response = response.choices[0].message.content.strip() - print(f"{Colors.MAGENTA}Llama response: {cleaned_response}{Colors.RESET}") + + # Find the JSON object in the response + import re + json_match = re.search(r'\{[\s\S]*"selected_urls"[\s\S]*\}', cleaned_response) + if json_match: + cleaned_response = json_match.group(0) + print(f"{Colors.CYAN}Extracted JSON: {cleaned_response}{Colors.RESET}") # Clean the response text if cleaned_response.startswith('```'): @@ -84,24 +95,16 @@ def select_urls_with_llama(company, objective, serp_results): cleaned_response = cleaned_response[4:] cleaned_response = cleaned_response.strip() - # Try to find JSON object in the response - json_start = cleaned_response.find('{') - json_end = cleaned_response.rfind('}') + 1 - if json_start != -1 and json_end != -1: - cleaned_response = cleaned_response[json_start:json_end] - try: # Parse JSON response result = json.loads(cleaned_response) if isinstance(result, dict) and "selected_urls" in result: urls = result["selected_urls"] else: - print(f"{Colors.YELLOW}Response not in expected format. Falling back to text parsing...{Colors.RESET}") - # Fallback to text parsing - urls = [line.strip() for line in cleaned_response.split('\n') - if line.strip().startswith(('http://', 'https://'))] - except json.JSONDecodeError: - print(f"{Colors.YELLOW}Could not parse JSON response. Falling back to text parsing...{Colors.RESET}") + print(f"{Colors.YELLOW}Response did not contain the expected 'selected_urls' key{Colors.RESET}") + urls = [] + except json.JSONDecodeError as e: + print(f"{Colors.YELLOW}Failed to parse JSON: {str(e)}{Colors.RESET}") # Fallback to text parsing urls = [line.strip() for line in cleaned_response.split('\n') if line.strip().startswith(('http://', 'https://'))] @@ -121,7 +124,7 @@ def select_urls_with_llama(company, objective, serp_results): return cleaned_urls except Exception as e: - print(f"{Colors.RED}Error with Together AI API call: {str(e)}{Colors.RESET}") + print(f"{Colors.RED}Error calling Together AI: {str(e)}{Colors.RESET}") return [] except Exception as e: @@ -144,13 +147,18 @@ def extract_company_info(urls, prompt, company, api_key): } try: + print(f"{Colors.CYAN}Making request to Firecrawl API...{Colors.RESET}") response = requests.post( "https://api.firecrawl.dev/v1/extract", headers=headers, json=payload, - timeout=30 + timeout=120 # Increased timeout to 120 seconds ) + if response.status_code != 200: + print(f"{Colors.RED}API returned status code {response.status_code}: {response.text}{Colors.RESET}") + return None + data = response.json() if not data.get('success'): @@ -162,8 +170,12 @@ def extract_company_info(urls, prompt, company, api_key): print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}") return None - return poll_firecrawl_result(extraction_id, api_key) + return poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=120) # Increased polling attempts + except requests.exceptions.Timeout: + print(f"{Colors.RED}Request timed out. The operation might still be processing in the background.{Colors.RESET}") + print(f"{Colors.YELLOW}You may want to try again with fewer URLs or a more specific prompt.{Colors.RESET}") + return None except requests.exceptions.RequestException as e: print(f"{Colors.RED}Request failed: {e}{Colors.RESET}") return None @@ -223,7 +235,7 @@ def main(): print(f"{Colors.RED}No search results found.{Colors.RESET}") return - selected_urls = select_urls_with_llama(company, objective, serp_results) + selected_urls = select_urls_with_gemini(company, objective, serp_results) if not selected_urls: print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")