mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-04-23 22:49:53 +08:00
Update Llama 4 Maverick extractor implementation
This commit is contained in:
parent
2f037fa1a7
commit
1321275102
@ -21,26 +21,23 @@ load_dotenv()
|
|||||||
|
|
||||||
# Initialize clients
|
# Initialize clients
|
||||||
together_api_key = os.getenv("TOGETHER_API_KEY")
|
together_api_key = os.getenv("TOGETHER_API_KEY")
|
||||||
|
if not together_api_key:
|
||||||
|
print(f"{Colors.RED}Error: TOGETHER_API_KEY not found in environment variables{Colors.RESET}")
|
||||||
|
|
||||||
|
client = Together(api_key=together_api_key)
|
||||||
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
||||||
serp_api_key = os.getenv("SERP_API_KEY")
|
serp_api_key = os.getenv("SERP_API_KEY")
|
||||||
|
|
||||||
if not together_api_key:
|
|
||||||
print(f"{Colors.RED}Warning: TOGETHER_API_KEY not found in environment variables{Colors.RESET}")
|
|
||||||
if not firecrawl_api_key:
|
if not firecrawl_api_key:
|
||||||
print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}")
|
print(f"{Colors.RED}Warning: FIRECRAWL_API_KEY not found in environment variables{Colors.RESET}")
|
||||||
|
|
||||||
# Initialize Together AI client
|
|
||||||
together_client = Together(api_key=together_api_key)
|
|
||||||
|
|
||||||
def search_google(query):
|
def search_google(query):
|
||||||
"""Search Google using SerpAPI and return top results."""
|
"""Search Google using SerpAPI and return top results."""
|
||||||
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
|
print(f"{Colors.YELLOW}Searching Google for '{query}'...{Colors.RESET}")
|
||||||
search = GoogleSearch({"q": query, "api_key": serp_api_key})
|
search = GoogleSearch({"q": query, "api_key": serp_api_key})
|
||||||
results = search.get_dict().get("organic_results", [])
|
return search.get_dict().get("organic_results", [])
|
||||||
print(f"{Colors.CYAN}Found {len(results)} search results{Colors.RESET}")
|
|
||||||
return results
|
|
||||||
|
|
||||||
def select_urls_with_llama(company, objective, serp_results):
|
def select_urls_with_gemini(company, objective, serp_results):
|
||||||
"""
|
"""
|
||||||
Use Llama 4 Maverick to select URLs from SERP results.
|
Use Llama 4 Maverick to select URLs from SERP results.
|
||||||
Returns a list of URLs.
|
Returns a list of URLs.
|
||||||
@ -49,33 +46,47 @@ def select_urls_with_llama(company, objective, serp_results):
|
|||||||
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
|
serp_data = [{"title": r.get("title"), "link": r.get("link"), "snippet": r.get("snippet")}
|
||||||
for r in serp_results if r.get("link")]
|
for r in serp_results if r.get("link")]
|
||||||
|
|
||||||
print(f"{Colors.CYAN}Processing {len(serp_data)} valid search results{Colors.RESET}")
|
print(f"{Colors.CYAN}Found {len(serp_data)} search results to analyze{Colors.RESET}")
|
||||||
|
|
||||||
|
if not serp_data:
|
||||||
|
print(f"{Colors.YELLOW}No search results found to analyze{Colors.RESET}")
|
||||||
|
return []
|
||||||
|
|
||||||
prompt = (
|
prompt = (
|
||||||
"You are a URL selection assistant. Your task is to analyze search results and select relevant URLs.\n\n"
|
"Task: Select the most relevant URLs from search results, prioritizing official sources.\n\n"
|
||||||
"IMPORTANT: You must respond ONLY with a JSON object containing selected URLs. Do not include any explanation or additional text.\n\n"
|
|
||||||
"Instructions:\n"
|
"Instructions:\n"
|
||||||
"1. Analyze the search results for information about the specified company\n"
|
"1. PRIORITIZE official company websites, documentation, and press releases first\n"
|
||||||
"2. Select URLs that are most likely to contain the requested information\n"
|
"2. Select ONLY URLs that directly contain information about the requested topic\n"
|
||||||
"3. Return EXACTLY in this format: {\"selected_urls\": [\"url1\", \"url2\"]}\n"
|
"3. Return ONLY a JSON object with the following structure: {\"selected_urls\": [\"url1\", \"url2\"]}\n"
|
||||||
"4. Do not include social media links\n"
|
"4. Do not include social media links (Twitter, LinkedIn, Facebook, etc.)\n"
|
||||||
"5. DO NOT include any explanation or analysis in your response\n"
|
"5. Exclude any LinkedIn URLs as they cannot be accessed\n"
|
||||||
"6. ONLY output the JSON object\n\n"
|
"6. Select a MAXIMUM of 3 most relevant URLs\n"
|
||||||
|
"7. Order URLs by relevance: official sources first, then trusted news/industry sources\n"
|
||||||
|
"8. IMPORTANT: Only output the JSON object, no other text or explanation\n\n"
|
||||||
f"Company: {company}\n"
|
f"Company: {company}\n"
|
||||||
f"Information Needed: {objective}\n"
|
f"Information Needed: {objective}\n"
|
||||||
f"Search Results: {json.dumps(serp_data, indent=2)}\n\n"
|
f"Search Results: {json.dumps(serp_data, indent=2)}\n\n"
|
||||||
"YOUR RESPONSE MUST BE ONLY THE JSON OBJECT. NO OTHER TEXT."
|
"Response Format: {\"selected_urls\": [\"https://example.com\", \"https://example2.com\"]}\n\n"
|
||||||
|
"Remember: Prioritize OFFICIAL sources and limit to 3 MOST RELEVANT URLs only."
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(f"{Colors.YELLOW}Asking Llama to analyze URLs...{Colors.RESET}")
|
print(f"{Colors.YELLOW}Calling Together AI model...{Colors.RESET}")
|
||||||
response = together_client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
temperature=0.1 # Lower temperature for more focused responses
|
|
||||||
)
|
)
|
||||||
|
print(f"{Colors.GREEN}Got response from Together AI{Colors.RESET}")
|
||||||
|
print(f"{Colors.CYAN}Raw response: {response.choices[0].message.content}{Colors.RESET}")
|
||||||
|
|
||||||
cleaned_response = response.choices[0].message.content.strip()
|
cleaned_response = response.choices[0].message.content.strip()
|
||||||
print(f"{Colors.MAGENTA}Llama response: {cleaned_response}{Colors.RESET}")
|
|
||||||
|
# Find the JSON object in the response
|
||||||
|
import re
|
||||||
|
json_match = re.search(r'\{[\s\S]*"selected_urls"[\s\S]*\}', cleaned_response)
|
||||||
|
if json_match:
|
||||||
|
cleaned_response = json_match.group(0)
|
||||||
|
print(f"{Colors.CYAN}Extracted JSON: {cleaned_response}{Colors.RESET}")
|
||||||
|
|
||||||
# Clean the response text
|
# Clean the response text
|
||||||
if cleaned_response.startswith('```'):
|
if cleaned_response.startswith('```'):
|
||||||
@ -84,24 +95,16 @@ def select_urls_with_llama(company, objective, serp_results):
|
|||||||
cleaned_response = cleaned_response[4:]
|
cleaned_response = cleaned_response[4:]
|
||||||
cleaned_response = cleaned_response.strip()
|
cleaned_response = cleaned_response.strip()
|
||||||
|
|
||||||
# Try to find JSON object in the response
|
|
||||||
json_start = cleaned_response.find('{')
|
|
||||||
json_end = cleaned_response.rfind('}') + 1
|
|
||||||
if json_start != -1 and json_end != -1:
|
|
||||||
cleaned_response = cleaned_response[json_start:json_end]
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Parse JSON response
|
# Parse JSON response
|
||||||
result = json.loads(cleaned_response)
|
result = json.loads(cleaned_response)
|
||||||
if isinstance(result, dict) and "selected_urls" in result:
|
if isinstance(result, dict) and "selected_urls" in result:
|
||||||
urls = result["selected_urls"]
|
urls = result["selected_urls"]
|
||||||
else:
|
else:
|
||||||
print(f"{Colors.YELLOW}Response not in expected format. Falling back to text parsing...{Colors.RESET}")
|
print(f"{Colors.YELLOW}Response did not contain the expected 'selected_urls' key{Colors.RESET}")
|
||||||
# Fallback to text parsing
|
urls = []
|
||||||
urls = [line.strip() for line in cleaned_response.split('\n')
|
except json.JSONDecodeError as e:
|
||||||
if line.strip().startswith(('http://', 'https://'))]
|
print(f"{Colors.YELLOW}Failed to parse JSON: {str(e)}{Colors.RESET}")
|
||||||
except json.JSONDecodeError:
|
|
||||||
print(f"{Colors.YELLOW}Could not parse JSON response. Falling back to text parsing...{Colors.RESET}")
|
|
||||||
# Fallback to text parsing
|
# Fallback to text parsing
|
||||||
urls = [line.strip() for line in cleaned_response.split('\n')
|
urls = [line.strip() for line in cleaned_response.split('\n')
|
||||||
if line.strip().startswith(('http://', 'https://'))]
|
if line.strip().startswith(('http://', 'https://'))]
|
||||||
@ -121,7 +124,7 @@ def select_urls_with_llama(company, objective, serp_results):
|
|||||||
return cleaned_urls
|
return cleaned_urls
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"{Colors.RED}Error with Together AI API call: {str(e)}{Colors.RESET}")
|
print(f"{Colors.RED}Error calling Together AI: {str(e)}{Colors.RESET}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -144,13 +147,18 @@ def extract_company_info(urls, prompt, company, api_key):
|
|||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
print(f"{Colors.CYAN}Making request to Firecrawl API...{Colors.RESET}")
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
"https://api.firecrawl.dev/v1/extract",
|
"https://api.firecrawl.dev/v1/extract",
|
||||||
headers=headers,
|
headers=headers,
|
||||||
json=payload,
|
json=payload,
|
||||||
timeout=30
|
timeout=120 # Increased timeout to 120 seconds
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"{Colors.RED}API returned status code {response.status_code}: {response.text}{Colors.RESET}")
|
||||||
|
return None
|
||||||
|
|
||||||
data = response.json()
|
data = response.json()
|
||||||
|
|
||||||
if not data.get('success'):
|
if not data.get('success'):
|
||||||
@ -162,8 +170,12 @@ def extract_company_info(urls, prompt, company, api_key):
|
|||||||
print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}")
|
print(f"{Colors.RED}No extraction ID found in response.{Colors.RESET}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return poll_firecrawl_result(extraction_id, api_key)
|
return poll_firecrawl_result(extraction_id, api_key, interval=5, max_attempts=120) # Increased polling attempts
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
print(f"{Colors.RED}Request timed out. The operation might still be processing in the background.{Colors.RESET}")
|
||||||
|
print(f"{Colors.YELLOW}You may want to try again with fewer URLs or a more specific prompt.{Colors.RESET}")
|
||||||
|
return None
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
print(f"{Colors.RED}Request failed: {e}{Colors.RESET}")
|
print(f"{Colors.RED}Request failed: {e}{Colors.RESET}")
|
||||||
return None
|
return None
|
||||||
@ -223,7 +235,7 @@ def main():
|
|||||||
print(f"{Colors.RED}No search results found.{Colors.RESET}")
|
print(f"{Colors.RED}No search results found.{Colors.RESET}")
|
||||||
return
|
return
|
||||||
|
|
||||||
selected_urls = select_urls_with_llama(company, objective, serp_results)
|
selected_urls = select_urls_with_gemini(company, objective, serp_results)
|
||||||
|
|
||||||
if not selected_urls:
|
if not selected_urls:
|
||||||
print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")
|
print(f"{Colors.RED}No URLs were selected.{Colors.RESET}")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user