Feat(tools) add tavily extract tool and enhance tavily search implementation (#10786)

This commit is contained in:
Kalo Chin 2024-11-18 10:51:34 +09:00 committed by GitHub
parent 6d532bfc02
commit 6de1f8c770
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 375 additions and 151 deletions

View File

@ -1,14 +1,12 @@
identity: identity:
author: Yash Parmar author: Yash Parmar, Kalo Chin
name: tavily name: tavily
label: label:
en_US: Tavily en_US: Tavily Search & Extract
zh_Hans: Tavily zh_Hans: Tavily 搜索和提取
pt_BR: Tavily
description: description:
en_US: Tavily en_US: A powerful AI-native search engine and web content extraction tool that provides highly relevant search results and raw content extraction from web pages.
zh_Hans: Tavily zh_Hans: 一个强大的原生AI搜索引擎和网页内容提取工具提供高度相关的搜索结果和网页原始内容提取。
pt_BR: Tavily
icon: icon.png icon: icon.png
tags: tags:
- search - search
@ -19,13 +17,10 @@ credentials_for_provider:
label: label:
en_US: Tavily API key en_US: Tavily API key
zh_Hans: Tavily API key zh_Hans: Tavily API key
pt_BR: Tavily API key
placeholder: placeholder:
en_US: Please input your Tavily API key en_US: Please input your Tavily API key
zh_Hans: 请输入你的 Tavily API key zh_Hans: 请输入你的 Tavily API key
pt_BR: Please input your Tavily API key
help: help:
en_US: Get your Tavily API key from Tavily en_US: Get your Tavily API key from Tavily
zh_Hans: 从 TavilyApi 获取您的 Tavily API key zh_Hans: 从 TavilyApi 获取您的 Tavily API key
pt_BR: Get your Tavily API key from Tavily url: https://app.tavily.com/home
url: https://docs.tavily.com/docs/welcome

View File

@ -0,0 +1,145 @@
from typing import Any
import requests
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool
TAVILY_API_URL = "https://api.tavily.com"
class TavilyExtract:
"""
A class for extracting content from web pages using the Tavily Extract API.
Args:
api_key (str): The API key for accessing the Tavily Extract API.
Methods:
extract_content: Retrieves extracted content from the Tavily Extract API.
"""
def __init__(self, api_key: str) -> None:
self.api_key = api_key
def extract_content(self, params: dict[str, Any]) -> dict:
"""
Retrieves extracted content from the Tavily Extract API.
Args:
params (Dict[str, Any]): The extraction parameters.
Returns:
dict: The extracted content.
"""
# Ensure required parameters are set
if "api_key" not in params:
params["api_key"] = self.api_key
# Process parameters
processed_params = self._process_params(params)
response = requests.post(f"{TAVILY_API_URL}/extract", json=processed_params)
response.raise_for_status()
return response.json()
def _process_params(self, params: dict[str, Any]) -> dict:
"""
Processes and validates the extraction parameters.
Args:
params (Dict[str, Any]): The extraction parameters.
Returns:
dict: The processed parameters.
"""
processed_params = {}
# Process 'urls'
if "urls" in params:
urls = params["urls"]
if isinstance(urls, str):
processed_params["urls"] = [url.strip() for url in urls.replace(",", " ").split()]
elif isinstance(urls, list):
processed_params["urls"] = urls
else:
raise ValueError("The 'urls' parameter is required.")
# Only include 'api_key'
processed_params["api_key"] = params.get("api_key", self.api_key)
return processed_params
class TavilyExtractTool(BuiltinTool):
"""
A tool for extracting content from web pages using Tavily Extract.
"""
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage | list[ToolInvokeMessage]:
"""
Invokes the Tavily Extract tool with the given user ID and tool parameters.
Args:
user_id (str): The ID of the user invoking the tool.
tool_parameters (Dict[str, Any]): The parameters for the Tavily Extract tool.
Returns:
ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily Extract tool invocation.
"""
urls = tool_parameters.get("urls", "")
api_key = self.runtime.credentials.get("tavily_api_key")
if not api_key:
return self.create_text_message(
"Tavily API key is missing. Please set the 'tavily_api_key' in credentials."
)
if not urls:
return self.create_text_message("Please input at least one URL to extract.")
tavily_extract = TavilyExtract(api_key)
try:
raw_results = tavily_extract.extract_content(tool_parameters)
except requests.HTTPError as e:
return self.create_text_message(f"Error occurred while extracting content: {str(e)}")
if not raw_results.get("results"):
return self.create_text_message("No content could be extracted from the provided URLs.")
else:
# Always return JSON message with all data
json_message = self.create_json_message(raw_results)
# Create text message based on user-selected parameters
text_message_content = self._format_results_as_text(raw_results)
text_message = self.create_text_message(text=text_message_content)
return [json_message, text_message]
def _format_results_as_text(self, raw_results: dict) -> str:
"""
Formats the raw extraction results into a markdown text based on user-selected parameters.
Args:
raw_results (dict): The raw extraction results.
Returns:
str: The formatted markdown text.
"""
output_lines = []
for idx, result in enumerate(raw_results.get("results", []), 1):
url = result.get("url", "")
raw_content = result.get("raw_content", "")
output_lines.append(f"## Extracted Content {idx}: {url}\n")
output_lines.append(f"**Raw Content:**\n{raw_content}\n")
output_lines.append("---\n")
if raw_results.get("failed_results"):
output_lines.append("## Failed URLs:\n")
for failed in raw_results["failed_results"]:
url = failed.get("url", "")
error = failed.get("error", "Unknown error")
output_lines.append(f"- {url}: {error}\n")
return "\n".join(output_lines)

View File

@ -0,0 +1,23 @@
identity:
name: tavily_extract
author: Kalo Chin
label:
en_US: Tavily Extract
zh_Hans: Tavily Extract
description:
human:
en_US: A web extraction tool built specifically for AI agents (LLMs), delivering raw content from web pages.
zh_Hans: 专为人工智能代理 (LLM) 构建的网页提取工具,提供网页的原始内容。
llm: A tool for extracting raw content from web pages, designed for AI agents (LLMs).
parameters:
- name: urls
type: string
required: true
label:
en_US: URLs
zh_Hans: URLs
human_description:
en_US: A comma-separated list of URLs to extract content from.
zh_Hans: 要从中提取内容的 URL 的逗号分隔列表。
llm_description: A comma-separated list of URLs to extract content from.
form: llm

View File

@ -17,8 +17,6 @@ class TavilySearch:
Methods: Methods:
raw_results: Retrieves raw search results from the Tavily Search API. raw_results: Retrieves raw search results from the Tavily Search API.
results: Retrieves cleaned search results from the Tavily Search API.
clean_results: Cleans the raw search results.
""" """
def __init__(self, api_key: str) -> None: def __init__(self, api_key: str) -> None:
@ -35,63 +33,62 @@ class TavilySearch:
dict: The raw search results. dict: The raw search results.
""" """
# Ensure required parameters are set
params["api_key"] = self.api_key params["api_key"] = self.api_key
if (
"exclude_domains" in params
and isinstance(params["exclude_domains"], str)
and params["exclude_domains"] != "None"
):
params["exclude_domains"] = params["exclude_domains"].split()
else:
params["exclude_domains"] = []
if (
"include_domains" in params
and isinstance(params["include_domains"], str)
and params["include_domains"] != "None"
):
params["include_domains"] = params["include_domains"].split()
else:
params["include_domains"] = []
response = requests.post(f"{TAVILY_API_URL}/search", json=params) # Process parameters to ensure correct types
processed_params = self._process_params(params)
response = requests.post(f"{TAVILY_API_URL}/search", json=processed_params)
response.raise_for_status() response.raise_for_status()
return response.json() return response.json()
def results(self, params: dict[str, Any]) -> list[dict]: def _process_params(self, params: dict[str, Any]) -> dict:
""" """
Retrieves cleaned search results from the Tavily Search API. Processes and validates the search parameters.
Args: Args:
params (Dict[str, Any]): The search parameters. params (Dict[str, Any]): The search parameters.
Returns: Returns:
list: The cleaned search results. dict: The processed parameters.
""" """
raw_search_results = self.raw_results(params) processed_params = {}
return self.clean_results(raw_search_results["results"])
def clean_results(self, results: list[dict]) -> list[dict]: for key, value in params.items():
""" if value is None or value == "None":
Cleans the raw search results. continue
if key in ["include_domains", "exclude_domains"]:
if isinstance(value, str):
# Split the string by commas or spaces and strip whitespace
processed_params[key] = [domain.strip() for domain in value.replace(",", " ").split()]
elif key in ["include_images", "include_image_descriptions", "include_answer", "include_raw_content"]:
# Ensure boolean type
if isinstance(value, str):
processed_params[key] = value.lower() == "true"
else:
processed_params[key] = bool(value)
elif key in ["max_results", "days"]:
if isinstance(value, str):
processed_params[key] = int(value)
else:
processed_params[key] = value
elif key in ["search_depth", "topic", "query", "api_key"]:
processed_params[key] = value
else:
# Unrecognized parameter
pass
Args: # Set defaults if not present
results (list): The raw search results. processed_params.setdefault("search_depth", "basic")
processed_params.setdefault("topic", "general")
processed_params.setdefault("max_results", 5)
Returns: # If topic is 'news', ensure 'days' is set
list: The cleaned search results. if processed_params.get("topic") == "news":
processed_params.setdefault("days", 3)
""" return processed_params
clean_results = []
for result in results:
clean_results.append(
{
"url": result["url"],
"content": result["content"],
}
)
# return clean results as a string
return "\n".join([f"{res['url']}\n{res['content']}" for res in clean_results])
class TavilySearchTool(BuiltinTool): class TavilySearchTool(BuiltinTool):
@ -111,14 +108,88 @@ class TavilySearchTool(BuiltinTool):
ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily search tool invocation. ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily search tool invocation.
""" """
query = tool_parameters.get("query", "") query = tool_parameters.get("query", "")
api_key = self.runtime.credentials.get("tavily_api_key")
api_key = self.runtime.credentials["tavily_api_key"] if not api_key:
return self.create_text_message(
"Tavily API key is missing. Please set the 'tavily_api_key' in credentials."
)
if not query: if not query:
return self.create_text_message("Please input query") return self.create_text_message("Please input a query.")
tavily_search = TavilySearch(api_key) tavily_search = TavilySearch(api_key)
results = tavily_search.results(tool_parameters) try:
print(results) raw_results = tavily_search.raw_results(tool_parameters)
if not results: except requests.HTTPError as e:
return self.create_text_message(f"No results found for '{query}' in Tavily") return self.create_text_message(f"Error occurred while searching: {str(e)}")
if not raw_results.get("results"):
return self.create_text_message(f"No results found for '{query}' in Tavily.")
else: else:
return self.create_text_message(text=results) # Always return JSON message with all data
json_message = self.create_json_message(raw_results)
# Create text message based on user-selected parameters
text_message_content = self._format_results_as_text(raw_results, tool_parameters)
text_message = self.create_text_message(text=text_message_content)
return [json_message, text_message]
def _format_results_as_text(self, raw_results: dict, tool_parameters: dict[str, Any]) -> str:
"""
Formats the raw results into a markdown text based on user-selected parameters.
Args:
raw_results (dict): The raw search results.
tool_parameters (dict): The tool parameters selected by the user.
Returns:
str: The formatted markdown text.
"""
output_lines = []
# Include answer if requested
if tool_parameters.get("include_answer", False) and raw_results.get("answer"):
output_lines.append(f"**Answer:** {raw_results['answer']}\n")
# Include images if requested
if tool_parameters.get("include_images", False) and raw_results.get("images"):
output_lines.append("**Images:**\n")
for image in raw_results["images"]:
if tool_parameters.get("include_image_descriptions", False) and "description" in image:
output_lines.append(f"![{image['description']}]({image['url']})\n")
else:
output_lines.append(f"![]({image['url']})\n")
# Process each result
if "results" in raw_results:
for idx, result in enumerate(raw_results["results"], 1):
title = result.get("title", "No Title")
url = result.get("url", "")
content = result.get("content", "")
published_date = result.get("published_date", "")
score = result.get("score", "")
output_lines.append(f"### Result {idx}: [{title}]({url})\n")
# Include published date if available and topic is 'news'
if tool_parameters.get("topic") == "news" and published_date:
output_lines.append(f"**Published Date:** {published_date}\n")
output_lines.append(f"**URL:** {url}\n")
# Include score (relevance)
if score:
output_lines.append(f"**Relevance Score:** {score}\n")
# Include content
if content:
output_lines.append(f"**Content:**\n{content}\n")
# Include raw content if requested
if tool_parameters.get("include_raw_content", False) and result.get("raw_content"):
output_lines.append(f"**Raw Content:**\n{result['raw_content']}\n")
# Add a separator
output_lines.append("---\n")
return "\n".join(output_lines)

View File

@ -2,28 +2,24 @@ identity:
name: tavily_search name: tavily_search
author: Yash Parmar author: Yash Parmar
label: label:
en_US: TavilySearch en_US: Tavily Search
zh_Hans: TavilySearch zh_Hans: Tavily Search
pt_BR: TavilySearch
description: description:
human: human:
en_US: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed. en_US: A search engine tool built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
zh_Hans: 专为人工智能代理 (LLM) 构建的搜索引擎工具,可快速提供实时、准确和真实的结果。 zh_Hans: 专为人工智能代理 (LLM) 构建的搜索引擎工具,可快速提供实时、准确和真实的结果。
pt_BR: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
llm: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed. llm: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
parameters: parameters:
- name: query - name: query
type: string type: string
required: true required: true
label: label:
en_US: Query string en_US: Query
zh_Hans: 查询语句 zh_Hans: 查询
pt_BR: Query string
human_description: human_description:
en_US: used for searching en_US: The search query you want to execute with Tavily.
zh_Hans: 用于搜索网页内容 zh_Hans: 您想用 Tavily 执行的搜索查询。
pt_BR: used for searching llm_description: The search query.
llm_description: key words for searching
form: llm form: llm
- name: search_depth - name: search_depth
type: select type: select
@ -31,122 +27,118 @@ parameters:
label: label:
en_US: Search Depth en_US: Search Depth
zh_Hans: 搜索深度 zh_Hans: 搜索深度
pt_BR: Search Depth
human_description: human_description:
en_US: The depth of search results en_US: The depth of the search.
zh_Hans: 搜索结果的深度 zh_Hans: 搜索的深度。
pt_BR: The depth of search results
form: form form: form
options: options:
- value: basic - value: basic
label: label:
en_US: Basic en_US: Basic
zh_Hans: 基本 zh_Hans: 基本
pt_BR: Basic
- value: advanced - value: advanced
label: label:
en_US: Advanced en_US: Advanced
zh_Hans: 高级 zh_Hans: 高级
pt_BR: Advanced
default: basic default: basic
- name: topic
type: select
required: false
label:
en_US: Topic
zh_Hans: 主题
human_description:
en_US: The category of the search.
zh_Hans: 搜索的类别。
form: form
options:
- value: general
label:
en_US: General
zh_Hans: 一般
- value: news
label:
en_US: News
zh_Hans: 新闻
default: general
- name: days
type: number
required: false
label:
en_US: Days
zh_Hans: 天数
human_description:
en_US: The number of days back from the current date to include in the search results (only applicable when "topic" is "news").
zh_Hans: 从当前日期起向前追溯的天数以包含在搜索结果中仅当“topic”为“news”时适用
form: form
min: 1
default: 3
- name: max_results
type: number
required: false
label:
en_US: Max Results
zh_Hans: 最大结果数
human_description:
en_US: The maximum number of search results to return.
zh_Hans: 要返回的最大搜索结果数。
form: form
min: 1
max: 20
default: 5
- name: include_images - name: include_images
type: boolean type: boolean
required: false required: false
label: label:
en_US: Include Images en_US: Include Images
zh_Hans: 包含图片 zh_Hans: 包含图片
pt_BR: Include Images
human_description: human_description:
en_US: Include images in the search results en_US: Include a list of query-related images in the response.
zh_Hans: 在搜索结果中包含图片 zh_Hans: 在响应中包含与查询相关的图片列表。
pt_BR: Include images in the search results
form: form form: form
options: default: false
- value: 'true' - name: include_image_descriptions
label: type: boolean
en_US: 'Yes' required: false
zh_Hans: label:
pt_BR: 'Yes' en_US: Include Image Descriptions
- value: 'false' zh_Hans: 包含图片描述
label: human_description:
en_US: 'No' en_US: When include_images is True, adds descriptive text for each image.
zh_Hans: zh_Hans: 当 include_images 为 True 时,为每个图像添加描述文本。
pt_BR: 'No' form: form
default: 'false' default: false
- name: include_answer - name: include_answer
type: boolean type: boolean
required: false required: false
label: label:
en_US: Include Answer en_US: Include Answer
zh_Hans: 包含答案 zh_Hans: 包含答案
pt_BR: Include Answer
human_description: human_description:
en_US: Include answers in the search results en_US: Include a short answer to the original query in the response.
zh_Hans: 在搜索结果中包含答案 zh_Hans: 在响应中包含对原始查询的简短回答。
pt_BR: Include answers in the search results
form: form form: form
options: default: false
- value: 'true'
label:
en_US: 'Yes'
zh_Hans:
pt_BR: 'Yes'
- value: 'false'
label:
en_US: 'No'
zh_Hans:
pt_BR: 'No'
default: 'false'
- name: include_raw_content - name: include_raw_content
type: boolean type: boolean
required: false required: false
label: label:
en_US: Include Raw Content en_US: Include Raw Content
zh_Hans: 包含原始内容 zh_Hans: 包含原始内容
pt_BR: Include Raw Content
human_description: human_description:
en_US: Include raw content in the search results en_US: Include the cleaned and parsed HTML content of each search result.
zh_Hans: 在搜索结果中包含原始内容 zh_Hans: 包含每个搜索结果的已清理和解析的HTML内容。
pt_BR: Include raw content in the search results
form: form form: form
options: default: false
- value: 'true'
label:
en_US: 'Yes'
zh_Hans:
pt_BR: 'Yes'
- value: 'false'
label:
en_US: 'No'
zh_Hans:
pt_BR: 'No'
default: 'false'
- name: max_results
type: number
required: false
label:
en_US: Max Results
zh_Hans: 最大结果
pt_BR: Max Results
human_description:
en_US: The number of maximum search results to return
zh_Hans: 返回的最大搜索结果数
pt_BR: The number of maximum search results to return
form: form
min: 1
max: 20
default: 5
- name: include_domains - name: include_domains
type: string type: string
required: false required: false
label: label:
en_US: Include Domains en_US: Include Domains
zh_Hans: 包含域 zh_Hans: 包含域
pt_BR: Include Domains
human_description: human_description:
en_US: A list of domains to specifically include in the search results en_US: A comma-separated list of domains to specifically include in the search results.
zh_Hans: 在搜索结果中特别包含的域名列表 zh_Hans: 要在搜索结果中特别包含的域的逗号分隔列表。
pt_BR: A list of domains to specifically include in the search results
form: form form: form
- name: exclude_domains - name: exclude_domains
type: string type: string
@ -154,9 +146,7 @@ parameters:
label: label:
en_US: Exclude Domains en_US: Exclude Domains
zh_Hans: 排除域 zh_Hans: 排除域
pt_BR: Exclude Domains
human_description: human_description:
en_US: A list of domains to specifically exclude from the search results en_US: A comma-separated list of domains to specifically exclude from the search results.
zh_Hans: 从搜索结果中特别排除的域名列表 zh_Hans: 要从搜索结果中特别排除的域的逗号分隔列表。
pt_BR: A list of domains to specifically exclude from the search results
form: form form: form