Add query_prefix + Return TED Transcript URL for Downstream Scraping Tasks (#11090)

This commit is contained in:
Tao Wang 2024-11-25 17:32:37 -08:00 committed by GitHub
parent 60c1549771
commit af2461cccc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 83 additions and 6 deletions

View File

@ -18,6 +18,12 @@ class DuckDuckGoImageSearchTool(BuiltinTool):
"size": tool_parameters.get("size"),
"max_results": tool_parameters.get("max_results"),
}
# Add query_prefix handling
query_prefix = tool_parameters.get("query_prefix", "").strip()
final_query = f"{query_prefix} {query_dict['keywords']}".strip()
query_dict["keywords"] = final_query
response = DDGS().images(**query_dict)
markdown_result = "\n\n"
json_result = []

View File

@ -86,3 +86,14 @@ parameters:
en_US: The size of the image to be searched.
zh_Hans: 要搜索的图片的大小
form: form
- name: query_prefix
label:
en_US: Query Prefix
zh_Hans: 查询前缀
type: string
required: false
default: ""
form: form
human_description:
en_US: Specific Search e.g. "site:unsplash.com"
zh_Hans: 定向搜索 e.g. "site:unsplash.com"

View File

@ -7,7 +7,7 @@ from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool
SUMMARY_PROMPT = """
User's query:
User's query:
{query}
Here are the news results:
@ -30,6 +30,12 @@ class DuckDuckGoNewsSearchTool(BuiltinTool):
"safesearch": "moderate",
"region": "wt-wt",
}
# Add query_prefix handling
query_prefix = tool_parameters.get("query_prefix", "").strip()
final_query = f"{query_prefix} {query_dict['keywords']}".strip()
query_dict["keywords"] = final_query
try:
response = list(DDGS().news(**query_dict))
if not response:

View File

@ -69,3 +69,14 @@ parameters:
en_US: Whether to pass the news results to llm for summarization.
zh_Hans: 是否需要将新闻结果传给大模型总结
form: form
- name: query_prefix
label:
en_US: Query Prefix
zh_Hans: 查询前缀
type: string
required: false
default: ""
form: form
human_description:
en_US: Specific Search e.g. "site:msn.com"
zh_Hans: 定向搜索 e.g. "site:msn.com"

View File

@ -7,7 +7,7 @@ from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool
SUMMARY_PROMPT = """
User's query:
User's query:
{query}
Here is the search engine result:
@ -26,7 +26,12 @@ class DuckDuckGoSearchTool(BuiltinTool):
query = tool_parameters.get("query")
max_results = tool_parameters.get("max_results", 5)
require_summary = tool_parameters.get("require_summary", False)
response = DDGS().text(query, max_results=max_results)
# Add query_prefix handling
query_prefix = tool_parameters.get("query_prefix", "").strip()
final_query = f"{query_prefix} {query}".strip()
response = DDGS().text(final_query, max_results=max_results)
if require_summary:
results = "\n".join([res.get("body") for res in response])
results = self.summary_results(user_id=user_id, content=results, query=query)

View File

@ -39,3 +39,14 @@ parameters:
en_US: Whether to pass the search results to llm for summarization.
zh_Hans: 是否需要将搜索结果传给大模型总结
form: form
- name: query_prefix
label:
en_US: Query Prefix
zh_Hans: 查询前缀
type: string
required: false
default: ""
form: form
human_description:
en_US: Specific Search e.g. "site:wikipedia.org"
zh_Hans: 定向搜索 e.g. "site:wikipedia.org"

View File

@ -24,7 +24,7 @@ max-width: 100%; border-radius: 8px;">
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInvokeMessage]:
query_dict = {
"keywords": tool_parameters.get("query"),
"keywords": tool_parameters.get("query"), # LLM's query
"region": tool_parameters.get("region", "wt-wt"),
"safesearch": tool_parameters.get("safesearch", "moderate"),
"timelimit": tool_parameters.get("timelimit"),
@ -40,6 +40,12 @@ max-width: 100%; border-radius: 8px;">
# Get proxy URL from parameters
proxy_url = tool_parameters.get("proxy_url", "").strip()
query_prefix = tool_parameters.get("query_prefix", "").strip()
final_query = f"{query_prefix} {query_dict['keywords']}".strip()
# Update the keywords in query_dict with the final_query
query_dict["keywords"] = final_query
response = DDGS().videos(**query_dict)
# Create HTML result with embedded iframes
@ -51,9 +57,13 @@ max-width: 100%; border-radius: 8px;">
embed_html = res.get("embed_html", "")
description = res.get("description", "")
content_url = res.get("content", "")
transcript_url = None
# Handle TED.com videos
if not embed_html and "ted.com/talks" in content_url:
if "ted.com/talks" in content_url:
# Create transcript URL
transcript_url = f"{content_url}/transcript"
# Create embed URL
embed_url = content_url.replace("www.ted.com", "embed.ted.com")
if proxy_url:
embed_url = f"{proxy_url}{embed_url}"
@ -68,8 +78,14 @@ max-width: 100%; border-radius: 8px;">
markdown_result += f"{title}\n\n"
markdown_result += f"{embed_html}\n\n"
if description:
markdown_result += f"{description}\n\n"
markdown_result += "---\n\n"
json_result.append(self.create_json_message(res))
# Add transcript_url to the JSON result if available
result_dict = res.copy()
if transcript_url:
result_dict["transcript_url"] = transcript_url
json_result.append(self.create_json_message(result_dict))
return [self.create_text_message(markdown_result)] + json_result

View File

@ -95,3 +95,14 @@ parameters:
en_US: Proxy URL
zh_Hans: 视频代理地址
form: form
- name: query_prefix
label:
en_US: Query Prefix
zh_Hans: 查询前缀
type: string
required: false
default: ""
form: form
human_description:
en_US: Specific Search e.g. "site:www.ted.com"
zh_Hans: 定向搜索 e.g. "site:www.ted.com"