From af2461cccc7ff0caf09ae73c7384022e584004ef Mon Sep 17 00:00:00 2001 From: Tao Wang <74752235+taowang1993@users.noreply.github.com> Date: Mon, 25 Nov 2024 17:32:37 -0800 Subject: [PATCH] Add query_prefix + Return TED Transcript URL for Downstream Scraping Tasks (#11090) --- .../builtin/duckduckgo/tools/ddgo_img.py | 6 +++++ .../builtin/duckduckgo/tools/ddgo_img.yaml | 11 ++++++++++ .../builtin/duckduckgo/tools/ddgo_news.py | 8 ++++++- .../builtin/duckduckgo/tools/ddgo_news.yaml | 11 ++++++++++ .../builtin/duckduckgo/tools/ddgo_search.py | 9 ++++++-- .../builtin/duckduckgo/tools/ddgo_search.yaml | 11 ++++++++++ .../builtin/duckduckgo/tools/ddgo_video.py | 22 ++++++++++++++++--- .../builtin/duckduckgo/tools/ddgo_video.yaml | 11 ++++++++++ 8 files changed, 83 insertions(+), 6 deletions(-) diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.py b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.py index 54bb38755a..b3c630878f 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.py +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.py @@ -18,6 +18,12 @@ class DuckDuckGoImageSearchTool(BuiltinTool): "size": tool_parameters.get("size"), "max_results": tool_parameters.get("max_results"), } + + # Add query_prefix handling + query_prefix = tool_parameters.get("query_prefix", "").strip() + final_query = f"{query_prefix} {query_dict['keywords']}".strip() + query_dict["keywords"] = final_query + response = DDGS().images(**query_dict) markdown_result = "\n\n" json_result = [] diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.yaml b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.yaml index 168cface22..a543d1e218 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.yaml +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.yaml @@ -86,3 +86,14 @@ parameters: en_US: The size of the image to be searched. zh_Hans: 要搜索的图片的大小 form: form + - name: query_prefix + label: + en_US: Query Prefix + zh_Hans: 查询前缀 + type: string + required: false + default: "" + form: form + human_description: + en_US: Specific Search e.g. "site:unsplash.com" + zh_Hans: 定向搜索 e.g. "site:unsplash.com" diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.py b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.py index 3a6fd394a8..11da6f5cf7 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.py +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.py @@ -7,7 +7,7 @@ from core.tools.entities.tool_entities import ToolInvokeMessage from core.tools.tool.builtin_tool import BuiltinTool SUMMARY_PROMPT = """ -User's query: +User's query: {query} Here are the news results: @@ -30,6 +30,12 @@ class DuckDuckGoNewsSearchTool(BuiltinTool): "safesearch": "moderate", "region": "wt-wt", } + + # Add query_prefix handling + query_prefix = tool_parameters.get("query_prefix", "").strip() + final_query = f"{query_prefix} {query_dict['keywords']}".strip() + query_dict["keywords"] = final_query + try: response = list(DDGS().news(**query_dict)) if not response: diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.yaml b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.yaml index eb2b67b7c9..6e181e0f41 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.yaml +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.yaml @@ -69,3 +69,14 @@ parameters: en_US: Whether to pass the news results to llm for summarization. zh_Hans: 是否需要将新闻结果传给大模型总结 form: form + - name: query_prefix + label: + en_US: Query Prefix + zh_Hans: 查询前缀 + type: string + required: false + default: "" + form: form + human_description: + en_US: Specific Search e.g. "site:msn.com" + zh_Hans: 定向搜索 e.g. "site:msn.com" diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.py b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.py index cbd65d2e77..3cd35d16a6 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.py +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.py @@ -7,7 +7,7 @@ from core.tools.entities.tool_entities import ToolInvokeMessage from core.tools.tool.builtin_tool import BuiltinTool SUMMARY_PROMPT = """ -User's query: +User's query: {query} Here is the search engine result: @@ -26,7 +26,12 @@ class DuckDuckGoSearchTool(BuiltinTool): query = tool_parameters.get("query") max_results = tool_parameters.get("max_results", 5) require_summary = tool_parameters.get("require_summary", False) - response = DDGS().text(query, max_results=max_results) + + # Add query_prefix handling + query_prefix = tool_parameters.get("query_prefix", "").strip() + final_query = f"{query_prefix} {query}".strip() + + response = DDGS().text(final_query, max_results=max_results) if require_summary: results = "\n".join([res.get("body") for res in response]) results = self.summary_results(user_id=user_id, content=results, query=query) diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.yaml b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.yaml index 333c0cb093..54e27d9905 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.yaml +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.yaml @@ -39,3 +39,14 @@ parameters: en_US: Whether to pass the search results to llm for summarization. zh_Hans: 是否需要将搜索结果传给大模型总结 form: form + - name: query_prefix + label: + en_US: Query Prefix + zh_Hans: 查询前缀 + type: string + required: false + default: "" + form: form + human_description: + en_US: Specific Search e.g. "site:wikipedia.org" + zh_Hans: 定向搜索 e.g. "site:wikipedia.org" diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.py b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.py index 4b74b223c1..1eef0b1ba2 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.py +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.py @@ -24,7 +24,7 @@ max-width: 100%; border-radius: 8px;"> def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInvokeMessage]: query_dict = { - "keywords": tool_parameters.get("query"), + "keywords": tool_parameters.get("query"), # LLM's query "region": tool_parameters.get("region", "wt-wt"), "safesearch": tool_parameters.get("safesearch", "moderate"), "timelimit": tool_parameters.get("timelimit"), @@ -40,6 +40,12 @@ max-width: 100%; border-radius: 8px;"> # Get proxy URL from parameters proxy_url = tool_parameters.get("proxy_url", "").strip() + query_prefix = tool_parameters.get("query_prefix", "").strip() + final_query = f"{query_prefix} {query_dict['keywords']}".strip() + + # Update the keywords in query_dict with the final_query + query_dict["keywords"] = final_query + response = DDGS().videos(**query_dict) # Create HTML result with embedded iframes @@ -51,9 +57,13 @@ max-width: 100%; border-radius: 8px;"> embed_html = res.get("embed_html", "") description = res.get("description", "") content_url = res.get("content", "") + transcript_url = None # Handle TED.com videos - if not embed_html and "ted.com/talks" in content_url: + if "ted.com/talks" in content_url: + # Create transcript URL + transcript_url = f"{content_url}/transcript" + # Create embed URL embed_url = content_url.replace("www.ted.com", "embed.ted.com") if proxy_url: embed_url = f"{proxy_url}{embed_url}" @@ -68,8 +78,14 @@ max-width: 100%; border-radius: 8px;"> markdown_result += f"{title}\n\n" markdown_result += f"{embed_html}\n\n" + if description: + markdown_result += f"{description}\n\n" markdown_result += "---\n\n" - json_result.append(self.create_json_message(res)) + # Add transcript_url to the JSON result if available + result_dict = res.copy() + if transcript_url: + result_dict["transcript_url"] = transcript_url + json_result.append(self.create_json_message(result_dict)) return [self.create_text_message(markdown_result)] + json_result diff --git a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.yaml b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.yaml index a516d3cb98..d846244e3d 100644 --- a/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.yaml +++ b/api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.yaml @@ -95,3 +95,14 @@ parameters: en_US: Proxy URL zh_Hans: 视频代理地址 form: form + - name: query_prefix + label: + en_US: Query Prefix + zh_Hans: 查询前缀 + type: string + required: false + default: "" + form: form + human_description: + en_US: Specific Search e.g. "site:www.ted.com" + zh_Hans: 定向搜索 e.g. "site:www.ted.com"