From 839ba22c90991dc4d72810aa6f6a527664db80c2 Mon Sep 17 00:00:00 2001 From: tth37 Date: Mon, 14 Apr 2025 01:49:05 +0800 Subject: [PATCH 1/5] feat: Backend for Self-Hosted/External Web Search/Loader Engines --- backend/open_webui/config.py | 23 ++++++++ backend/open_webui/main.py | 8 +++ .../open_webui/retrieval/loaders/external.py | 56 +++++++++++++++++++ backend/open_webui/retrieval/web/external.py | 45 +++++++++++++++ backend/open_webui/retrieval/web/utils.py | 8 +++ backend/open_webui/routers/retrieval.py | 9 +++ 6 files changed, 149 insertions(+) create mode 100644 backend/open_webui/retrieval/loaders/external.py create mode 100644 backend/open_webui/retrieval/web/external.py diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index bd822d06d..434b6403f 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2236,6 +2236,29 @@ FIRECRAWL_API_BASE_URL = PersistentConfig( os.environ.get("FIRECRAWL_API_BASE_URL", "https://api.firecrawl.dev"), ) +EXTERNAL_WEB_SEARCH_URL = PersistentConfig( + "EXTERNAL_WEB_SEARCH_URL", + "rag.web.search.external_web_search_url", + os.environ.get("EXTERNAL_WEB_SEARCH_URL", ""), +) + +EXTERNAL_WEB_SEARCH_API_KEY = PersistentConfig( + "EXTERNAL_WEB_SEARCH_API_KEY", + "rag.web.search.external_web_search_api_key", + os.environ.get("EXTERNAL_WEB_SEARCH_API_KEY", ""), +) + +EXTERNAL_WEB_LOADER_URL = PersistentConfig( + "EXTERNAL_WEB_LOADER_URL", + "rag.web.loader.external_web_loader_url", + os.environ.get("EXTERNAL_WEB_LOADER_URL", ""), +) + +EXTERNAL_WEB_LOADER_API_KEY = PersistentConfig( + "EXTERNAL_WEB_LOADER_API_KEY", + "rag.web.loader.external_web_loader_api_key", + os.environ.get("EXTERNAL_WEB_LOADER_API_KEY", ""), +) #################################### # Images diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 1d1efc5df..b18313f5c 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -245,6 +245,10 @@ from open_webui.config import ( ENABLE_GOOGLE_DRIVE_INTEGRATION, ENABLE_ONEDRIVE_INTEGRATION, UPLOAD_DIR, + EXTERNAL_WEB_SEARCH_URL, + EXTERNAL_WEB_SEARCH_API_KEY, + EXTERNAL_WEB_LOADER_URL, + EXTERNAL_WEB_LOADER_API_KEY, # WebUI WEBUI_AUTH, WEBUI_NAME, @@ -667,6 +671,10 @@ app.state.config.EXA_API_KEY = EXA_API_KEY app.state.config.PERPLEXITY_API_KEY = PERPLEXITY_API_KEY app.state.config.SOUGOU_API_SID = SOUGOU_API_SID app.state.config.SOUGOU_API_SK = SOUGOU_API_SK +app.state.config.EXTERNAL_WEB_SEARCH_URL = EXTERNAL_WEB_SEARCH_URL +app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = EXTERNAL_WEB_SEARCH_API_KEY +app.state.config.EXTERNAL_WEB_LOADER_URL = EXTERNAL_WEB_LOADER_URL +app.state.config.EXTERNAL_WEB_LOADER_API_KEY = EXTERNAL_WEB_LOADER_API_KEY app.state.config.PLAYWRIGHT_WS_URL = PLAYWRIGHT_WS_URL diff --git a/backend/open_webui/retrieval/loaders/external.py b/backend/open_webui/retrieval/loaders/external.py new file mode 100644 index 000000000..8ff2e617e --- /dev/null +++ b/backend/open_webui/retrieval/loaders/external.py @@ -0,0 +1,56 @@ +import requests +import logging +from typing import Iterator, List, Union + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +class ExternalLoader(BaseLoader): + def __init__( + self, + web_paths: Union[str, List[str]], + external_url: str, + external_api_key: str, + continue_on_failure: bool = True, + **kwargs, + ) -> None: + if not web_paths: + raise ValueError("At least one URL must be provided.") + + self.external_url = external_url + self.external_api_key = external_api_key + self.urls = web_paths if isinstance(web_paths, list) else [web_paths] + self.continue_on_failure = continue_on_failure + + def lazy_load(self) -> Iterator[Document]: + batch_size = 20 + for i in range(0, len(self.urls), batch_size): + urls = self.urls[i : i + batch_size] + try: + response = requests.get( + self.external_url, + headers={ + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "Authorization": f"Bearer {self.external_api_key}", + }, + params={ + "urls": urls, + } + ) + response.raise_for_status() + results = response.json() + for result in results: + yield Document( + page_content=result.get("page_content", ""), + metadata=result.get("metadata", {}), + ) + except Exception as e: + if self.continue_on_failure: + log.error(f"Error extracting content from batch {urls}: {e}") + else: + raise e diff --git a/backend/open_webui/retrieval/web/external.py b/backend/open_webui/retrieval/web/external.py new file mode 100644 index 000000000..29e914f84 --- /dev/null +++ b/backend/open_webui/retrieval/web/external.py @@ -0,0 +1,45 @@ +import logging +from typing import Optional, List + +import requests +from open_webui.retrieval.web.main import SearchResult, get_filtered_results +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +def search_external( + external_url: str, + external_api_key: str, + query: str, + count: int, + filter_list: Optional[List[str]] = None, +) -> List[SearchResult]: + try: + response = requests.get( + external_url, + headers={ + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "Authorization": f"Bearer {external_api_key}", + }, + params={ + "query": query, + "count": count, + } + ) + response.raise_for_status() + results = response.json() + if filter_list: + results = get_filtered_results(results, filter_list) + return [ + SearchResult( + link=result.get("link"), + title=result.get("title"), + snippet=result.get("snippet"), + ) + for result in results[:count] + ] + except Exception as e: + log.error(f"Error in External search: {e}") + return [] diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 718cfe52f..fc46d78c4 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -25,6 +25,7 @@ from langchain_community.document_loaders.firecrawl import FireCrawlLoader from langchain_community.document_loaders.base import BaseLoader from langchain_core.documents import Document from open_webui.retrieval.loaders.tavily import TavilyLoader +from open_webui.retrieval.loaders.external import ExternalLoader from open_webui.constants import ERROR_MESSAGES from open_webui.config import ( ENABLE_RAG_LOCAL_WEB_FETCH, @@ -35,6 +36,8 @@ from open_webui.config import ( FIRECRAWL_API_KEY, TAVILY_API_KEY, TAVILY_EXTRACT_DEPTH, + EXTERNAL_WEB_LOADER_URL, + EXTERNAL_WEB_LOADER_API_KEY, ) from open_webui.env import SRC_LOG_LEVELS @@ -619,6 +622,11 @@ def get_web_loader( web_loader_args["api_key"] = TAVILY_API_KEY.value web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value + if WEB_LOADER_ENGINE.value == "external": + WebLoaderClass = ExternalLoader + web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value + web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value + if WebLoaderClass: web_loader = WebLoaderClass(**web_loader_args) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 8f89351ac..72d10245d 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -61,6 +61,7 @@ from open_webui.retrieval.web.bing import search_bing from open_webui.retrieval.web.exa import search_exa from open_webui.retrieval.web.perplexity import search_perplexity from open_webui.retrieval.web.sougou import search_sougou +from open_webui.retrieval.web.external import search_external from open_webui.retrieval.utils import ( get_embedding_function, @@ -1465,6 +1466,14 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]: raise Exception( "No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables" ) + elif engine == "external": + return search_external( + request.app.state.config.EXTERNAL_WEB_SEARCH_URL, + request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY, + query, + request.app.state.config.WEB_SEARCH_RESULT_COUNT, + request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, + ) else: raise Exception("No search engine API key found in environment variables") From 22f0365cef0d366632be48e20b25fbf09330f9ec Mon Sep 17 00:00:00 2001 From: tth37 Date: Mon, 14 Apr 2025 02:05:58 +0800 Subject: [PATCH 2/5] format --- backend/open_webui/retrieval/loaders/external.py | 2 +- backend/open_webui/retrieval/web/external.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/external.py b/backend/open_webui/retrieval/loaders/external.py index 8ff2e617e..dd67035fd 100644 --- a/backend/open_webui/retrieval/loaders/external.py +++ b/backend/open_webui/retrieval/loaders/external.py @@ -40,7 +40,7 @@ class ExternalLoader(BaseLoader): }, params={ "urls": urls, - } + }, ) response.raise_for_status() results = response.json() diff --git a/backend/open_webui/retrieval/web/external.py b/backend/open_webui/retrieval/web/external.py index 29e914f84..f0a97fcfd 100644 --- a/backend/open_webui/retrieval/web/external.py +++ b/backend/open_webui/retrieval/web/external.py @@ -26,7 +26,7 @@ def search_external( params={ "query": query, "count": count, - } + }, ) response.raise_for_status() results = response.json() From 008fec80c10b911068205a1782ea28507e3dea6b Mon Sep 17 00:00:00 2001 From: tth37 Date: Mon, 14 Apr 2025 18:17:27 +0800 Subject: [PATCH 3/5] fix: Update external search/loader method to POST --- backend/open_webui/retrieval/loaders/external.py | 7 ++----- backend/open_webui/retrieval/web/external.py | 8 +++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/external.py b/backend/open_webui/retrieval/loaders/external.py index dd67035fd..642cfd3a5 100644 --- a/backend/open_webui/retrieval/loaders/external.py +++ b/backend/open_webui/retrieval/loaders/external.py @@ -19,9 +19,6 @@ class ExternalLoader(BaseLoader): continue_on_failure: bool = True, **kwargs, ) -> None: - if not web_paths: - raise ValueError("At least one URL must be provided.") - self.external_url = external_url self.external_api_key = external_api_key self.urls = web_paths if isinstance(web_paths, list) else [web_paths] @@ -32,13 +29,13 @@ class ExternalLoader(BaseLoader): for i in range(0, len(self.urls), batch_size): urls = self.urls[i : i + batch_size] try: - response = requests.get( + response = requests.post( self.external_url, headers={ "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", "Authorization": f"Bearer {self.external_api_key}", }, - params={ + json={ "urls": urls, }, ) diff --git a/backend/open_webui/retrieval/web/external.py b/backend/open_webui/retrieval/web/external.py index f0a97fcfd..a5c8003e4 100644 --- a/backend/open_webui/retrieval/web/external.py +++ b/backend/open_webui/retrieval/web/external.py @@ -17,13 +17,13 @@ def search_external( filter_list: Optional[List[str]] = None, ) -> List[SearchResult]: try: - response = requests.get( + response = requests.post( external_url, headers={ "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", "Authorization": f"Bearer {external_api_key}", }, - params={ + json={ "query": query, "count": count, }, @@ -32,7 +32,7 @@ def search_external( results = response.json() if filter_list: results = get_filtered_results(results, filter_list) - return [ + results = [ SearchResult( link=result.get("link"), title=result.get("title"), @@ -40,6 +40,8 @@ def search_external( ) for result in results[:count] ] + log.info(f"External search results: {results}") + return results except Exception as e: log.error(f"Error in External search: {e}") return [] From 85f8e91288c201d68cece8f3ec3d08968a6d5fb1 Mon Sep 17 00:00:00 2001 From: tth37 Date: Mon, 14 Apr 2025 18:19:26 +0800 Subject: [PATCH 4/5] feat: Allow admin editing external search/loader settings --- backend/open_webui/routers/retrieval.py | 24 +++++++ .../admin/Settings/WebSearch.svelte | 66 ++++++++++++++++++- 2 files changed, 88 insertions(+), 2 deletions(-) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 72d10245d..91e3fa7ae 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -419,6 +419,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, + "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL, + "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY, + "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL, + "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY, "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, @@ -464,6 +468,10 @@ class WebConfig(BaseModel): FIRECRAWL_API_KEY: Optional[str] = None FIRECRAWL_API_BASE_URL: Optional[str] = None TAVILY_EXTRACT_DEPTH: Optional[str] = None + EXTERNAL_WEB_SEARCH_URL: Optional[str] = None + EXTERNAL_WEB_SEARCH_API_KEY: Optional[str] = None + EXTERNAL_WEB_LOADER_URL: Optional[str] = None + EXTERNAL_WEB_LOADER_API_KEY: Optional[str] = None YOUTUBE_LOADER_LANGUAGE: Optional[List[str]] = None YOUTUBE_LOADER_PROXY_URL: Optional[str] = None YOUTUBE_LOADER_TRANSLATION: Optional[str] = None @@ -698,6 +706,18 @@ async def update_rag_config( request.app.state.config.FIRECRAWL_API_BASE_URL = ( form_data.web.FIRECRAWL_API_BASE_URL ) + request.app.state.config.EXTERNAL_WEB_SEARCH_URL = ( + form_data.web.EXTERNAL_WEB_SEARCH_URL + ) + request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = ( + form_data.web.EXTERNAL_WEB_SEARCH_API_KEY + ) + request.app.state.config.EXTERNAL_WEB_LOADER_URL = ( + form_data.web.EXTERNAL_WEB_LOADER_URL + ) + request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY = ( + form_data.web.EXTERNAL_WEB_LOADER_API_KEY + ) request.app.state.config.TAVILY_EXTRACT_DEPTH = ( form_data.web.TAVILY_EXTRACT_DEPTH ) @@ -779,6 +799,10 @@ async def update_rag_config( "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, + "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL, + "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY, + "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL, + "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY, "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, diff --git a/src/lib/components/admin/Settings/WebSearch.svelte b/src/lib/components/admin/Settings/WebSearch.svelte index d9771f835..82bd67814 100644 --- a/src/lib/components/admin/Settings/WebSearch.svelte +++ b/src/lib/components/admin/Settings/WebSearch.svelte @@ -30,9 +30,10 @@ 'bing', 'exa', 'perplexity', - 'sougou' + 'sougou', + 'external' ]; - let webLoaderEngines = ['playwright', 'firecrawl', 'tavily']; + let webLoaderEngines = ['playwright', 'firecrawl', 'tavily', 'external']; let webConfig = null; @@ -431,6 +432,36 @@ /> + {:else if webConfig.WEB_SEARCH_ENGINE === 'external'} +
+
+ {$i18n.t('External Web Search URL')} +
+ +
+
+ +
+
+
+
+
+
+ {$i18n.t('External Web Search API Key')} +
+ + +
+
{/if} {/if} @@ -652,6 +683,37 @@ {/if} + {:else if webConfig.WEB_LOADER_ENGINE === 'external'} +
+
+
+ {$i18n.t('External Web Loader URL')} +
+ +
+
+ +
+
+
+ +
+
+ {$i18n.t('External Web Loader API Key')} +
+ + +
+
{/if}
From dcb3f18e1e4e342c736dc1c86f337467883bc1bb Mon Sep 17 00:00:00 2001 From: tth37 Date: Mon, 14 Apr 2025 18:56:47 +0800 Subject: [PATCH 5/5] refac: styling --- .../admin/Settings/WebSearch.svelte | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/lib/components/admin/Settings/WebSearch.svelte b/src/lib/components/admin/Settings/WebSearch.svelte index 82bd67814..96b887405 100644 --- a/src/lib/components/admin/Settings/WebSearch.svelte +++ b/src/lib/components/admin/Settings/WebSearch.svelte @@ -433,25 +433,26 @@
{:else if webConfig.WEB_SEARCH_ENGINE === 'external'} -
-
- {$i18n.t('External Web Search URL')} -
- -
-
- -
-
-
+
+ {$i18n.t('External Web Search URL')} +
+ +
+
+ +
+
+
+ +
{$i18n.t('External Web Search API Key')}