diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 952a53530..8d2cf0638 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -2270,6 +2270,29 @@ FIRECRAWL_API_BASE_URL = PersistentConfig( os.environ.get("FIRECRAWL_API_BASE_URL", "https://api.firecrawl.dev"), ) +EXTERNAL_WEB_SEARCH_URL = PersistentConfig( + "EXTERNAL_WEB_SEARCH_URL", + "rag.web.search.external_web_search_url", + os.environ.get("EXTERNAL_WEB_SEARCH_URL", ""), +) + +EXTERNAL_WEB_SEARCH_API_KEY = PersistentConfig( + "EXTERNAL_WEB_SEARCH_API_KEY", + "rag.web.search.external_web_search_api_key", + os.environ.get("EXTERNAL_WEB_SEARCH_API_KEY", ""), +) + +EXTERNAL_WEB_LOADER_URL = PersistentConfig( + "EXTERNAL_WEB_LOADER_URL", + "rag.web.loader.external_web_loader_url", + os.environ.get("EXTERNAL_WEB_LOADER_URL", ""), +) + +EXTERNAL_WEB_LOADER_API_KEY = PersistentConfig( + "EXTERNAL_WEB_LOADER_API_KEY", + "rag.web.loader.external_web_loader_api_key", + os.environ.get("EXTERNAL_WEB_LOADER_API_KEY", ""), +) #################################### # Images diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index 87fa68d02..ef443a66d 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -251,6 +251,10 @@ from open_webui.config import ( ENABLE_GOOGLE_DRIVE_INTEGRATION, ENABLE_ONEDRIVE_INTEGRATION, UPLOAD_DIR, + EXTERNAL_WEB_SEARCH_URL, + EXTERNAL_WEB_SEARCH_API_KEY, + EXTERNAL_WEB_LOADER_URL, + EXTERNAL_WEB_LOADER_API_KEY, # WebUI WEBUI_AUTH, WEBUI_NAME, @@ -678,6 +682,10 @@ app.state.config.EXA_API_KEY = EXA_API_KEY app.state.config.PERPLEXITY_API_KEY = PERPLEXITY_API_KEY app.state.config.SOUGOU_API_SID = SOUGOU_API_SID app.state.config.SOUGOU_API_SK = SOUGOU_API_SK +app.state.config.EXTERNAL_WEB_SEARCH_URL = EXTERNAL_WEB_SEARCH_URL +app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = EXTERNAL_WEB_SEARCH_API_KEY +app.state.config.EXTERNAL_WEB_LOADER_URL = EXTERNAL_WEB_LOADER_URL +app.state.config.EXTERNAL_WEB_LOADER_API_KEY = EXTERNAL_WEB_LOADER_API_KEY app.state.config.PLAYWRIGHT_WS_URL = PLAYWRIGHT_WS_URL diff --git a/backend/open_webui/retrieval/loaders/external.py b/backend/open_webui/retrieval/loaders/external.py new file mode 100644 index 000000000..642cfd3a5 --- /dev/null +++ b/backend/open_webui/retrieval/loaders/external.py @@ -0,0 +1,53 @@ +import requests +import logging +from typing import Iterator, List, Union + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +class ExternalLoader(BaseLoader): + def __init__( + self, + web_paths: Union[str, List[str]], + external_url: str, + external_api_key: str, + continue_on_failure: bool = True, + **kwargs, + ) -> None: + self.external_url = external_url + self.external_api_key = external_api_key + self.urls = web_paths if isinstance(web_paths, list) else [web_paths] + self.continue_on_failure = continue_on_failure + + def lazy_load(self) -> Iterator[Document]: + batch_size = 20 + for i in range(0, len(self.urls), batch_size): + urls = self.urls[i : i + batch_size] + try: + response = requests.post( + self.external_url, + headers={ + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "Authorization": f"Bearer {self.external_api_key}", + }, + json={ + "urls": urls, + }, + ) + response.raise_for_status() + results = response.json() + for result in results: + yield Document( + page_content=result.get("page_content", ""), + metadata=result.get("metadata", {}), + ) + except Exception as e: + if self.continue_on_failure: + log.error(f"Error extracting content from batch {urls}: {e}") + else: + raise e diff --git a/backend/open_webui/retrieval/web/external.py b/backend/open_webui/retrieval/web/external.py new file mode 100644 index 000000000..a5c8003e4 --- /dev/null +++ b/backend/open_webui/retrieval/web/external.py @@ -0,0 +1,47 @@ +import logging +from typing import Optional, List + +import requests +from open_webui.retrieval.web.main import SearchResult, get_filtered_results +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +def search_external( + external_url: str, + external_api_key: str, + query: str, + count: int, + filter_list: Optional[List[str]] = None, +) -> List[SearchResult]: + try: + response = requests.post( + external_url, + headers={ + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "Authorization": f"Bearer {external_api_key}", + }, + json={ + "query": query, + "count": count, + }, + ) + response.raise_for_status() + results = response.json() + if filter_list: + results = get_filtered_results(results, filter_list) + results = [ + SearchResult( + link=result.get("link"), + title=result.get("title"), + snippet=result.get("snippet"), + ) + for result in results[:count] + ] + log.info(f"External search results: {results}") + return results + except Exception as e: + log.error(f"Error in External search: {e}") + return [] diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 718cfe52f..fc46d78c4 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -25,6 +25,7 @@ from langchain_community.document_loaders.firecrawl import FireCrawlLoader from langchain_community.document_loaders.base import BaseLoader from langchain_core.documents import Document from open_webui.retrieval.loaders.tavily import TavilyLoader +from open_webui.retrieval.loaders.external import ExternalLoader from open_webui.constants import ERROR_MESSAGES from open_webui.config import ( ENABLE_RAG_LOCAL_WEB_FETCH, @@ -35,6 +36,8 @@ from open_webui.config import ( FIRECRAWL_API_KEY, TAVILY_API_KEY, TAVILY_EXTRACT_DEPTH, + EXTERNAL_WEB_LOADER_URL, + EXTERNAL_WEB_LOADER_API_KEY, ) from open_webui.env import SRC_LOG_LEVELS @@ -619,6 +622,11 @@ def get_web_loader( web_loader_args["api_key"] = TAVILY_API_KEY.value web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value + if WEB_LOADER_ENGINE.value == "external": + WebLoaderClass = ExternalLoader + web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value + web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value + if WebLoaderClass: web_loader = WebLoaderClass(**web_loader_args) diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index ce79503db..0343d9300 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -61,6 +61,7 @@ from open_webui.retrieval.web.bing import search_bing from open_webui.retrieval.web.exa import search_exa from open_webui.retrieval.web.perplexity import search_perplexity from open_webui.retrieval.web.sougou import search_sougou +from open_webui.retrieval.web.external import search_external from open_webui.retrieval.utils import ( get_embedding_function, @@ -418,6 +419,10 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, + "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL, + "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY, + "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL, + "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY, "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, @@ -463,6 +468,10 @@ class WebConfig(BaseModel): FIRECRAWL_API_KEY: Optional[str] = None FIRECRAWL_API_BASE_URL: Optional[str] = None TAVILY_EXTRACT_DEPTH: Optional[str] = None + EXTERNAL_WEB_SEARCH_URL: Optional[str] = None + EXTERNAL_WEB_SEARCH_API_KEY: Optional[str] = None + EXTERNAL_WEB_LOADER_URL: Optional[str] = None + EXTERNAL_WEB_LOADER_API_KEY: Optional[str] = None YOUTUBE_LOADER_LANGUAGE: Optional[List[str]] = None YOUTUBE_LOADER_PROXY_URL: Optional[str] = None YOUTUBE_LOADER_TRANSLATION: Optional[str] = None @@ -697,6 +706,18 @@ async def update_rag_config( request.app.state.config.FIRECRAWL_API_BASE_URL = ( form_data.web.FIRECRAWL_API_BASE_URL ) + request.app.state.config.EXTERNAL_WEB_SEARCH_URL = ( + form_data.web.EXTERNAL_WEB_SEARCH_URL + ) + request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY = ( + form_data.web.EXTERNAL_WEB_SEARCH_API_KEY + ) + request.app.state.config.EXTERNAL_WEB_LOADER_URL = ( + form_data.web.EXTERNAL_WEB_LOADER_URL + ) + request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY = ( + form_data.web.EXTERNAL_WEB_LOADER_API_KEY + ) request.app.state.config.TAVILY_EXTRACT_DEPTH = ( form_data.web.TAVILY_EXTRACT_DEPTH ) @@ -778,6 +799,10 @@ async def update_rag_config( "FIRECRAWL_API_KEY": request.app.state.config.FIRECRAWL_API_KEY, "FIRECRAWL_API_BASE_URL": request.app.state.config.FIRECRAWL_API_BASE_URL, "TAVILY_EXTRACT_DEPTH": request.app.state.config.TAVILY_EXTRACT_DEPTH, + "EXTERNAL_WEB_SEARCH_URL": request.app.state.config.EXTERNAL_WEB_SEARCH_URL, + "EXTERNAL_WEB_SEARCH_API_KEY": request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY, + "EXTERNAL_WEB_LOADER_URL": request.app.state.config.EXTERNAL_WEB_LOADER_URL, + "EXTERNAL_WEB_LOADER_API_KEY": request.app.state.config.EXTERNAL_WEB_LOADER_API_KEY, "YOUTUBE_LOADER_LANGUAGE": request.app.state.config.YOUTUBE_LOADER_LANGUAGE, "YOUTUBE_LOADER_PROXY_URL": request.app.state.config.YOUTUBE_LOADER_PROXY_URL, "YOUTUBE_LOADER_TRANSLATION": request.app.state.YOUTUBE_LOADER_TRANSLATION, @@ -1465,6 +1490,14 @@ def search_web(request: Request, engine: str, query: str) -> list[SearchResult]: raise Exception( "No SOUGOU_API_SID or SOUGOU_API_SK found in environment variables" ) + elif engine == "external": + return search_external( + request.app.state.config.EXTERNAL_WEB_SEARCH_URL, + request.app.state.config.EXTERNAL_WEB_SEARCH_API_KEY, + query, + request.app.state.config.WEB_SEARCH_RESULT_COUNT, + request.app.state.config.WEB_SEARCH_DOMAIN_FILTER_LIST, + ) else: raise Exception("No search engine API key found in environment variables") diff --git a/src/lib/components/admin/Settings/WebSearch.svelte b/src/lib/components/admin/Settings/WebSearch.svelte index d9771f835..96b887405 100644 --- a/src/lib/components/admin/Settings/WebSearch.svelte +++ b/src/lib/components/admin/Settings/WebSearch.svelte @@ -30,9 +30,10 @@ 'bing', 'exa', 'perplexity', - 'sougou' + 'sougou', + 'external' ]; - let webLoaderEngines = ['playwright', 'firecrawl', 'tavily']; + let webLoaderEngines = ['playwright', 'firecrawl', 'tavily', 'external']; let webConfig = null; @@ -431,6 +432,37 @@ /> + {:else if webConfig.WEB_SEARCH_ENGINE === 'external'} +