diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 7a776d132..5aae07e3c 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1838,6 +1838,18 @@ CONTENT_EXTRACTION_ENGINE = PersistentConfig( os.environ.get("CONTENT_EXTRACTION_ENGINE", "").lower(), ) +EXTERNAL_DOCUMENT_LOADER_URL = PersistentConfig( + "EXTERNAL_DOCUMENT_LOADER_URL", + "rag.external_document_loader_url", + os.environ.get("EXTERNAL_DOCUMENT_LOADER_URL", ""), +) + +EXTERNAL_DOCUMENT_LOADER_API_KEY = PersistentConfig( + "EXTERNAL_DOCUMENT_LOADER_API_KEY", + "rag.external_document_loader_api_key", + os.environ.get("EXTERNAL_DOCUMENT_LOADER_API_KEY", ""), +) + TIKA_SERVER_URL = PersistentConfig( "TIKA_SERVER_URL", "rag.tika_server_url", diff --git a/backend/open_webui/main.py b/backend/open_webui/main.py index c4929aff1..3d1036785 100644 --- a/backend/open_webui/main.py +++ b/backend/open_webui/main.py @@ -206,6 +206,8 @@ from open_webui.config import ( CHUNK_OVERLAP, CHUNK_SIZE, CONTENT_EXTRACTION_ENGINE, + EXTERNAL_DOCUMENT_LOADER_URL, + EXTERNAL_DOCUMENT_LOADER_API_KEY, TIKA_SERVER_URL, DOCLING_SERVER_URL, DOCLING_OCR_ENGINE, @@ -646,6 +648,8 @@ app.state.config.ENABLE_RAG_HYBRID_SEARCH = ENABLE_RAG_HYBRID_SEARCH app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERIFICATION app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE +app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL +app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY = EXTERNAL_DOCUMENT_LOADER_API_KEY app.state.config.TIKA_SERVER_URL = TIKA_SERVER_URL app.state.config.DOCLING_SERVER_URL = DOCLING_SERVER_URL app.state.config.DOCLING_OCR_ENGINE = DOCLING_OCR_ENGINE diff --git a/backend/open_webui/retrieval/loaders/external_document.py b/backend/open_webui/retrieval/loaders/external_document.py new file mode 100644 index 000000000..6119da379 --- /dev/null +++ b/backend/open_webui/retrieval/loaders/external_document.py @@ -0,0 +1,58 @@ +import requests +import logging +from typing import Iterator, List, Union + +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from open_webui.env import SRC_LOG_LEVELS + +log = logging.getLogger(__name__) +log.setLevel(SRC_LOG_LEVELS["RAG"]) + + +class ExternalDocumentLoader(BaseLoader): + def __init__( + self, + file_path, + url: str, + api_key: str, + mime_type=None, + **kwargs, + ) -> None: + self.url = url + self.api_key = api_key + + self.file_path = file_path + self.mime_type = mime_type + + def load(self) -> list[Document]: + with open(self.file_path, "rb") as f: + data = f.read() + + headers = {} + if self.mime_type is not None: + headers["Content-Type"] = self.mime_type + + if self.api_key is not None: + headers["Authorization"] = f"Bearer {self.api_key}" + + url = self.url + if url.endswith("/"): + url = url[:-1] + + r = requests.put(f"{url}/process", data=data, headers=headers) + + if r.ok: + res = r.json() + + if res: + return [ + Document( + page_content=res.get("page_content"), + metadata=res.get("metadata"), + ) + ] + else: + raise Exception("Error loading document: No content returned") + else: + raise Exception(f"Error loading document: {r.status_code} {r.text}") diff --git a/backend/open_webui/retrieval/loaders/external.py b/backend/open_webui/retrieval/loaders/external_web.py similarity index 95% rename from backend/open_webui/retrieval/loaders/external.py rename to backend/open_webui/retrieval/loaders/external_web.py index 642cfd3a5..68ed66162 100644 --- a/backend/open_webui/retrieval/loaders/external.py +++ b/backend/open_webui/retrieval/loaders/external_web.py @@ -10,7 +10,7 @@ log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["RAG"]) -class ExternalLoader(BaseLoader): +class ExternalWebLoader(BaseLoader): def __init__( self, web_paths: Union[str, List[str]], @@ -32,7 +32,7 @@ class ExternalLoader(BaseLoader): response = requests.post( self.external_url, headers={ - "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) RAG Bot", + "User-Agent": "Open WebUI (https://github.com/open-webui/open-webui) External Web Loader", "Authorization": f"Bearer {self.external_api_key}", }, json={ diff --git a/backend/open_webui/retrieval/loaders/main.py b/backend/open_webui/retrieval/loaders/main.py index 1f34c9f79..c5f0b4e5e 100644 --- a/backend/open_webui/retrieval/loaders/main.py +++ b/backend/open_webui/retrieval/loaders/main.py @@ -21,6 +21,8 @@ from langchain_community.document_loaders import ( ) from langchain_core.documents import Document + +from open_webui.retrieval.loaders.external_document import ExternalDocumentLoader from open_webui.retrieval.loaders.mistral import MistralLoader from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL @@ -213,6 +215,17 @@ class Loader: def _get_loader(self, filename: str, file_content_type: str, file_path: str): file_ext = filename.split(".")[-1].lower() + if ( + self.engine == "external" + and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL") + and self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY") + ): + loader = ExternalDocumentLoader( + file_path=file_path, + url=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_URL"), + api_key=self.kwargs.get("EXTERNAL_DOCUMENT_LOADER_API_KEY"), + mime_type=file_content_type, + ) if self.engine == "tika" and self.kwargs.get("TIKA_SERVER_URL"): if self._is_text_file(file_ext, file_content_type): loader = TextLoader(file_path, autodetect_encoding=True) @@ -269,6 +282,15 @@ class Loader: loader = MistralLoader( api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path ) + elif ( + self.engine == "external" + and self.kwargs.get("MISTRAL_OCR_API_KEY") != "" + and file_ext + in ["pdf"] # Mistral OCR currently only supports PDF and images + ): + loader = MistralLoader( + api_key=self.kwargs.get("MISTRAL_OCR_API_KEY"), file_path=file_path + ) else: if file_ext == "pdf": loader = PyPDFLoader( diff --git a/backend/open_webui/retrieval/web/utils.py b/backend/open_webui/retrieval/web/utils.py index 78c962f15..8bf872fb4 100644 --- a/backend/open_webui/retrieval/web/utils.py +++ b/backend/open_webui/retrieval/web/utils.py @@ -25,7 +25,7 @@ from langchain_community.document_loaders.firecrawl import FireCrawlLoader from langchain_community.document_loaders.base import BaseLoader from langchain_core.documents import Document from open_webui.retrieval.loaders.tavily import TavilyLoader -from open_webui.retrieval.loaders.external import ExternalLoader +from backend.open_webui.retrieval.loaders.external_web import ExternalWebLoader from open_webui.constants import ERROR_MESSAGES from open_webui.config import ( ENABLE_RAG_LOCAL_WEB_FETCH, @@ -628,7 +628,7 @@ def get_web_loader( web_loader_args["extract_depth"] = TAVILY_EXTRACT_DEPTH.value if WEB_LOADER_ENGINE.value == "external": - WebLoaderClass = ExternalLoader + WebLoaderClass = ExternalWebLoader web_loader_args["external_url"] = EXTERNAL_WEB_LOADER_URL.value web_loader_args["external_api_key"] = EXTERNAL_WEB_LOADER_API_KEY.value diff --git a/backend/open_webui/routers/retrieval.py b/backend/open_webui/routers/retrieval.py index 5f867dadf..1d0af3029 100644 --- a/backend/open_webui/routers/retrieval.py +++ b/backend/open_webui/routers/retrieval.py @@ -352,6 +352,8 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)): # Content extraction settings "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, + "EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY, "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, @@ -493,6 +495,9 @@ class ConfigForm(BaseModel): # Content extraction settings CONTENT_EXTRACTION_ENGINE: Optional[str] = None PDF_EXTRACT_IMAGES: Optional[bool] = None + EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None + EXTERNAL_DOCUMENT_LOADER_API_KEY: Optional[str] = None + TIKA_SERVER_URL: Optional[str] = None DOCLING_SERVER_URL: Optional[str] = None DOCLING_OCR_ENGINE: Optional[str] = None @@ -583,6 +588,16 @@ async def update_rag_config( if form_data.PDF_EXTRACT_IMAGES is not None else request.app.state.config.PDF_EXTRACT_IMAGES ) + request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = ( + form_data.EXTERNAL_DOCUMENT_LOADER_URL + if form_data.EXTERNAL_DOCUMENT_LOADER_URL is not None + else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL + ) + request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY = ( + form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY + if form_data.EXTERNAL_DOCUMENT_LOADER_API_KEY is not None + else request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY + ) request.app.state.config.TIKA_SERVER_URL = ( form_data.TIKA_SERVER_URL if form_data.TIKA_SERVER_URL is not None @@ -818,6 +833,8 @@ async def update_rag_config( # Content extraction settings "CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE, "PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES, + "EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, + "EXTERNAL_DOCUMENT_LOADER_API_KEY": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY, "TIKA_SERVER_URL": request.app.state.config.TIKA_SERVER_URL, "DOCLING_SERVER_URL": request.app.state.config.DOCLING_SERVER_URL, "DOCLING_OCR_ENGINE": request.app.state.config.DOCLING_OCR_ENGINE, @@ -1139,6 +1156,8 @@ def process_file( file_path = Storage.get_file(file_path) loader = Loader( engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE, + EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL, + EXTERNAL_DOCUMENT_LOADER_API_KEY=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_API_KEY, TIKA_SERVER_URL=request.app.state.config.TIKA_SERVER_URL, DOCLING_SERVER_URL=request.app.state.config.DOCLING_SERVER_URL, DOCLING_OCR_ENGINE=request.app.state.config.DOCLING_OCR_ENGINE, diff --git a/src/lib/components/admin/Settings/Documents.svelte b/src/lib/components/admin/Settings/Documents.svelte index 8f6a68cf4..498ff2c1e 100644 --- a/src/lib/components/admin/Settings/Documents.svelte +++ b/src/lib/components/admin/Settings/Documents.svelte @@ -124,6 +124,13 @@ }; const submitHandler = async () => { + if ( + RAGConfig.CONTENT_EXTRACTION_ENGINE === 'external' && + RAGConfig.EXTERNAL_DOCUMENT_LOADER_URL === '' + ) { + toast.error($i18n.t('External Document Loader URL required.')); + return; + } if (RAGConfig.CONTENT_EXTRACTION_ENGINE === 'tika' && RAGConfig.TIKA_SERVER_URL === '') { toast.error($i18n.t('Tika Server URL required.')); return; @@ -256,6 +263,7 @@ bind:value={RAGConfig.CONTENT_EXTRACTION_ENGINE} > + @@ -275,6 +283,19 @@ + {:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'external'} +