diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 48ca61666..da7bb307d 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -21,6 +21,7 @@ from langchain_community.document_loaders import ( TextLoader, PyPDFLoader, CSVLoader, + BSHTMLLoader, Docx2txtLoader, UnstructuredEPubLoader, UnstructuredWordDocumentLoader, @@ -114,6 +115,7 @@ class CollectionNameForm(BaseModel): class StoreWebForm(CollectionNameForm): url: str + @app.get("/") async def get_status(): return { @@ -296,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)): def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: + text_splitter = RecursiveCharacterTextSplitter( chunk_size=app.state.CHUNK_SIZE, chunk_overlap=app.state.CHUNK_OVERLAP, add_start_index=True, ) docs = text_splitter.split_documents(data) - return store_docs_in_vector_db(docs, collection_name, overwrite) + + if len(docs) > 0: + return store_docs_in_vector_db(docs, collection_name, overwrite), None + else: + raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) def store_text_in_vector_db( @@ -318,6 +325,7 @@ def store_text_in_vector_db( def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool: + texts = [doc.page_content for doc in docs] metadatas = [doc.metadata for doc in docs] @@ -402,6 +410,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str): loader = UnstructuredRSTLoader(file_path, mode="elements") elif file_ext == "xml": loader = UnstructuredXMLLoader(file_path) + elif file_ext in ["htm", "html"]: + loader = BSHTMLLoader(file_path, open_encoding="unicode_escape") elif file_ext == "md": loader = UnstructuredMarkdownLoader(file_path) elif file_content_type == "application/epub+zip": @@ -452,19 +462,21 @@ def store_doc( loader, known_type = get_loader(file.filename, file.content_type, file_path) data = loader.load() - result = store_data_in_vector_db(data, collection_name) - if result: - return { - "status": True, - "collection_name": collection_name, - "filename": filename, - "known_type": known_type, - } - else: + try: + result = store_data_in_vector_db(data, collection_name) + + if result: + return { + "status": True, + "collection_name": collection_name, + "filename": filename, + "known_type": known_type, + } + except Exception as e: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - detail=ERROR_MESSAGES.DEFAULT(), + detail=e, ) except Exception as e: log.exception(e) @@ -529,38 +541,42 @@ def scan_docs_dir(user=Depends(get_admin_user)): ) data = loader.load() - result = store_data_in_vector_db(data, collection_name) + try: + result = store_data_in_vector_db(data, collection_name) - if result: - sanitized_filename = sanitize_filename(filename) - doc = Documents.get_doc_by_name(sanitized_filename) + if result: + sanitized_filename = sanitize_filename(filename) + doc = Documents.get_doc_by_name(sanitized_filename) - if doc == None: - doc = Documents.insert_new_doc( - user.id, - DocumentForm( - **{ - "name": sanitized_filename, - "title": filename, - "collection_name": collection_name, - "filename": filename, - "content": ( - json.dumps( - { - "tags": list( - map( - lambda name: {"name": name}, - tags, + if doc == None: + doc = Documents.insert_new_doc( + user.id, + DocumentForm( + **{ + "name": sanitized_filename, + "title": filename, + "collection_name": collection_name, + "filename": filename, + "content": ( + json.dumps( + { + "tags": list( + map( + lambda name: {"name": name}, + tags, + ) ) - ) - } - ) - if len(tags) - else "{}" - ), - } - ), - ) + } + ) + if len(tags) + else "{}" + ), + } + ), + ) + except Exception as e: + print(e) + pass except Exception as e: log.exception(e) diff --git a/backend/constants.py b/backend/constants.py index 42c5c85eb..8bcdd0789 100644 --- a/backend/constants.py +++ b/backend/constants.py @@ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum): MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found" OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found" OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama" + + EMPTY_CONTENT = "The content provided is empty. Please ensure that there is text or data present before proceeding." diff --git a/src/lib/constants.ts b/src/lib/constants.ts index bdd9c64e9..5adefe561 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -22,6 +22,7 @@ export const SUPPORTED_FILE_TYPE = [ 'text/plain', 'text/csv', 'text/xml', + 'text/html', 'text/x-python', 'text/css', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', @@ -50,6 +51,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [ 'h', 'c', 'cs', + 'htm', + 'html', 'sql', 'log', 'ini',