Parse base64 eml file (#1796)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
2025-08-12 16:59:04 +08:00 · 2023-12-21 13:18:58 +08:00 · 2023-12-21 13:18:58 +08:00 · 64642fabc4
commit 64642fabc4
parent 7083a05a25
2 changed files with 21 additions and 10 deletions
--- a/api/core/data_loader/loader/unstructured/unstructured_eml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_eml.py
@ -1,9 +1,8 @@
 import logging
-import re
+import base64
-from typing import Optional, List, Tuple, cast
+from typing import List
-
+from bs4 import BeautifulSoup
 from langchain.document_loaders.base import BaseLoader
 from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 logger = logging.getLogger(__name__)
@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
 class UnstructuredEmailLoader(BaseLoader):
    """Load msg files.
    Args:
        file_path: Path to the file to load.
    """
@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
        self._file_path = file_path
        self._api_url = api_url
    def load(self) -> List[Document]:
        from unstructured.partition.email import partition_email
        elements = partition_email(filename=self._file_path, api_url=self._api_url)
        # noinspection PyBroadException
        try:
            for element in elements:
                element_text = element.text.strip()
                padding_needed = 4 - len(element_text) % 4
                element_text += '=' * padding_needed
                element_decode = base64.b64decode(element_text)
                soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
                element.text = soup.get_text()
        except Exception:
            pass
        from unstructured.chunking.title import chunk_by_title
        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
        documents = []
        for chunk in chunks:
            text = chunk.text.strip()
            documents.append(Document(page_content=text))
        return documents
--- a/api/requirements.txt
+++ b/api/requirements.txt
@ -56,3 +56,5 @@ qdrant-client==1.6.4
 cohere~=4.32
 unstructured~=0.10.27
 unstructured[docx,pptx]~=0.10.27
 bs4~=0.0.1
 markdown~=3.5.1