From 64642fabc45494face74361615e74b037fa87e39 Mon Sep 17 00:00:00 2001
From: "Charlie.Wei" <luowei@cvte.com>
Date: Thu, 21 Dec 2023 13:18:58 +0800
Subject: [PATCH] Parse base64 eml file (#1796)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
---
 .../loader/unstructured/unstructured_eml.py   | 27 ++++++++++++-------
 api/requirements.txt                          |  4 ++-
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/api/core/data_loader/loader/unstructured/unstructured_eml.py b/api/core/data_loader/loader/unstructured/unstructured_eml.py
index f7a67be421..fa097ac37b 100644
--- a/api/core/data_loader/loader/unstructured/unstructured_eml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_eml.py
@@ -1,9 +1,8 @@
 import logging
-import re
-from typing import Optional, List, Tuple, cast
-
+import base64
+from typing import List
+from bs4 import BeautifulSoup
 from langchain.document_loaders.base import BaseLoader
-from langchain.document_loaders.helpers import detect_file_encodings
 from langchain.schema import Document
 
 logger = logging.getLogger(__name__)
@@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
 
 class UnstructuredEmailLoader(BaseLoader):
     """Load msg files.
-
-
     Args:
         file_path: Path to the file to load.
     """
@@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
         self._file_path = file_path
         self._api_url = api_url
 
-
     def load(self) -> List[Document]:
         from unstructured.partition.email import partition_email
-
         elements = partition_email(filename=self._file_path, api_url=self._api_url)
+
+        # noinspection PyBroadException
+        try:
+            for element in elements:
+                element_text = element.text.strip()
+
+                padding_needed = 4 - len(element_text) % 4
+                element_text += '=' * padding_needed
+
+                element_decode = base64.b64decode(element_text)
+                soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
+                element.text = soup.get_text()
+        except Exception:
+            pass
+
         from unstructured.chunking.title import chunk_by_title
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
         documents = []
         for chunk in chunks:
             text = chunk.text.strip()
             documents.append(Document(page_content=text))
-
         return documents
diff --git a/api/requirements.txt b/api/requirements.txt
index b56c4c6c73..e8c9f03ec5 100644
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -55,4 +55,6 @@ pymilvus==2.3.0
 qdrant-client==1.6.4
 cohere~=4.32
 unstructured~=0.10.27
-unstructured[docx,pptx]~=0.10.27
\ No newline at end of file
+unstructured[docx,pptx]~=0.10.27
+bs4~=0.0.1
+markdown~=3.5.1
\ No newline at end of file