From 64642fabc45494face74361615e74b037fa87e39 Mon Sep 17 00:00:00 2001 From: "Charlie.Wei" Date: Thu, 21 Dec 2023 13:18:58 +0800 Subject: [PATCH] Parse base64 eml file (#1796) Co-authored-by: luowei Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> --- .../loader/unstructured/unstructured_eml.py | 27 ++++++++++++------- api/requirements.txt | 4 ++- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/api/core/data_loader/loader/unstructured/unstructured_eml.py b/api/core/data_loader/loader/unstructured/unstructured_eml.py index f7a67be421..fa097ac37b 100644 --- a/api/core/data_loader/loader/unstructured/unstructured_eml.py +++ b/api/core/data_loader/loader/unstructured/unstructured_eml.py @@ -1,9 +1,8 @@ import logging -import re -from typing import Optional, List, Tuple, cast - +import base64 +from typing import List +from bs4 import BeautifulSoup from langchain.document_loaders.base import BaseLoader -from langchain.document_loaders.helpers import detect_file_encodings from langchain.schema import Document logger = logging.getLogger(__name__) @@ -11,8 +10,6 @@ logger = logging.getLogger(__name__) class UnstructuredEmailLoader(BaseLoader): """Load msg files. - - Args: file_path: Path to the file to load. """ @@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader): self._file_path = file_path self._api_url = api_url - def load(self) -> List[Document]: from unstructured.partition.email import partition_email - elements = partition_email(filename=self._file_path, api_url=self._api_url) + + # noinspection PyBroadException + try: + for element in elements: + element_text = element.text.strip() + + padding_needed = 4 - len(element_text) % 4 + element_text += '=' * padding_needed + + element_decode = base64.b64decode(element_text) + soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser') + element.text = soup.get_text() + except Exception: + pass + from unstructured.chunking.title import chunk_by_title chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) documents = [] for chunk in chunks: text = chunk.text.strip() documents.append(Document(page_content=text)) - return documents diff --git a/api/requirements.txt b/api/requirements.txt index b56c4c6c73..e8c9f03ec5 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -55,4 +55,6 @@ pymilvus==2.3.0 qdrant-client==1.6.4 cohere~=4.32 unstructured~=0.10.27 -unstructured[docx,pptx]~=0.10.27 \ No newline at end of file +unstructured[docx,pptx]~=0.10.27 +bs4~=0.0.1 +markdown~=3.5.1 \ No newline at end of file