mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 16:59:04 +08:00
Parse base64 eml file (#1796)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
parent
7083a05a25
commit
64642fabc4
@ -1,9 +1,8 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import base64
|
||||||
from typing import Optional, List, Tuple, cast
|
from typing import List
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
from langchain.document_loaders.helpers import detect_file_encodings
|
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -11,8 +10,6 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class UnstructuredEmailLoader(BaseLoader):
|
class UnstructuredEmailLoader(BaseLoader):
|
||||||
"""Load msg files.
|
"""Load msg files.
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
@ -26,16 +23,28 @@ class UnstructuredEmailLoader(BaseLoader):
|
|||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
elements = partition_email(filename=self._file_path, api_url=self._api_url)
|
elements = partition_email(filename=self._file_path, api_url=self._api_url)
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
for element in elements:
|
||||||
|
element_text = element.text.strip()
|
||||||
|
|
||||||
|
padding_needed = 4 - len(element_text) % 4
|
||||||
|
element_text += '=' * padding_needed
|
||||||
|
|
||||||
|
element_decode = base64.b64decode(element_text)
|
||||||
|
soup = BeautifulSoup(element_decode.decode('utf-8'), 'html.parser')
|
||||||
|
element.text = soup.get_text()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
|
||||||
documents = []
|
documents = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
text = chunk.text.strip()
|
text = chunk.text.strip()
|
||||||
documents.append(Document(page_content=text))
|
documents.append(Document(page_content=text))
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
@ -56,3 +56,5 @@ qdrant-client==1.6.4
|
|||||||
cohere~=4.32
|
cohere~=4.32
|
||||||
unstructured~=0.10.27
|
unstructured~=0.10.27
|
||||||
unstructured[docx,pptx]~=0.10.27
|
unstructured[docx,pptx]~=0.10.27
|
||||||
|
bs4~=0.0.1
|
||||||
|
markdown~=3.5.1
|
Loading…
x
Reference in New Issue
Block a user