refactor(api): consolidate allowed extensions handling

- Unified allowed extensions into a single `DOCUMENT_EXTENSIONS` reference
- Adjusted checks and imports in controllers and services to use the new constant
- Enhanced text extraction to support additional file types (EPUB, EML, MSG)
This commit is contained in:
-LAN- 2024-09-29 18:15:22 +08:00
parent e7f425be91
commit 3ff88f4f3c
4 changed files with 46 additions and 15 deletions

View File

@ -1,3 +1,5 @@
from configs import dify_config
HIDDEN_VALUE = "[__HIDDEN__]"
UUID_NIL = "00000000-0000-0000-0000-000000000000"
@ -13,9 +15,7 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
ALLOWED_EXTENSIONS.extend([ext.upper() for ext in ALLOWED_EXTENSIONS])
UNSTRUCTURED_ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend([ext.upper() for ext in UNSTRUCTURED_ALLOWED_EXTENSIONS])
if dify_config.ETL_TYPE == "Unstructured":
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])

View File

@ -6,7 +6,7 @@ from flask_restful import Resource, marshal_with
import services
from configs import dify_config
from constants import ALLOWED_EXTENSIONS, UNSTRUCTURED_ALLOWED_EXTENSIONS
from constants import DOCUMENT_EXTENSIONS
from controllers.console import api
from controllers.console.datasets.error import (
FileTooLargeError,
@ -79,9 +79,7 @@ class FileSupportTypeApi(Resource):
@login_required
@account_initialization_required
def get(self):
etl_type = dify_config.ETL_TYPE
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
return {"allowed_extensions": allowed_extensions}
return {"allowed_extensions": DOCUMENT_EXTENSIONS}
class RemoteFileInfoApi(Resource):

View File

@ -5,6 +5,9 @@ from typing import cast
import docx
import pandas as pd
import pypdfium2
from unstructured.partition.email import partition_email
from unstructured.partition.epub import partition_epub
from unstructured.partition.msg import partition_msg
from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx
@ -96,6 +99,12 @@ def _extract_text(*, file_content: bytes, mime_type: str) -> str:
return _extract_text_from_ppt(file_content)
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
return _extract_text_from_pptx(file_content)
elif mime_type == "application/epub+zip":
return _extract_text_from_epub(file_content)
elif mime_type == "message/rfc822":
return _extract_text_from_eml(file_content)
elif mime_type == "application/vnd.ms-outlook":
return _extract_text_from_msg(file_content)
else:
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
@ -210,3 +219,30 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
def _extract_text_from_epub(file_content: bytes) -> str:
try:
with io.BytesIO(file_content) as file:
elements = partition_epub(file=file)
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e
def _extract_text_from_eml(file_content: bytes) -> str:
try:
with io.BytesIO(file_content) as file:
elements = partition_email(file=file)
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e
def _extract_text_from_msg(file_content: bytes) -> str:
try:
with io.BytesIO(file_content) as file:
elements = partition_msg(file=file)
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e

View File

@ -10,10 +10,9 @@ from werkzeug.exceptions import NotFound
from configs import dify_config
from constants import (
ALLOWED_EXTENSIONS,
AUDIO_EXTENSIONS,
DOCUMENT_EXTENSIONS,
IMAGE_EXTENSIONS,
UNSTRUCTURED_ALLOWED_EXTENSIONS,
VIDEO_EXTENSIONS,
)
from core.file import helpers as file_helpers
@ -147,9 +146,7 @@ class FileService:
# extract text from file
extension = upload_file.extension
etl_type = dify_config.ETL_TYPE
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
if extension.lower() not in allowed_extensions:
if extension.lower() not in DOCUMENT_EXTENSIONS:
raise UnsupportedFileTypeError()
text = ExtractProcessor.load_from_upload_file(upload_file, return_text=True)