mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-18 02:35:56 +08:00
refactor(api): consolidate allowed extensions handling
- Unified allowed extensions into a single `DOCUMENT_EXTENSIONS` reference - Adjusted checks and imports in controllers and services to use the new constant - Enhanced text extraction to support additional file types (EPUB, EML, MSG)
This commit is contained in:
parent
e7f425be91
commit
3ff88f4f3c
@ -1,3 +1,5 @@
|
||||
from configs import dify_config
|
||||
|
||||
HIDDEN_VALUE = "[__HIDDEN__]"
|
||||
UUID_NIL = "00000000-0000-0000-0000-000000000000"
|
||||
|
||||
@ -13,9 +15,7 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
|
||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
|
||||
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||
|
||||
ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
|
||||
ALLOWED_EXTENSIONS.extend([ext.upper() for ext in ALLOWED_EXTENSIONS])
|
||||
|
||||
UNSTRUCTURED_ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
|
||||
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
|
||||
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend([ext.upper() for ext in UNSTRUCTURED_ALLOWED_EXTENSIONS])
|
||||
if dify_config.ETL_TYPE == "Unstructured":
|
||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
|
||||
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
|
||||
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||
|
@ -6,7 +6,7 @@ from flask_restful import Resource, marshal_with
|
||||
|
||||
import services
|
||||
from configs import dify_config
|
||||
from constants import ALLOWED_EXTENSIONS, UNSTRUCTURED_ALLOWED_EXTENSIONS
|
||||
from constants import DOCUMENT_EXTENSIONS
|
||||
from controllers.console import api
|
||||
from controllers.console.datasets.error import (
|
||||
FileTooLargeError,
|
||||
@ -79,9 +79,7 @@ class FileSupportTypeApi(Resource):
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self):
|
||||
etl_type = dify_config.ETL_TYPE
|
||||
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
|
||||
return {"allowed_extensions": allowed_extensions}
|
||||
return {"allowed_extensions": DOCUMENT_EXTENSIONS}
|
||||
|
||||
|
||||
class RemoteFileInfoApi(Resource):
|
||||
|
@ -5,6 +5,9 @@ from typing import cast
|
||||
import docx
|
||||
import pandas as pd
|
||||
import pypdfium2
|
||||
from unstructured.partition.email import partition_email
|
||||
from unstructured.partition.epub import partition_epub
|
||||
from unstructured.partition.msg import partition_msg
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
@ -96,6 +99,12 @@ def _extract_text(*, file_content: bytes, mime_type: str) -> str:
|
||||
return _extract_text_from_ppt(file_content)
|
||||
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||
return _extract_text_from_pptx(file_content)
|
||||
elif mime_type == "application/epub+zip":
|
||||
return _extract_text_from_epub(file_content)
|
||||
elif mime_type == "message/rfc822":
|
||||
return _extract_text_from_eml(file_content)
|
||||
elif mime_type == "application/vnd.ms-outlook":
|
||||
return _extract_text_from_msg(file_content)
|
||||
else:
|
||||
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
||||
|
||||
@ -210,3 +219,30 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
|
||||
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_epub(file_content: bytes) -> str:
|
||||
try:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_epub(file=file)
|
||||
return "\n".join([str(element) for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_eml(file_content: bytes) -> str:
|
||||
try:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_email(file=file)
|
||||
return "\n".join([str(element) for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e
|
||||
|
||||
|
||||
def _extract_text_from_msg(file_content: bytes) -> str:
|
||||
try:
|
||||
with io.BytesIO(file_content) as file:
|
||||
elements = partition_msg(file=file)
|
||||
return "\n".join([str(element) for element in elements])
|
||||
except Exception as e:
|
||||
raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e
|
||||
|
@ -10,10 +10,9 @@ from werkzeug.exceptions import NotFound
|
||||
|
||||
from configs import dify_config
|
||||
from constants import (
|
||||
ALLOWED_EXTENSIONS,
|
||||
AUDIO_EXTENSIONS,
|
||||
DOCUMENT_EXTENSIONS,
|
||||
IMAGE_EXTENSIONS,
|
||||
UNSTRUCTURED_ALLOWED_EXTENSIONS,
|
||||
VIDEO_EXTENSIONS,
|
||||
)
|
||||
from core.file import helpers as file_helpers
|
||||
@ -147,9 +146,7 @@ class FileService:
|
||||
|
||||
# extract text from file
|
||||
extension = upload_file.extension
|
||||
etl_type = dify_config.ETL_TYPE
|
||||
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
|
||||
if extension.lower() not in allowed_extensions:
|
||||
if extension.lower() not in DOCUMENT_EXTENSIONS:
|
||||
raise UnsupportedFileTypeError()
|
||||
|
||||
text = ExtractProcessor.load_from_upload_file(upload_file, return_text=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user