mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-20 07:59:10 +08:00
refactor(api): consolidate allowed extensions handling
- Unified allowed extensions into a single `DOCUMENT_EXTENSIONS` reference - Adjusted checks and imports in controllers and services to use the new constant - Enhanced text extraction to support additional file types (EPUB, EML, MSG)
This commit is contained in:
parent
e7f425be91
commit
3ff88f4f3c
@ -1,3 +1,5 @@
|
|||||||
|
from configs import dify_config
|
||||||
|
|
||||||
HIDDEN_VALUE = "[__HIDDEN__]"
|
HIDDEN_VALUE = "[__HIDDEN__]"
|
||||||
UUID_NIL = "00000000-0000-0000-0000-000000000000"
|
UUID_NIL = "00000000-0000-0000-0000-000000000000"
|
||||||
|
|
||||||
@ -13,9 +15,7 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
|
|||||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
|
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
|
||||||
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||||
|
|
||||||
ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
|
if dify_config.ETL_TYPE == "Unstructured":
|
||||||
ALLOWED_EXTENSIONS.extend([ext.upper() for ext in ALLOWED_EXTENSIONS])
|
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
|
||||||
|
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
|
||||||
UNSTRUCTURED_ALLOWED_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
|
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||||
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
|
|
||||||
UNSTRUCTURED_ALLOWED_EXTENSIONS.extend([ext.upper() for ext in UNSTRUCTURED_ALLOWED_EXTENSIONS])
|
|
||||||
|
@ -6,7 +6,7 @@ from flask_restful import Resource, marshal_with
|
|||||||
|
|
||||||
import services
|
import services
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from constants import ALLOWED_EXTENSIONS, UNSTRUCTURED_ALLOWED_EXTENSIONS
|
from constants import DOCUMENT_EXTENSIONS
|
||||||
from controllers.console import api
|
from controllers.console import api
|
||||||
from controllers.console.datasets.error import (
|
from controllers.console.datasets.error import (
|
||||||
FileTooLargeError,
|
FileTooLargeError,
|
||||||
@ -79,9 +79,7 @@ class FileSupportTypeApi(Resource):
|
|||||||
@login_required
|
@login_required
|
||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
def get(self):
|
def get(self):
|
||||||
etl_type = dify_config.ETL_TYPE
|
return {"allowed_extensions": DOCUMENT_EXTENSIONS}
|
||||||
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
|
|
||||||
return {"allowed_extensions": allowed_extensions}
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteFileInfoApi(Resource):
|
class RemoteFileInfoApi(Resource):
|
||||||
|
@ -5,6 +5,9 @@ from typing import cast
|
|||||||
import docx
|
import docx
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypdfium2
|
import pypdfium2
|
||||||
|
from unstructured.partition.email import partition_email
|
||||||
|
from unstructured.partition.epub import partition_epub
|
||||||
|
from unstructured.partition.msg import partition_msg
|
||||||
from unstructured.partition.ppt import partition_ppt
|
from unstructured.partition.ppt import partition_ppt
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
|
|
||||||
@ -96,6 +99,12 @@ def _extract_text(*, file_content: bytes, mime_type: str) -> str:
|
|||||||
return _extract_text_from_ppt(file_content)
|
return _extract_text_from_ppt(file_content)
|
||||||
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
||||||
return _extract_text_from_pptx(file_content)
|
return _extract_text_from_pptx(file_content)
|
||||||
|
elif mime_type == "application/epub+zip":
|
||||||
|
return _extract_text_from_epub(file_content)
|
||||||
|
elif mime_type == "message/rfc822":
|
||||||
|
return _extract_text_from_eml(file_content)
|
||||||
|
elif mime_type == "application/vnd.ms-outlook":
|
||||||
|
return _extract_text_from_msg(file_content)
|
||||||
else:
|
else:
|
||||||
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
||||||
|
|
||||||
@ -210,3 +219,30 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
|
|||||||
return "\n".join([getattr(element, "text", "") for element in elements])
|
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
|
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_from_epub(file_content: bytes) -> str:
|
||||||
|
try:
|
||||||
|
with io.BytesIO(file_content) as file:
|
||||||
|
elements = partition_epub(file=file)
|
||||||
|
return "\n".join([str(element) for element in elements])
|
||||||
|
except Exception as e:
|
||||||
|
raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_from_eml(file_content: bytes) -> str:
|
||||||
|
try:
|
||||||
|
with io.BytesIO(file_content) as file:
|
||||||
|
elements = partition_email(file=file)
|
||||||
|
return "\n".join([str(element) for element in elements])
|
||||||
|
except Exception as e:
|
||||||
|
raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_from_msg(file_content: bytes) -> str:
|
||||||
|
try:
|
||||||
|
with io.BytesIO(file_content) as file:
|
||||||
|
elements = partition_msg(file=file)
|
||||||
|
return "\n".join([str(element) for element in elements])
|
||||||
|
except Exception as e:
|
||||||
|
raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e
|
||||||
|
@ -10,10 +10,9 @@ from werkzeug.exceptions import NotFound
|
|||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from constants import (
|
from constants import (
|
||||||
ALLOWED_EXTENSIONS,
|
|
||||||
AUDIO_EXTENSIONS,
|
AUDIO_EXTENSIONS,
|
||||||
|
DOCUMENT_EXTENSIONS,
|
||||||
IMAGE_EXTENSIONS,
|
IMAGE_EXTENSIONS,
|
||||||
UNSTRUCTURED_ALLOWED_EXTENSIONS,
|
|
||||||
VIDEO_EXTENSIONS,
|
VIDEO_EXTENSIONS,
|
||||||
)
|
)
|
||||||
from core.file import helpers as file_helpers
|
from core.file import helpers as file_helpers
|
||||||
@ -147,9 +146,7 @@ class FileService:
|
|||||||
|
|
||||||
# extract text from file
|
# extract text from file
|
||||||
extension = upload_file.extension
|
extension = upload_file.extension
|
||||||
etl_type = dify_config.ETL_TYPE
|
if extension.lower() not in DOCUMENT_EXTENSIONS:
|
||||||
allowed_extensions = UNSTRUCTURED_ALLOWED_EXTENSIONS if etl_type == "Unstructured" else ALLOWED_EXTENSIONS
|
|
||||||
if extension.lower() not in allowed_extensions:
|
|
||||||
raise UnsupportedFileTypeError()
|
raise UnsupportedFileTypeError()
|
||||||
|
|
||||||
text = ExtractProcessor.load_from_upload_file(upload_file, return_text=True)
|
text = ExtractProcessor.load_from_upload_file(upload_file, return_text=True)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user