diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index f9fd7f92a1..e80d0f0bbe 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -18,6 +18,7 @@ from core.rag.extractor.markdown_extractor import MarkdownExtractor from core.rag.extractor.notion_extractor import NotionExtractor from core.rag.extractor.pdf_extractor import PdfExtractor from core.rag.extractor.text_extractor import TextExtractor +from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor @@ -104,7 +105,7 @@ class ExtractProcessor: etl_type = dify_config.ETL_TYPE extractor: Optional[BaseExtractor] = None if etl_type == "Unstructured": - unstructured_api_url = dify_config.UNSTRUCTURED_API_URL + unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or "" unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or "" if file_extension in {".xlsx", ".xls"}: @@ -121,6 +122,8 @@ class ExtractProcessor: extractor = HtmlExtractor(file_path) elif file_extension == ".docx": extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by) + elif file_extension == ".doc": + extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key) elif file_extension == ".csv": extractor = CSVExtractor(file_path, autodetect_encoding=True) elif file_extension == ".msg": diff --git a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py index a525c9e9e3..5199208f70 100644 --- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py @@ -10,14 +10,11 @@ logger = logging.getLogger(__name__) class UnstructuredWordExtractor(BaseExtractor): """Loader that uses unstructured to load word documents.""" - def __init__( - self, - file_path: str, - api_url: str, - ): + def __init__(self, file_path: str, api_url: str, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url + self._api_key = api_key def extract(self) -> list[Document]: from unstructured.__version__ import __version__ as __unstructured_version__ @@ -41,9 +38,10 @@ class UnstructuredWordExtractor(BaseExtractor): ) if is_doc: - from unstructured.partition.doc import partition_doc + from unstructured.partition.api import partition_via_api + + elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) - elements = partition_doc(filename=self._file_path) else: from unstructured.partition.docx import partition_docx