mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-15 01:05:59 +08:00
add doc support in knowledge base for unstructured (#17352)
This commit is contained in:
parent
33c8cb7b3b
commit
6104b91d3f
@ -18,6 +18,7 @@ from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
|||||||
from core.rag.extractor.notion_extractor import NotionExtractor
|
from core.rag.extractor.notion_extractor import NotionExtractor
|
||||||
from core.rag.extractor.pdf_extractor import PdfExtractor
|
from core.rag.extractor.pdf_extractor import PdfExtractor
|
||||||
from core.rag.extractor.text_extractor import TextExtractor
|
from core.rag.extractor.text_extractor import TextExtractor
|
||||||
|
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
|
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
||||||
@ -104,7 +105,7 @@ class ExtractProcessor:
|
|||||||
etl_type = dify_config.ETL_TYPE
|
etl_type = dify_config.ETL_TYPE
|
||||||
extractor: Optional[BaseExtractor] = None
|
extractor: Optional[BaseExtractor] = None
|
||||||
if etl_type == "Unstructured":
|
if etl_type == "Unstructured":
|
||||||
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
|
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or ""
|
||||||
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
|
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
|
||||||
|
|
||||||
if file_extension in {".xlsx", ".xls"}:
|
if file_extension in {".xlsx", ".xls"}:
|
||||||
@ -121,6 +122,8 @@ class ExtractProcessor:
|
|||||||
extractor = HtmlExtractor(file_path)
|
extractor = HtmlExtractor(file_path)
|
||||||
elif file_extension == ".docx":
|
elif file_extension == ".docx":
|
||||||
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
|
||||||
|
elif file_extension == ".doc":
|
||||||
|
extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
elif file_extension == ".csv":
|
elif file_extension == ".csv":
|
||||||
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
||||||
elif file_extension == ".msg":
|
elif file_extension == ".msg":
|
||||||
|
@ -10,14 +10,11 @@ logger = logging.getLogger(__name__)
|
|||||||
class UnstructuredWordExtractor(BaseExtractor):
|
class UnstructuredWordExtractor(BaseExtractor):
|
||||||
"""Loader that uses unstructured to load word documents."""
|
"""Loader that uses unstructured to load word documents."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, file_path: str, api_url: str, api_key: str = ""):
|
||||||
self,
|
|
||||||
file_path: str,
|
|
||||||
api_url: str,
|
|
||||||
):
|
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||||
@ -41,9 +38,10 @@ class UnstructuredWordExtractor(BaseExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if is_doc:
|
if is_doc:
|
||||||
from unstructured.partition.doc import partition_doc
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
|
||||||
elements = partition_doc(filename=self._file_path)
|
|
||||||
else:
|
else:
|
||||||
from unstructured.partition.docx import partition_docx
|
from unstructured.partition.docx import partition_docx
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user