diff --git a/api/configs/feature/__init__.py b/api/configs/feature/__init__.py index 74cdf94486..f10252e455 100644 --- a/api/configs/feature/__init__.py +++ b/api/configs/feature/__init__.py @@ -601,7 +601,7 @@ class RagEtlConfig(BaseSettings): UNSTRUCTURED_API_KEY: Optional[str] = Field( description="API key for Unstructured.io service", - default=None, + default="", ) SCARF_NO_ANALYTICS: Optional[str] = Field( diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 23ccab63b8..f9fd7f92a1 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -102,12 +102,11 @@ class ExtractProcessor: input_file = Path(file_path) file_extension = input_file.suffix.lower() etl_type = dify_config.ETL_TYPE - unstructured_api_url = dify_config.UNSTRUCTURED_API_URL - unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY - assert unstructured_api_url is not None, "unstructured_api_url is required" - assert unstructured_api_key is not None, "unstructured_api_key is required" extractor: Optional[BaseExtractor] = None if etl_type == "Unstructured": + unstructured_api_url = dify_config.UNSTRUCTURED_API_URL + unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or "" + if file_extension in {".xlsx", ".xls"}: extractor = ExcelExtractor(file_path) elif file_extension == ".pdf": diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py index 9647dedfff..f1fa5dde5c 100644 --- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py @@ -1,5 +1,6 @@ import base64 import logging +from typing import Optional from bs4 import BeautifulSoup # type: ignore @@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor): file_path: Path to the file to load. """ - def __init__(self, file_path: str, api_url: str, api_key: str): + def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url diff --git a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py index 80c29157aa..35ca686f62 100644 --- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py @@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor): self, file_path: str, api_url: Optional[str] = None, - api_key: Optional[str] = None, + api_key: str = "", ): """Initialize with file path.""" self._file_path = file_path @@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor): if self._api_url: from unstructured.partition.api import partition_via_api - if self._api_key is None: - raise ValueError("api_key is required") - elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) else: from unstructured.partition.epub import partition_epub diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py index 4173d4d122..d5418e612a 100644 --- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor): if the specified encoding fails. """ - def __init__(self, file_path: str, api_url: str, api_key: str): + def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py index 57affb8d36..d363449c29 100644 --- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor): file_path: Path to the file to load. """ - def __init__(self, file_path: str, api_url: str, api_key: str): + def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url diff --git a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py index e504d4bc23..ecc272a2f0 100644 --- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor): file_path: Path to the file to load. """ - def __init__(self, file_path: str, api_url: str, api_key: str): + def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url diff --git a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py index cefe72b290..e7bf6fd2e6 100644 --- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor): file_path: Path to the file to load. """ - def __init__(self, file_path: str, api_url: str, api_key: str): + def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py index ef46ab0e70..916cdc3f2b 100644 --- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor): file_path: Path to the file to load. """ - def __init__(self, file_path: str, api_url: str, api_key: str): + def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url