mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-14 05:36:02 +08:00
fix unstructured setting (#12116)
This commit is contained in:
parent
49feff082f
commit
811e4bd0cf
@ -601,7 +601,7 @@ class RagEtlConfig(BaseSettings):
|
|||||||
|
|
||||||
UNSTRUCTURED_API_KEY: Optional[str] = Field(
|
UNSTRUCTURED_API_KEY: Optional[str] = Field(
|
||||||
description="API key for Unstructured.io service",
|
description="API key for Unstructured.io service",
|
||||||
default=None,
|
default="",
|
||||||
)
|
)
|
||||||
|
|
||||||
SCARF_NO_ANALYTICS: Optional[str] = Field(
|
SCARF_NO_ANALYTICS: Optional[str] = Field(
|
||||||
|
@ -102,12 +102,11 @@ class ExtractProcessor:
|
|||||||
input_file = Path(file_path)
|
input_file = Path(file_path)
|
||||||
file_extension = input_file.suffix.lower()
|
file_extension = input_file.suffix.lower()
|
||||||
etl_type = dify_config.ETL_TYPE
|
etl_type = dify_config.ETL_TYPE
|
||||||
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
|
|
||||||
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY
|
|
||||||
assert unstructured_api_url is not None, "unstructured_api_url is required"
|
|
||||||
assert unstructured_api_key is not None, "unstructured_api_key is required"
|
|
||||||
extractor: Optional[BaseExtractor] = None
|
extractor: Optional[BaseExtractor] = None
|
||||||
if etl_type == "Unstructured":
|
if etl_type == "Unstructured":
|
||||||
|
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
|
||||||
|
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
|
||||||
|
|
||||||
if file_extension in {".xlsx", ".xls"}:
|
if file_extension in {".xlsx", ".xls"}:
|
||||||
extractor = ExcelExtractor(file_path)
|
extractor = ExcelExtractor(file_path)
|
||||||
elif file_extension == ".pdf":
|
elif file_extension == ".pdf":
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import base64
|
import base64
|
||||||
import logging
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from bs4 import BeautifulSoup # type: ignore
|
from bs4 import BeautifulSoup # type: ignore
|
||||||
|
|
||||||
@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
|
|||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str, api_key: str):
|
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
|||||||
self,
|
self,
|
||||||
file_path: str,
|
file_path: str,
|
||||||
api_url: Optional[str] = None,
|
api_url: Optional[str] = None,
|
||||||
api_key: Optional[str] = None,
|
api_key: str = "",
|
||||||
):
|
):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
|||||||
if self._api_url:
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
if self._api_key is None:
|
|
||||||
raise ValueError("api_key is required")
|
|
||||||
|
|
||||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
else:
|
else:
|
||||||
from unstructured.partition.epub import partition_epub
|
from unstructured.partition.epub import partition_epub
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
|
|||||||
if the specified encoding fails.
|
if the specified encoding fails.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str, api_key: str):
|
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
|
|||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str, api_key: str):
|
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
|
|||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str, api_key: str):
|
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
|
|||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str, api_key: str):
|
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
|
|||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str, api_key: str):
|
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
Loading…
x
Reference in New Issue
Block a user