mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-06-04 11:14:10 +08:00
nltk security issue and upgrade unstructured (#9558)
This commit is contained in:
parent
ecc8beef3f
commit
3e9d271b52
@ -15,7 +15,9 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
|
|||||||
|
|
||||||
if dify_config.ETL_TYPE == "Unstructured":
|
if dify_config.ETL_TYPE == "Unstructured":
|
||||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
|
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
|
||||||
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
|
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
|
||||||
|
if dify_config.UNSTRUCTURED_API_URL:
|
||||||
|
DOCUMENT_EXTENSIONS.append("ppt")
|
||||||
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||||
else:
|
else:
|
||||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
|
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
|
||||||
|
@ -21,6 +21,7 @@ from core.rag.extractor.unstructured.unstructured_eml_extractor import Unstructu
|
|||||||
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
|
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
|
||||||
|
from core.rag.extractor.unstructured.unstructured_pdf_extractor import UnstructuredPDFExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
|
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
|
from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
|
||||||
@ -102,10 +103,10 @@ class ExtractProcessor:
|
|||||||
if file_extension in {".xlsx", ".xls"}:
|
if file_extension in {".xlsx", ".xls"}:
|
||||||
extractor = ExcelExtractor(file_path)
|
extractor = ExcelExtractor(file_path)
|
||||||
elif file_extension == ".pdf":
|
elif file_extension == ".pdf":
|
||||||
extractor = PdfExtractor(file_path)
|
extractor = UnstructuredPDFExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
elif file_extension in {".md", ".markdown"}:
|
elif file_extension in {".md", ".markdown"}:
|
||||||
extractor = (
|
extractor = (
|
||||||
UnstructuredMarkdownExtractor(file_path, unstructured_api_url)
|
UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
if is_automatic
|
if is_automatic
|
||||||
else MarkdownExtractor(file_path, autodetect_encoding=True)
|
else MarkdownExtractor(file_path, autodetect_encoding=True)
|
||||||
)
|
)
|
||||||
@ -116,17 +117,17 @@ class ExtractProcessor:
|
|||||||
elif file_extension == ".csv":
|
elif file_extension == ".csv":
|
||||||
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
||||||
elif file_extension == ".msg":
|
elif file_extension == ".msg":
|
||||||
extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url)
|
extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
elif file_extension == ".eml":
|
elif file_extension == ".eml":
|
||||||
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url)
|
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
elif file_extension == ".ppt":
|
elif file_extension == ".ppt":
|
||||||
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
elif file_extension == ".pptx":
|
elif file_extension == ".pptx":
|
||||||
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
|
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
elif file_extension == ".xml":
|
elif file_extension == ".xml":
|
||||||
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
|
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
elif file_extension == ".epub":
|
elif file_extension == ".epub":
|
||||||
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url)
|
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
|
||||||
else:
|
else:
|
||||||
# txt
|
# txt
|
||||||
extractor = (
|
extractor = (
|
||||||
|
@ -10,21 +10,23 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredEmailExtractor(BaseExtractor):
|
class UnstructuredEmailExtractor(BaseExtractor):
|
||||||
"""Load msg files.
|
"""Load eml files.
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, file_path: str, api_url: str, api_key: str):
|
||||||
self,
|
|
||||||
file_path: str,
|
|
||||||
api_url: str,
|
|
||||||
):
|
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
else:
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
elements = partition_email(filename=self._file_path)
|
elements = partition_email(filename=self._file_path)
|
||||||
|
@ -19,15 +19,23 @@ class UnstructuredEpubExtractor(BaseExtractor):
|
|||||||
self,
|
self,
|
||||||
file_path: str,
|
file_path: str,
|
||||||
api_url: Optional[str] = None,
|
api_url: Optional[str] = None,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
else:
|
||||||
from unstructured.partition.epub import partition_epub
|
from unstructured.partition.epub import partition_epub
|
||||||
|
|
||||||
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
|
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
|
||||||
|
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
|
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
|
@ -24,16 +24,18 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
|
|||||||
if the specified encoding fails.
|
if the specified encoding fails.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, file_path: str, api_url: str, api_key: str):
|
||||||
self,
|
|
||||||
file_path: str,
|
|
||||||
api_url: str,
|
|
||||||
):
|
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
else:
|
||||||
from unstructured.partition.md import partition_md
|
from unstructured.partition.md import partition_md
|
||||||
|
|
||||||
elements = partition_md(filename=self._file_path)
|
elements = partition_md(filename=self._file_path)
|
||||||
|
@ -14,12 +14,18 @@ class UnstructuredMsgExtractor(BaseExtractor):
|
|||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str):
|
def __init__(self, file_path: str, api_url: str, api_key: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
else:
|
||||||
from unstructured.partition.msg import partition_msg
|
from unstructured.partition.msg import partition_msg
|
||||||
|
|
||||||
elements = partition_msg(filename=self._file_path)
|
elements = partition_msg(filename=self._file_path)
|
||||||
|
@ -0,0 +1,47 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
|
from core.rag.models.document import Document
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredPDFExtractor(BaseExtractor):
|
||||||
|
"""Load pdf files.
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to load.
|
||||||
|
|
||||||
|
api_url: Unstructured API URL
|
||||||
|
|
||||||
|
api_key: Unstructured API Key
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, file_path: str, api_url: str, api_key: str):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
self._file_path = file_path
|
||||||
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(
|
||||||
|
filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
from unstructured.partition.pdf import partition_pdf
|
||||||
|
|
||||||
|
elements = partition_pdf(filename=self._file_path, strategy="auto")
|
||||||
|
|
||||||
|
from unstructured.chunking.title import chunk_by_title
|
||||||
|
|
||||||
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
|
documents = []
|
||||||
|
for chunk in chunks:
|
||||||
|
text = chunk.text.strip()
|
||||||
|
documents.append(Document(page_content=text))
|
||||||
|
|
||||||
|
return documents
|
@ -7,7 +7,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredPPTExtractor(BaseExtractor):
|
class UnstructuredPPTExtractor(BaseExtractor):
|
||||||
"""Load msg files.
|
"""Load ppt files.
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -21,9 +21,12 @@ class UnstructuredPPTExtractor(BaseExtractor):
|
|||||||
self._api_key = api_key
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Unstructured API Url is not configured")
|
||||||
text_by_page = {}
|
text_by_page = {}
|
||||||
for element in elements:
|
for element in elements:
|
||||||
page = element.metadata.page_number
|
page = element.metadata.page_number
|
||||||
|
@ -7,19 +7,25 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredPPTXExtractor(BaseExtractor):
|
class UnstructuredPPTXExtractor(BaseExtractor):
|
||||||
"""Load msg files.
|
"""Load pptx files.
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str):
|
def __init__(self, file_path: str, api_url: str, api_key: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
else:
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
|
|
||||||
elements = partition_pptx(filename=self._file_path)
|
elements = partition_pptx(filename=self._file_path)
|
||||||
|
@ -7,22 +7,29 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredXmlExtractor(BaseExtractor):
|
class UnstructuredXmlExtractor(BaseExtractor):
|
||||||
"""Load msg files.
|
"""Load xml files.
|
||||||
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file to load.
|
file_path: Path to the file to load.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, api_url: str):
|
def __init__(self, file_path: str, api_url: str, api_key: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
self._file_path = file_path
|
self._file_path = file_path
|
||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
self._api_key = api_key
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
if self._api_url:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
|
||||||
|
else:
|
||||||
from unstructured.partition.xml import partition_xml
|
from unstructured.partition.xml import partition_xml
|
||||||
|
|
||||||
elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
|
elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
|
||||||
|
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
|
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
|
2271
api/poetry.lock
generated
2271
api/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -172,11 +172,12 @@ sagemaker = "2.231.0"
|
|||||||
scikit-learn = "~1.5.1"
|
scikit-learn = "~1.5.1"
|
||||||
sentry-sdk = { version = "~1.44.1", extras = ["flask"] }
|
sentry-sdk = { version = "~1.44.1", extras = ["flask"] }
|
||||||
sqlalchemy = "~2.0.29"
|
sqlalchemy = "~2.0.29"
|
||||||
|
starlette = "0.41.0"
|
||||||
tencentcloud-sdk-python-hunyuan = "~3.0.1158"
|
tencentcloud-sdk-python-hunyuan = "~3.0.1158"
|
||||||
tiktoken = "~0.8.0"
|
tiktoken = "~0.8.0"
|
||||||
tokenizers = "~0.15.0"
|
tokenizers = "~0.15.0"
|
||||||
transformers = "~4.35.0"
|
transformers = "~4.35.0"
|
||||||
unstructured = { version = "~0.10.27", extras = ["docx", "epub", "md", "msg", "ppt", "pptx"] }
|
unstructured = { version = "~0.15.7", extras = ["docx", "epub", "md", "msg", "ppt", "pptx", "pdf"] }
|
||||||
validators = "0.21.0"
|
validators = "0.21.0"
|
||||||
volcengine-python-sdk = {extras = ["ark"], version = "~1.0.98"}
|
volcengine-python-sdk = {extras = ["ark"], version = "~1.0.98"}
|
||||||
websocket-client = "~1.7.0"
|
websocket-client = "~1.7.0"
|
||||||
@ -206,7 +207,7 @@ duckduckgo-search = "~6.3.0"
|
|||||||
jsonpath-ng = "1.6.1"
|
jsonpath-ng = "1.6.1"
|
||||||
matplotlib = "~3.8.2"
|
matplotlib = "~3.8.2"
|
||||||
newspaper3k = "0.2.8"
|
newspaper3k = "0.2.8"
|
||||||
nltk = "3.8.1"
|
nltk = "3.9.1"
|
||||||
numexpr = "~2.9.0"
|
numexpr = "~2.9.0"
|
||||||
pydub = "~0.25.1"
|
pydub = "~0.25.1"
|
||||||
qrcode = "~7.4.2"
|
qrcode = "~7.4.2"
|
||||||
|
@ -6,9 +6,4 @@ pytest api/tests/integration_tests/vdb/chroma \
|
|||||||
api/tests/integration_tests/vdb/pgvecto_rs \
|
api/tests/integration_tests/vdb/pgvecto_rs \
|
||||||
api/tests/integration_tests/vdb/pgvector \
|
api/tests/integration_tests/vdb/pgvector \
|
||||||
api/tests/integration_tests/vdb/qdrant \
|
api/tests/integration_tests/vdb/qdrant \
|
||||||
api/tests/integration_tests/vdb/weaviate \
|
api/tests/integration_tests/vdb/weaviate
|
||||||
api/tests/integration_tests/vdb/elasticsearch \
|
|
||||||
api/tests/integration_tests/vdb/vikingdb \
|
|
||||||
api/tests/integration_tests/vdb/baidu \
|
|
||||||
api/tests/integration_tests/vdb/tcvectordb \
|
|
||||||
api/tests/integration_tests/vdb/upstash
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user