nltk security issue and upgrade unstructured (#9558)

This commit is contained in:
Jyong 2024-10-23 16:23:55 +08:00 committed by GitHub
parent ecc8beef3f
commit 3e9d271b52
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 1664 additions and 781 deletions

View File

@ -15,7 +15,9 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
if dify_config.ETL_TYPE == "Unstructured": if dify_config.ETL_TYPE == "Unstructured":
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"] DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub")) DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
if dify_config.UNSTRUCTURED_API_URL:
DOCUMENT_EXTENSIONS.append("ppt")
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
else: else:
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"] DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]

View File

@ -21,6 +21,7 @@ from core.rag.extractor.unstructured.unstructured_eml_extractor import Unstructu
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
from core.rag.extractor.unstructured.unstructured_pdf_extractor import UnstructuredPDFExtractor
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
@ -102,10 +103,10 @@ class ExtractProcessor:
if file_extension in {".xlsx", ".xls"}: if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path) extractor = ExcelExtractor(file_path)
elif file_extension == ".pdf": elif file_extension == ".pdf":
extractor = PdfExtractor(file_path) extractor = UnstructuredPDFExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension in {".md", ".markdown"}: elif file_extension in {".md", ".markdown"}:
extractor = ( extractor = (
UnstructuredMarkdownExtractor(file_path, unstructured_api_url) UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
if is_automatic if is_automatic
else MarkdownExtractor(file_path, autodetect_encoding=True) else MarkdownExtractor(file_path, autodetect_encoding=True)
) )
@ -116,17 +117,17 @@ class ExtractProcessor:
elif file_extension == ".csv": elif file_extension == ".csv":
extractor = CSVExtractor(file_path, autodetect_encoding=True) extractor = CSVExtractor(file_path, autodetect_encoding=True)
elif file_extension == ".msg": elif file_extension == ".msg":
extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url) extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".eml": elif file_extension == ".eml":
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url) extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".ppt": elif file_extension == ".ppt":
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key) extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".pptx": elif file_extension == ".pptx":
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url) extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".xml": elif file_extension == ".xml":
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url) extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".epub": elif file_extension == ".epub":
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url) extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
else: else:
# txt # txt
extractor = ( extractor = (

View File

@ -10,21 +10,23 @@ logger = logging.getLogger(__name__)
class UnstructuredEmailExtractor(BaseExtractor): class UnstructuredEmailExtractor(BaseExtractor):
"""Load msg files. """Load eml files.
Args: Args:
file_path: Path to the file to load. file_path: Path to the file to load.
""" """
def __init__( def __init__(self, file_path: str, api_url: str, api_key: str):
self,
file_path: str,
api_url: str,
):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
elements = partition_email(filename=self._file_path) elements = partition_email(filename=self._file_path)

View File

@ -19,15 +19,23 @@ class UnstructuredEpubExtractor(BaseExtractor):
self, self,
file_path: str, file_path: str,
api_url: Optional[str] = None, api_url: Optional[str] = None,
api_key: Optional[str] = None,
): ):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.epub import partition_epub from unstructured.partition.epub import partition_epub
elements = partition_epub(filename=self._file_path, xml_keep_tags=True) elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)

View File

@ -24,16 +24,18 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
if the specified encoding fails. if the specified encoding fails.
""" """
def __init__( def __init__(self, file_path: str, api_url: str, api_key: str):
self,
file_path: str,
api_url: str,
):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.md import partition_md from unstructured.partition.md import partition_md
elements = partition_md(filename=self._file_path) elements = partition_md(filename=self._file_path)

View File

@ -14,12 +14,18 @@ class UnstructuredMsgExtractor(BaseExtractor):
file_path: Path to the file to load. file_path: Path to the file to load.
""" """
def __init__(self, file_path: str, api_url: str): def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.msg import partition_msg from unstructured.partition.msg import partition_msg
elements = partition_msg(filename=self._file_path) elements = partition_msg(filename=self._file_path)

View File

@ -0,0 +1,47 @@
import logging
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
logger = logging.getLogger(__name__)
class UnstructuredPDFExtractor(BaseExtractor):
"""Load pdf files.
Args:
file_path: Path to the file to load.
api_url: Unstructured API URL
api_key: Unstructured API Key
"""
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(
filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
)
else:
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename=self._file_path, strategy="auto")
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))
return documents

View File

@ -7,7 +7,7 @@ logger = logging.getLogger(__name__)
class UnstructuredPPTExtractor(BaseExtractor): class UnstructuredPPTExtractor(BaseExtractor):
"""Load msg files. """Load ppt files.
Args: Args:
@ -21,9 +21,12 @@ class UnstructuredPPTExtractor(BaseExtractor):
self._api_key = api_key self._api_key = api_key
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
raise NotImplementedError("Unstructured API Url is not configured")
text_by_page = {} text_by_page = {}
for element in elements: for element in elements:
page = element.metadata.page_number page = element.metadata.page_number

View File

@ -7,19 +7,25 @@ logger = logging.getLogger(__name__)
class UnstructuredPPTXExtractor(BaseExtractor): class UnstructuredPPTXExtractor(BaseExtractor):
"""Load msg files. """Load pptx files.
Args: Args:
file_path: Path to the file to load. file_path: Path to the file to load.
""" """
def __init__(self, file_path: str, api_url: str): def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.pptx import partition_pptx from unstructured.partition.pptx import partition_pptx
elements = partition_pptx(filename=self._file_path) elements = partition_pptx(filename=self._file_path)

View File

@ -7,22 +7,29 @@ logger = logging.getLogger(__name__)
class UnstructuredXmlExtractor(BaseExtractor): class UnstructuredXmlExtractor(BaseExtractor):
"""Load msg files. """Load xml files.
Args: Args:
file_path: Path to the file to load. file_path: Path to the file to load.
""" """
def __init__(self, file_path: str, api_url: str): def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.xml import partition_xml from unstructured.partition.xml import partition_xml
elements = partition_xml(filename=self._file_path, xml_keep_tags=True) elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)

2271
api/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -172,11 +172,12 @@ sagemaker = "2.231.0"
scikit-learn = "~1.5.1" scikit-learn = "~1.5.1"
sentry-sdk = { version = "~1.44.1", extras = ["flask"] } sentry-sdk = { version = "~1.44.1", extras = ["flask"] }
sqlalchemy = "~2.0.29" sqlalchemy = "~2.0.29"
starlette = "0.41.0"
tencentcloud-sdk-python-hunyuan = "~3.0.1158" tencentcloud-sdk-python-hunyuan = "~3.0.1158"
tiktoken = "~0.8.0" tiktoken = "~0.8.0"
tokenizers = "~0.15.0" tokenizers = "~0.15.0"
transformers = "~4.35.0" transformers = "~4.35.0"
unstructured = { version = "~0.10.27", extras = ["docx", "epub", "md", "msg", "ppt", "pptx"] } unstructured = { version = "~0.15.7", extras = ["docx", "epub", "md", "msg", "ppt", "pptx", "pdf"] }
validators = "0.21.0" validators = "0.21.0"
volcengine-python-sdk = {extras = ["ark"], version = "~1.0.98"} volcengine-python-sdk = {extras = ["ark"], version = "~1.0.98"}
websocket-client = "~1.7.0" websocket-client = "~1.7.0"
@ -206,7 +207,7 @@ duckduckgo-search = "~6.3.0"
jsonpath-ng = "1.6.1" jsonpath-ng = "1.6.1"
matplotlib = "~3.8.2" matplotlib = "~3.8.2"
newspaper3k = "0.2.8" newspaper3k = "0.2.8"
nltk = "3.8.1" nltk = "3.9.1"
numexpr = "~2.9.0" numexpr = "~2.9.0"
pydub = "~0.25.1" pydub = "~0.25.1"
qrcode = "~7.4.2" qrcode = "~7.4.2"

View File

@ -6,9 +6,4 @@ pytest api/tests/integration_tests/vdb/chroma \
api/tests/integration_tests/vdb/pgvecto_rs \ api/tests/integration_tests/vdb/pgvecto_rs \
api/tests/integration_tests/vdb/pgvector \ api/tests/integration_tests/vdb/pgvector \
api/tests/integration_tests/vdb/qdrant \ api/tests/integration_tests/vdb/qdrant \
api/tests/integration_tests/vdb/weaviate \ api/tests/integration_tests/vdb/weaviate
api/tests/integration_tests/vdb/elasticsearch \
api/tests/integration_tests/vdb/vikingdb \
api/tests/integration_tests/vdb/baidu \
api/tests/integration_tests/vdb/tcvectordb \
api/tests/integration_tests/vdb/upstash