mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 05:09:03 +08:00
fix unstructured api,remove unused parameters (#3056)
This commit is contained in:
parent
d241d66a69
commit
e4f686deb7
@ -26,7 +26,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
|
|||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
elements = partition_email(filename=self._file_path, api_url=self._api_url)
|
elements = partition_email(filename=self._file_path)
|
||||||
|
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
|
@ -36,7 +36,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
|
|||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.partition.md import partition_md
|
from unstructured.partition.md import partition_md
|
||||||
|
|
||||||
elements = partition_md(filename=self._file_path, api_url=self._api_url)
|
elements = partition_md(filename=self._file_path)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
|
@ -26,7 +26,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
|
|||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.partition.msg import partition_msg
|
from unstructured.partition.msg import partition_msg
|
||||||
|
|
||||||
elements = partition_msg(filename=self._file_path, api_url=self._api_url)
|
elements = partition_msg(filename=self._file_path)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
|
@ -24,9 +24,9 @@ class UnstructuredPPTExtractor(BaseExtractor):
|
|||||||
self._api_url = api_url
|
self._api_url = api_url
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.partition.ppt import partition_ppt
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
|
elements = partition_via_api(filename=self._file_path, api_url=self._api_url)
|
||||||
text_by_page = {}
|
text_by_page = {}
|
||||||
for element in elements:
|
for element in elements:
|
||||||
page = element.metadata.page_number
|
page = element.metadata.page_number
|
||||||
|
@ -26,7 +26,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
|
|||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.partition.pptx import partition_pptx
|
from unstructured.partition.pptx import partition_pptx
|
||||||
|
|
||||||
elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
|
elements = partition_pptx(filename=self._file_path)
|
||||||
text_by_page = {}
|
text_by_page = {}
|
||||||
for element in elements:
|
for element in elements:
|
||||||
page = element.metadata.page_number
|
page = element.metadata.page_number
|
||||||
|
@ -26,7 +26,7 @@ class UnstructuredTextExtractor(BaseExtractor):
|
|||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.partition.text import partition_text
|
from unstructured.partition.text import partition_text
|
||||||
|
|
||||||
elements = partition_text(filename=self._file_path, api_url=self._api_url)
|
elements = partition_text(filename=self._file_path)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
|
@ -26,7 +26,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
|
|||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
from unstructured.partition.xml import partition_xml
|
from unstructured.partition.xml import partition_xml
|
||||||
|
|
||||||
elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
|
elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
|
||||||
from unstructured.chunking.title import chunk_by_title
|
from unstructured.chunking.title import chunk_by_title
|
||||||
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
documents = []
|
documents = []
|
||||||
|
Loading…
x
Reference in New Issue
Block a user