From e4f686deb71e59b8d36a6c31a5480a676f522a34 Mon Sep 17 00:00:00 2001
From: Vikey Chen <vikeytk@gmail.com>
Date: Wed, 3 Apr 2024 21:00:20 +0800
Subject: [PATCH] fix unstructured api,remove unused parameters (#3056)

---
 .../rag/extractor/unstructured/unstructured_eml_extractor.py  | 2 +-
 .../extractor/unstructured/unstructured_markdown_extractor.py | 2 +-
 .../rag/extractor/unstructured/unstructured_msg_extractor.py  | 2 +-
 .../rag/extractor/unstructured/unstructured_ppt_extractor.py  | 4 ++--
 .../rag/extractor/unstructured/unstructured_pptx_extractor.py | 2 +-
 .../rag/extractor/unstructured/unstructured_text_extractor.py | 2 +-
 .../rag/extractor/unstructured/unstructured_xml_extractor.py  | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
index f6ae8fad53..2e704f187d 100644
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
 
     def extract(self) -> list[Document]:
         from unstructured.partition.email import partition_email
-        elements = partition_email(filename=self._file_path, api_url=self._api_url)
+        elements = partition_email(filename=self._file_path)
 
         # noinspection PyBroadException
         try:
diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
index 3d63446fef..144b4e0c1d 100644
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -36,7 +36,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
     def extract(self) -> list[Document]:
         from unstructured.partition.md import partition_md
 
-        elements = partition_md(filename=self._file_path, api_url=self._api_url)
+        elements = partition_md(filename=self._file_path)
         from unstructured.chunking.title import chunk_by_title
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
         documents = []
diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
index 34d3e8021a..ad09b79eb0 100644
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
     def extract(self) -> list[Document]:
         from unstructured.partition.msg import partition_msg
 
-        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
+        elements = partition_msg(filename=self._file_path)
         from unstructured.chunking.title import chunk_by_title
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
         documents = []
diff --git a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
index cd3aba9866..6d3ffe6589 100644
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -24,9 +24,9 @@ class UnstructuredPPTExtractor(BaseExtractor):
         self._api_url = api_url
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.ppt import partition_ppt
+        from unstructured.partition.api import partition_via_api
 
-        elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
+        elements = partition_via_api(filename=self._file_path, api_url=self._api_url)
         text_by_page = {}
         for element in elements:
             page = element.metadata.page_number
diff --git a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
index f9667d2527..6fcbb5feb9 100644
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
     def extract(self) -> list[Document]:
         from unstructured.partition.pptx import partition_pptx
 
-        elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
+        elements = partition_pptx(filename=self._file_path)
         text_by_page = {}
         for element in elements:
             page = element.metadata.page_number
diff --git a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
index cc67f2b866..f4a4adbc16 100644
--- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredTextExtractor(BaseExtractor):
     def extract(self) -> list[Document]:
         from unstructured.partition.text import partition_text
 
-        elements = partition_text(filename=self._file_path, api_url=self._api_url)
+        elements = partition_text(filename=self._file_path)
         from unstructured.chunking.title import chunk_by_title
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
         documents = []
diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
index 5600fb075d..6aef8e0f7e 100644
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
     def extract(self) -> list[Document]:
         from unstructured.partition.xml import partition_xml
 
-        elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
+        elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
         from unstructured.chunking.title import chunk_by_title
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
         documents = []