From 43335b5c871627692dfa8f8242b18323c12db044 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Wed, 26 Jun 2024 12:51:50 +0800 Subject: [PATCH] delete the deprecated method (#5612) --- api/controllers/console/datasets/datasets.py | 4 +++- .../console/datasets/datasets_document.py | 5 +++++ api/controllers/console/datasets/error.py | 6 ++++++ api/core/rag/datasource/keyword/jieba/jieba.py | 17 +---------------- api/core/rag/datasource/keyword/keyword_base.py | 4 ---- .../rag/datasource/keyword/keyword_factory.py | 3 --- .../rag/datasource/vdb/milvus/milvus_vector.py | 6 ------ .../vdb/opensearch/opensearch_vector.py | 5 ----- .../rag/datasource/vdb/oracle/oraclevector.py | 7 ------- .../rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py | 8 -------- .../rag/datasource/vdb/relyt/relyt_vector.py | 5 ----- .../datasource/vdb/tidb_vector/tidb_vector.py | 5 ----- api/core/rag/datasource/vdb/vector_base.py | 3 --- api/core/rag/extractor/excel_extractor.py | 10 +++++++++- .../integration_tests/vdb/milvus/test_milvus.py | 3 --- .../vdb/opensearch/test_opensearch.py | 9 --------- .../vdb/pgvecto_rs/test_pgvecto_rs.py | 3 --- .../integration_tests/vdb/test_vector_store.py | 5 ----- .../vdb/tidb_vector/test_tidb_vector.py | 3 --- 19 files changed, 24 insertions(+), 87 deletions(-) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index 619ab4f7e2..d5196aae61 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -8,7 +8,7 @@ import services from controllers.console import api from controllers.console.apikey import api_key_fields, api_key_list from controllers.console.app.error import ProviderNotInitializeError -from controllers.console.datasets.error import DatasetInUseError, DatasetNameDuplicateError +from controllers.console.datasets.error import DatasetInUseError, DatasetNameDuplicateError, IndexingEstimateError from controllers.console.setup import setup_required from controllers.console.wraps import account_initialization_required from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError @@ -346,6 +346,8 @@ class DatasetIndexingEstimateApi(Resource): "in the Settings -> Model Provider.") except ProviderTokenNotInitError as ex: raise ProviderNotInitializeError(ex.description) + except Exception as e: + raise IndexingEstimateError(str(e)) return response, 200 diff --git a/api/controllers/console/datasets/datasets_document.py b/api/controllers/console/datasets/datasets_document.py index 976b7df629..b3a253c167 100644 --- a/api/controllers/console/datasets/datasets_document.py +++ b/api/controllers/console/datasets/datasets_document.py @@ -20,6 +20,7 @@ from controllers.console.datasets.error import ( ArchivedDocumentImmutableError, DocumentAlreadyFinishedError, DocumentIndexingError, + IndexingEstimateError, InvalidActionError, InvalidMetadataError, ) @@ -388,6 +389,8 @@ class DocumentIndexingEstimateApi(DocumentResource): "in the Settings -> Model Provider.") except ProviderTokenNotInitError as ex: raise ProviderNotInitializeError(ex.description) + except Exception as e: + raise IndexingEstimateError(str(e)) return response @@ -493,6 +496,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource): "in the Settings -> Model Provider.") except ProviderTokenNotInitError as ex: raise ProviderNotInitializeError(ex.description) + except Exception as e: + raise IndexingEstimateError(str(e)) return response diff --git a/api/controllers/console/datasets/error.py b/api/controllers/console/datasets/error.py index 71476764aa..9270b610c2 100644 --- a/api/controllers/console/datasets/error.py +++ b/api/controllers/console/datasets/error.py @@ -83,3 +83,9 @@ class DatasetInUseError(BaseHTTPException): error_code = 'dataset_in_use' description = "The dataset is being used by some apps. Please remove the dataset from the apps before deleting it." code = 409 + + +class IndexingEstimateError(BaseHTTPException): + error_code = 'indexing_estimate_error' + description = "Knowledge indexing estimate failed: {message}" + code = 500 diff --git a/api/core/rag/datasource/keyword/jieba/jieba.py b/api/core/rag/datasource/keyword/jieba/jieba.py index 0f4cbccff7..1a5d3d11df 100644 --- a/api/core/rag/datasource/keyword/jieba/jieba.py +++ b/api/core/rag/datasource/keyword/jieba/jieba.py @@ -70,22 +70,6 @@ class Jieba(BaseKeyword): self._save_dataset_keyword_table(keyword_table) - def delete_by_document_id(self, document_id: str): - lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id) - with redis_client.lock(lock_name, timeout=600): - # get segment ids by document_id - segments = db.session.query(DocumentSegment).filter( - DocumentSegment.dataset_id == self.dataset.id, - DocumentSegment.document_id == document_id - ).all() - - ids = [segment.index_node_id for segment in segments] - - keyword_table = self._get_dataset_keyword_table() - keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids) - - self._save_dataset_keyword_table(keyword_table) - def search( self, query: str, **kwargs: Any @@ -104,6 +88,7 @@ class Jieba(BaseKeyword): ).first() if segment: + documents.append(Document( page_content=segment.content, metadata={ diff --git a/api/core/rag/datasource/keyword/keyword_base.py b/api/core/rag/datasource/keyword/keyword_base.py index 84a5800855..02838cb1bd 100644 --- a/api/core/rag/datasource/keyword/keyword_base.py +++ b/api/core/rag/datasource/keyword/keyword_base.py @@ -28,10 +28,6 @@ class BaseKeyword(ABC): def delete_by_ids(self, ids: list[str]) -> None: raise NotImplementedError - @abstractmethod - def delete_by_document_id(self, document_id: str) -> None: - raise NotImplementedError - def delete(self) -> None: raise NotImplementedError diff --git a/api/core/rag/datasource/keyword/keyword_factory.py b/api/core/rag/datasource/keyword/keyword_factory.py index f5e2bf0f83..beb3322aa6 100644 --- a/api/core/rag/datasource/keyword/keyword_factory.py +++ b/api/core/rag/datasource/keyword/keyword_factory.py @@ -39,9 +39,6 @@ class Keyword: def delete_by_ids(self, ids: list[str]) -> None: self._keyword_processor.delete_by_ids(ids) - def delete_by_document_id(self, document_id: str) -> None: - self._keyword_processor.delete_by_document_id(document_id) - def delete(self) -> None: self._keyword_processor.delete() diff --git a/api/core/rag/datasource/vdb/milvus/milvus_vector.py b/api/core/rag/datasource/vdb/milvus/milvus_vector.py index 665a697e1a..02b715d768 100644 --- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py +++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py @@ -100,12 +100,6 @@ class MilvusVector(BaseVector): raise e return pks - def delete_by_document_id(self, document_id: str): - - ids = self.get_ids_by_metadata_field('document_id', document_id) - if ids: - self._client.delete(collection_name=self._collection_name, pks=ids) - def get_ids_by_metadata_field(self, key: str, value: str): result = self._client.query(collection_name=self._collection_name, filter=f'metadata["{key}"] == "{value}"', diff --git a/api/core/rag/datasource/vdb/opensearch/opensearch_vector.py b/api/core/rag/datasource/vdb/opensearch/opensearch_vector.py index 52f8b41bae..744ff2d517 100644 --- a/api/core/rag/datasource/vdb/opensearch/opensearch_vector.py +++ b/api/core/rag/datasource/vdb/opensearch/opensearch_vector.py @@ -87,11 +87,6 @@ class OpenSearchVector(BaseVector): helpers.bulk(self._client, actions) - def delete_by_document_id(self, document_id: str): - ids = self.get_ids_by_metadata_field('document_id', document_id) - if ids: - self.delete_by_ids(ids) - def get_ids_by_metadata_field(self, key: str, value: str): query = {"query": {"term": {f"{Field.METADATA_KEY.value}.{key}": value}}} response = self._client.search(index=self._collection_name.lower(), body=query) diff --git a/api/core/rag/datasource/vdb/oracle/oraclevector.py b/api/core/rag/datasource/vdb/oracle/oraclevector.py index c087ed0cd8..5f7723508c 100644 --- a/api/core/rag/datasource/vdb/oracle/oraclevector.py +++ b/api/core/rag/datasource/vdb/oracle/oraclevector.py @@ -156,13 +156,6 @@ class OracleVector(BaseVector): # idss.append(record[0]) # return idss - #def delete_by_document_id(self, document_id: str): - # ids = self.get_ids_by_metadata_field('doc_id', document_id) - # if len(ids)>0: - # with self._get_cursor() as cur: - # cur.execute(f"delete FROM {self.table_name} d WHERE d.meta.doc_id in '%s'" % ("','".join(ids),)) - - def delete_by_ids(self, ids: list[str]) -> None: with self._get_cursor() as cur: cur.execute(f"DELETE FROM {self.table_name} WHERE id IN %s" % (tuple(ids),)) diff --git a/api/core/rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py b/api/core/rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py index 61cac4f3a3..63c8edfbc3 100644 --- a/api/core/rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py +++ b/api/core/rag/datasource/vdb/pgvecto_rs/pgvecto_rs.py @@ -130,14 +130,6 @@ class PGVectoRS(BaseVector): return pks - def delete_by_document_id(self, document_id: str): - ids = self.get_ids_by_metadata_field('document_id', document_id) - if ids: - with Session(self._client) as session: - select_statement = sql_text(f"DELETE FROM {self._collection_name} WHERE id = ANY(:ids)") - session.execute(select_statement, {'ids': ids}) - session.commit() - def get_ids_by_metadata_field(self, key: str, value: str): result = None with Session(self._client) as session: diff --git a/api/core/rag/datasource/vdb/relyt/relyt_vector.py b/api/core/rag/datasource/vdb/relyt/relyt_vector.py index d2b32324a1..4fe1df717a 100644 --- a/api/core/rag/datasource/vdb/relyt/relyt_vector.py +++ b/api/core/rag/datasource/vdb/relyt/relyt_vector.py @@ -151,11 +151,6 @@ class RelytVector(BaseVector): return ids - def delete_by_document_id(self, document_id: str): - ids = self.get_ids_by_metadata_field('document_id', document_id) - if ids: - self.delete_by_uuids(ids) - def get_ids_by_metadata_field(self, key: str, value: str): result = None with Session(self.client) as session: diff --git a/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py b/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py index 1da0fd554f..5922db1176 100644 --- a/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py +++ b/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py @@ -161,11 +161,6 @@ class TiDBVector(BaseVector): print("Delete operation failed:", str(e)) return False - def delete_by_document_id(self, document_id: str): - ids = self.get_ids_by_metadata_field('document_id', document_id) - if ids: - self._delete_by_ids(ids) - def get_ids_by_metadata_field(self, key: str, value: str): with Session(self._engine) as session: select_statement = sql_text( diff --git a/api/core/rag/datasource/vdb/vector_base.py b/api/core/rag/datasource/vdb/vector_base.py index 9b414e4e12..dbd8b6284b 100644 --- a/api/core/rag/datasource/vdb/vector_base.py +++ b/api/core/rag/datasource/vdb/vector_base.py @@ -31,9 +31,6 @@ class BaseVector(ABC): def delete_by_ids(self, ids: list[str]) -> None: raise NotImplementedError - def delete_by_document_id(self, document_id: str): - raise NotImplementedError - def get_ids_by_metadata_field(self, key: str, value: str): raise NotImplementedError diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 931297c95e..2b16275dc8 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -1,4 +1,5 @@ """Abstract interface for document loader implementations.""" +import os from typing import Optional import pandas as pd @@ -29,8 +30,15 @@ class ExcelExtractor(BaseExtractor): def extract(self) -> list[Document]: """ Load from Excel file in xls or xlsx format using Pandas.""" documents = [] + # Determine the file extension + file_extension = os.path.splitext(self._file_path)[-1].lower() # Read each worksheet of an Excel file using Pandas - excel_file = pd.ExcelFile(self._file_path) + if file_extension == '.xlsx': + excel_file = pd.ExcelFile(self._file_path, engine='openpyxl') + elif file_extension == '.xls': + excel_file = pd.ExcelFile(self._file_path, engine='xlrd') + else: + raise ValueError(f"Unsupported file extension: {file_extension}") for sheet_name in excel_file.sheet_names: df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name) diff --git a/api/tests/integration_tests/vdb/milvus/test_milvus.py b/api/tests/integration_tests/vdb/milvus/test_milvus.py index 2ce85445fb..9c0917ef30 100644 --- a/api/tests/integration_tests/vdb/milvus/test_milvus.py +++ b/api/tests/integration_tests/vdb/milvus/test_milvus.py @@ -24,9 +24,6 @@ class MilvusVectorTest(AbstractVectorTest): hits_by_full_text = self.vector.search_by_full_text(query=get_example_text()) assert len(hits_by_full_text) == 0 - def delete_by_document_id(self): - self.vector.delete_by_document_id(document_id=self.example_doc_id) - def get_ids_by_metadata_field(self): ids = self.vector.get_ids_by_metadata_field(key='document_id', value=self.example_doc_id) assert len(ids) == 1 diff --git a/api/tests/integration_tests/vdb/opensearch/test_opensearch.py b/api/tests/integration_tests/vdb/opensearch/test_opensearch.py index e372c9b7ac..4a67c39a98 100644 --- a/api/tests/integration_tests/vdb/opensearch/test_opensearch.py +++ b/api/tests/integration_tests/vdb/opensearch/test_opensearch.py @@ -91,9 +91,6 @@ class TestOpenSearchVector: assert hits_by_vector[0].metadata['document_id'] == self.example_doc_id, \ f"Expected document ID {self.example_doc_id}, got {hits_by_vector[0].metadata['document_id']}" - def test_delete_by_document_id(self): - self.vector._client.delete_by_query.return_value = {'deleted': 1} - doc = Document(page_content="Test content to delete", metadata={"document_id": self.example_doc_id}) embedding = [0.1] * 128 @@ -101,8 +98,6 @@ class TestOpenSearchVector: mock_bulk.return_value = ([], []) self.vector.add_texts([doc], [embedding]) - self.vector.delete_by_document_id(document_id=self.example_doc_id) - self.vector._client.search.return_value = {'hits': {'total': {'value': 0}, 'hits': []}} ids = self.vector.get_ids_by_metadata_field(key='document_id', value=self.example_doc_id) @@ -169,10 +164,6 @@ class TestOpenSearchVectorWithRedis: expected_doc_id = "example_doc_id" self.tester.test_search_by_full_text(search_response, expected_length, expected_doc_id) - def test_delete_by_document_id(self): - self.tester.setup_method() - self.tester.test_delete_by_document_id() - def test_get_ids_by_metadata_field(self): self.tester.setup_method() self.tester.test_get_ids_by_metadata_field() diff --git a/api/tests/integration_tests/vdb/pgvecto_rs/test_pgvecto_rs.py b/api/tests/integration_tests/vdb/pgvecto_rs/test_pgvecto_rs.py index 89a40a92be..e6ce8aab3d 100644 --- a/api/tests/integration_tests/vdb/pgvecto_rs/test_pgvecto_rs.py +++ b/api/tests/integration_tests/vdb/pgvecto_rs/test_pgvecto_rs.py @@ -26,9 +26,6 @@ class PGVectoRSVectorTest(AbstractVectorTest): hits_by_full_text = self.vector.search_by_full_text(query=get_example_text()) assert len(hits_by_full_text) == 0 - def delete_by_document_id(self): - self.vector.delete_by_document_id(document_id=self.example_doc_id) - def get_ids_by_metadata_field(self): ids = self.vector.get_ids_by_metadata_field(key='document_id', value=self.example_doc_id) assert len(ids) == 1 diff --git a/api/tests/integration_tests/vdb/test_vector_store.py b/api/tests/integration_tests/vdb/test_vector_store.py index 3930daf484..cb35822709 100644 --- a/api/tests/integration_tests/vdb/test_vector_store.py +++ b/api/tests/integration_tests/vdb/test_vector_store.py @@ -81,10 +81,6 @@ class AbstractVectorTest: def text_exists(self): assert self.vector.text_exists(self.example_doc_id) - def delete_by_document_id(self): - with pytest.raises(NotImplementedError): - self.vector.delete_by_document_id(document_id=self.example_doc_id) - def get_ids_by_metadata_field(self): with pytest.raises(NotImplementedError): self.vector.get_ids_by_metadata_field(key='key', value='value') @@ -95,7 +91,6 @@ class AbstractVectorTest: self.search_by_full_text() self.text_exists() self.get_ids_by_metadata_field() - self.delete_by_document_id() added_doc_ids = self.add_texts() self.delete_by_ids(added_doc_ids) self.delete_vector() diff --git a/api/tests/integration_tests/vdb/tidb_vector/test_tidb_vector.py b/api/tests/integration_tests/vdb/tidb_vector/test_tidb_vector.py index 7cd8d22e91..18e00dbedd 100644 --- a/api/tests/integration_tests/vdb/tidb_vector/test_tidb_vector.py +++ b/api/tests/integration_tests/vdb/tidb_vector/test_tidb_vector.py @@ -43,9 +43,6 @@ class TiDBVectorTest(AbstractVectorTest): ids = self.vector.get_ids_by_metadata_field(key='document_id', value=self.example_doc_id) assert len(ids) == 0 - def delete_by_document_id(self): - self.vector.delete_by_document_id(document_id=self.example_doc_id) - def test_tidb_vector(setup_mock_redis, setup_tidbvector_mock, tidb_vector, mock_session): TiDBVectorTest(vector=tidb_vector).run_all_tests()