From eb1ce3dd6bc7e8c2883e37da1812a1dac8382c92 Mon Sep 17 00:00:00 2001 From: lauding <719880851@qq.com> Date: Tue, 22 Apr 2025 13:03:35 +0800 Subject: [PATCH] feat: support huawei cloud vector database (#16141) --- api/configs/middleware/__init__.py | 2 + .../middleware/vdb/huawei_cloud_config.py | 25 ++ api/controllers/console/datasets/datasets.py | 2 + .../rag/datasource/vdb/huawei/__init__.py | 0 .../vdb/huawei/huawei_cloud_vector.py | 215 ++++++++++++++++++ api/core/rag/datasource/vdb/vector_factory.py | 4 + api/core/rag/datasource/vdb/vector_type.py | 1 + .../vdb/__mock/huaweicloudvectordb.py | 88 +++++++ .../integration_tests/vdb/huawei/__init__.py | 0 .../vdb/huawei/test_huawei_cloud.py | 28 +++ dev/pytest/pytest_vdb.sh | 1 + docker/.env.example | 5 + docker/docker-compose.yaml | 3 + 13 files changed, 374 insertions(+) create mode 100644 api/configs/middleware/vdb/huawei_cloud_config.py create mode 100644 api/core/rag/datasource/vdb/huawei/__init__.py create mode 100644 api/core/rag/datasource/vdb/huawei/huawei_cloud_vector.py create mode 100644 api/tests/integration_tests/vdb/__mock/huaweicloudvectordb.py create mode 100644 api/tests/integration_tests/vdb/huawei/__init__.py create mode 100644 api/tests/integration_tests/vdb/huawei/test_huawei_cloud.py diff --git a/api/configs/middleware/__init__.py b/api/configs/middleware/__init__.py index 15dfe0063b..c2ad24094a 100644 --- a/api/configs/middleware/__init__.py +++ b/api/configs/middleware/__init__.py @@ -22,6 +22,7 @@ from .vdb.baidu_vector_config import BaiduVectorDBConfig from .vdb.chroma_config import ChromaConfig from .vdb.couchbase_config import CouchbaseConfig from .vdb.elasticsearch_config import ElasticsearchConfig +from .vdb.huawei_cloud_config import HuaweiCloudConfig from .vdb.lindorm_config import LindormConfig from .vdb.milvus_config import MilvusConfig from .vdb.myscale_config import MyScaleConfig @@ -263,6 +264,7 @@ class MiddlewareConfig( VectorStoreConfig, AnalyticdbConfig, ChromaConfig, + HuaweiCloudConfig, MilvusConfig, MyScaleConfig, OpenSearchConfig, diff --git a/api/configs/middleware/vdb/huawei_cloud_config.py b/api/configs/middleware/vdb/huawei_cloud_config.py new file mode 100644 index 0000000000..2290c60499 --- /dev/null +++ b/api/configs/middleware/vdb/huawei_cloud_config.py @@ -0,0 +1,25 @@ +from typing import Optional + +from pydantic import Field +from pydantic_settings import BaseSettings + + +class HuaweiCloudConfig(BaseSettings): + """ + Configuration settings for Huawei cloud search service + """ + + HUAWEI_CLOUD_HOSTS: Optional[str] = Field( + description="Hostname or IP address of the Huawei cloud search service instance", + default=None, + ) + + HUAWEI_CLOUD_USER: Optional[str] = Field( + description="Username for authenticating with Huawei cloud search service", + default=None, + ) + + HUAWEI_CLOUD_PASSWORD: Optional[str] = Field( + description="Password for authenticating with Huawei cloud search service", + default=None, + ) diff --git a/api/controllers/console/datasets/datasets.py b/api/controllers/console/datasets/datasets.py index 4644ac6299..752d124735 100644 --- a/api/controllers/console/datasets/datasets.py +++ b/api/controllers/console/datasets/datasets.py @@ -664,6 +664,7 @@ class DatasetRetrievalSettingApi(Resource): | VectorType.OPENGAUSS | VectorType.OCEANBASE | VectorType.TABLESTORE + | VectorType.HUAWEI_CLOUD | VectorType.TENCENT ): return { @@ -710,6 +711,7 @@ class DatasetRetrievalSettingMockApi(Resource): | VectorType.OCEANBASE | VectorType.TABLESTORE | VectorType.TENCENT + | VectorType.HUAWEI_CLOUD ): return { "retrieval_method": [ diff --git a/api/core/rag/datasource/vdb/huawei/__init__.py b/api/core/rag/datasource/vdb/huawei/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/rag/datasource/vdb/huawei/huawei_cloud_vector.py b/api/core/rag/datasource/vdb/huawei/huawei_cloud_vector.py new file mode 100644 index 0000000000..89423eb160 --- /dev/null +++ b/api/core/rag/datasource/vdb/huawei/huawei_cloud_vector.py @@ -0,0 +1,215 @@ +import json +import logging +import ssl +from typing import Any, Optional + +from elasticsearch import Elasticsearch +from pydantic import BaseModel, model_validator + +from configs import dify_config +from core.rag.datasource.vdb.field import Field +from core.rag.datasource.vdb.vector_base import BaseVector +from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory +from core.rag.datasource.vdb.vector_type import VectorType +from core.rag.embedding.embedding_base import Embeddings +from core.rag.models.document import Document +from extensions.ext_redis import redis_client +from models.dataset import Dataset + +logger = logging.getLogger(__name__) + + +def create_ssl_context() -> ssl.SSLContext: + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + return ssl_context + + +class HuaweiCloudVectorConfig(BaseModel): + hosts: str + username: str | None + password: str | None + + @model_validator(mode="before") + @classmethod + def validate_config(cls, values: dict) -> dict: + if not values["hosts"]: + raise ValueError("config HOSTS is required") + return values + + def to_elasticsearch_params(self) -> dict[str, Any]: + params = { + "hosts": self.hosts.split(","), + "verify_certs": False, + "ssl_show_warn": False, + "request_timeout": 30000, + "retry_on_timeout": True, + "max_retries": 10, + } + if self.username and self.password: + params["basic_auth"] = (self.username, self.password) + return params + + +class HuaweiCloudVector(BaseVector): + def __init__(self, index_name: str, config: HuaweiCloudVectorConfig): + super().__init__(index_name.lower()) + self._client = Elasticsearch(**config.to_elasticsearch_params()) + + def get_type(self) -> str: + return VectorType.HUAWEI_CLOUD + + def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs): + uuids = self._get_uuids(documents) + for i in range(len(documents)): + self._client.index( + index=self._collection_name, + id=uuids[i], + document={ + Field.CONTENT_KEY.value: documents[i].page_content, + Field.VECTOR.value: embeddings[i] or None, + Field.METADATA_KEY.value: documents[i].metadata or {}, + }, + ) + self._client.indices.refresh(index=self._collection_name) + return uuids + + def text_exists(self, id: str) -> bool: + return bool(self._client.exists(index=self._collection_name, id=id)) + + def delete_by_ids(self, ids: list[str]) -> None: + if not ids: + return + for id in ids: + self._client.delete(index=self._collection_name, id=id) + + def delete_by_metadata_field(self, key: str, value: str) -> None: + query_str = {"query": {"match": {f"metadata.{key}": f"{value}"}}} + results = self._client.search(index=self._collection_name, body=query_str) + ids = [hit["_id"] for hit in results["hits"]["hits"]] + if ids: + self.delete_by_ids(ids) + + def delete(self) -> None: + self._client.indices.delete(index=self._collection_name) + + def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: + top_k = kwargs.get("top_k", 4) + + query = { + "size": top_k, + "query": { + "vector": { + Field.VECTOR.value: { + "vector": query_vector, + "topk": top_k, + } + } + }, + } + + results = self._client.search(index=self._collection_name, body=query) + + docs_and_scores = [] + for hit in results["hits"]["hits"]: + docs_and_scores.append( + ( + Document( + page_content=hit["_source"][Field.CONTENT_KEY.value], + vector=hit["_source"][Field.VECTOR.value], + metadata=hit["_source"][Field.METADATA_KEY.value], + ), + hit["_score"], + ) + ) + + docs = [] + for doc, score in docs_and_scores: + score_threshold = float(kwargs.get("score_threshold") or 0.0) + if score > score_threshold: + if doc.metadata is not None: + doc.metadata["score"] = score + docs.append(doc) + + return docs + + def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]: + query_str = {"match": {Field.CONTENT_KEY.value: query}} + results = self._client.search(index=self._collection_name, query=query_str, size=kwargs.get("top_k", 4)) + docs = [] + for hit in results["hits"]["hits"]: + docs.append( + Document( + page_content=hit["_source"][Field.CONTENT_KEY.value], + vector=hit["_source"][Field.VECTOR.value], + metadata=hit["_source"][Field.METADATA_KEY.value], + ) + ) + + return docs + + def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs): + metadatas = [d.metadata if d.metadata is not None else {} for d in texts] + self.create_collection(embeddings, metadatas) + self.add_texts(texts, embeddings, **kwargs) + + def create_collection( + self, + embeddings: list[list[float]], + metadatas: Optional[list[dict[Any, Any]]] = None, + index_params: Optional[dict] = None, + ): + lock_name = f"vector_indexing_lock_{self._collection_name}" + with redis_client.lock(lock_name, timeout=20): + collection_exist_cache_key = f"vector_indexing_{self._collection_name}" + if redis_client.get(collection_exist_cache_key): + logger.info(f"Collection {self._collection_name} already exists.") + return + + if not self._client.indices.exists(index=self._collection_name): + dim = len(embeddings[0]) + mappings = { + "properties": { + Field.CONTENT_KEY.value: {"type": "text"}, + Field.VECTOR.value: { # Make sure the dimension is correct here + "type": "vector", + "dimension": dim, + "indexing": True, + "algorithm": "GRAPH", + "metric": "cosine", + "neighbors": 32, + "efc": 128, + }, + Field.METADATA_KEY.value: { + "type": "object", + "properties": { + "doc_id": {"type": "keyword"} # Map doc_id to keyword type + }, + }, + } + } + settings = {"index.vector": True} + self._client.indices.create(index=self._collection_name, mappings=mappings, settings=settings) + + redis_client.set(collection_exist_cache_key, 1, ex=3600) + + +class HuaweiCloudVectorFactory(AbstractVectorFactory): + def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> HuaweiCloudVector: + if dataset.index_struct_dict: + class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] + collection_name = class_prefix.lower() + else: + dataset_id = dataset.id + collection_name = Dataset.gen_collection_name_by_id(dataset_id).lower() + dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.HUAWEI_CLOUD, collection_name)) + + return HuaweiCloudVector( + index_name=collection_name, + config=HuaweiCloudVectorConfig( + hosts=dify_config.HUAWEI_CLOUD_HOSTS or "http://localhost:9200", + username=dify_config.HUAWEI_CLOUD_USER, + password=dify_config.HUAWEI_CLOUD_PASSWORD, + ), + ) diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 00601c38a1..05158cc7ca 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -156,6 +156,10 @@ class Vector: from core.rag.datasource.vdb.tablestore.tablestore_vector import TableStoreVectorFactory return TableStoreVectorFactory + case VectorType.HUAWEI_CLOUD: + from core.rag.datasource.vdb.huawei.huawei_cloud_vector import HuaweiCloudVectorFactory + + return HuaweiCloudVectorFactory case _: raise ValueError(f"Vector store {vector_type} is not supported.") diff --git a/api/core/rag/datasource/vdb/vector_type.py b/api/core/rag/datasource/vdb/vector_type.py index 940f12caef..0421be3458 100644 --- a/api/core/rag/datasource/vdb/vector_type.py +++ b/api/core/rag/datasource/vdb/vector_type.py @@ -26,3 +26,4 @@ class VectorType(StrEnum): OCEANBASE = "oceanbase" OPENGAUSS = "opengauss" TABLESTORE = "tablestore" + HUAWEI_CLOUD = "huawei_cloud" diff --git a/api/tests/integration_tests/vdb/__mock/huaweicloudvectordb.py b/api/tests/integration_tests/vdb/__mock/huaweicloudvectordb.py new file mode 100644 index 0000000000..e1aba4e2c1 --- /dev/null +++ b/api/tests/integration_tests/vdb/__mock/huaweicloudvectordb.py @@ -0,0 +1,88 @@ +import os + +import pytest +from _pytest.monkeypatch import MonkeyPatch +from api.core.rag.datasource.vdb.field import Field +from elasticsearch import Elasticsearch + + +class MockIndicesClient: + def __init__(self): + pass + + def create(self, index, mappings, settings): + return {"acknowledge": True} + + def refresh(self, index): + return {"acknowledge": True} + + def delete(self, index): + return {"acknowledge": True} + + def exists(self, index): + return True + + +class MockClient: + def __init__(self, **kwargs): + self.indices = MockIndicesClient() + + def index(self, **kwargs): + return {"acknowledge": True} + + def exists(self, **kwargs): + return True + + def delete(self, **kwargs): + return {"acknowledge": True} + + def search(self, **kwargs): + return { + "took": 1, + "hits": { + "hits": [ + { + "_source": { + Field.CONTENT_KEY.value: "abcdef", + Field.VECTOR.value: [1, 2], + Field.METADATA_KEY.value: {}, + }, + "_score": 1.0, + }, + { + "_source": { + Field.CONTENT_KEY.value: "123456", + Field.VECTOR.value: [2, 2], + Field.METADATA_KEY.value: {}, + }, + "_score": 0.9, + }, + { + "_source": { + Field.CONTENT_KEY.value: "a1b2c3", + Field.VECTOR.value: [3, 2], + Field.METADATA_KEY.value: {}, + }, + "_score": 0.8, + }, + ] + }, + } + + +MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true" + + +@pytest.fixture +def setup_client_mock(request, monkeypatch: MonkeyPatch): + if MOCK: + monkeypatch.setattr(Elasticsearch, "__init__", MockClient.__init__) + monkeypatch.setattr(Elasticsearch, "index", MockClient.index) + monkeypatch.setattr(Elasticsearch, "exists", MockClient.exists) + monkeypatch.setattr(Elasticsearch, "delete", MockClient.delete) + monkeypatch.setattr(Elasticsearch, "search", MockClient.search) + + yield + + if MOCK: + monkeypatch.undo() diff --git a/api/tests/integration_tests/vdb/huawei/__init__.py b/api/tests/integration_tests/vdb/huawei/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/tests/integration_tests/vdb/huawei/test_huawei_cloud.py b/api/tests/integration_tests/vdb/huawei/test_huawei_cloud.py new file mode 100644 index 0000000000..943b2bc877 --- /dev/null +++ b/api/tests/integration_tests/vdb/huawei/test_huawei_cloud.py @@ -0,0 +1,28 @@ +from core.rag.datasource.vdb.huawei.huawei_cloud_vector import HuaweiCloudVector, HuaweiCloudVectorConfig +from tests.integration_tests.vdb.__mock.huaweicloudvectordb import setup_client_mock +from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis + + +class HuaweiCloudVectorTest(AbstractVectorTest): + def __init__(self): + super().__init__() + self.vector = HuaweiCloudVector( + "dify", + HuaweiCloudVectorConfig( + hosts="https://127.0.0.1:9200", + username="dify", + password="dify", + ), + ) + + def search_by_vector(self): + hits_by_vector = self.vector.search_by_vector(query_vector=self.example_embedding) + assert len(hits_by_vector) == 3 + + def search_by_full_text(self): + hits_by_full_text = self.vector.search_by_full_text(query=get_example_text()) + assert len(hits_by_full_text) == 3 + + +def test_huawei_cloud_vector(setup_mock_redis, setup_client_mock): + HuaweiCloudVectorTest().run_all_tests() diff --git a/dev/pytest/pytest_vdb.sh b/dev/pytest/pytest_vdb.sh index c68a94c79b..dd03ca3514 100755 --- a/dev/pytest/pytest_vdb.sh +++ b/dev/pytest/pytest_vdb.sh @@ -15,3 +15,4 @@ pytest api/tests/integration_tests/vdb/chroma \ api/tests/integration_tests/vdb/couchbase \ api/tests/integration_tests/vdb/oceanbase \ api/tests/integration_tests/vdb/tidb_vector \ + api/tests/integration_tests/vdb/huawei \ diff --git a/docker/.env.example b/docker/.env.example index 82ef4174c2..f8310a10f1 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -574,6 +574,11 @@ OPENGAUSS_MIN_CONNECTION=1 OPENGAUSS_MAX_CONNECTION=5 OPENGAUSS_ENABLE_PQ=false +# huawei cloud search service vector configurations, only available when VECTOR_STORE is `huawei_cloud` +HUAWEI_CLOUD_HOSTS=https://127.0.0.1:9200 +HUAWEI_CLOUD_USER=admin +HUAWEI_CLOUD_PASSWORD=admin + # Upstash Vector configuration, only available when VECTOR_STORE is `upstash` UPSTASH_VECTOR_URL=https://xxx-vector.upstash.io UPSTASH_VECTOR_TOKEN=dify diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index def4b77c65..d8ff7d841a 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -266,6 +266,9 @@ x-shared-env: &shared-api-worker-env OPENGAUSS_MIN_CONNECTION: ${OPENGAUSS_MIN_CONNECTION:-1} OPENGAUSS_MAX_CONNECTION: ${OPENGAUSS_MAX_CONNECTION:-5} OPENGAUSS_ENABLE_PQ: ${OPENGAUSS_ENABLE_PQ:-false} + HUAWEI_CLOUD_HOSTS: ${HUAWEI_CLOUD_HOSTS:-https://127.0.0.1:9200} + HUAWEI_CLOUD_USER: ${HUAWEI_CLOUD_USER:-admin} + HUAWEI_CLOUD_PASSWORD: ${HUAWEI_CLOUD_PASSWORD:-admin} UPSTASH_VECTOR_URL: ${UPSTASH_VECTOR_URL:-https://xxx-vector.upstash.io} UPSTASH_VECTOR_TOKEN: ${UPSTASH_VECTOR_TOKEN:-dify} TABLESTORE_ENDPOINT: ${TABLESTORE_ENDPOINT:-https://instance-name.cn-hangzhou.ots.aliyuncs.com}