Feat elasticsearch japanese (#12194)

2025-07-07 13:31:51 +08:00 · 2025-01-08 13:35:41 +09:00 · 2025-01-08 13:35:41 +09:00 · d2586278d6
commit d2586278d6
parent 6635c393e9
7 changed files with 149 additions and 3 deletions
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@ -640,6 +640,7 @@ class DatasetRetrievalSettingApi(Resource):
                | VectorType.MYSCALE
                | VectorType.ORACLE
                | VectorType.ELASTICSEARCH
                | VectorType.ELASTICSEARCH_JA
                | VectorType.PGVECTOR
                | VectorType.TIDB_ON_QDRANT
                | VectorType.LINDORM
@ -683,6 +684,7 @@ class DatasetRetrievalSettingMockApi(Resource):
                | VectorType.MYSCALE
                | VectorType.ORACLE
                | VectorType.ELASTICSEARCH
                | VectorType.ELASTICSEARCH_JA
                | VectorType.COUCHBASE
                | VectorType.PGVECTOR
                | VectorType.LINDORM
--- a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py
+++ b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py
@ -0,0 +1,104 @@
 import json
 import logging
 from typing import Any, Optional
 from flask import current_app
 from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import (
    ElasticSearchConfig,
    ElasticSearchVector,
    ElasticSearchVectorFactory,
 )
 from core.rag.datasource.vdb.field import Field
 from core.rag.datasource.vdb.vector_type import VectorType
 from core.rag.embedding.embedding_base import Embeddings
 from extensions.ext_redis import redis_client
 from models.dataset import Dataset
 logger = logging.getLogger(__name__)
 class ElasticSearchJaVector(ElasticSearchVector):
    def create_collection(
        self,
        embeddings: list[list[float]],
        metadatas: Optional[list[dict[Any, Any]]] = None,
        index_params: Optional[dict] = None,
    ):
        lock_name = f"vector_indexing_lock_{self._collection_name}"
        with redis_client.lock(lock_name, timeout=20):
            collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
            if redis_client.get(collection_exist_cache_key):
                logger.info(f"Collection {self._collection_name} already exists.")
                return
            if not self._client.indices.exists(index=self._collection_name):
                dim = len(embeddings[0])
                settings = {
                    "analysis": {
                        "analyzer": {
                            "ja_analyzer": {
                                "type": "custom",
                                "char_filter": [
                                    "icu_normalizer",
                                    "kuromoji_iteration_mark",
                                ],
                                "tokenizer": "kuromoji_tokenizer",
                                "filter": [
                                    "kuromoji_baseform",
                                    "kuromoji_part_of_speech",
                                    "ja_stop",
                                    "kuromoji_number",
                                    "kuromoji_stemmer",
                                ],
                            }
                        }
                    }
                }
                mappings = {
                    "properties": {
                        Field.CONTENT_KEY.value: {
                            "type": "text",
                            "analyzer": "ja_analyzer",
                            "search_analyzer": "ja_analyzer",
                        },
                        Field.VECTOR.value: {  # Make sure the dimension is correct here
                            "type": "dense_vector",
                            "dims": dim,
                            "index": True,
                            "similarity": "cosine",
                        },
                        Field.METADATA_KEY.value: {
                            "type": "object",
                            "properties": {
                                "doc_id": {"type": "keyword"}  # Map doc_id to keyword type
                            },
                        },
                    }
                }
                self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings)
            redis_client.set(collection_exist_cache_key, 1, ex=3600)
 class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory):
    def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector:
        if dataset.index_struct_dict:
            class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
            collection_name = class_prefix
        else:
            dataset_id = dataset.id
            collection_name = Dataset.gen_collection_name_by_id(dataset_id)
            dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name))
        config = current_app.config
        return ElasticSearchJaVector(
            index_name=collection_name,
            config=ElasticSearchConfig(
                host=config.get("ELASTICSEARCH_HOST", "localhost"),
                port=config.get("ELASTICSEARCH_PORT", 9200),
                username=config.get("ELASTICSEARCH_USERNAME", ""),
                password=config.get("ELASTICSEARCH_PASSWORD", ""),
            ),
            attributes=[],
        )
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@ -90,6 +90,12 @@ class Vector:
                from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory
                return ElasticSearchVectorFactory
            case VectorType.ELASTICSEARCH_JA:
                from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import (
                    ElasticSearchJaVectorFactory,
                )
                return ElasticSearchJaVectorFactory
            case VectorType.TIDB_VECTOR:
                from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory
--- a/api/core/rag/datasource/vdb/vector_type.py
+++ b/api/core/rag/datasource/vdb/vector_type.py
@ -16,6 +16,7 @@ class VectorType(StrEnum):
    TENCENT = "tencent"
    ORACLE = "oracle"
    ELASTICSEARCH = "elasticsearch"
    ELASTICSEARCH_JA = "elasticsearch-ja"
    LINDORM = "lindorm"
    COUCHBASE = "couchbase"
    BAIDU = "baidu"
--- a/docker/.env.example
+++ b/docker/.env.example
@ -383,7 +383,7 @@ SUPABASE_URL=your-server-url
 # ------------------------------
 # The type of vector store to use.
-# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
+# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
 VECTOR_STORE=weaviate
 # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
@ -512,7 +512,7 @@ TENCENT_VECTOR_DB_SHARD=1
 TENCENT_VECTOR_DB_REPLICAS=2
 # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch`
-ELASTICSEARCH_HOST=0.0.0.0
+ELASTICSEARCH_HOST=elasticsearch
 ELASTICSEARCH_PORT=9200
 ELASTICSEARCH_USERNAME=elastic
 ELASTICSEARCH_PASSWORD=elastic
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@ -883,20 +883,28 @@ services:
    container_name: elasticsearch
    profiles:
      - elasticsearch
      - elasticsearch-ja
    restart: always
    volumes:
      - ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh
      - dify_es01_data:/usr/share/elasticsearch/data
    environment:
      ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic}
      VECTOR_STORE: ${VECTOR_STORE:-}
      cluster.name: dify-es-cluster
      node.name: dify-es0
      discovery.type: single-node
-      xpack.license.self_generated.type: trial
+      xpack.license.self_generated.type: basic
      xpack.security.enabled: 'true'
      xpack.security.enrollment.enabled: 'false'
      xpack.security.http.ssl.enabled: 'false'
    ports:
      - ${ELASTICSEARCH_PORT:-9200}:9200
    deploy:
      resources:
        limits:
          memory: 2g
    entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ]
    healthcheck:
      test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ]
      interval: 30s
--- a/docker/elasticsearch/docker-entrypoint.sh
+++ b/docker/elasticsearch/docker-entrypoint.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 set -e
 if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then
    # Check if the ICU tokenizer plugin is installed
    if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then
        printf '%s\n' "Installing the ICU tokenizer plugin"
        if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then
            printf '%s\n' "Failed to install the ICU tokenizer plugin"
            exit 1
        fi
    fi
    # Check if the Japanese language analyzer plugin is installed
    if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then
        printf '%s\n' "Installing the Japanese language analyzer plugin"
        if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then
            printf '%s\n' "Failed to install the Japanese language analyzer plugin"
            exit 1
        fi
    fi
 fi
 # Run the original entrypoint script
 exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh