mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-07-07 13:31:51 +08:00
Feat elasticsearch japanese (#12194)
This commit is contained in:
parent
6635c393e9
commit
d2586278d6
@ -640,6 +640,7 @@ class DatasetRetrievalSettingApi(Resource):
|
|||||||
| VectorType.MYSCALE
|
| VectorType.MYSCALE
|
||||||
| VectorType.ORACLE
|
| VectorType.ORACLE
|
||||||
| VectorType.ELASTICSEARCH
|
| VectorType.ELASTICSEARCH
|
||||||
|
| VectorType.ELASTICSEARCH_JA
|
||||||
| VectorType.PGVECTOR
|
| VectorType.PGVECTOR
|
||||||
| VectorType.TIDB_ON_QDRANT
|
| VectorType.TIDB_ON_QDRANT
|
||||||
| VectorType.LINDORM
|
| VectorType.LINDORM
|
||||||
@ -683,6 +684,7 @@ class DatasetRetrievalSettingMockApi(Resource):
|
|||||||
| VectorType.MYSCALE
|
| VectorType.MYSCALE
|
||||||
| VectorType.ORACLE
|
| VectorType.ORACLE
|
||||||
| VectorType.ELASTICSEARCH
|
| VectorType.ELASTICSEARCH
|
||||||
|
| VectorType.ELASTICSEARCH_JA
|
||||||
| VectorType.COUCHBASE
|
| VectorType.COUCHBASE
|
||||||
| VectorType.PGVECTOR
|
| VectorType.PGVECTOR
|
||||||
| VectorType.LINDORM
|
| VectorType.LINDORM
|
||||||
|
@ -0,0 +1,104 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from flask import current_app
|
||||||
|
|
||||||
|
from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import (
|
||||||
|
ElasticSearchConfig,
|
||||||
|
ElasticSearchVector,
|
||||||
|
ElasticSearchVectorFactory,
|
||||||
|
)
|
||||||
|
from core.rag.datasource.vdb.field import Field
|
||||||
|
from core.rag.datasource.vdb.vector_type import VectorType
|
||||||
|
from core.rag.embedding.embedding_base import Embeddings
|
||||||
|
from extensions.ext_redis import redis_client
|
||||||
|
from models.dataset import Dataset
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticSearchJaVector(ElasticSearchVector):
|
||||||
|
def create_collection(
|
||||||
|
self,
|
||||||
|
embeddings: list[list[float]],
|
||||||
|
metadatas: Optional[list[dict[Any, Any]]] = None,
|
||||||
|
index_params: Optional[dict] = None,
|
||||||
|
):
|
||||||
|
lock_name = f"vector_indexing_lock_{self._collection_name}"
|
||||||
|
with redis_client.lock(lock_name, timeout=20):
|
||||||
|
collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
|
||||||
|
if redis_client.get(collection_exist_cache_key):
|
||||||
|
logger.info(f"Collection {self._collection_name} already exists.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self._client.indices.exists(index=self._collection_name):
|
||||||
|
dim = len(embeddings[0])
|
||||||
|
settings = {
|
||||||
|
"analysis": {
|
||||||
|
"analyzer": {
|
||||||
|
"ja_analyzer": {
|
||||||
|
"type": "custom",
|
||||||
|
"char_filter": [
|
||||||
|
"icu_normalizer",
|
||||||
|
"kuromoji_iteration_mark",
|
||||||
|
],
|
||||||
|
"tokenizer": "kuromoji_tokenizer",
|
||||||
|
"filter": [
|
||||||
|
"kuromoji_baseform",
|
||||||
|
"kuromoji_part_of_speech",
|
||||||
|
"ja_stop",
|
||||||
|
"kuromoji_number",
|
||||||
|
"kuromoji_stemmer",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mappings = {
|
||||||
|
"properties": {
|
||||||
|
Field.CONTENT_KEY.value: {
|
||||||
|
"type": "text",
|
||||||
|
"analyzer": "ja_analyzer",
|
||||||
|
"search_analyzer": "ja_analyzer",
|
||||||
|
},
|
||||||
|
Field.VECTOR.value: { # Make sure the dimension is correct here
|
||||||
|
"type": "dense_vector",
|
||||||
|
"dims": dim,
|
||||||
|
"index": True,
|
||||||
|
"similarity": "cosine",
|
||||||
|
},
|
||||||
|
Field.METADATA_KEY.value: {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings)
|
||||||
|
|
||||||
|
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||||
|
|
||||||
|
|
||||||
|
class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory):
|
||||||
|
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector:
|
||||||
|
if dataset.index_struct_dict:
|
||||||
|
class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
|
||||||
|
collection_name = class_prefix
|
||||||
|
else:
|
||||||
|
dataset_id = dataset.id
|
||||||
|
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
|
||||||
|
dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name))
|
||||||
|
|
||||||
|
config = current_app.config
|
||||||
|
return ElasticSearchJaVector(
|
||||||
|
index_name=collection_name,
|
||||||
|
config=ElasticSearchConfig(
|
||||||
|
host=config.get("ELASTICSEARCH_HOST", "localhost"),
|
||||||
|
port=config.get("ELASTICSEARCH_PORT", 9200),
|
||||||
|
username=config.get("ELASTICSEARCH_USERNAME", ""),
|
||||||
|
password=config.get("ELASTICSEARCH_PASSWORD", ""),
|
||||||
|
),
|
||||||
|
attributes=[],
|
||||||
|
)
|
@ -90,6 +90,12 @@ class Vector:
|
|||||||
from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory
|
from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory
|
||||||
|
|
||||||
return ElasticSearchVectorFactory
|
return ElasticSearchVectorFactory
|
||||||
|
case VectorType.ELASTICSEARCH_JA:
|
||||||
|
from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import (
|
||||||
|
ElasticSearchJaVectorFactory,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ElasticSearchJaVectorFactory
|
||||||
case VectorType.TIDB_VECTOR:
|
case VectorType.TIDB_VECTOR:
|
||||||
from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory
|
from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ class VectorType(StrEnum):
|
|||||||
TENCENT = "tencent"
|
TENCENT = "tencent"
|
||||||
ORACLE = "oracle"
|
ORACLE = "oracle"
|
||||||
ELASTICSEARCH = "elasticsearch"
|
ELASTICSEARCH = "elasticsearch"
|
||||||
|
ELASTICSEARCH_JA = "elasticsearch-ja"
|
||||||
LINDORM = "lindorm"
|
LINDORM = "lindorm"
|
||||||
COUCHBASE = "couchbase"
|
COUCHBASE = "couchbase"
|
||||||
BAIDU = "baidu"
|
BAIDU = "baidu"
|
||||||
|
@ -383,7 +383,7 @@ SUPABASE_URL=your-server-url
|
|||||||
# ------------------------------
|
# ------------------------------
|
||||||
|
|
||||||
# The type of vector store to use.
|
# The type of vector store to use.
|
||||||
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
|
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
|
||||||
VECTOR_STORE=weaviate
|
VECTOR_STORE=weaviate
|
||||||
|
|
||||||
# The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
|
# The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
|
||||||
@ -512,7 +512,7 @@ TENCENT_VECTOR_DB_SHARD=1
|
|||||||
TENCENT_VECTOR_DB_REPLICAS=2
|
TENCENT_VECTOR_DB_REPLICAS=2
|
||||||
|
|
||||||
# ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch`
|
# ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch`
|
||||||
ELASTICSEARCH_HOST=0.0.0.0
|
ELASTICSEARCH_HOST=elasticsearch
|
||||||
ELASTICSEARCH_PORT=9200
|
ELASTICSEARCH_PORT=9200
|
||||||
ELASTICSEARCH_USERNAME=elastic
|
ELASTICSEARCH_USERNAME=elastic
|
||||||
ELASTICSEARCH_PASSWORD=elastic
|
ELASTICSEARCH_PASSWORD=elastic
|
||||||
|
@ -883,20 +883,28 @@ services:
|
|||||||
container_name: elasticsearch
|
container_name: elasticsearch
|
||||||
profiles:
|
profiles:
|
||||||
- elasticsearch
|
- elasticsearch
|
||||||
|
- elasticsearch-ja
|
||||||
restart: always
|
restart: always
|
||||||
volumes:
|
volumes:
|
||||||
|
- ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh
|
||||||
- dify_es01_data:/usr/share/elasticsearch/data
|
- dify_es01_data:/usr/share/elasticsearch/data
|
||||||
environment:
|
environment:
|
||||||
ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic}
|
ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic}
|
||||||
|
VECTOR_STORE: ${VECTOR_STORE:-}
|
||||||
cluster.name: dify-es-cluster
|
cluster.name: dify-es-cluster
|
||||||
node.name: dify-es0
|
node.name: dify-es0
|
||||||
discovery.type: single-node
|
discovery.type: single-node
|
||||||
xpack.license.self_generated.type: trial
|
xpack.license.self_generated.type: basic
|
||||||
xpack.security.enabled: 'true'
|
xpack.security.enabled: 'true'
|
||||||
xpack.security.enrollment.enabled: 'false'
|
xpack.security.enrollment.enabled: 'false'
|
||||||
xpack.security.http.ssl.enabled: 'false'
|
xpack.security.http.ssl.enabled: 'false'
|
||||||
ports:
|
ports:
|
||||||
- ${ELASTICSEARCH_PORT:-9200}:9200
|
- ${ELASTICSEARCH_PORT:-9200}:9200
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 2g
|
||||||
|
entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ]
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ]
|
test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
|
25
docker/elasticsearch/docker-entrypoint.sh
Executable file
25
docker/elasticsearch/docker-entrypoint.sh
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then
|
||||||
|
# Check if the ICU tokenizer plugin is installed
|
||||||
|
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then
|
||||||
|
printf '%s\n' "Installing the ICU tokenizer plugin"
|
||||||
|
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then
|
||||||
|
printf '%s\n' "Failed to install the ICU tokenizer plugin"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
# Check if the Japanese language analyzer plugin is installed
|
||||||
|
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then
|
||||||
|
printf '%s\n' "Installing the Japanese language analyzer plugin"
|
||||||
|
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then
|
||||||
|
printf '%s\n' "Failed to install the Japanese language analyzer plugin"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Run the original entrypoint script
|
||||||
|
exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh
|
Loading…
x
Reference in New Issue
Block a user