mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 04:58:59 +08:00
Add search by full text when using Oracle23ai as vector DB (#6559)
This commit is contained in:
parent
093b8ca475
commit
06fc1bce9e
@ -543,13 +543,13 @@ class DatasetRetrievalSettingApi(Resource):
|
|||||||
def get(self):
|
def get(self):
|
||||||
vector_type = dify_config.VECTOR_STORE
|
vector_type = dify_config.VECTOR_STORE
|
||||||
match vector_type:
|
match vector_type:
|
||||||
case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCENT | VectorType.ORACLE:
|
case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCENT:
|
||||||
return {
|
return {
|
||||||
'retrieval_method': [
|
'retrieval_method': [
|
||||||
RetrievalMethod.SEMANTIC_SEARCH.value
|
RetrievalMethod.SEMANTIC_SEARCH.value
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH | VectorType.ANALYTICDB | VectorType.MYSCALE:
|
case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH | VectorType.ANALYTICDB | VectorType.MYSCALE | VectorType.ORACLE:
|
||||||
return {
|
return {
|
||||||
'retrieval_method': [
|
'retrieval_method': [
|
||||||
RetrievalMethod.SEMANTIC_SEARCH.value,
|
RetrievalMethod.SEMANTIC_SEARCH.value,
|
||||||
@ -567,13 +567,13 @@ class DatasetRetrievalSettingMockApi(Resource):
|
|||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
def get(self, vector_type):
|
def get(self, vector_type):
|
||||||
match vector_type:
|
match vector_type:
|
||||||
case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCENT | VectorType.ORACLE:
|
case VectorType.MILVUS | VectorType.RELYT | VectorType.PGVECTOR | VectorType.TIDB_VECTOR | VectorType.CHROMA | VectorType.TENCENT:
|
||||||
return {
|
return {
|
||||||
'retrieval_method': [
|
'retrieval_method': [
|
||||||
RetrievalMethod.SEMANTIC_SEARCH.value
|
RetrievalMethod.SEMANTIC_SEARCH.value
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH| VectorType.ANALYTICDB | VectorType.MYSCALE:
|
case VectorType.QDRANT | VectorType.WEAVIATE | VectorType.OPENSEARCH| VectorType.ANALYTICDB | VectorType.MYSCALE | VectorType.ORACLE:
|
||||||
return {
|
return {
|
||||||
'retrieval_method': [
|
'retrieval_method': [
|
||||||
RetrievalMethod.SEMANTIC_SEARCH.value,
|
RetrievalMethod.SEMANTIC_SEARCH.value,
|
||||||
|
@ -1,11 +1,15 @@
|
|||||||
import array
|
import array
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
import jieba.posseg as pseg
|
||||||
|
import nltk
|
||||||
import numpy
|
import numpy
|
||||||
import oracledb
|
import oracledb
|
||||||
|
from nltk.corpus import stopwords
|
||||||
from pydantic import BaseModel, model_validator
|
from pydantic import BaseModel, model_validator
|
||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
@ -50,6 +54,11 @@ CREATE TABLE IF NOT EXISTS {table_name} (
|
|||||||
,embedding vector NOT NULL
|
,embedding vector NOT NULL
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
|
SQL_CREATE_INDEX = """
|
||||||
|
CREATE INDEX idx_docs_{table_name} ON {table_name}(text)
|
||||||
|
INDEXTYPE IS CTXSYS.CONTEXT PARAMETERS
|
||||||
|
('FILTER CTXSYS.NULL_FILTER SECTION GROUP CTXSYS.HTML_SECTION_GROUP LEXER sys.my_chinese_vgram_lexer')
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class OracleVector(BaseVector):
|
class OracleVector(BaseVector):
|
||||||
@ -188,7 +197,53 @@ class OracleVector(BaseVector):
|
|||||||
return docs
|
return docs
|
||||||
|
|
||||||
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||||
# do not support bm25 search
|
top_k = kwargs.get("top_k", 5)
|
||||||
|
# just not implement fetch by score_threshold now, may be later
|
||||||
|
score_threshold = kwargs.get("score_threshold") if kwargs.get("score_threshold") else 0.0
|
||||||
|
if len(query) > 0:
|
||||||
|
# Check which language the query is in
|
||||||
|
zh_pattern = re.compile('[\u4e00-\u9fa5]+')
|
||||||
|
match = zh_pattern.search(query)
|
||||||
|
entities = []
|
||||||
|
# match: query condition maybe is a chinese sentence, so using Jieba split,else using nltk split
|
||||||
|
if match:
|
||||||
|
words = pseg.cut(query)
|
||||||
|
current_entity = ""
|
||||||
|
for word, pos in words:
|
||||||
|
if pos == 'nr' or pos == 'Ng' or pos == 'eng' or pos == 'nz' or pos == 'n' or pos == 'ORG' or pos == 'v': # nr: 人名, ns: 地名, nt: 机构名
|
||||||
|
current_entity += word
|
||||||
|
else:
|
||||||
|
if current_entity:
|
||||||
|
entities.append(current_entity)
|
||||||
|
current_entity = ""
|
||||||
|
if current_entity:
|
||||||
|
entities.append(current_entity)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
nltk.data.find('tokenizers/punkt')
|
||||||
|
nltk.data.find('corpora/stopwords')
|
||||||
|
except LookupError:
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
print("run download")
|
||||||
|
e_str = re.sub(r'[^\w ]', '', query)
|
||||||
|
all_tokens = nltk.word_tokenize(e_str)
|
||||||
|
stop_words = stopwords.words('english')
|
||||||
|
for token in all_tokens:
|
||||||
|
if token not in stop_words:
|
||||||
|
entities.append(token)
|
||||||
|
with self._get_cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
f"select meta, text FROM {self.table_name} WHERE CONTAINS(text, :1, 1) > 0 order by score(1) desc fetch first {top_k} rows only",
|
||||||
|
[" ACCUM ".join(entities)]
|
||||||
|
)
|
||||||
|
docs = []
|
||||||
|
for record in cur:
|
||||||
|
metadata, text = record
|
||||||
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
return docs
|
||||||
|
else:
|
||||||
|
return [Document(page_content="", metadata="")]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def delete(self) -> None:
|
def delete(self) -> None:
|
||||||
@ -206,6 +261,8 @@ class OracleVector(BaseVector):
|
|||||||
with self._get_cursor() as cur:
|
with self._get_cursor() as cur:
|
||||||
cur.execute(SQL_CREATE_TABLE.format(table_name=self.table_name))
|
cur.execute(SQL_CREATE_TABLE.format(table_name=self.table_name))
|
||||||
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||||
|
with self._get_cursor() as cur:
|
||||||
|
cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name))
|
||||||
|
|
||||||
|
|
||||||
class OracleVectorFactory(AbstractVectorFactory):
|
class OracleVectorFactory(AbstractVectorFactory):
|
||||||
|
13
docker/startupscripts/init.sh
Executable file
13
docker/startupscripts/init.sh
Executable file
@ -0,0 +1,13 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
DB_INITIALISED="/opt/oracle/oradata/dbinit"
|
||||||
|
#[ -f ${DB_INITIALISED} ] && exit
|
||||||
|
#touch ${DB_INITIALISED}
|
||||||
|
if [ -f ${DB_INITIALISED} ]; then
|
||||||
|
echo 'File exists. Standards for have been Init'
|
||||||
|
exit
|
||||||
|
else
|
||||||
|
echo 'File does not exist. Standards for first time Strart up this DB'
|
||||||
|
"$ORACLE_HOME"/bin/sqlplus -s "/ as sysdba" @"/opt/oracle/scripts/startup/init_user.script";
|
||||||
|
touch ${DB_INITIALISED}
|
||||||
|
fi
|
@ -3,3 +3,8 @@ ALTER SYSTEM SET PROCESSES=500 SCOPE=SPFILE;
|
|||||||
alter session set container= freepdb1;
|
alter session set container= freepdb1;
|
||||||
create user dify identified by dify DEFAULT TABLESPACE users quota unlimited on users;
|
create user dify identified by dify DEFAULT TABLESPACE users quota unlimited on users;
|
||||||
grant DB_DEVELOPER_ROLE to dify;
|
grant DB_DEVELOPER_ROLE to dify;
|
||||||
|
|
||||||
|
BEGIN
|
||||||
|
CTX_DDL.CREATE_PREFERENCE('my_chinese_vgram_lexer','CHINESE_VGRAM_LEXER');
|
||||||
|
END;
|
||||||
|
/
|
Loading…
x
Reference in New Issue
Block a user