mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-07-31 01:42:04 +08:00
add vdb document id index (#16244)
Co-authored-by: crazywoola <427733928@qq.com>
This commit is contained in:
parent
cade0f65e2
commit
d135677c25
@ -196,7 +196,8 @@ class ElasticSearchVector(BaseVector):
|
|||||||
Field.METADATA_KEY.value: {
|
Field.METADATA_KEY.value: {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
|
"doc_id": {"type": "keyword"}, # Map doc_id to keyword type
|
||||||
|
"document_id": {"type": "keyword"}, # Map doc_id to keyword type
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -11,3 +11,4 @@ class Field(Enum):
|
|||||||
TEXT_KEY = "text"
|
TEXT_KEY = "text"
|
||||||
PRIMARY_KEY = "id"
|
PRIMARY_KEY = "id"
|
||||||
DOC_ID = "metadata.doc_id"
|
DOC_ID = "metadata.doc_id"
|
||||||
|
DOCUMENT_ID = "metadata.document_id"
|
||||||
|
@ -134,6 +134,10 @@ class QdrantVector(BaseVector):
|
|||||||
self._client.create_payload_index(
|
self._client.create_payload_index(
|
||||||
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||||
)
|
)
|
||||||
|
# create document_id payload index
|
||||||
|
self._client.create_payload_index(
|
||||||
|
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||||
|
)
|
||||||
# create full text index
|
# create full text index
|
||||||
text_index_params = TextIndexParams(
|
text_index_params = TextIndexParams(
|
||||||
type=TextIndexType.TEXT,
|
type=TextIndexType.TEXT,
|
||||||
|
@ -144,6 +144,10 @@ class TidbOnQdrantVector(BaseVector):
|
|||||||
self._client.create_payload_index(
|
self._client.create_payload_index(
|
||||||
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||||
)
|
)
|
||||||
|
# create document_id payload index
|
||||||
|
self._client.create_payload_index(
|
||||||
|
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
|
||||||
|
)
|
||||||
# create full text index
|
# create full text index
|
||||||
text_index_params = TextIndexParams(
|
text_index_params = TextIndexParams(
|
||||||
type=TextIndexType.TEXT,
|
type=TextIndexType.TEXT,
|
||||||
@ -318,23 +322,17 @@ class TidbOnQdrantVector(BaseVector):
|
|||||||
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
|
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
|
||||||
from qdrant_client.http import models
|
from qdrant_client.http import models
|
||||||
|
|
||||||
filter = models.Filter(
|
filter = None
|
||||||
must=[
|
|
||||||
models.FieldCondition(
|
|
||||||
key="group_id",
|
|
||||||
match=models.MatchValue(value=self._group_id),
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
document_ids_filter = kwargs.get("document_ids_filter")
|
document_ids_filter = kwargs.get("document_ids_filter")
|
||||||
if document_ids_filter:
|
if document_ids_filter:
|
||||||
if filter.must:
|
filter = models.Filter(
|
||||||
filter.must.append(
|
must=[
|
||||||
models.FieldCondition(
|
models.FieldCondition(
|
||||||
key="metadata.document_id",
|
key="metadata.document_id",
|
||||||
match=models.MatchAny(any=document_ids_filter),
|
match=models.MatchAny(any=document_ids_filter),
|
||||||
)
|
)
|
||||||
)
|
],
|
||||||
|
)
|
||||||
results = self._client.search(
|
results = self._client.search(
|
||||||
collection_name=self._collection_name,
|
collection_name=self._collection_name,
|
||||||
query_vector=query_vector,
|
query_vector=query_vector,
|
||||||
@ -369,23 +367,17 @@ class TidbOnQdrantVector(BaseVector):
|
|||||||
"""
|
"""
|
||||||
from qdrant_client.http import models
|
from qdrant_client.http import models
|
||||||
|
|
||||||
scroll_filter = models.Filter(
|
scroll_filter = None
|
||||||
must=[
|
|
||||||
models.FieldCondition(
|
|
||||||
key="page_content",
|
|
||||||
match=models.MatchText(text=query),
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
document_ids_filter = kwargs.get("document_ids_filter")
|
document_ids_filter = kwargs.get("document_ids_filter")
|
||||||
if document_ids_filter:
|
if document_ids_filter:
|
||||||
if scroll_filter.must:
|
scroll_filter = models.Filter(
|
||||||
scroll_filter.must.append(
|
must=[
|
||||||
models.FieldCondition(
|
models.FieldCondition(
|
||||||
key="metadata.document_id",
|
key="metadata.document_id",
|
||||||
match=models.MatchAny(any=document_ids_filter),
|
match=models.MatchAny(any=document_ids_filter),
|
||||||
)
|
)
|
||||||
)
|
]
|
||||||
|
)
|
||||||
response = self._client.scroll(
|
response = self._client.scroll(
|
||||||
collection_name=self._collection_name,
|
collection_name=self._collection_name,
|
||||||
scroll_filter=scroll_filter,
|
scroll_filter=scroll_filter,
|
||||||
|
@ -105,10 +105,12 @@ class TiDBVector(BaseVector):
|
|||||||
text TEXT NOT NULL,
|
text TEXT NOT NULL,
|
||||||
meta JSON NOT NULL,
|
meta JSON NOT NULL,
|
||||||
doc_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.doc_id'))) STORED,
|
doc_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.doc_id'))) STORED,
|
||||||
|
document_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.document_id'))) STORED,
|
||||||
vector VECTOR<FLOAT>({dimension}) NOT NULL,
|
vector VECTOR<FLOAT>({dimension}) NOT NULL,
|
||||||
create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
|
create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||||
update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
KEY (doc_id),
|
KEY (doc_id),
|
||||||
|
KEY (document_id),
|
||||||
VECTOR INDEX idx_vector (({tidb_dist_func}(vector))) USING HNSW
|
VECTOR INDEX idx_vector (({tidb_dist_func}(vector))) USING HNSW
|
||||||
);
|
);
|
||||||
""")
|
""")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user