From d135677c25aa632d13b6a4db8eca68365cd4b660 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Thu, 20 Mar 2025 01:38:15 +0800 Subject: [PATCH] add vdb document id index (#16244) Co-authored-by: crazywoola <427733928@qq.com> --- .../vdb/elasticsearch/elasticsearch_vector.py | 3 +- api/core/rag/datasource/vdb/field.py | 1 + .../datasource/vdb/qdrant/qdrant_vector.py | 4 +++ .../tidb_on_qdrant/tidb_on_qdrant_vector.py | 36 ++++++++----------- .../datasource/vdb/tidb_vector/tidb_vector.py | 2 ++ 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py index 093368b0cc..033d05a077 100644 --- a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py +++ b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py @@ -196,7 +196,8 @@ class ElasticSearchVector(BaseVector): Field.METADATA_KEY.value: { "type": "object", "properties": { - "doc_id": {"type": "keyword"} # Map doc_id to keyword type + "doc_id": {"type": "keyword"}, # Map doc_id to keyword type + "document_id": {"type": "keyword"}, # Map doc_id to keyword type }, }, } diff --git a/api/core/rag/datasource/vdb/field.py b/api/core/rag/datasource/vdb/field.py index a64407bce1..9887e21b7c 100644 --- a/api/core/rag/datasource/vdb/field.py +++ b/api/core/rag/datasource/vdb/field.py @@ -11,3 +11,4 @@ class Field(Enum): TEXT_KEY = "text" PRIMARY_KEY = "id" DOC_ID = "metadata.doc_id" + DOCUMENT_ID = "metadata.document_id" diff --git a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py index 73ce8201fd..4efd90667a 100644 --- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py +++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py @@ -134,6 +134,10 @@ class QdrantVector(BaseVector): self._client.create_payload_index( collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD ) + # create document_id payload index + self._client.create_payload_index( + collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD + ) # create full text index text_index_params = TextIndexParams( type=TextIndexType.TEXT, diff --git a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py index ae4baeb17e..6a61fe9496 100644 --- a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py +++ b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py @@ -144,6 +144,10 @@ class TidbOnQdrantVector(BaseVector): self._client.create_payload_index( collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD ) + # create document_id payload index + self._client.create_payload_index( + collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD + ) # create full text index text_index_params = TextIndexParams( type=TextIndexType.TEXT, @@ -318,23 +322,17 @@ class TidbOnQdrantVector(BaseVector): def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: from qdrant_client.http import models - filter = models.Filter( - must=[ - models.FieldCondition( - key="group_id", - match=models.MatchValue(value=self._group_id), - ), - ], - ) + filter = None document_ids_filter = kwargs.get("document_ids_filter") if document_ids_filter: - if filter.must: - filter.must.append( + filter = models.Filter( + must=[ models.FieldCondition( key="metadata.document_id", match=models.MatchAny(any=document_ids_filter), ) - ) + ], + ) results = self._client.search( collection_name=self._collection_name, query_vector=query_vector, @@ -369,23 +367,17 @@ class TidbOnQdrantVector(BaseVector): """ from qdrant_client.http import models - scroll_filter = models.Filter( - must=[ - models.FieldCondition( - key="page_content", - match=models.MatchText(text=query), - ) - ] - ) + scroll_filter = None document_ids_filter = kwargs.get("document_ids_filter") if document_ids_filter: - if scroll_filter.must: - scroll_filter.must.append( + scroll_filter = models.Filter( + must=[ models.FieldCondition( key="metadata.document_id", match=models.MatchAny(any=document_ids_filter), ) - ) + ] + ) response = self._client.scroll( collection_name=self._collection_name, scroll_filter=scroll_filter, diff --git a/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py b/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py index 77c5786042..efa68059e5 100644 --- a/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py +++ b/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py @@ -105,10 +105,12 @@ class TiDBVector(BaseVector): text TEXT NOT NULL, meta JSON NOT NULL, doc_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.doc_id'))) STORED, + document_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.document_id'))) STORED, vector VECTOR({dimension}) NOT NULL, create_time DATETIME DEFAULT CURRENT_TIMESTAMP, update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, KEY (doc_id), + KEY (document_id), VECTOR INDEX idx_vector (({tidb_dist_func}(vector))) USING HNSW ); """)