Lindorm VDB bugfix (#17357)

Co-authored-by: jiangzhijie <jiangzhijie.jzj@alibaba-inc.com>
This commit is contained in:
Jiang 2025-04-02 21:31:59 +08:00 committed by GitHub
parent e2b8f40275
commit fd1e40d22e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4,7 +4,8 @@ import logging
import time
from typing import Any, Optional
from opensearchpy import OpenSearch
from opensearchpy import OpenSearch, helpers
from opensearchpy.helpers import BulkIndexError
from pydantic import BaseModel, model_validator
from tenacity import retry, stop_after_attempt, wait_exponential
@ -135,14 +136,14 @@ class LindormVectorStore(BaseVector):
actions.append(action_header)
actions.append(action_values)
logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
# logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
try:
_bulk_with_retry(actions)
logger.info(f"Successfully processed batch {batch_num + 1}")
# logger.info(f"Successfully processed batch {batch_num + 1}")
# simple latency to avoid too many requests in a short time
if batch_num < num_batches - 1:
time.sleep(1)
time.sleep(0.5)
except Exception:
logger.exception(f"Failed to process batch {batch_num + 1}")
@ -166,18 +167,51 @@ class LindormVectorStore(BaseVector):
self.delete_by_ids(ids)
def delete_by_ids(self, ids: list[str]) -> None:
params = {}
if self._using_ugc:
params["routing"] = self._routing
"""Delete documents by their IDs in batch.
Args:
ids: List of document IDs to delete
"""
if not ids:
return
params = {"routing": self._routing} if self._using_ugc else {}
# 1. First check if collection exists
if not self._client.indices.exists(index=self._collection_name):
logger.warning(f"Collection {self._collection_name} does not exist")
return
# 2. Batch process deletions
actions = []
for id in ids:
if self._client.exists(index=self._collection_name, id=id, params=params):
params = {}
if self._using_ugc:
params["routing"] = self._routing
self._client.delete(index=self._collection_name, id=id, params=params)
actions.append(
{
"_op_type": "delete",
"_index": self._collection_name,
"_id": id,
**params, # Include routing if using UGC
}
)
else:
logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.")
# 3. Perform bulk deletion if there are valid documents to delete
if actions:
try:
helpers.bulk(self._client, actions)
except BulkIndexError as e:
for error in e.errors:
delete_error = error.get("delete", {})
status = delete_error.get("status")
doc_id = delete_error.get("_id")
if status == 404:
logger.warning(f"Document not found for deletion: {doc_id}")
else:
logger.exception(f"Error deleting document: {error}")
def delete(self) -> None:
if self._using_ugc:
routing_filter_query = {
@ -213,7 +247,7 @@ class LindormVectorStore(BaseVector):
document_ids_filter = kwargs.get("document_ids_filter")
filters = []
if document_ids_filter:
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs)
try:
@ -256,7 +290,7 @@ class LindormVectorStore(BaseVector):
filters = kwargs.get("filter", [])
document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter:
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
routing = self._routing
full_text_query = default_text_search_query(
query_text=query,
@ -270,6 +304,7 @@ class LindormVectorStore(BaseVector):
routing=routing,
routing_field=self._routing_field,
)
response = self._client.search(index=self._collection_name, body=full_text_query)
docs = []
for hit in response["hits"]["hits"]:
@ -479,7 +514,7 @@ def default_vector_search_query(
**kwargs,
) -> dict:
if filters is not None:
filter_type = "post_filter" if filter_type is None else filter_type
filter_type = "pre_filter" if filter_type is None else filter_type
if not isinstance(filters, list):
raise RuntimeError(f"unexpected filter with {type(filters)}")
final_ext: dict[str, Any] = {"lvector": {}}