Lindorm VDB bugfix (#17357)

Co-authored-by: jiangzhijie <jiangzhijie.jzj@alibaba-inc.com>
This commit is contained in:
Jiang 2025-04-02 21:31:59 +08:00 committed by GitHub
parent e2b8f40275
commit fd1e40d22e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4,7 +4,8 @@ import logging
import time import time
from typing import Any, Optional from typing import Any, Optional
from opensearchpy import OpenSearch from opensearchpy import OpenSearch, helpers
from opensearchpy.helpers import BulkIndexError
from pydantic import BaseModel, model_validator from pydantic import BaseModel, model_validator
from tenacity import retry, stop_after_attempt, wait_exponential from tenacity import retry, stop_after_attempt, wait_exponential
@ -135,14 +136,14 @@ class LindormVectorStore(BaseVector):
actions.append(action_header) actions.append(action_header)
actions.append(action_values) actions.append(action_values)
logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})") # logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
try: try:
_bulk_with_retry(actions) _bulk_with_retry(actions)
logger.info(f"Successfully processed batch {batch_num + 1}") # logger.info(f"Successfully processed batch {batch_num + 1}")
# simple latency to avoid too many requests in a short time # simple latency to avoid too many requests in a short time
if batch_num < num_batches - 1: if batch_num < num_batches - 1:
time.sleep(1) time.sleep(0.5)
except Exception: except Exception:
logger.exception(f"Failed to process batch {batch_num + 1}") logger.exception(f"Failed to process batch {batch_num + 1}")
@ -166,18 +167,51 @@ class LindormVectorStore(BaseVector):
self.delete_by_ids(ids) self.delete_by_ids(ids)
def delete_by_ids(self, ids: list[str]) -> None: def delete_by_ids(self, ids: list[str]) -> None:
params = {} """Delete documents by their IDs in batch.
if self._using_ugc:
params["routing"] = self._routing Args:
ids: List of document IDs to delete
"""
if not ids:
return
params = {"routing": self._routing} if self._using_ugc else {}
# 1. First check if collection exists
if not self._client.indices.exists(index=self._collection_name):
logger.warning(f"Collection {self._collection_name} does not exist")
return
# 2. Batch process deletions
actions = []
for id in ids: for id in ids:
if self._client.exists(index=self._collection_name, id=id, params=params): if self._client.exists(index=self._collection_name, id=id, params=params):
params = {} actions.append(
if self._using_ugc: {
params["routing"] = self._routing "_op_type": "delete",
self._client.delete(index=self._collection_name, id=id, params=params) "_index": self._collection_name,
"_id": id,
**params, # Include routing if using UGC
}
)
else: else:
logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.") logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.")
# 3. Perform bulk deletion if there are valid documents to delete
if actions:
try:
helpers.bulk(self._client, actions)
except BulkIndexError as e:
for error in e.errors:
delete_error = error.get("delete", {})
status = delete_error.get("status")
doc_id = delete_error.get("_id")
if status == 404:
logger.warning(f"Document not found for deletion: {doc_id}")
else:
logger.exception(f"Error deleting document: {error}")
def delete(self) -> None: def delete(self) -> None:
if self._using_ugc: if self._using_ugc:
routing_filter_query = { routing_filter_query = {
@ -213,7 +247,7 @@ class LindormVectorStore(BaseVector):
document_ids_filter = kwargs.get("document_ids_filter") document_ids_filter = kwargs.get("document_ids_filter")
filters = [] filters = []
if document_ids_filter: if document_ids_filter:
filters.append({"terms": {"metadata.document_id": document_ids_filter}}) filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs) query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs)
try: try:
@ -256,7 +290,7 @@ class LindormVectorStore(BaseVector):
filters = kwargs.get("filter", []) filters = kwargs.get("filter", [])
document_ids_filter = kwargs.get("document_ids_filter") document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter: if document_ids_filter:
filters.append({"terms": {"metadata.document_id": document_ids_filter}}) filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
routing = self._routing routing = self._routing
full_text_query = default_text_search_query( full_text_query = default_text_search_query(
query_text=query, query_text=query,
@ -270,6 +304,7 @@ class LindormVectorStore(BaseVector):
routing=routing, routing=routing,
routing_field=self._routing_field, routing_field=self._routing_field,
) )
response = self._client.search(index=self._collection_name, body=full_text_query) response = self._client.search(index=self._collection_name, body=full_text_query)
docs = [] docs = []
for hit in response["hits"]["hits"]: for hit in response["hits"]["hits"]:
@ -479,7 +514,7 @@ def default_vector_search_query(
**kwargs, **kwargs,
) -> dict: ) -> dict:
if filters is not None: if filters is not None:
filter_type = "post_filter" if filter_type is None else filter_type filter_type = "pre_filter" if filter_type is None else filter_type
if not isinstance(filters, list): if not isinstance(filters, list):
raise RuntimeError(f"unexpected filter with {type(filters)}") raise RuntimeError(f"unexpected filter with {type(filters)}")
final_ext: dict[str, Any] = {"lvector": {}} final_ext: dict[str, Any] = {"lvector": {}}