mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-06-04 03:04:11 +08:00
Lindorm VDB bugfix (#17357)
Co-authored-by: jiangzhijie <jiangzhijie.jzj@alibaba-inc.com>
This commit is contained in:
parent
e2b8f40275
commit
fd1e40d22e
@ -4,7 +4,8 @@ import logging
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from opensearchpy import OpenSearch
|
||||
from opensearchpy import OpenSearch, helpers
|
||||
from opensearchpy.helpers import BulkIndexError
|
||||
from pydantic import BaseModel, model_validator
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
@ -135,14 +136,14 @@ class LindormVectorStore(BaseVector):
|
||||
actions.append(action_header)
|
||||
actions.append(action_values)
|
||||
|
||||
logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
|
||||
# logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
|
||||
|
||||
try:
|
||||
_bulk_with_retry(actions)
|
||||
logger.info(f"Successfully processed batch {batch_num + 1}")
|
||||
# logger.info(f"Successfully processed batch {batch_num + 1}")
|
||||
# simple latency to avoid too many requests in a short time
|
||||
if batch_num < num_batches - 1:
|
||||
time.sleep(1)
|
||||
time.sleep(0.5)
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Failed to process batch {batch_num + 1}")
|
||||
@ -166,18 +167,51 @@ class LindormVectorStore(BaseVector):
|
||||
self.delete_by_ids(ids)
|
||||
|
||||
def delete_by_ids(self, ids: list[str]) -> None:
|
||||
params = {}
|
||||
if self._using_ugc:
|
||||
params["routing"] = self._routing
|
||||
"""Delete documents by their IDs in batch.
|
||||
|
||||
Args:
|
||||
ids: List of document IDs to delete
|
||||
"""
|
||||
if not ids:
|
||||
return
|
||||
|
||||
params = {"routing": self._routing} if self._using_ugc else {}
|
||||
|
||||
# 1. First check if collection exists
|
||||
if not self._client.indices.exists(index=self._collection_name):
|
||||
logger.warning(f"Collection {self._collection_name} does not exist")
|
||||
return
|
||||
|
||||
# 2. Batch process deletions
|
||||
actions = []
|
||||
for id in ids:
|
||||
if self._client.exists(index=self._collection_name, id=id, params=params):
|
||||
params = {}
|
||||
if self._using_ugc:
|
||||
params["routing"] = self._routing
|
||||
self._client.delete(index=self._collection_name, id=id, params=params)
|
||||
actions.append(
|
||||
{
|
||||
"_op_type": "delete",
|
||||
"_index": self._collection_name,
|
||||
"_id": id,
|
||||
**params, # Include routing if using UGC
|
||||
}
|
||||
)
|
||||
else:
|
||||
logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.")
|
||||
|
||||
# 3. Perform bulk deletion if there are valid documents to delete
|
||||
if actions:
|
||||
try:
|
||||
helpers.bulk(self._client, actions)
|
||||
except BulkIndexError as e:
|
||||
for error in e.errors:
|
||||
delete_error = error.get("delete", {})
|
||||
status = delete_error.get("status")
|
||||
doc_id = delete_error.get("_id")
|
||||
|
||||
if status == 404:
|
||||
logger.warning(f"Document not found for deletion: {doc_id}")
|
||||
else:
|
||||
logger.exception(f"Error deleting document: {error}")
|
||||
|
||||
def delete(self) -> None:
|
||||
if self._using_ugc:
|
||||
routing_filter_query = {
|
||||
@ -213,7 +247,7 @@ class LindormVectorStore(BaseVector):
|
||||
document_ids_filter = kwargs.get("document_ids_filter")
|
||||
filters = []
|
||||
if document_ids_filter:
|
||||
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
|
||||
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
|
||||
query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs)
|
||||
|
||||
try:
|
||||
@ -256,7 +290,7 @@ class LindormVectorStore(BaseVector):
|
||||
filters = kwargs.get("filter", [])
|
||||
document_ids_filter = kwargs.get("document_ids_filter")
|
||||
if document_ids_filter:
|
||||
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
|
||||
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
|
||||
routing = self._routing
|
||||
full_text_query = default_text_search_query(
|
||||
query_text=query,
|
||||
@ -270,6 +304,7 @@ class LindormVectorStore(BaseVector):
|
||||
routing=routing,
|
||||
routing_field=self._routing_field,
|
||||
)
|
||||
|
||||
response = self._client.search(index=self._collection_name, body=full_text_query)
|
||||
docs = []
|
||||
for hit in response["hits"]["hits"]:
|
||||
@ -479,7 +514,7 @@ def default_vector_search_query(
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
if filters is not None:
|
||||
filter_type = "post_filter" if filter_type is None else filter_type
|
||||
filter_type = "pre_filter" if filter_type is None else filter_type
|
||||
if not isinstance(filters, list):
|
||||
raise RuntimeError(f"unexpected filter with {type(filters)}")
|
||||
final_ext: dict[str, Any] = {"lvector": {}}
|
||||
|
Loading…
x
Reference in New Issue
Block a user