mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-06-04 11:14:10 +08:00
Lindorm VDB bugfix (#17357)
Co-authored-by: jiangzhijie <jiangzhijie.jzj@alibaba-inc.com>
This commit is contained in:
parent
e2b8f40275
commit
fd1e40d22e
@ -4,7 +4,8 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from opensearchpy import OpenSearch
|
from opensearchpy import OpenSearch, helpers
|
||||||
|
from opensearchpy.helpers import BulkIndexError
|
||||||
from pydantic import BaseModel, model_validator
|
from pydantic import BaseModel, model_validator
|
||||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||||
|
|
||||||
@ -135,14 +136,14 @@ class LindormVectorStore(BaseVector):
|
|||||||
actions.append(action_header)
|
actions.append(action_header)
|
||||||
actions.append(action_values)
|
actions.append(action_values)
|
||||||
|
|
||||||
logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
|
# logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_bulk_with_retry(actions)
|
_bulk_with_retry(actions)
|
||||||
logger.info(f"Successfully processed batch {batch_num + 1}")
|
# logger.info(f"Successfully processed batch {batch_num + 1}")
|
||||||
# simple latency to avoid too many requests in a short time
|
# simple latency to avoid too many requests in a short time
|
||||||
if batch_num < num_batches - 1:
|
if batch_num < num_batches - 1:
|
||||||
time.sleep(1)
|
time.sleep(0.5)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception(f"Failed to process batch {batch_num + 1}")
|
logger.exception(f"Failed to process batch {batch_num + 1}")
|
||||||
@ -166,18 +167,51 @@ class LindormVectorStore(BaseVector):
|
|||||||
self.delete_by_ids(ids)
|
self.delete_by_ids(ids)
|
||||||
|
|
||||||
def delete_by_ids(self, ids: list[str]) -> None:
|
def delete_by_ids(self, ids: list[str]) -> None:
|
||||||
params = {}
|
"""Delete documents by their IDs in batch.
|
||||||
if self._using_ugc:
|
|
||||||
params["routing"] = self._routing
|
Args:
|
||||||
|
ids: List of document IDs to delete
|
||||||
|
"""
|
||||||
|
if not ids:
|
||||||
|
return
|
||||||
|
|
||||||
|
params = {"routing": self._routing} if self._using_ugc else {}
|
||||||
|
|
||||||
|
# 1. First check if collection exists
|
||||||
|
if not self._client.indices.exists(index=self._collection_name):
|
||||||
|
logger.warning(f"Collection {self._collection_name} does not exist")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 2. Batch process deletions
|
||||||
|
actions = []
|
||||||
for id in ids:
|
for id in ids:
|
||||||
if self._client.exists(index=self._collection_name, id=id, params=params):
|
if self._client.exists(index=self._collection_name, id=id, params=params):
|
||||||
params = {}
|
actions.append(
|
||||||
if self._using_ugc:
|
{
|
||||||
params["routing"] = self._routing
|
"_op_type": "delete",
|
||||||
self._client.delete(index=self._collection_name, id=id, params=params)
|
"_index": self._collection_name,
|
||||||
|
"_id": id,
|
||||||
|
**params, # Include routing if using UGC
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.")
|
logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.")
|
||||||
|
|
||||||
|
# 3. Perform bulk deletion if there are valid documents to delete
|
||||||
|
if actions:
|
||||||
|
try:
|
||||||
|
helpers.bulk(self._client, actions)
|
||||||
|
except BulkIndexError as e:
|
||||||
|
for error in e.errors:
|
||||||
|
delete_error = error.get("delete", {})
|
||||||
|
status = delete_error.get("status")
|
||||||
|
doc_id = delete_error.get("_id")
|
||||||
|
|
||||||
|
if status == 404:
|
||||||
|
logger.warning(f"Document not found for deletion: {doc_id}")
|
||||||
|
else:
|
||||||
|
logger.exception(f"Error deleting document: {error}")
|
||||||
|
|
||||||
def delete(self) -> None:
|
def delete(self) -> None:
|
||||||
if self._using_ugc:
|
if self._using_ugc:
|
||||||
routing_filter_query = {
|
routing_filter_query = {
|
||||||
@ -213,7 +247,7 @@ class LindormVectorStore(BaseVector):
|
|||||||
document_ids_filter = kwargs.get("document_ids_filter")
|
document_ids_filter = kwargs.get("document_ids_filter")
|
||||||
filters = []
|
filters = []
|
||||||
if document_ids_filter:
|
if document_ids_filter:
|
||||||
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
|
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
|
||||||
query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs)
|
query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -256,7 +290,7 @@ class LindormVectorStore(BaseVector):
|
|||||||
filters = kwargs.get("filter", [])
|
filters = kwargs.get("filter", [])
|
||||||
document_ids_filter = kwargs.get("document_ids_filter")
|
document_ids_filter = kwargs.get("document_ids_filter")
|
||||||
if document_ids_filter:
|
if document_ids_filter:
|
||||||
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
|
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
|
||||||
routing = self._routing
|
routing = self._routing
|
||||||
full_text_query = default_text_search_query(
|
full_text_query = default_text_search_query(
|
||||||
query_text=query,
|
query_text=query,
|
||||||
@ -270,6 +304,7 @@ class LindormVectorStore(BaseVector):
|
|||||||
routing=routing,
|
routing=routing,
|
||||||
routing_field=self._routing_field,
|
routing_field=self._routing_field,
|
||||||
)
|
)
|
||||||
|
|
||||||
response = self._client.search(index=self._collection_name, body=full_text_query)
|
response = self._client.search(index=self._collection_name, body=full_text_query)
|
||||||
docs = []
|
docs = []
|
||||||
for hit in response["hits"]["hits"]:
|
for hit in response["hits"]["hits"]:
|
||||||
@ -479,7 +514,7 @@ def default_vector_search_query(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
if filters is not None:
|
if filters is not None:
|
||||||
filter_type = "post_filter" if filter_type is None else filter_type
|
filter_type = "pre_filter" if filter_type is None else filter_type
|
||||||
if not isinstance(filters, list):
|
if not isinstance(filters, list):
|
||||||
raise RuntimeError(f"unexpected filter with {type(filters)}")
|
raise RuntimeError(f"unexpected filter with {type(filters)}")
|
||||||
final_ext: dict[str, Any] = {"lvector": {}}
|
final_ext: dict[str, Any] = {"lvector": {}}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user