From d9266ed65addb5b8b28e922246dc377eb6617976 Mon Sep 17 00:00:00 2001 From: alulala <61424174+Yue-Lyu123@users.noreply.github.com> Date: Fri, 11 Apr 2025 12:31:36 +0800 Subject: [PATCH] Fix: incorrect total chunks count in retrieval function after similarity filtering (#6741) (#6932) ### Related Issue: https://github.com/infiniflow/ragflow/issues/6741 ### Environment: Using nightly version Commit version: [[6051abb](https://github.com/infiniflow/ragflow/commit/6051abb4a328176717fae3836e2ce933a980a894)] ### Bug Description: The retrieval function in rag/nlp/search.py returns the original total chunks number even after chunks are filtered by similarity_threshold. This creates inconsistency between the actual returned chunks and the reported total. ### Changes Made: Added code to count how many search results actually meet or exceed the configured similarity threshold Positioned the calculation after the doc_ids conditional logic to ensure special cases are handled correctly Updated the ranks["total"] value to store this filtered count instead of using the raw search result count Using NumPy leverages optimized C-level batch operations to optimize speed --- rag/nlp/search.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rag/nlp/search.py b/rag/nlp/search.py index e2128a26..187c2030 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -363,7 +363,6 @@ class Dealer: sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature) - ranks["total"] = sres.total if rerank_mdl and sres.total > 0: sim, tsim, vsim = self.rerank_by_model(rerank_mdl, @@ -383,6 +382,9 @@ class Dealer: if doc_ids: similarity_threshold = 0 page_size = 30 + sim_np = np.array(sim) + filtered_count = (sim_np >= similarity_threshold).sum() + ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error for i in idx: if sim[i] < similarity_threshold: break