From d9266ed65addb5b8b28e922246dc377eb6617976 Mon Sep 17 00:00:00 2001
From: alulala <61424174+Yue-Lyu123@users.noreply.github.com>
Date: Fri, 11 Apr 2025 12:31:36 +0800
Subject: [PATCH] Fix: incorrect total chunks count in retrieval function after
 similarity filtering (#6741)  (#6932)

### Related Issue:
https://github.com/infiniflow/ragflow/issues/6741

### Environment:
Using nightly version
Commit version:
[[6051abb](https://github.com/infiniflow/ragflow/commit/6051abb4a328176717fae3836e2ce933a980a894)]

### Bug Description:
The retrieval function in rag/nlp/search.py returns the original total
chunks number
even after chunks are filtered by similarity_threshold. This creates
inconsistency
between the actual returned chunks and the reported total.

### Changes Made:
Added code to count how many search results actually meet or exceed the
configured similarity threshold
Positioned the calculation after the doc_ids conditional logic to ensure
special cases are handled correctly
Updated the ranks["total"] value to store this filtered count instead of
using the raw search result count
Using NumPy leverages optimized C-level batch operations to optimize
speed
---
 rag/nlp/search.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index e2128a26..187c2030 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -363,7 +363,6 @@ class Dealer:
 
         sres = self.search(req, [index_name(tid) for tid in tenant_ids],
                            kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
-        ranks["total"] = sres.total
 
         if rerank_mdl and sres.total > 0:
             sim, tsim, vsim = self.rerank_by_model(rerank_mdl,
@@ -383,6 +382,9 @@ class Dealer:
         if doc_ids:
             similarity_threshold = 0
             page_size = 30
+        sim_np = np.array(sim)
+        filtered_count = (sim_np >= similarity_threshold).sum()    
+        ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
         for i in idx:
             if sim[i] < similarity_threshold:
                 break