From 920b2c2b40a017cb5c25fae9def16b101896d5c3 Mon Sep 17 00:00:00 2001
From: Jyong <76649700+JohnJyong@users.noreply.github.com>
Date: Tue, 27 Feb 2024 17:30:52 +0800
Subject: [PATCH] Fix/hit test tsne issue (#2581)

Co-authored-by: jyong <jyong@dify.ai>
---
 api/celerybeat-schedule.db                   | Bin 16384 -> 0 bytes
 api/core/features/annotation_reply.py        |   2 +-
 api/core/rag/datasource/retrieval_service.py |   4 ++--
 api/services/hit_testing_service.py          |   5 +++--
 4 files changed, 6 insertions(+), 5 deletions(-)
 delete mode 100644 api/celerybeat-schedule.db

diff --git a/api/celerybeat-schedule.db b/api/celerybeat-schedule.db
deleted file mode 100644
index b8c01de27bfe7ea04f1dd868cec4935ef336f2b5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI%J#W)M7zglk>f!`QTojc7@c|m78dNo6>rydA=}>~icDmRP*lOxV@fkIhEM?+F
zx~<oR4}ka>ENo1C0EEPX#K4Zk%h}inp%k`MMgNm7_ax8R_xN|`DeS_kV2srmv)?kd
zVnTMAG0O~jXZ12L`QnGAa$GZ`oyR9}_Q8yK%je{M;jLbjy6|POAOs))0SG_<0uX=z
z1Rwwb2teT62-Mj(_lx`4{p7xP-?*>cL367)Xr7z$Q78l;009U<00Izz00bZafio4D
z*(VRmKSDFTrmp!T5;3R!Aq7DcKjgmfL*h~-ds<!cX+)E$HzXmoYTh-OARW=2SWc$;
z_+M9VCFOkY^fmsH(yAT^Niy1}-@{|QebM=7)ts9xtN%_bdpwm#X5jY*+oq<@u70d{
zb?=2*DT}YFNX%WT*h~x^)yVH1&`1jNy~lcoJ@&+0`)nuw>|{FA(gYPz8m9>b+|(oz
zlF3h^(Edd@H?RIgm^Z6Ln3vLFqkP=;mUX?Y!&dQh8(}+K?Xmv-+WeBQRvRb$J&FTf
zY(P5JdAX<N)~mJbPJ8KjR;{%hUQv!eIf$hg7MpI}(#qO2{e}*^VK+C*^sw+!4jqo8
z2_?G!>SdB=Qjg4Oi6}AWu2CL*wcPbKyxZF2{1Hu(=pg2NW<T{(>2r$3a74k(-tpwo
znZ77k90Cx400bZa0SG_<0uX=z1Rwx`g$R88js^h;KmY;|fB*y_009U<00Izz!2d4r
E0}denXaE2J

diff --git a/api/core/features/annotation_reply.py b/api/core/features/annotation_reply.py
index e1b64cf73f..fd516e465f 100644
--- a/api/core/features/annotation_reply.py
+++ b/api/core/features/annotation_reply.py
@@ -59,7 +59,7 @@ class AnnotationReplyFeature:
 
             documents = vector.search_by_vector(
                 query=query,
-                k=1,
+                top_k=1,
                 score_threshold=score_threshold,
                 filter={
                     'group_id': [dataset.id]
diff --git a/api/core/rag/datasource/retrieval_service.py b/api/core/rag/datasource/retrieval_service.py
index e295e58950..0f9c753056 100644
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@@ -101,7 +101,7 @@ class RetrievalService:
 
             documents = keyword.search(
                 query,
-                k=top_k
+                top_k=top_k
             )
             all_documents.extend(documents)
 
@@ -121,7 +121,7 @@ class RetrievalService:
             documents = vector.search_by_vector(
                 query,
                 search_type='similarity_score_threshold',
-                k=top_k,
+                top_k=top_k,
                 score_threshold=score_threshold,
                 filter={
                     'group_id': [dataset.id]
diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py
index 568974b74f..6d5a0537d3 100644
--- a/api/services/hit_testing_service.py
+++ b/api/services/hit_testing_service.py
@@ -133,8 +133,9 @@ class HitTestingService:
         if embedding_length <= 1:
             return [{'x': 0, 'y': 0}]
 
-        concatenate_data = np.array(embeddings).reshape(embedding_length, -1)
-        # concatenate_data = np.concatenate(embeddings)
+        noise = np.random.normal(0, 1e-4, np.array(embeddings).shape)
+        concatenate_data = np.array(embeddings) + noise
+        concatenate_data = concatenate_data.reshape(embedding_length, -1)
 
         perplexity = embedding_length / 2 + 1
         if perplexity >= embedding_length: