From 1333d3c02a5a97a7ab9aab8405cf00f1e81430a3 Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Tue, 18 Mar 2025 11:13:44 +0800
Subject: [PATCH] Fix: float transfer exception. (#6197)

### What problem does this PR solve?

#6177

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 graphrag/search.py    | 14 +++++++-------
 rag/app/qa.py         |  6 ++++--
 rag/nlp/search.py     |  8 ++++----
 rag/utils/__init__.py | 10 ++++++++++
 rag/utils/es_conn.py  |  4 ++--
 5 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/graphrag/search.py b/graphrag/search.py
index e0df92235..6a0236a05 100644
--- a/graphrag/search.py
+++ b/graphrag/search.py
@@ -24,7 +24,7 @@ import trio
 from api.utils import get_uuid
 from graphrag.query_analyze_prompt import PROMPTS
 from graphrag.utils import get_entity_type2sampels, get_llm_cache, set_llm_cache, get_relation
-from rag.utils import num_tokens_from_string
+from rag.utils import num_tokens_from_string, get_float
 from rag.utils.doc_store_conn import OrderByExpr
 
 from rag.nlp.search import Dealer, index_name
@@ -72,13 +72,13 @@ class KGSearch(Dealer):
             for f in flds:
                 if f in ent and ent[f] is None:
                     del ent[f]
-            if float(ent.get("_score", 0)) < sim_thr:
+            if get_float(ent.get("_score", 0)) < sim_thr:
                 continue
             if isinstance(ent["entity_kwd"], list):
                 ent["entity_kwd"] = ent["entity_kwd"][0]
             res[ent["entity_kwd"]] = {
-                "sim": float(ent.get("_score", 0)),
-                "pagerank": float(ent.get("rank_flt", 0)),
+                "sim": get_float(ent.get("_score", 0)),
+                "pagerank": get_float(ent.get("rank_flt", 0)),
                 "n_hop_ents": json.loads(ent.get("n_hop_with_weight", "[]")),
                 "description": ent.get("content_with_weight", "{}")
             }
@@ -89,7 +89,7 @@ class KGSearch(Dealer):
         es_res = self.dataStore.getFields(es_res, ["content_with_weight", "_score", "from_entity_kwd", "to_entity_kwd",
                                                    "weight_int"])
         for _, ent in es_res.items():
-            if float(ent["_score"]) < sim_thr:
+            if get_float(ent["_score"]) < sim_thr:
                 continue
             f, t = sorted([ent["from_entity_kwd"], ent["to_entity_kwd"]])
             if isinstance(f, list):
@@ -97,8 +97,8 @@ class KGSearch(Dealer):
             if isinstance(t, list):
                 t = t[0]
             res[(f, t)] = {
-                "sim": float(ent["_score"]),
-                "pagerank": float(ent.get("weight_int", 0)),
+                "sim": get_float(ent["_score"]),
+                "pagerank": get_float(ent.get("weight_int", 0)),
                 "description": ent["content_with_weight"]
             }
         return res
diff --git a/rag/app/qa.py b/rag/app/qa.py
index ebc3677a2..986618989 100644
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -30,6 +30,8 @@ from docx import Document
 from PIL import Image
 from markdown import markdown
 
+from rag.utils import get_float
+
 
 class Excel(ExcelParser):
     def __call__(self, fnm, binary=None, callback=None):
@@ -126,8 +128,8 @@ class Pdf(PdfParser):
             section, line_tag = box['text'], self._line_tag(box, zoomin)
             has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
             last_box, last_index, last_bull = box, index, has_bull
-            line_pn = float(line_tag.lstrip('@@').split('\t')[0])
-            line_top = float(line_tag.rstrip('##').split('\t')[3])
+            line_pn = get_float(line_tag.lstrip('@@').split('\t')[0])
+            line_top = get_float(line_tag.rstrip('##').split('\t')[3])
             tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
             if not has_bull:  # No question bullet
                 if not last_q:
diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index b1801607b..ba2ee3ffe 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -18,7 +18,7 @@ import re
 from dataclasses import dataclass
 
 from rag.settings import TAG_FLD, PAGERANK_FLD
-from rag.utils import rmSpace
+from rag.utils import rmSpace, get_float
 from rag.nlp import rag_tokenizer, query
 import numpy as np
 from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr
@@ -49,7 +49,7 @@ class Dealer:
         if len(shape) > 1:
             raise Exception(
                 f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).")
-        embedding_data = [float(v) for v in qv]
+        embedding_data = [get_float(v) for v in qv]
         vector_column_name = f"q_{len(embedding_data)}_vec"
         return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity})
 
@@ -153,7 +153,7 @@ class Dealer:
 
     @staticmethod
     def trans2floats(txt):
-        return [float(t) for t in txt.split("\t")]
+        return [get_float(t) for t in txt.split("\t")]
 
     def insert_citations(self, answer, chunks, chunk_v,
                          embd_mdl, tkweight=0.1, vtweight=0.9):
@@ -282,7 +282,7 @@ class Dealer:
         for chunk_id in sres.ids:
             vector = sres.field[chunk_id].get(vector_column, zero_vector)
             if isinstance(vector, str):
-                vector = [float(v) for v in vector.split("\t")]
+                vector = [get_float(v) for v in vector.split("\t")]
             ins_embd.append(vector)
         if not ins_embd:
             return [], [], []
diff --git a/rag/utils/__init__.py b/rag/utils/__init__.py
index 8a4c1e09f..f80d56b7d 100644
--- a/rag/utils/__init__.py
+++ b/rag/utils/__init__.py
@@ -19,6 +19,7 @@ import re
 import tiktoken
 from api.utils.file_utils import get_project_base_directory
 
+
 def singleton(cls, *args, **kw):
     instances = {}
 
@@ -89,3 +90,12 @@ def num_tokens_from_string(string: str) -> int:
 def truncate(string: str, max_len: int) -> str:
     """Returns truncated text if the length of text exceed max_len."""
     return encoder.decode(encoder.encode(string)[:max_len])
+
+
+def get_float(v: str | None):
+    if v is None:
+        return float('-inf')
+    try:
+        return float(v)
+    except Exception:
+        return float('-inf')
\ No newline at end of file
diff --git a/rag/utils/es_conn.py b/rag/utils/es_conn.py
index a8dd65317..9cba60899 100644
--- a/rag/utils/es_conn.py
+++ b/rag/utils/es_conn.py
@@ -26,7 +26,7 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
 from elastic_transport import ConnectionTimeout
 from rag import settings
 from rag.settings import TAG_FLD, PAGERANK_FLD
-from rag.utils import singleton
+from rag.utils import singleton, get_float
 from api.utils.file_utils import get_project_base_directory
 from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
     FusionExpr
@@ -178,7 +178,7 @@ class ESConnection(DocStoreConnection):
                                                                                                         MatchDenseExpr) and isinstance(
                     matchExprs[2], FusionExpr)
                 weights = m.fusion_params["weights"]
-                vector_similarity_weight = float(weights.split(",")[1])
+                vector_similarity_weight = get_float(weights.split(",")[1])
         for m in matchExprs:
             if isinstance(m, MatchTextExpr):
                 minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)