From 1333d3c02a5a97a7ab9aab8405cf00f1e81430a3 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 18 Mar 2025 11:13:44 +0800 Subject: [PATCH] Fix: float transfer exception. (#6197) ### What problem does this PR solve? #6177 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- graphrag/search.py | 14 +++++++------- rag/app/qa.py | 6 ++++-- rag/nlp/search.py | 8 ++++---- rag/utils/__init__.py | 10 ++++++++++ rag/utils/es_conn.py | 4 ++-- 5 files changed, 27 insertions(+), 15 deletions(-) diff --git a/graphrag/search.py b/graphrag/search.py index e0df92235..6a0236a05 100644 --- a/graphrag/search.py +++ b/graphrag/search.py @@ -24,7 +24,7 @@ import trio from api.utils import get_uuid from graphrag.query_analyze_prompt import PROMPTS from graphrag.utils import get_entity_type2sampels, get_llm_cache, set_llm_cache, get_relation -from rag.utils import num_tokens_from_string +from rag.utils import num_tokens_from_string, get_float from rag.utils.doc_store_conn import OrderByExpr from rag.nlp.search import Dealer, index_name @@ -72,13 +72,13 @@ class KGSearch(Dealer): for f in flds: if f in ent and ent[f] is None: del ent[f] - if float(ent.get("_score", 0)) < sim_thr: + if get_float(ent.get("_score", 0)) < sim_thr: continue if isinstance(ent["entity_kwd"], list): ent["entity_kwd"] = ent["entity_kwd"][0] res[ent["entity_kwd"]] = { - "sim": float(ent.get("_score", 0)), - "pagerank": float(ent.get("rank_flt", 0)), + "sim": get_float(ent.get("_score", 0)), + "pagerank": get_float(ent.get("rank_flt", 0)), "n_hop_ents": json.loads(ent.get("n_hop_with_weight", "[]")), "description": ent.get("content_with_weight", "{}") } @@ -89,7 +89,7 @@ class KGSearch(Dealer): es_res = self.dataStore.getFields(es_res, ["content_with_weight", "_score", "from_entity_kwd", "to_entity_kwd", "weight_int"]) for _, ent in es_res.items(): - if float(ent["_score"]) < sim_thr: + if get_float(ent["_score"]) < sim_thr: continue f, t = sorted([ent["from_entity_kwd"], ent["to_entity_kwd"]]) if isinstance(f, list): @@ -97,8 +97,8 @@ class KGSearch(Dealer): if isinstance(t, list): t = t[0] res[(f, t)] = { - "sim": float(ent["_score"]), - "pagerank": float(ent.get("weight_int", 0)), + "sim": get_float(ent["_score"]), + "pagerank": get_float(ent.get("weight_int", 0)), "description": ent["content_with_weight"] } return res diff --git a/rag/app/qa.py b/rag/app/qa.py index ebc3677a2..986618989 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -30,6 +30,8 @@ from docx import Document from PIL import Image from markdown import markdown +from rag.utils import get_float + class Excel(ExcelParser): def __call__(self, fnm, binary=None, callback=None): @@ -126,8 +128,8 @@ class Pdf(PdfParser): section, line_tag = box['text'], self._line_tag(box, zoomin) has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list) last_box, last_index, last_bull = box, index, has_bull - line_pn = float(line_tag.lstrip('@@').split('\t')[0]) - line_top = float(line_tag.rstrip('##').split('\t')[3]) + line_pn = get_float(line_tag.lstrip('@@').split('\t')[0]) + line_top = get_float(line_tag.rstrip('##').split('\t')[3]) tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index) if not has_bull: # No question bullet if not last_q: diff --git a/rag/nlp/search.py b/rag/nlp/search.py index b1801607b..ba2ee3ffe 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -18,7 +18,7 @@ import re from dataclasses import dataclass from rag.settings import TAG_FLD, PAGERANK_FLD -from rag.utils import rmSpace +from rag.utils import rmSpace, get_float from rag.nlp import rag_tokenizer, query import numpy as np from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr @@ -49,7 +49,7 @@ class Dealer: if len(shape) > 1: raise Exception( f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).") - embedding_data = [float(v) for v in qv] + embedding_data = [get_float(v) for v in qv] vector_column_name = f"q_{len(embedding_data)}_vec" return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity}) @@ -153,7 +153,7 @@ class Dealer: @staticmethod def trans2floats(txt): - return [float(t) for t in txt.split("\t")] + return [get_float(t) for t in txt.split("\t")] def insert_citations(self, answer, chunks, chunk_v, embd_mdl, tkweight=0.1, vtweight=0.9): @@ -282,7 +282,7 @@ class Dealer: for chunk_id in sres.ids: vector = sres.field[chunk_id].get(vector_column, zero_vector) if isinstance(vector, str): - vector = [float(v) for v in vector.split("\t")] + vector = [get_float(v) for v in vector.split("\t")] ins_embd.append(vector) if not ins_embd: return [], [], [] diff --git a/rag/utils/__init__.py b/rag/utils/__init__.py index 8a4c1e09f..f80d56b7d 100644 --- a/rag/utils/__init__.py +++ b/rag/utils/__init__.py @@ -19,6 +19,7 @@ import re import tiktoken from api.utils.file_utils import get_project_base_directory + def singleton(cls, *args, **kw): instances = {} @@ -89,3 +90,12 @@ def num_tokens_from_string(string: str) -> int: def truncate(string: str, max_len: int) -> str: """Returns truncated text if the length of text exceed max_len.""" return encoder.decode(encoder.encode(string)[:max_len]) + + +def get_float(v: str | None): + if v is None: + return float('-inf') + try: + return float(v) + except Exception: + return float('-inf') \ No newline at end of file diff --git a/rag/utils/es_conn.py b/rag/utils/es_conn.py index a8dd65317..9cba60899 100644 --- a/rag/utils/es_conn.py +++ b/rag/utils/es_conn.py @@ -26,7 +26,7 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index from elastic_transport import ConnectionTimeout from rag import settings from rag.settings import TAG_FLD, PAGERANK_FLD -from rag.utils import singleton +from rag.utils import singleton, get_float from api.utils.file_utils import get_project_base_directory from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \ FusionExpr @@ -178,7 +178,7 @@ class ESConnection(DocStoreConnection): MatchDenseExpr) and isinstance( matchExprs[2], FusionExpr) weights = m.fusion_params["weights"] - vector_similarity_weight = float(weights.split(",")[1]) + vector_similarity_weight = get_float(weights.split(",")[1]) for m in matchExprs: if isinstance(m, MatchTextExpr): minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)