Fix: float transfer exception. (#6197)

### What problem does this PR solve?

#6177

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2025-03-18 11:13:44 +08:00 committed by GitHub
parent 222a2c8fa5
commit 1333d3c02a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 27 additions and 15 deletions

View File

@ -24,7 +24,7 @@ import trio
from api.utils import get_uuid
from graphrag.query_analyze_prompt import PROMPTS
from graphrag.utils import get_entity_type2sampels, get_llm_cache, set_llm_cache, get_relation
from rag.utils import num_tokens_from_string
from rag.utils import num_tokens_from_string, get_float
from rag.utils.doc_store_conn import OrderByExpr
from rag.nlp.search import Dealer, index_name
@ -72,13 +72,13 @@ class KGSearch(Dealer):
for f in flds:
if f in ent and ent[f] is None:
del ent[f]
if float(ent.get("_score", 0)) < sim_thr:
if get_float(ent.get("_score", 0)) < sim_thr:
continue
if isinstance(ent["entity_kwd"], list):
ent["entity_kwd"] = ent["entity_kwd"][0]
res[ent["entity_kwd"]] = {
"sim": float(ent.get("_score", 0)),
"pagerank": float(ent.get("rank_flt", 0)),
"sim": get_float(ent.get("_score", 0)),
"pagerank": get_float(ent.get("rank_flt", 0)),
"n_hop_ents": json.loads(ent.get("n_hop_with_weight", "[]")),
"description": ent.get("content_with_weight", "{}")
}
@ -89,7 +89,7 @@ class KGSearch(Dealer):
es_res = self.dataStore.getFields(es_res, ["content_with_weight", "_score", "from_entity_kwd", "to_entity_kwd",
"weight_int"])
for _, ent in es_res.items():
if float(ent["_score"]) < sim_thr:
if get_float(ent["_score"]) < sim_thr:
continue
f, t = sorted([ent["from_entity_kwd"], ent["to_entity_kwd"]])
if isinstance(f, list):
@ -97,8 +97,8 @@ class KGSearch(Dealer):
if isinstance(t, list):
t = t[0]
res[(f, t)] = {
"sim": float(ent["_score"]),
"pagerank": float(ent.get("weight_int", 0)),
"sim": get_float(ent["_score"]),
"pagerank": get_float(ent.get("weight_int", 0)),
"description": ent["content_with_weight"]
}
return res

View File

@ -30,6 +30,8 @@ from docx import Document
from PIL import Image
from markdown import markdown
from rag.utils import get_float
class Excel(ExcelParser):
def __call__(self, fnm, binary=None, callback=None):
@ -126,8 +128,8 @@ class Pdf(PdfParser):
section, line_tag = box['text'], self._line_tag(box, zoomin)
has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
last_box, last_index, last_bull = box, index, has_bull
line_pn = float(line_tag.lstrip('@@').split('\t')[0])
line_top = float(line_tag.rstrip('##').split('\t')[3])
line_pn = get_float(line_tag.lstrip('@@').split('\t')[0])
line_top = get_float(line_tag.rstrip('##').split('\t')[3])
tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text = self.get_tbls_info(tbls, tbl_index)
if not has_bull: # No question bullet
if not last_q:

View File

@ -18,7 +18,7 @@ import re
from dataclasses import dataclass
from rag.settings import TAG_FLD, PAGERANK_FLD
from rag.utils import rmSpace
from rag.utils import rmSpace, get_float
from rag.nlp import rag_tokenizer, query
import numpy as np
from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr
@ -49,7 +49,7 @@ class Dealer:
if len(shape) > 1:
raise Exception(
f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).")
embedding_data = [float(v) for v in qv]
embedding_data = [get_float(v) for v in qv]
vector_column_name = f"q_{len(embedding_data)}_vec"
return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity})
@ -153,7 +153,7 @@ class Dealer:
@staticmethod
def trans2floats(txt):
return [float(t) for t in txt.split("\t")]
return [get_float(t) for t in txt.split("\t")]
def insert_citations(self, answer, chunks, chunk_v,
embd_mdl, tkweight=0.1, vtweight=0.9):
@ -282,7 +282,7 @@ class Dealer:
for chunk_id in sres.ids:
vector = sres.field[chunk_id].get(vector_column, zero_vector)
if isinstance(vector, str):
vector = [float(v) for v in vector.split("\t")]
vector = [get_float(v) for v in vector.split("\t")]
ins_embd.append(vector)
if not ins_embd:
return [], [], []

View File

@ -19,6 +19,7 @@ import re
import tiktoken
from api.utils.file_utils import get_project_base_directory
def singleton(cls, *args, **kw):
instances = {}
@ -89,3 +90,12 @@ def num_tokens_from_string(string: str) -> int:
def truncate(string: str, max_len: int) -> str:
"""Returns truncated text if the length of text exceed max_len."""
return encoder.decode(encoder.encode(string)[:max_len])
def get_float(v: str | None):
if v is None:
return float('-inf')
try:
return float(v)
except Exception:
return float('-inf')

View File

@ -26,7 +26,7 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
from elastic_transport import ConnectionTimeout
from rag import settings
from rag.settings import TAG_FLD, PAGERANK_FLD
from rag.utils import singleton
from rag.utils import singleton, get_float
from api.utils.file_utils import get_project_base_directory
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
FusionExpr
@ -178,7 +178,7 @@ class ESConnection(DocStoreConnection):
MatchDenseExpr) and isinstance(
matchExprs[2], FusionExpr)
weights = m.fusion_params["weights"]
vector_similarity_weight = float(weights.split(",")[1])
vector_similarity_weight = get_float(weights.split(",")[1])
for m in matchExprs:
if isinstance(m, MatchTextExpr):
minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)