diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index 11f248d17..b4a56f2ec 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -83,7 +83,6 @@ ChatModel = { "VolcEngine": VolcEngineChat, "BaiChuan": BaiChuanChat, "MiniMax": MiniMaxChat, - "Minimax": MiniMaxChat, "Mistral": MistralChat, "Gemini": GeminiChat, "Bedrock": BedrockChat, diff --git a/rag/nlp/query.py b/rag/nlp/query.py index c58c99c4c..79f730113 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -165,7 +165,7 @@ class EsQueryer: d = {} if isinstance(tks, str): tks = tks.split(" ") - for t, c in self.tw.weights(tks): + for t, c in self.tw.weights(tks, preprocess=False): if t not in d: d[t] = 0 d[t] += c @@ -177,9 +177,9 @@ class EsQueryer: def similarity(self, qtwt, dtwt): if isinstance(dtwt, type("")): - dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))} + dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt), preprocess=False)} if isinstance(qtwt, type("")): - qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))} + qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt), preprocess=False)} s = 1e-9 for k, v in qtwt.items(): if k in dtwt: diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py index 34765feaf..1b22dc9e1 100644 --- a/rag/nlp/term_weight.py +++ b/rag/nlp/term_weight.py @@ -1,4 +1,4 @@ -# + # # Copyright 2024 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -158,7 +158,7 @@ class Dealer: tks.append(t) return tks - def weights(self, tks): + def weights(self, tks, preprocess=True): def skill(t): if t not in self.sk: return 1 @@ -222,14 +222,20 @@ class Dealer: def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5))) tw = [] - for tk in tks: - tt = self.tokenMerge(self.pretoken(tk, True)) - idf1 = np.array([idf(freq(t), 10000000) for t in tt]) - idf2 = np.array([idf(df(t), 1000000000) for t in tt]) + if not preprocess: + idf1 = np.array([idf(freq(t), 10000000) for t in tks]) + idf2 = np.array([idf(df(t), 1000000000) for t in tks]) wts = (0.3 * idf1 + 0.7 * idf2) * \ - np.array([ner(t) * postag(t) for t in tt]) - - tw.extend(zip(tt, wts)) + np.array([ner(t) * postag(t) for t in tks]) + tw = zip(tks, wts) + else: + for tk in tks: + tt = self.tokenMerge(self.pretoken(tk, True)) + idf1 = np.array([idf(freq(t), 10000000) for t in tt]) + idf2 = np.array([idf(df(t), 1000000000) for t in tt]) + wts = (0.3 * idf1 + 0.7 * idf2) * \ + np.array([ner(t) * postag(t) for t in tt]) + tw.extend(zip(tt, wts)) S = np.sum([s for _, s in tw]) return [(t, s / S) for t, s in tw]