mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 21:28:59 +08:00
accelerate term weight calculation (#3206)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
677f02c2a7
commit
55953819c1
@ -83,7 +83,6 @@ ChatModel = {
|
|||||||
"VolcEngine": VolcEngineChat,
|
"VolcEngine": VolcEngineChat,
|
||||||
"BaiChuan": BaiChuanChat,
|
"BaiChuan": BaiChuanChat,
|
||||||
"MiniMax": MiniMaxChat,
|
"MiniMax": MiniMaxChat,
|
||||||
"Minimax": MiniMaxChat,
|
|
||||||
"Mistral": MistralChat,
|
"Mistral": MistralChat,
|
||||||
"Gemini": GeminiChat,
|
"Gemini": GeminiChat,
|
||||||
"Bedrock": BedrockChat,
|
"Bedrock": BedrockChat,
|
||||||
|
@ -165,7 +165,7 @@ class EsQueryer:
|
|||||||
d = {}
|
d = {}
|
||||||
if isinstance(tks, str):
|
if isinstance(tks, str):
|
||||||
tks = tks.split(" ")
|
tks = tks.split(" ")
|
||||||
for t, c in self.tw.weights(tks):
|
for t, c in self.tw.weights(tks, preprocess=False):
|
||||||
if t not in d:
|
if t not in d:
|
||||||
d[t] = 0
|
d[t] = 0
|
||||||
d[t] += c
|
d[t] += c
|
||||||
@ -177,9 +177,9 @@ class EsQueryer:
|
|||||||
|
|
||||||
def similarity(self, qtwt, dtwt):
|
def similarity(self, qtwt, dtwt):
|
||||||
if isinstance(dtwt, type("")):
|
if isinstance(dtwt, type("")):
|
||||||
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))}
|
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt), preprocess=False)}
|
||||||
if isinstance(qtwt, type("")):
|
if isinstance(qtwt, type("")):
|
||||||
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))}
|
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt), preprocess=False)}
|
||||||
s = 1e-9
|
s = 1e-9
|
||||||
for k, v in qtwt.items():
|
for k, v in qtwt.items():
|
||||||
if k in dtwt:
|
if k in dtwt:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
#
|
#
|
||||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -158,7 +158,7 @@ class Dealer:
|
|||||||
tks.append(t)
|
tks.append(t)
|
||||||
return tks
|
return tks
|
||||||
|
|
||||||
def weights(self, tks):
|
def weights(self, tks, preprocess=True):
|
||||||
def skill(t):
|
def skill(t):
|
||||||
if t not in self.sk:
|
if t not in self.sk:
|
||||||
return 1
|
return 1
|
||||||
@ -222,14 +222,20 @@ class Dealer:
|
|||||||
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
|
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
|
||||||
|
|
||||||
tw = []
|
tw = []
|
||||||
for tk in tks:
|
if not preprocess:
|
||||||
tt = self.tokenMerge(self.pretoken(tk, True))
|
idf1 = np.array([idf(freq(t), 10000000) for t in tks])
|
||||||
idf1 = np.array([idf(freq(t), 10000000) for t in tt])
|
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
||||||
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
|
||||||
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
||||||
np.array([ner(t) * postag(t) for t in tt])
|
np.array([ner(t) * postag(t) for t in tks])
|
||||||
|
tw = zip(tks, wts)
|
||||||
tw.extend(zip(tt, wts))
|
else:
|
||||||
|
for tk in tks:
|
||||||
|
tt = self.tokenMerge(self.pretoken(tk, True))
|
||||||
|
idf1 = np.array([idf(freq(t), 10000000) for t in tt])
|
||||||
|
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
||||||
|
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
||||||
|
np.array([ner(t) * postag(t) for t in tt])
|
||||||
|
tw.extend(zip(tt, wts))
|
||||||
|
|
||||||
S = np.sum([s for _, s in tw])
|
S = np.sum([s for _, s in tw])
|
||||||
return [(t, s / S) for t, s in tw]
|
return [(t, s / S) for t, s in tw]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user