accelerate term weight calculation (#3206)

### What problem does this PR solve?



### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu 2024-11-05 13:11:26 +08:00 committed by GitHub
parent 677f02c2a7
commit 55953819c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 18 additions and 13 deletions

View File

@ -83,7 +83,6 @@ ChatModel = {
"VolcEngine": VolcEngineChat,
"BaiChuan": BaiChuanChat,
"MiniMax": MiniMaxChat,
"Minimax": MiniMaxChat,
"Mistral": MistralChat,
"Gemini": GeminiChat,
"Bedrock": BedrockChat,

View File

@ -165,7 +165,7 @@ class EsQueryer:
d = {}
if isinstance(tks, str):
tks = tks.split(" ")
for t, c in self.tw.weights(tks):
for t, c in self.tw.weights(tks, preprocess=False):
if t not in d:
d[t] = 0
d[t] += c
@ -177,9 +177,9 @@ class EsQueryer:
def similarity(self, qtwt, dtwt):
if isinstance(dtwt, type("")):
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt))}
dtwt = {t: w for t, w in self.tw.weights(self.tw.split(dtwt), preprocess=False)}
if isinstance(qtwt, type("")):
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt))}
qtwt = {t: w for t, w in self.tw.weights(self.tw.split(qtwt), preprocess=False)}
s = 1e-9
for k, v in qtwt.items():
if k in dtwt:

View File

@ -1,4 +1,4 @@
#
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -158,7 +158,7 @@ class Dealer:
tks.append(t)
return tks
def weights(self, tks):
def weights(self, tks, preprocess=True):
def skill(t):
if t not in self.sk:
return 1
@ -222,13 +222,19 @@ class Dealer:
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
tw = []
if not preprocess:
idf1 = np.array([idf(freq(t), 10000000) for t in tks])
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
wts = (0.3 * idf1 + 0.7 * idf2) * \
np.array([ner(t) * postag(t) for t in tks])
tw = zip(tks, wts)
else:
for tk in tks:
tt = self.tokenMerge(self.pretoken(tk, True))
idf1 = np.array([idf(freq(t), 10000000) for t in tt])
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
wts = (0.3 * idf1 + 0.7 * idf2) * \
np.array([ner(t) * postag(t) for t in tt])
tw.extend(zip(tt, wts))
S = np.sum([s for _, s in tw])