mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 05:38:58 +08:00
Refine synonym query. (#3855)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
1b589609a4
commit
1b817a5b4c
@ -140,13 +140,21 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"string": {
|
"rank_feature": {
|
||||||
"match": "*_fea",
|
"match": "*_fea",
|
||||||
"mapping": {
|
"mapping": {
|
||||||
"type": "rank_feature"
|
"type": "rank_feature"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"rank_features": {
|
||||||
|
"match": "*_feas",
|
||||||
|
"mapping": {
|
||||||
|
"type": "rank_features"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"dense_vector": {
|
"dense_vector": {
|
||||||
"match": "*_512_vec",
|
"match": "*_512_vec",
|
||||||
|
@ -120,7 +120,7 @@ class FulltextQueryer:
|
|||||||
keywords.append(tt)
|
keywords.append(tt)
|
||||||
twts = self.tw.weights([tt])
|
twts = self.tw.weights([tt])
|
||||||
syns = self.syn.lookup(tt)
|
syns = self.syn.lookup(tt)
|
||||||
if syns: keywords.extend(syns)
|
if syns and len(keywords) < 32: keywords.extend(syns)
|
||||||
logging.debug(json.dumps(twts, ensure_ascii=False))
|
logging.debug(json.dumps(twts, ensure_ascii=False))
|
||||||
tms = []
|
tms = []
|
||||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||||
@ -140,17 +140,24 @@ class FulltextQueryer:
|
|||||||
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
|
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
|
||||||
sm = [m for m in sm if len(m) > 1]
|
sm = [m for m in sm if len(m) > 1]
|
||||||
|
|
||||||
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
if len(keywords) < 32:
|
||||||
keywords.extend(sm)
|
keywords.append(re.sub(r"[ \\\"']+", "", tk))
|
||||||
if len(keywords) >= 12:
|
keywords.extend(sm)
|
||||||
break
|
|
||||||
|
|
||||||
tk_syns = self.syn.lookup(tk)
|
tk_syns = self.syn.lookup(tk)
|
||||||
|
tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
|
||||||
|
if len(keywords) < 32: keywords.extend([s for s in tk_syns if s])
|
||||||
|
tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
|
||||||
|
tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
|
||||||
|
|
||||||
|
if len(keywords) >= 32:
|
||||||
|
break
|
||||||
|
|
||||||
tk = FulltextQueryer.subSpecialChar(tk)
|
tk = FulltextQueryer.subSpecialChar(tk)
|
||||||
if tk.find(" ") > 0:
|
if tk.find(" ") > 0:
|
||||||
tk = '"%s"' % tk
|
tk = '"%s"' % tk
|
||||||
if tk_syns:
|
if tk_syns:
|
||||||
tk = f"({tk} %s)" % " ".join(tk_syns)
|
tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
|
||||||
if sm:
|
if sm:
|
||||||
tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
|
tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
|
||||||
if tk.strip():
|
if tk.strip():
|
||||||
@ -159,14 +166,14 @@ class FulltextQueryer:
|
|||||||
tms = " ".join([f"({t})^{w}" for t, w in tms])
|
tms = " ".join([f"({t})^{w}" for t, w in tms])
|
||||||
|
|
||||||
if len(twts) > 1:
|
if len(twts) > 1:
|
||||||
tms += ' ("%s"~4)^1.5' % (" ".join([t for t, _ in twts]))
|
tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
|
||||||
if re.match(r"[0-9a-z ]+$", tt):
|
if re.match(r"[0-9a-z ]+$", tt):
|
||||||
tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
|
tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
|
||||||
|
|
||||||
syns = " OR ".join(
|
syns = " OR ".join(
|
||||||
[
|
[
|
||||||
'"%s"^0.7'
|
'"%s"'
|
||||||
% FulltextQueryer.subSpecialChar(rag_tokenizer.tokenize(s))
|
% rag_tokenizer.tokenize(FulltextQueryer.subSpecialChar(s))
|
||||||
for s in syns
|
for s in syns
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user