Refine synonym query. (#3855)

### What problem does this PR solve?

### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu 2024-12-04 17:20:12 +08:00 committed by GitHub
parent 1b589609a4
commit 1b817a5b4c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 25 additions and 10 deletions

View File

@ -140,13 +140,21 @@
} }
}, },
{ {
"string": { "rank_feature": {
"match": "*_fea", "match": "*_fea",
"mapping": { "mapping": {
"type": "rank_feature" "type": "rank_feature"
} }
} }
}, },
{
"rank_features": {
"match": "*_feas",
"mapping": {
"type": "rank_features"
}
}
},
{ {
"dense_vector": { "dense_vector": {
"match": "*_512_vec", "match": "*_512_vec",

View File

@ -120,7 +120,7 @@ class FulltextQueryer:
keywords.append(tt) keywords.append(tt)
twts = self.tw.weights([tt]) twts = self.tw.weights([tt])
syns = self.syn.lookup(tt) syns = self.syn.lookup(tt)
if syns: keywords.extend(syns) if syns and len(keywords) < 32: keywords.extend(syns)
logging.debug(json.dumps(twts, ensure_ascii=False)) logging.debug(json.dumps(twts, ensure_ascii=False))
tms = [] tms = []
for tk, w in sorted(twts, key=lambda x: x[1] * -1): for tk, w in sorted(twts, key=lambda x: x[1] * -1):
@ -140,17 +140,24 @@ class FulltextQueryer:
sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1] sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
sm = [m for m in sm if len(m) > 1] sm = [m for m in sm if len(m) > 1]
keywords.append(re.sub(r"[ \\\"']+", "", tk)) if len(keywords) < 32:
keywords.extend(sm) keywords.append(re.sub(r"[ \\\"']+", "", tk))
if len(keywords) >= 12: keywords.extend(sm)
break
tk_syns = self.syn.lookup(tk) tk_syns = self.syn.lookup(tk)
tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
if len(keywords) < 32: keywords.extend([s for s in tk_syns if s])
tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
if len(keywords) >= 32:
break
tk = FulltextQueryer.subSpecialChar(tk) tk = FulltextQueryer.subSpecialChar(tk)
if tk.find(" ") > 0: if tk.find(" ") > 0:
tk = '"%s"' % tk tk = '"%s"' % tk
if tk_syns: if tk_syns:
tk = f"({tk} %s)" % " ".join(tk_syns) tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
if sm: if sm:
tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm)) tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm))
if tk.strip(): if tk.strip():
@ -159,14 +166,14 @@ class FulltextQueryer:
tms = " ".join([f"({t})^{w}" for t, w in tms]) tms = " ".join([f"({t})^{w}" for t, w in tms])
if len(twts) > 1: if len(twts) > 1:
tms += ' ("%s"~4)^1.5' % (" ".join([t for t, _ in twts])) tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
if re.match(r"[0-9a-z ]+$", tt): if re.match(r"[0-9a-z ]+$", tt):
tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt) tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)
syns = " OR ".join( syns = " OR ".join(
[ [
'"%s"^0.7' '"%s"'
% FulltextQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) % rag_tokenizer.tokenize(FulltextQueryer.subSpecialChar(s))
for s in syns for s in syns
] ]
) )