From 1b817a5b4cf0cfeec96c1e4e2c669c9d53f1f5cb Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 4 Dec 2024 17:20:12 +0800 Subject: [PATCH] Refine synonym query. (#3855) ### What problem does this PR solve? ### Type of change - [x] Performance Improvement --- conf/mapping.json | 10 +++++++++- rag/nlp/query.py | 25 ++++++++++++++++--------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/conf/mapping.json b/conf/mapping.json index 7f8c4e27e..f32acb02b 100644 --- a/conf/mapping.json +++ b/conf/mapping.json @@ -140,13 +140,21 @@ } }, { - "string": { + "rank_feature": { "match": "*_fea", "mapping": { "type": "rank_feature" } } }, + { + "rank_features": { + "match": "*_feas", + "mapping": { + "type": "rank_features" + } + } + }, { "dense_vector": { "match": "*_512_vec", diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 9a0ceafae..3243d2a80 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -120,7 +120,7 @@ class FulltextQueryer: keywords.append(tt) twts = self.tw.weights([tt]) syns = self.syn.lookup(tt) - if syns: keywords.extend(syns) + if syns and len(keywords) < 32: keywords.extend(syns) logging.debug(json.dumps(twts, ensure_ascii=False)) tms = [] for tk, w in sorted(twts, key=lambda x: x[1] * -1): @@ -140,17 +140,24 @@ class FulltextQueryer: sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1] sm = [m for m in sm if len(m) > 1] - keywords.append(re.sub(r"[ \\\"']+", "", tk)) - keywords.extend(sm) - if len(keywords) >= 12: - break + if len(keywords) < 32: + keywords.append(re.sub(r"[ \\\"']+", "", tk)) + keywords.extend(sm) tk_syns = self.syn.lookup(tk) + tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns] + if len(keywords) < 32: keywords.extend([s for s in tk_syns if s]) + tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s] + tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns] + + if len(keywords) >= 32: + break + tk = FulltextQueryer.subSpecialChar(tk) if tk.find(" ") > 0: tk = '"%s"' % tk if tk_syns: - tk = f"({tk} %s)" % " ".join(tk_syns) + tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns) if sm: tk = f'{tk} OR "%s" OR ("%s"~2)^0.5' % (" ".join(sm), " ".join(sm)) if tk.strip(): @@ -159,14 +166,14 @@ class FulltextQueryer: tms = " ".join([f"({t})^{w}" for t, w in tms]) if len(twts) > 1: - tms += ' ("%s"~4)^1.5' % (" ".join([t for t, _ in twts])) + tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt) if re.match(r"[0-9a-z ]+$", tt): tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt) syns = " OR ".join( [ - '"%s"^0.7' - % FulltextQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) + '"%s"' + % rag_tokenizer.tokenize(FulltextQueryer.subSpecialChar(s)) for s in syns ] )