From 0d7cfce6e1186c1d5f12cf8679ca593cc34ab41e Mon Sep 17 00:00:00 2001 From: Sol <55039727+Sol-Ghf@users.noreply.github.com> Date: Fri, 23 May 2025 17:13:37 +0800 Subject: [PATCH] Update rag/nlp/query.py (#7816) ### What problem does this PR solve? Fix tokenizer resulting in low recall ![37743d3a495f734aa69f1e173fa77457](https://github.com/user-attachments/assets/1394757e-8fcb-4f87-96af-a92716144884) ![4aba633a17f34269a4e17e84fafb34c4](https://github.com/user-attachments/assets/a1828e32-3e17-4394-a633-ba3f09bd506d) ![image](https://github.com/user-attachments/assets/61308f32-2a4f-44d5-a034-d65bbec554ef) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring --------- Co-authored-by: Kevin Hu --- rag/nlp/query.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 34333a350..55b4e9d32 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -71,7 +71,19 @@ class FulltextQueryer: txt = otxt return txt + @staticmethod + def add_space_between_eng_zh(txt): + # (ENG/ENG+NUM) + ZH + txt = re.sub(r'([A-Za-z]+[0-9]+)([\u4e00-\u9fa5]+)', r'\1 \2', txt) + # ENG + ZH + txt = re.sub(r'([A-Za-z])([\u4e00-\u9fa5]+)', r'\1 \2', txt) + # ZH + (ENG/ENG+NUM) + txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z]+[0-9]+)', r'\1 \2', txt) + txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z])', r'\1 \2', txt) + return txt + def question(self, txt, tbl="qa", min_match: float = 0.6): + txt = FulltextQueryer.add_space_between_eng_zh(txt) txt = re.sub( r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+", " ",