diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 34333a350..55b4e9d32 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -71,7 +71,19 @@ class FulltextQueryer: txt = otxt return txt + @staticmethod + def add_space_between_eng_zh(txt): + # (ENG/ENG+NUM) + ZH + txt = re.sub(r'([A-Za-z]+[0-9]+)([\u4e00-\u9fa5]+)', r'\1 \2', txt) + # ENG + ZH + txt = re.sub(r'([A-Za-z])([\u4e00-\u9fa5]+)', r'\1 \2', txt) + # ZH + (ENG/ENG+NUM) + txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z]+[0-9]+)', r'\1 \2', txt) + txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z])', r'\1 \2', txt) + return txt + def question(self, txt, tbl="qa", min_match: float = 0.6): + txt = FulltextQueryer.add_space_between_eng_zh(txt) txt = re.sub( r"[ :|\r\n\t,,。??/`!!&^%%()\[\]{}<>]+", " ",