diff --git a/rag/nlp/rag_tokenizer.py b/rag/nlp/rag_tokenizer.py index 11881162d..eee013e61 100644 --- a/rag/nlp/rag_tokenizer.py +++ b/rag/nlp/rag_tokenizer.py @@ -64,7 +64,7 @@ class RagTokenizer: self.stemmer = PorterStemmer() self.lemmatizer = WordNetLemmatizer() - self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)" + self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)" try: self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie") return