Fix tokenizer bug (#2573)

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
lidp 2024-09-25 10:30:49 +08:00 committed by GitHub
parent 7bb28ca2bd
commit 08d5637770
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -64,7 +64,7 @@ class RagTokenizer:
self.stemmer = PorterStemmer()
self.lemmatizer = WordNetLemmatizer()
self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
try:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
return