use sub to operate all (#475)

This commit is contained in:
crazywoola 2023-06-28 14:58:40 +08:00 committed by GitHub
parent 6194b82752
commit 998f819b04
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -346,10 +346,10 @@ class IndexingRunner:
return text_docs
def filter_string(self, text):
text = text.replace('<|', '<')
text = text.replace('|>', '>')
pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]')
return pattern.sub('', text)
text = re.sub(r'<\|', '<', text)
text = re.sub(r'\|>', '>', text)
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]', '', text)
return text
def _get_splitter(self, processing_rule: DatasetProcessRule) -> TextSplitter:
"""