fix special code (#473)

This commit is contained in:
Jyong 2023-06-28 13:58:36 +08:00 committed by GitHub
parent 97e9ebd29a
commit 2eea114ac0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -235,7 +235,8 @@ class IndexingRunner:
if len(preview_texts) < 5: if len(preview_texts) < 5:
preview_texts.append(document.page_content) preview_texts.append(document.page_content)
tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, document.page_content) tokens += TokenCalculator.get_num_tokens(self.embedding_model_name,
self.filter_string(document.page_content))
return { return {
"total_segments": total_segments, "total_segments": total_segments,
@ -345,6 +346,8 @@ class IndexingRunner:
return text_docs return text_docs
def filter_string(self, text): def filter_string(self, text):
text = text.replace('<|', '<')
text = text.replace('|>', '>')
pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]')
return pattern.sub('', text) return pattern.sub('', text)
@ -425,7 +428,7 @@ class IndexingRunner:
return documents return documents
def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter,
processing_rule: DatasetProcessRule) -> List[Document]: processing_rule: DatasetProcessRule) -> List[Document]:
""" """
Split the text documents into nodes. Split the text documents into nodes.
""" """