From 2eea114ac0af2c64f4cd3fa636e9e9a3562523b9 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Wed, 28 Jun 2023 13:58:36 +0800 Subject: [PATCH] fix special code (#473) --- api/core/indexing_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index f78f21a157..f4f8b7d8de 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -235,7 +235,8 @@ class IndexingRunner: if len(preview_texts) < 5: preview_texts.append(document.page_content) - tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, document.page_content) + tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, + self.filter_string(document.page_content)) return { "total_segments": total_segments, @@ -345,6 +346,8 @@ class IndexingRunner: return text_docs def filter_string(self, text): + text = text.replace('<|', '<') + text = text.replace('|>', '>') pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') return pattern.sub('', text) @@ -425,7 +428,7 @@ class IndexingRunner: return documents def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, - processing_rule: DatasetProcessRule) -> List[Document]: + processing_rule: DatasetProcessRule) -> List[Document]: """ Split the text documents into nodes. """