diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index f78f21a157..f4f8b7d8de 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -235,7 +235,8 @@ class IndexingRunner: if len(preview_texts) < 5: preview_texts.append(document.page_content) - tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, document.page_content) + tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, + self.filter_string(document.page_content)) return { "total_segments": total_segments, @@ -345,6 +346,8 @@ class IndexingRunner: return text_docs def filter_string(self, text): + text = text.replace('<|', '<') + text = text.replace('|>', '>') pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') return pattern.sub('', text) @@ -425,7 +428,7 @@ class IndexingRunner: return documents def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, - processing_rule: DatasetProcessRule) -> List[Document]: + processing_rule: DatasetProcessRule) -> List[Document]: """ Split the text documents into nodes. """