From 571415d1a4cd01078b4f46cb00f49c8daf923b3d Mon Sep 17 00:00:00 2001 From: Sumkor Date: Wed, 4 Sep 2024 12:59:10 +0800 Subject: [PATCH] fix: split text keep separator (#7930) --- api/core/rag/splitter/text_splitter.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/api/core/rag/splitter/text_splitter.py b/api/core/rag/splitter/text_splitter.py index b3adcedc76..8ae5cbf7d1 100644 --- a/api/core/rag/splitter/text_splitter.py +++ b/api/core/rag/splitter/text_splitter.py @@ -30,15 +30,14 @@ def _split_text_with_regex( if keep_separator: # The parentheses in the pattern keep the delimiters in the result. _splits = re.split(f"({re.escape(separator)})", text) - splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] - if len(_splits) % 2 == 0: + splits = [_splits[i - 1] + _splits[i] for i in range(1, len(_splits), 2)] + if len(_splits) % 2 != 0: splits += _splits[-1:] - splits = [_splits[0]] + splits else: splits = re.split(separator, text) else: splits = list(text) - return [s for s in splits if s != ""] + return [s for s in splits if (s != "" and s != '\n')] class TextSplitter(BaseDocumentTransformer, ABC):