diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index fcb06e5c84..509a1a189b 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -52,7 +52,7 @@ class BaseIndexProcessor(ABC): character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder( chunk_size=segmentation["max_tokens"], - chunk_overlap=0, + chunk_overlap=segmentation.get('chunk_overlap', 0), fixed_separator=separator, separators=["\n\n", "。", ".", " ", ""], embedding_model_instance=embedding_model_instance @@ -61,7 +61,7 @@ class BaseIndexProcessor(ABC): # Automatic segmentation character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder( chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'], - chunk_overlap=0, + chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'], separators=["\n\n", "。", ".", " ", ""], embedding_model_instance=embedding_model_instance ) diff --git a/api/core/splitter/text_splitter.py b/api/core/splitter/text_splitter.py index e3d43c0658..5eeb237a96 100644 --- a/api/core/splitter/text_splitter.py +++ b/api/core/splitter/text_splitter.py @@ -30,7 +30,7 @@ def _split_text_with_regex( if separator: if keep_separator: # The parentheses in the pattern keep the delimiters in the result. - _splits = re.split(f"({separator})", text) + _splits = re.split(f"({re.escape(separator)})", text) splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] if len(_splits) % 2 == 0: splits += _splits[-1:] @@ -94,7 +94,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): documents.append(new_doc) return documents - def split_documents(self, documents: Iterable[Document]) -> list[Document]: + def split_documents(self, documents: Iterable[Document] ) -> list[Document]: """Split documents.""" texts, metadatas = [], [] for doc in documents: