mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 19:39:02 +08:00
fix overlap and splitter optimization (#2742)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
parent
b163545771
commit
8ba38e8e74
@ -52,7 +52,7 @@ class BaseIndexProcessor(ABC):
|
|||||||
|
|
||||||
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
|
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
|
||||||
chunk_size=segmentation["max_tokens"],
|
chunk_size=segmentation["max_tokens"],
|
||||||
chunk_overlap=0,
|
chunk_overlap=segmentation.get('chunk_overlap', 0),
|
||||||
fixed_separator=separator,
|
fixed_separator=separator,
|
||||||
separators=["\n\n", "。", ".", " ", ""],
|
separators=["\n\n", "。", ".", " ", ""],
|
||||||
embedding_model_instance=embedding_model_instance
|
embedding_model_instance=embedding_model_instance
|
||||||
@ -61,7 +61,7 @@ class BaseIndexProcessor(ABC):
|
|||||||
# Automatic segmentation
|
# Automatic segmentation
|
||||||
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
|
character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
|
||||||
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
|
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
|
||||||
chunk_overlap=0,
|
chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'],
|
||||||
separators=["\n\n", "。", ".", " ", ""],
|
separators=["\n\n", "。", ".", " ", ""],
|
||||||
embedding_model_instance=embedding_model_instance
|
embedding_model_instance=embedding_model_instance
|
||||||
)
|
)
|
||||||
|
@ -30,7 +30,7 @@ def _split_text_with_regex(
|
|||||||
if separator:
|
if separator:
|
||||||
if keep_separator:
|
if keep_separator:
|
||||||
# The parentheses in the pattern keep the delimiters in the result.
|
# The parentheses in the pattern keep the delimiters in the result.
|
||||||
_splits = re.split(f"({separator})", text)
|
_splits = re.split(f"({re.escape(separator)})", text)
|
||||||
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
|
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
|
||||||
if len(_splits) % 2 == 0:
|
if len(_splits) % 2 == 0:
|
||||||
splits += _splits[-1:]
|
splits += _splits[-1:]
|
||||||
@ -94,7 +94,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
|
|||||||
documents.append(new_doc)
|
documents.append(new_doc)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def split_documents(self, documents: Iterable[Document]) -> list[Document]:
|
def split_documents(self, documents: Iterable[Document] ) -> list[Document]:
|
||||||
"""Split documents."""
|
"""Split documents."""
|
||||||
texts, metadatas = [], []
|
texts, metadatas = [], []
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user