From 95283b4dd3b34132050cb8b625daf2c463a4d1ff Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Wed, 16 Apr 2025 12:28:22 +0800 Subject: [PATCH] Feat/change split length method (#18097) Co-authored-by: JzoNg --- api/core/rag/splitter/fixed_text_splitter.py | 10 ++++++++-- api/services/dataset_service.py | 2 +- .../components/datasets/create/step-two/index.tsx | 14 +++++++------- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/api/core/rag/splitter/fixed_text_splitter.py b/api/core/rag/splitter/fixed_text_splitter.py index 67f9b6384d..0fb1bcb2e0 100644 --- a/api/core/rag/splitter/fixed_text_splitter.py +++ b/api/core/rag/splitter/fixed_text_splitter.py @@ -39,6 +39,12 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter): else: return [GPT2Tokenizer.get_num_tokens(text) for text in texts] + def _character_encoder(texts: list[str]) -> list[int]: + if not texts: + return [] + + return [len(text) for text in texts] + if issubclass(cls, TokenTextSplitter): extra_kwargs = { "model_name": embedding_model_instance.model if embedding_model_instance else "gpt2", @@ -47,7 +53,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter): } kwargs = {**kwargs, **extra_kwargs} - return cls(length_function=_token_encoder, **kwargs) + return cls(length_function=_character_encoder, **kwargs) class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter): @@ -103,7 +109,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter) _good_splits_lengths = [] # cache the lengths of the splits _separator = "" if self._keep_separator else separator s_lens = self._length_function(splits) - if _separator != "": + if separator != "": for s, s_len in zip(splits, s_lens): if s_len < self._chunk_size: _good_splits.append(s) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 0301c8a584..deb6be5a43 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -553,7 +553,7 @@ class DocumentService: {"id": "remove_extra_spaces", "enabled": True}, {"id": "remove_urls_emails", "enabled": False}, ], - "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50}, + "segmentation": {"delimiter": "\n", "max_tokens": 1024, "chunk_overlap": 50}, }, "limits": { "indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH, diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 12fd54d0fe..6b6580ae7e 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -97,7 +97,7 @@ export enum IndexingType { } const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' -const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500 +const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024 const DEFAULT_OVERLAP = 50 const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10) @@ -117,11 +117,11 @@ const defaultParentChildConfig: ParentChildConfig = { chunkForContext: 'paragraph', parent: { delimiter: '\\n\\n', - maxLength: 500, + maxLength: 1024, }, child: { delimiter: '\\n', - maxLength: 200, + maxLength: 512, }, } @@ -623,12 +623,12 @@ const StepTwo = ({ onChange={e => setSegmentIdentifier(e.target.value, true)} /> setParentChildConfig({ ...parentChildConfig, @@ -803,7 +803,7 @@ const StepTwo = ({ })} /> setParentChildConfig({ ...parentChildConfig,