From 95283b4dd3b34132050cb8b625daf2c463a4d1ff Mon Sep 17 00:00:00 2001
From: Jyong <76649700+JohnJyong@users.noreply.github.com>
Date: Wed, 16 Apr 2025 12:28:22 +0800
Subject: [PATCH] Feat/change split length method (#18097)

Co-authored-by: JzoNg <jzongcode@gmail.com>
---
 api/core/rag/splitter/fixed_text_splitter.py       | 10 ++++++++--
 api/services/dataset_service.py                    |  2 +-
 .../components/datasets/create/step-two/index.tsx  | 14 +++++++-------
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/api/core/rag/splitter/fixed_text_splitter.py b/api/core/rag/splitter/fixed_text_splitter.py
index 67f9b6384d..0fb1bcb2e0 100644
--- a/api/core/rag/splitter/fixed_text_splitter.py
+++ b/api/core/rag/splitter/fixed_text_splitter.py
@@ -39,6 +39,12 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
             else:
                 return [GPT2Tokenizer.get_num_tokens(text) for text in texts]
 
+        def _character_encoder(texts: list[str]) -> list[int]:
+            if not texts:
+                return []
+
+            return [len(text) for text in texts]
+
         if issubclass(cls, TokenTextSplitter):
             extra_kwargs = {
                 "model_name": embedding_model_instance.model if embedding_model_instance else "gpt2",
@@ -47,7 +53,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
             }
             kwargs = {**kwargs, **extra_kwargs}
 
-        return cls(length_function=_token_encoder, **kwargs)
+        return cls(length_function=_character_encoder, **kwargs)
 
 
 class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
@@ -103,7 +109,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
         _good_splits_lengths = []  # cache the lengths of the splits
         _separator = "" if self._keep_separator else separator
         s_lens = self._length_function(splits)
-        if _separator != "":
+        if separator != "":
             for s, s_len in zip(splits, s_lens):
                 if s_len < self._chunk_size:
                     _good_splits.append(s)
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
index 0301c8a584..deb6be5a43 100644
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -553,7 +553,7 @@ class DocumentService:
                 {"id": "remove_extra_spaces", "enabled": True},
                 {"id": "remove_urls_emails", "enabled": False},
             ],
-            "segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
+            "segmentation": {"delimiter": "\n", "max_tokens": 1024, "chunk_overlap": 50},
         },
         "limits": {
             "indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,
diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx
index 12fd54d0fe..6b6580ae7e 100644
--- a/web/app/components/datasets/create/step-two/index.tsx
+++ b/web/app/components/datasets/create/step-two/index.tsx
@@ -97,7 +97,7 @@ export enum IndexingType {
 }
 
 const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
-const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
+const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
 const DEFAULT_OVERLAP = 50
 const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
 
@@ -117,11 +117,11 @@ const defaultParentChildConfig: ParentChildConfig = {
   chunkForContext: 'paragraph',
   parent: {
     delimiter: '\\n\\n',
-    maxLength: 500,
+    maxLength: 1024,
   },
   child: {
     delimiter: '\\n',
-    maxLength: 200,
+    maxLength: 512,
   },
 }
 
@@ -623,12 +623,12 @@ const StepTwo = ({
                   onChange={e => setSegmentIdentifier(e.target.value, true)}
                 />
                 <MaxLengthInput
-                  unit='tokens'
+                  unit='characters'
                   value={maxChunkLength}
                   onChange={setMaxChunkLength}
                 />
                 <OverlapInput
-                  unit='tokens'
+                  unit='characters'
                   value={overlap}
                   min={1}
                   onChange={setOverlap}
@@ -756,7 +756,7 @@ const StepTwo = ({
                         })}
                       />
                       <MaxLengthInput
-                        unit='tokens'
+                        unit='characters'
                         value={parentChildConfig.parent.maxLength}
                         onChange={value => setParentChildConfig({
                           ...parentChildConfig,
@@ -803,7 +803,7 @@ const StepTwo = ({
                     })}
                   />
                   <MaxLengthInput
-                    unit='tokens'
+                    unit='characters'
                     value={parentChildConfig.child.maxLength}
                     onChange={value => setParentChildConfig({
                       ...parentChildConfig,