Feat/change split length method (#18097)

Co-authored-by: JzoNg <jzongcode@gmail.com>
This commit is contained in:
Jyong 2025-04-16 12:28:22 +08:00 committed by GitHub
parent 2a0d7533d7
commit 95283b4dd3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 16 additions and 10 deletions

View File

@ -39,6 +39,12 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
else: else:
return [GPT2Tokenizer.get_num_tokens(text) for text in texts] return [GPT2Tokenizer.get_num_tokens(text) for text in texts]
def _character_encoder(texts: list[str]) -> list[int]:
if not texts:
return []
return [len(text) for text in texts]
if issubclass(cls, TokenTextSplitter): if issubclass(cls, TokenTextSplitter):
extra_kwargs = { extra_kwargs = {
"model_name": embedding_model_instance.model if embedding_model_instance else "gpt2", "model_name": embedding_model_instance.model if embedding_model_instance else "gpt2",
@ -47,7 +53,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
} }
kwargs = {**kwargs, **extra_kwargs} kwargs = {**kwargs, **extra_kwargs}
return cls(length_function=_token_encoder, **kwargs) return cls(length_function=_character_encoder, **kwargs)
class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter): class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
@ -103,7 +109,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
_good_splits_lengths = [] # cache the lengths of the splits _good_splits_lengths = [] # cache the lengths of the splits
_separator = "" if self._keep_separator else separator _separator = "" if self._keep_separator else separator
s_lens = self._length_function(splits) s_lens = self._length_function(splits)
if _separator != "": if separator != "":
for s, s_len in zip(splits, s_lens): for s, s_len in zip(splits, s_lens):
if s_len < self._chunk_size: if s_len < self._chunk_size:
_good_splits.append(s) _good_splits.append(s)

View File

@ -553,7 +553,7 @@ class DocumentService:
{"id": "remove_extra_spaces", "enabled": True}, {"id": "remove_extra_spaces", "enabled": True},
{"id": "remove_urls_emails", "enabled": False}, {"id": "remove_urls_emails", "enabled": False},
], ],
"segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50}, "segmentation": {"delimiter": "\n", "max_tokens": 1024, "chunk_overlap": 50},
}, },
"limits": { "limits": {
"indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH, "indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,

View File

@ -97,7 +97,7 @@ export enum IndexingType {
} }
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500 const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
const DEFAULT_OVERLAP = 50 const DEFAULT_OVERLAP = 50
const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10) const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
@ -117,11 +117,11 @@ const defaultParentChildConfig: ParentChildConfig = {
chunkForContext: 'paragraph', chunkForContext: 'paragraph',
parent: { parent: {
delimiter: '\\n\\n', delimiter: '\\n\\n',
maxLength: 500, maxLength: 1024,
}, },
child: { child: {
delimiter: '\\n', delimiter: '\\n',
maxLength: 200, maxLength: 512,
}, },
} }
@ -623,12 +623,12 @@ const StepTwo = ({
onChange={e => setSegmentIdentifier(e.target.value, true)} onChange={e => setSegmentIdentifier(e.target.value, true)}
/> />
<MaxLengthInput <MaxLengthInput
unit='tokens' unit='characters'
value={maxChunkLength} value={maxChunkLength}
onChange={setMaxChunkLength} onChange={setMaxChunkLength}
/> />
<OverlapInput <OverlapInput
unit='tokens' unit='characters'
value={overlap} value={overlap}
min={1} min={1}
onChange={setOverlap} onChange={setOverlap}
@ -756,7 +756,7 @@ const StepTwo = ({
})} })}
/> />
<MaxLengthInput <MaxLengthInput
unit='tokens' unit='characters'
value={parentChildConfig.parent.maxLength} value={parentChildConfig.parent.maxLength}
onChange={value => setParentChildConfig({ onChange={value => setParentChildConfig({
...parentChildConfig, ...parentChildConfig,
@ -803,7 +803,7 @@ const StepTwo = ({
})} })}
/> />
<MaxLengthInput <MaxLengthInput
unit='tokens' unit='characters'
value={parentChildConfig.child.maxLength} value={parentChildConfig.child.maxLength}
onChange={value => setParentChildConfig({ onChange={value => setParentChildConfig({
...parentChildConfig, ...parentChildConfig,