mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-20 01:09:05 +08:00
Feat/change split length method (#18097)
Co-authored-by: JzoNg <jzongcode@gmail.com>
This commit is contained in:
parent
2a0d7533d7
commit
95283b4dd3
@ -39,6 +39,12 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|||||||
else:
|
else:
|
||||||
return [GPT2Tokenizer.get_num_tokens(text) for text in texts]
|
return [GPT2Tokenizer.get_num_tokens(text) for text in texts]
|
||||||
|
|
||||||
|
def _character_encoder(texts: list[str]) -> list[int]:
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [len(text) for text in texts]
|
||||||
|
|
||||||
if issubclass(cls, TokenTextSplitter):
|
if issubclass(cls, TokenTextSplitter):
|
||||||
extra_kwargs = {
|
extra_kwargs = {
|
||||||
"model_name": embedding_model_instance.model if embedding_model_instance else "gpt2",
|
"model_name": embedding_model_instance.model if embedding_model_instance else "gpt2",
|
||||||
@ -47,7 +53,7 @@ class EnhanceRecursiveCharacterTextSplitter(RecursiveCharacterTextSplitter):
|
|||||||
}
|
}
|
||||||
kwargs = {**kwargs, **extra_kwargs}
|
kwargs = {**kwargs, **extra_kwargs}
|
||||||
|
|
||||||
return cls(length_function=_token_encoder, **kwargs)
|
return cls(length_function=_character_encoder, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
|
class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter):
|
||||||
@ -103,7 +109,7 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
|||||||
_good_splits_lengths = [] # cache the lengths of the splits
|
_good_splits_lengths = [] # cache the lengths of the splits
|
||||||
_separator = "" if self._keep_separator else separator
|
_separator = "" if self._keep_separator else separator
|
||||||
s_lens = self._length_function(splits)
|
s_lens = self._length_function(splits)
|
||||||
if _separator != "":
|
if separator != "":
|
||||||
for s, s_len in zip(splits, s_lens):
|
for s, s_len in zip(splits, s_lens):
|
||||||
if s_len < self._chunk_size:
|
if s_len < self._chunk_size:
|
||||||
_good_splits.append(s)
|
_good_splits.append(s)
|
||||||
|
@ -553,7 +553,7 @@ class DocumentService:
|
|||||||
{"id": "remove_extra_spaces", "enabled": True},
|
{"id": "remove_extra_spaces", "enabled": True},
|
||||||
{"id": "remove_urls_emails", "enabled": False},
|
{"id": "remove_urls_emails", "enabled": False},
|
||||||
],
|
],
|
||||||
"segmentation": {"delimiter": "\n", "max_tokens": 500, "chunk_overlap": 50},
|
"segmentation": {"delimiter": "\n", "max_tokens": 1024, "chunk_overlap": 50},
|
||||||
},
|
},
|
||||||
"limits": {
|
"limits": {
|
||||||
"indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,
|
"indexing_max_segmentation_tokens_length": dify_config.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH,
|
||||||
|
@ -97,7 +97,7 @@ export enum IndexingType {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
||||||
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
|
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
|
||||||
const DEFAULT_OVERLAP = 50
|
const DEFAULT_OVERLAP = 50
|
||||||
const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
|
const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
|
||||||
|
|
||||||
@ -117,11 +117,11 @@ const defaultParentChildConfig: ParentChildConfig = {
|
|||||||
chunkForContext: 'paragraph',
|
chunkForContext: 'paragraph',
|
||||||
parent: {
|
parent: {
|
||||||
delimiter: '\\n\\n',
|
delimiter: '\\n\\n',
|
||||||
maxLength: 500,
|
maxLength: 1024,
|
||||||
},
|
},
|
||||||
child: {
|
child: {
|
||||||
delimiter: '\\n',
|
delimiter: '\\n',
|
||||||
maxLength: 200,
|
maxLength: 512,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -623,12 +623,12 @@ const StepTwo = ({
|
|||||||
onChange={e => setSegmentIdentifier(e.target.value, true)}
|
onChange={e => setSegmentIdentifier(e.target.value, true)}
|
||||||
/>
|
/>
|
||||||
<MaxLengthInput
|
<MaxLengthInput
|
||||||
unit='tokens'
|
unit='characters'
|
||||||
value={maxChunkLength}
|
value={maxChunkLength}
|
||||||
onChange={setMaxChunkLength}
|
onChange={setMaxChunkLength}
|
||||||
/>
|
/>
|
||||||
<OverlapInput
|
<OverlapInput
|
||||||
unit='tokens'
|
unit='characters'
|
||||||
value={overlap}
|
value={overlap}
|
||||||
min={1}
|
min={1}
|
||||||
onChange={setOverlap}
|
onChange={setOverlap}
|
||||||
@ -756,7 +756,7 @@ const StepTwo = ({
|
|||||||
})}
|
})}
|
||||||
/>
|
/>
|
||||||
<MaxLengthInput
|
<MaxLengthInput
|
||||||
unit='tokens'
|
unit='characters'
|
||||||
value={parentChildConfig.parent.maxLength}
|
value={parentChildConfig.parent.maxLength}
|
||||||
onChange={value => setParentChildConfig({
|
onChange={value => setParentChildConfig({
|
||||||
...parentChildConfig,
|
...parentChildConfig,
|
||||||
@ -803,7 +803,7 @@ const StepTwo = ({
|
|||||||
})}
|
})}
|
||||||
/>
|
/>
|
||||||
<MaxLengthInput
|
<MaxLengthInput
|
||||||
unit='tokens'
|
unit='characters'
|
||||||
value={parentChildConfig.child.maxLength}
|
value={parentChildConfig.child.maxLength}
|
||||||
onChange={value => setParentChildConfig({
|
onChange={value => setParentChildConfig({
|
||||||
...parentChildConfig,
|
...parentChildConfig,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user