mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-15 20:46:01 +08:00
fix spliter length missed (#7987)
This commit is contained in:
parent
f6b9982c23
commit
0e71f6db84
@ -93,17 +93,21 @@ class FixedRecursiveCharacterTextSplitter(EnhanceRecursiveCharacterTextSplitter)
|
|||||||
splits = list(text)
|
splits = list(text)
|
||||||
# Now go merging things, recursively splitting longer texts.
|
# Now go merging things, recursively splitting longer texts.
|
||||||
_good_splits = []
|
_good_splits = []
|
||||||
|
_good_splits_lengths = [] # cache the lengths of the splits
|
||||||
for s in splits:
|
for s in splits:
|
||||||
if self._length_function(s) < self._chunk_size:
|
s_len = self._length_function(s)
|
||||||
|
if s_len < self._chunk_size:
|
||||||
_good_splits.append(s)
|
_good_splits.append(s)
|
||||||
|
_good_splits_lengths.append(s_len)
|
||||||
else:
|
else:
|
||||||
if _good_splits:
|
if _good_splits:
|
||||||
merged_text = self._merge_splits(_good_splits, separator)
|
merged_text = self._merge_splits(_good_splits, separator, _good_splits_lengths)
|
||||||
final_chunks.extend(merged_text)
|
final_chunks.extend(merged_text)
|
||||||
_good_splits = []
|
_good_splits = []
|
||||||
|
_good_splits_lengths = []
|
||||||
other_info = self.recursive_split_text(s)
|
other_info = self.recursive_split_text(s)
|
||||||
final_chunks.extend(other_info)
|
final_chunks.extend(other_info)
|
||||||
if _good_splits:
|
if _good_splits:
|
||||||
merged_text = self._merge_splits(_good_splits, separator)
|
merged_text = self._merge_splits(_good_splits, separator, _good_splits_lengths)
|
||||||
final_chunks.extend(merged_text)
|
final_chunks.extend(merged_text)
|
||||||
return final_chunks
|
return final_chunks
|
||||||
|
@ -243,7 +243,10 @@ class CharacterTextSplitter(TextSplitter):
|
|||||||
# First we naively split the large input into a bunch of smaller ones.
|
# First we naively split the large input into a bunch of smaller ones.
|
||||||
splits = _split_text_with_regex(text, self._separator, self._keep_separator)
|
splits = _split_text_with_regex(text, self._separator, self._keep_separator)
|
||||||
_separator = "" if self._keep_separator else self._separator
|
_separator = "" if self._keep_separator else self._separator
|
||||||
return self._merge_splits(splits, _separator)
|
_good_splits_lengths = [] # cache the lengths of the splits
|
||||||
|
for split in splits:
|
||||||
|
_good_splits_lengths.append(self._length_function(split))
|
||||||
|
return self._merge_splits(splits, _separator, _good_splits_lengths)
|
||||||
|
|
||||||
|
|
||||||
class LineType(TypedDict):
|
class LineType(TypedDict):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user