mirror of
https://www.modelscope.cn/OpenBMB/MiniCPM-o-2_6.git
synced 2025-04-18 15:49:34 +08:00
Release interface in processing
This commit is contained in:
parent
64edf3d723
commit
4f835638ec
@ -102,6 +102,31 @@ class MiniCPMOProcessor(ProcessorMixin):
|
||||
|
||||
return MiniCPMOBatchFeature(data={**model_inputs})
|
||||
|
||||
def get_audio_placeholder(self, audio_lens, chunk_input, chunk_length):
|
||||
pool_step = 2
|
||||
feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
|
||||
|
||||
feature_lens = (feature_lens - 1) // 2 + 1
|
||||
output_lens = (feature_lens - pool_step) // pool_step + 1
|
||||
|
||||
if chunk_input:
|
||||
fbank_feat_in_chunk = int(chunk_length * 100)
|
||||
cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
|
||||
audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
|
||||
num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
|
||||
|
||||
place_holders = ""
|
||||
total_unk_len = 0
|
||||
for _ in range(num_audio_chunks):
|
||||
unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
|
||||
place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
|
||||
total_unk_len += unk_len
|
||||
audio_placeholder = place_holders
|
||||
else:
|
||||
audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
|
||||
|
||||
return audio_placeholder
|
||||
|
||||
def audio_feature_extract(
|
||||
self,
|
||||
audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]],
|
||||
@ -111,31 +136,6 @@ class MiniCPMOProcessor(ProcessorMixin):
|
||||
chunk_length: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
def get_audio_placeholder(audio_lens, chunk_input):
|
||||
pool_step = 2
|
||||
feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length)
|
||||
|
||||
feature_lens = (feature_lens - 1) // 2 + 1
|
||||
output_lens = (feature_lens - pool_step) // pool_step + 1
|
||||
|
||||
if chunk_input:
|
||||
fbank_feat_in_chunk = int(chunk_length * 100)
|
||||
cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
|
||||
audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1
|
||||
num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk
|
||||
|
||||
place_holders = ""
|
||||
total_unk_len = 0
|
||||
for _ in range(num_audio_chunks):
|
||||
unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len)
|
||||
place_holders += self.tokenizer.audio_start + "<unk>" * unk_len + self.tokenizer.audio_end
|
||||
total_unk_len += unk_len
|
||||
audio_placeholder = place_holders
|
||||
else:
|
||||
audio_placeholder = self.tokenizer.audio_start + "<unk>" * output_lens + self.tokenizer.audio_end
|
||||
|
||||
return audio_placeholder
|
||||
|
||||
if isinstance(audios, np.ndarray):
|
||||
audios_list = [[audios]]
|
||||
elif isinstance(audios[0], np.ndarray):
|
||||
@ -156,7 +156,7 @@ class MiniCPMOProcessor(ProcessorMixin):
|
||||
# audio placeholder not dependent on audio_parts
|
||||
for audios in audios_list:
|
||||
if audios:
|
||||
audio_ph_list.append([get_audio_placeholder(len(a), chunk_input) for a in audios])
|
||||
audio_ph_list.append([self.get_audio_placeholder(len(a), chunk_input, chunk_length) for a in audios])
|
||||
else:
|
||||
audio_ph_list.append([])
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user