From 4f835638ec1860b9621f4d74cac04cf5f422cce4 Mon Sep 17 00:00:00 2001 From: Hongji Zhu Date: Fri, 7 Feb 2025 15:00:29 +0800 Subject: [PATCH] Release interface in processing --- processing_minicpmo.py | 52 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/processing_minicpmo.py b/processing_minicpmo.py index 65e775e..5a62b04 100644 --- a/processing_minicpmo.py +++ b/processing_minicpmo.py @@ -102,6 +102,31 @@ class MiniCPMOProcessor(ProcessorMixin): return MiniCPMOBatchFeature(data={**model_inputs}) + def get_audio_placeholder(self, audio_lens, chunk_input, chunk_length): + pool_step = 2 + feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length) + + feature_lens = (feature_lens - 1) // 2 + 1 + output_lens = (feature_lens - pool_step) // pool_step + 1 + + if chunk_input: + fbank_feat_in_chunk = int(chunk_length * 100) + cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1 + audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1 + num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk + + place_holders = "" + total_unk_len = 0 + for _ in range(num_audio_chunks): + unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len) + place_holders += self.tokenizer.audio_start + "" * unk_len + self.tokenizer.audio_end + total_unk_len += unk_len + audio_placeholder = place_holders + else: + audio_placeholder = self.tokenizer.audio_start + "" * output_lens + self.tokenizer.audio_end + + return audio_placeholder + def audio_feature_extract( self, audios: Union[np.ndarray, List[np.ndarray], List[List[np.ndarray]]], @@ -111,31 +136,6 @@ class MiniCPMOProcessor(ProcessorMixin): chunk_length: Optional[int] = 1, **kwargs, ): - def get_audio_placeholder(audio_lens, chunk_input): - pool_step = 2 - feature_lens = math.ceil(audio_lens / self.feature_extractor.hop_length) - - feature_lens = (feature_lens - 1) // 2 + 1 - output_lens = (feature_lens - pool_step) // pool_step + 1 - - if chunk_input: - fbank_feat_in_chunk = int(chunk_length * 100) - cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1 - audio_embeds_in_chunk = (cnn_feat_in_chunk - pool_step) // pool_step + 1 - num_audio_chunks = (output_lens + audio_embeds_in_chunk - 1) // audio_embeds_in_chunk - - place_holders = "" - total_unk_len = 0 - for _ in range(num_audio_chunks): - unk_len = min(audio_embeds_in_chunk, output_lens - total_unk_len) - place_holders += self.tokenizer.audio_start + "" * unk_len + self.tokenizer.audio_end - total_unk_len += unk_len - audio_placeholder = place_holders - else: - audio_placeholder = self.tokenizer.audio_start + "" * output_lens + self.tokenizer.audio_end - - return audio_placeholder - if isinstance(audios, np.ndarray): audios_list = [[audios]] elif isinstance(audios[0], np.ndarray): @@ -156,7 +156,7 @@ class MiniCPMOProcessor(ProcessorMixin): # audio placeholder not dependent on audio_parts for audios in audios_list: if audios: - audio_ph_list.append([get_audio_placeholder(len(a), chunk_input) for a in audios]) + audio_ph_list.append([self.get_audio_placeholder(len(a), chunk_input, chunk_length) for a in audios]) else: audio_ph_list.append([])