mirror of
https://www.modelscope.cn/OpenBMB/MiniCPM-o-2_6.git
synced 2025-08-18 22:55:57 +08:00
support audio finetuning
This commit is contained in:
parent
161f4b7db3
commit
64edf3d723
@ -466,7 +466,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_audio_embedding(self, data, chunk_length=-1):
|
def get_audio_embedding(self, data, chunk_length=-1, dummy=True):
|
||||||
r"""
|
r"""
|
||||||
Extract full audio embeddings with optional chunk-based attention.
|
Extract full audio embeddings with optional chunk-based attention.
|
||||||
|
|
||||||
@ -484,6 +484,8 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|||||||
Returns:
|
Returns:
|
||||||
List[List[torch.Tensor]]: audio embeddings
|
List[List[torch.Tensor]]: audio embeddings
|
||||||
"""
|
"""
|
||||||
|
dtype = self.apm.embed_positions.weight.dtype
|
||||||
|
device = self.apm.embed_positions.weight.device
|
||||||
|
|
||||||
wavforms = data.get("audio_features", []) # (bs, 80, frames) or [], multi audios need filled in advance
|
wavforms = data.get("audio_features", []) # (bs, 80, frames) or [], multi audios need filled in advance
|
||||||
audio_feature_lens_raw = data.get("audio_feature_lens", []) # list, [[x1, x2], [y1], [z1]]
|
audio_feature_lens_raw = data.get("audio_feature_lens", []) # list, [[x1, x2], [y1], [z1]]
|
||||||
@ -544,6 +546,17 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|||||||
idx += 1
|
idx += 1
|
||||||
final_audio_embeds.append(target_audio_embeds)
|
final_audio_embeds.append(target_audio_embeds)
|
||||||
return final_audio_embeds
|
return final_audio_embeds
|
||||||
|
elif self.training and dummy:
|
||||||
|
dummy_wavs = torch.zeros((1, 80, 100), device=device, dtype=dtype)
|
||||||
|
audio_states = self.apm(dummy_wavs, output_hidden_states=True).hidden_states[self.audio_encoder_layer]
|
||||||
|
|
||||||
|
audio_embeds = self.audio_projection_layer(audio_states)
|
||||||
|
|
||||||
|
audio_embeds = audio_embeds.transpose(1, 2)
|
||||||
|
audio_embeds = self.audio_avg_pooler(audio_embeds)
|
||||||
|
audio_embeds = audio_embeds.transpose(1, 2)
|
||||||
|
return [audio_embeds]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@ -576,7 +589,7 @@ class MiniCPMO(MiniCPMOPreTrainedModel):
|
|||||||
audio_start_pos = 0
|
audio_start_pos = 0
|
||||||
for bound in audio_bounds[i]:
|
for bound in audio_bounds[i]:
|
||||||
audio_len = bound[1] - bound[0]
|
audio_len = bound[1] - bound[0]
|
||||||
input_embeddings[0, bound[0] : bound[1]] = audio_embs[
|
input_embeddings[i, bound[0] : bound[1]] = audio_embs[
|
||||||
audio_start_pos : audio_start_pos + audio_len, :
|
audio_start_pos : audio_start_pos + audio_len, :
|
||||||
]
|
]
|
||||||
audio_start_pos += audio_len
|
audio_start_pos += audio_len
|
||||||
|
Loading…
x
Reference in New Issue
Block a user