mirror of
https://www.modelscope.cn/iic/CosyVoice2-0.5B.git
synced 2025-04-19 08:09:31 +08:00
add flow cache model
This commit is contained in:
parent
9bd5b08fc0
commit
f3bcc1af05
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -59,3 +59,4 @@ flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text
|
|||||||
flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
|
flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
|
||||||
flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text
|
flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text
|
||||||
flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text
|
flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
flow.cache.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
@ -10,6 +10,12 @@ llm_input_size: 896
|
|||||||
llm_output_size: 896
|
llm_output_size: 896
|
||||||
spk_embed_dim: 192
|
spk_embed_dim: 192
|
||||||
qwen_pretrain_path: ''
|
qwen_pretrain_path: ''
|
||||||
|
token_frame_rate: 25
|
||||||
|
token_mel_ratio: 2
|
||||||
|
|
||||||
|
# stream related params
|
||||||
|
chunk_size: 25 # streaming inference chunk size, in token
|
||||||
|
num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
|
||||||
|
|
||||||
# model params
|
# model params
|
||||||
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
|
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
|
||||||
@ -20,6 +26,7 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
|
|||||||
speech_token_size: 6561
|
speech_token_size: 6561
|
||||||
length_normalized_loss: True
|
length_normalized_loss: True
|
||||||
lsm_weight: 0
|
lsm_weight: 0
|
||||||
|
mix_ratio: [5, 15]
|
||||||
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
|
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
|
||||||
pretrain_path: !ref <qwen_pretrain_path>
|
pretrain_path: !ref <qwen_pretrain_path>
|
||||||
sampling: !name:cosyvoice.utils.common.ras_sampling
|
sampling: !name:cosyvoice.utils.common.ras_sampling
|
||||||
@ -34,9 +41,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
|||||||
spk_embed_dim: !ref <spk_embed_dim>
|
spk_embed_dim: !ref <spk_embed_dim>
|
||||||
output_type: 'mel'
|
output_type: 'mel'
|
||||||
vocab_size: 6561
|
vocab_size: 6561
|
||||||
input_frame_rate: 25
|
input_frame_rate: !ref <token_frame_rate>
|
||||||
only_mask_loss: True
|
only_mask_loss: True
|
||||||
token_mel_ratio: 2
|
token_mel_ratio: !ref <token_mel_ratio>
|
||||||
pre_lookahead_len: 3
|
pre_lookahead_len: 3
|
||||||
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
|
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
|
||||||
output_size: 512
|
output_size: 512
|
||||||
@ -53,6 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
|||||||
input_size: 512
|
input_size: 512
|
||||||
use_cnn_module: False
|
use_cnn_module: False
|
||||||
macaron_style: False
|
macaron_style: False
|
||||||
|
static_chunk_size: !ref <chunk_size>
|
||||||
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
|
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
|
||||||
in_channels: 240
|
in_channels: 240
|
||||||
n_spks: 1
|
n_spks: 1
|
||||||
@ -65,10 +73,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
|||||||
training_cfg_rate: 0.2
|
training_cfg_rate: 0.2
|
||||||
inference_cfg_rate: 0.7
|
inference_cfg_rate: 0.7
|
||||||
reg_loss_type: 'l1'
|
reg_loss_type: 'l1'
|
||||||
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
|
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
|
||||||
in_channels: 320
|
in_channels: 320
|
||||||
out_channels: 80
|
out_channels: 80
|
||||||
causal: True
|
|
||||||
channels: [256]
|
channels: [256]
|
||||||
dropout: 0.0
|
dropout: 0.0
|
||||||
attention_head_dim: 64
|
attention_head_dim: 64
|
||||||
@ -76,6 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
|||||||
num_mid_blocks: 12
|
num_mid_blocks: 12
|
||||||
num_heads: 8
|
num_heads: 8
|
||||||
act_fn: 'gelu'
|
act_fn: 'gelu'
|
||||||
|
static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
|
||||||
|
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
|
||||||
|
|
||||||
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
||||||
in_channels: 80
|
in_channels: 80
|
||||||
@ -101,6 +110,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
|||||||
in_channels: 80
|
in_channels: 80
|
||||||
cond_channels: 512
|
cond_channels: 512
|
||||||
|
|
||||||
|
# gan related module
|
||||||
|
mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
|
||||||
|
n_fft: 1920
|
||||||
|
num_mels: 80
|
||||||
|
sampling_rate: !ref <sample_rate>
|
||||||
|
hop_size: 480
|
||||||
|
win_size: 1920
|
||||||
|
fmin: 0
|
||||||
|
fmax: null
|
||||||
|
center: False
|
||||||
|
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
|
||||||
|
generator: !ref <hift>
|
||||||
|
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
|
||||||
|
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
|
||||||
|
mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
|
||||||
|
mel_spec_transform: [
|
||||||
|
!ref <mel_spec_transform1>
|
||||||
|
]
|
||||||
|
|
||||||
# processor functions
|
# processor functions
|
||||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
||||||
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
|
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
|
||||||
@ -112,11 +140,13 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
|
|||||||
allowed_special: !ref <allowed_special>
|
allowed_special: !ref <allowed_special>
|
||||||
filter: !name:cosyvoice.dataset.processor.filter
|
filter: !name:cosyvoice.dataset.processor.filter
|
||||||
max_length: 40960
|
max_length: 40960
|
||||||
min_length: 0
|
min_length: 100
|
||||||
token_max_length: 200
|
token_max_length: 200
|
||||||
token_min_length: 1
|
token_min_length: 1
|
||||||
resample: !name:cosyvoice.dataset.processor.resample
|
resample: !name:cosyvoice.dataset.processor.resample
|
||||||
resample_rate: !ref <sample_rate>
|
resample_rate: !ref <sample_rate>
|
||||||
|
truncate: !name:cosyvoice.dataset.processor.truncate
|
||||||
|
truncate_length: 24480 # must be a multiplier of hop_size
|
||||||
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
||||||
n_fft: 1920
|
n_fft: 1920
|
||||||
num_mels: 80
|
num_mels: 80
|
||||||
@ -128,6 +158,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
|||||||
center: False
|
center: False
|
||||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||||
feat_extractor: !ref <feat_extractor>
|
feat_extractor: !ref <feat_extractor>
|
||||||
|
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
|
||||||
|
sample_rate: !ref <sample_rate>
|
||||||
|
hop_size: 480
|
||||||
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
|
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
|
||||||
normalize: True
|
normalize: True
|
||||||
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
||||||
@ -137,4 +170,64 @@ sort: !name:cosyvoice.dataset.processor.sort
|
|||||||
batch: !name:cosyvoice.dataset.processor.batch
|
batch: !name:cosyvoice.dataset.processor.batch
|
||||||
batch_type: 'dynamic'
|
batch_type: 'dynamic'
|
||||||
max_frames_in_batch: 2000
|
max_frames_in_batch: 2000
|
||||||
padding: !name:cosyvoice.dataset.processor.padding
|
padding: !name:cosyvoice.dataset.processor.padding
|
||||||
|
use_spk_embedding: False # change to True during sft
|
||||||
|
|
||||||
|
|
||||||
|
# dataset processor pipeline
|
||||||
|
data_pipeline: [
|
||||||
|
!ref <parquet_opener>,
|
||||||
|
!ref <tokenize>,
|
||||||
|
!ref <filter>,
|
||||||
|
!ref <resample>,
|
||||||
|
!ref <compute_fbank>,
|
||||||
|
!ref <parse_embedding>,
|
||||||
|
!ref <shuffle>,
|
||||||
|
!ref <sort>,
|
||||||
|
!ref <batch>,
|
||||||
|
!ref <padding>,
|
||||||
|
]
|
||||||
|
data_pipeline_gan: [
|
||||||
|
!ref <parquet_opener>,
|
||||||
|
!ref <tokenize>,
|
||||||
|
!ref <filter>,
|
||||||
|
!ref <resample>,
|
||||||
|
!ref <truncate>,
|
||||||
|
!ref <compute_fbank>,
|
||||||
|
!ref <compute_f0>,
|
||||||
|
!ref <parse_embedding>,
|
||||||
|
!ref <shuffle>,
|
||||||
|
!ref <sort>,
|
||||||
|
!ref <batch>,
|
||||||
|
!ref <padding>,
|
||||||
|
]
|
||||||
|
|
||||||
|
# llm flow train conf
|
||||||
|
train_conf:
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 1e-5 # change to 1e-5 during sft
|
||||||
|
scheduler: constantlr # change to constantlr during sft
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 2500
|
||||||
|
max_epoch: 200
|
||||||
|
grad_clip: 5
|
||||||
|
accum_grad: 2
|
||||||
|
log_interval: 100
|
||||||
|
save_per_step: -1
|
||||||
|
|
||||||
|
# gan train conf
|
||||||
|
train_conf_gan:
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.0002 # use small lr for gan training
|
||||||
|
scheduler: constantlr
|
||||||
|
optim_d: adam
|
||||||
|
optim_conf_d:
|
||||||
|
lr: 0.0002 # use small lr for gan training
|
||||||
|
scheduler_d: constantlr
|
||||||
|
max_epoch: 200
|
||||||
|
grad_clip: 5
|
||||||
|
accum_grad: 1 # in gan training, accum_grad must be 1
|
||||||
|
log_interval: 100
|
||||||
|
save_per_step: -1
|
BIN
flow.cache.pt
(Stored with Git LFS)
Normal file
BIN
flow.cache.pt
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
flow.decoder.estimator.fp16.a10.plan
(Stored with Git LFS)
BIN
flow.decoder.estimator.fp16.a10.plan
(Stored with Git LFS)
Binary file not shown.
BIN
flow.decoder.estimator.fp16.l20.plan
(Stored with Git LFS)
BIN
flow.decoder.estimator.fp16.l20.plan
(Stored with Git LFS)
Binary file not shown.
BIN
flow.decoder.estimator.fp16.v100.plan
(Stored with Git LFS)
BIN
flow.decoder.estimator.fp16.v100.plan
(Stored with Git LFS)
Binary file not shown.
BIN
flow.decoder.estimator.fp32.onnx
(Stored with Git LFS)
BIN
flow.decoder.estimator.fp32.onnx
(Stored with Git LFS)
Binary file not shown.
BIN
flow.encoder.fp16.zip
(Stored with Git LFS)
BIN
flow.encoder.fp16.zip
(Stored with Git LFS)
Binary file not shown.
BIN
flow.encoder.fp32.zip
(Stored with Git LFS)
BIN
flow.encoder.fp32.zip
(Stored with Git LFS)
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user