From f3bcc1af05e4863340ebc6c4634dbaed05dbeaed Mon Sep 17 00:00:00 2001 From: "lyuxiang.lx" Date: Mon, 7 Apr 2025 21:48:19 +0800 Subject: [PATCH] add flow cache model --- .gitattributes | 1 + cosyvoice.yaml => cosyvoice2.yaml | 105 ++++++++++++++++++++++++-- flow.cache.pt | 3 + flow.decoder.estimator.fp16.a10.plan | 3 - flow.decoder.estimator.fp16.l20.plan | 3 - flow.decoder.estimator.fp16.v100.plan | 3 - flow.decoder.estimator.fp32.onnx | 4 +- flow.encoder.fp16.zip | 4 +- flow.encoder.fp32.zip | 4 +- 9 files changed, 109 insertions(+), 21 deletions(-) rename cosyvoice.yaml => cosyvoice2.yaml (61%) create mode 100644 flow.cache.pt delete mode 100644 flow.decoder.estimator.fp16.a10.plan delete mode 100644 flow.decoder.estimator.fp16.l20.plan delete mode 100644 flow.decoder.estimator.fp16.v100.plan diff --git a/.gitattributes b/.gitattributes index 4bc72bd..536468e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -59,3 +59,4 @@ flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text +flow.cache.pt filter=lfs diff=lfs merge=lfs -text diff --git a/cosyvoice.yaml b/cosyvoice2.yaml similarity index 61% rename from cosyvoice.yaml rename to cosyvoice2.yaml index c9a7848..d6bdeb6 100644 --- a/cosyvoice.yaml +++ b/cosyvoice2.yaml @@ -10,6 +10,12 @@ llm_input_size: 896 llm_output_size: 896 spk_embed_dim: 192 qwen_pretrain_path: '' +token_frame_rate: 25 +token_mel_ratio: 2 + +# stream related params +chunk_size: 25 # streaming inference chunk size, in token +num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks # model params # for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. @@ -20,6 +26,7 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM speech_token_size: 6561 length_normalized_loss: True lsm_weight: 0 + mix_ratio: [5, 15] llm: !new:cosyvoice.llm.llm.Qwen2Encoder pretrain_path: !ref sampling: !name:cosyvoice.utils.common.ras_sampling @@ -34,9 +41,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec spk_embed_dim: !ref output_type: 'mel' vocab_size: 6561 - input_frame_rate: 25 + input_frame_rate: !ref only_mask_loss: True - token_mel_ratio: 2 + token_mel_ratio: !ref pre_lookahead_len: 3 encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder output_size: 512 @@ -53,6 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec input_size: 512 use_cnn_module: False macaron_style: False + static_chunk_size: !ref decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM in_channels: 240 n_spks: 1 @@ -65,10 +73,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec training_cfg_rate: 0.2 inference_cfg_rate: 0.7 reg_loss_type: 'l1' - estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder + estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder in_channels: 320 out_channels: 80 - causal: True channels: [256] dropout: 0.0 attention_head_dim: 64 @@ -76,6 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec num_mid_blocks: 12 num_heads: 8 act_fn: 'gelu' + static_chunk_size: !ref * + num_decoding_left_chunks: !ref hift: !new:cosyvoice.hifigan.generator.HiFTGenerator in_channels: 80 @@ -101,6 +110,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator in_channels: 80 cond_channels: 512 +# gan related module +mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1920 + num_mels: 80 + sampling_rate: !ref + hop_size: 480 + win_size: 1920 + fmin: 0 + fmax: null + center: False +hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan + generator: !ref + discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator + mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator + mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator + mel_spec_transform: [ + !ref + ] + # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer @@ -112,11 +140,13 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize allowed_special: !ref filter: !name:cosyvoice.dataset.processor.filter max_length: 40960 - min_length: 0 + min_length: 100 token_max_length: 200 token_min_length: 1 resample: !name:cosyvoice.dataset.processor.resample resample_rate: !ref +truncate: !name:cosyvoice.dataset.processor.truncate + truncate_length: 24480 # must be a multiplier of hop_size feat_extractor: !name:matcha.utils.audio.mel_spectrogram n_fft: 1920 num_mels: 80 @@ -128,6 +158,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram center: False compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank feat_extractor: !ref +compute_f0: !name:cosyvoice.dataset.processor.compute_f0 + sample_rate: !ref + hop_size: 480 parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding normalize: True shuffle: !name:cosyvoice.dataset.processor.shuffle @@ -137,4 +170,64 @@ sort: !name:cosyvoice.dataset.processor.sort batch: !name:cosyvoice.dataset.processor.batch batch_type: 'dynamic' max_frames_in_batch: 2000 -padding: !name:cosyvoice.dataset.processor.padding \ No newline at end of file +padding: !name:cosyvoice.dataset.processor.padding + use_spk_embedding: False # change to True during sft + + +# dataset processor pipeline +data_pipeline: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] +data_pipeline_gan: [ + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , + !ref , +] + +# llm flow train conf +train_conf: + optim: adam + optim_conf: + lr: 1e-5 # change to 1e-5 during sft + scheduler: constantlr # change to constantlr during sft + scheduler_conf: + warmup_steps: 2500 + max_epoch: 200 + grad_clip: 5 + accum_grad: 2 + log_interval: 100 + save_per_step: -1 + +# gan train conf +train_conf_gan: + optim: adam + optim_conf: + lr: 0.0002 # use small lr for gan training + scheduler: constantlr + optim_d: adam + optim_conf_d: + lr: 0.0002 # use small lr for gan training + scheduler_d: constantlr + max_epoch: 200 + grad_clip: 5 + accum_grad: 1 # in gan training, accum_grad must be 1 + log_interval: 100 + save_per_step: -1 \ No newline at end of file diff --git a/flow.cache.pt b/flow.cache.pt new file mode 100644 index 0000000..b87ad4c --- /dev/null +++ b/flow.cache.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebde248652c6eed855e08bb6a263af3847039a1361f8019bdb27f5f680a1dc4 +size 450496991 diff --git a/flow.decoder.estimator.fp16.a10.plan b/flow.decoder.estimator.fp16.a10.plan deleted file mode 100644 index 5bf713e..0000000 --- a/flow.decoder.estimator.fp16.a10.plan +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f6b9073bd9e7b8ac5bef0a21431391cbc32376b9265ec73935d6f28a0d32d01 -size 168597292 diff --git a/flow.decoder.estimator.fp16.l20.plan b/flow.decoder.estimator.fp16.l20.plan deleted file mode 100644 index aa7bf51..0000000 --- a/flow.decoder.estimator.fp16.l20.plan +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:012df9e730e36e1cb61bf2780378c15ae92c536ae87518b7a54a90026cb99385 -size 166520788 diff --git a/flow.decoder.estimator.fp16.v100.plan b/flow.decoder.estimator.fp16.v100.plan deleted file mode 100644 index 1f6b504..0000000 --- a/flow.decoder.estimator.fp16.v100.plan +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f231edf01451fafbc3dc0498a51feb3a264afad43275536c8151fff954ef3c56 -size 161799540 diff --git a/flow.decoder.estimator.fp32.onnx b/flow.decoder.estimator.fp32.onnx index e2c9281..848ab3e 100644 --- a/flow.decoder.estimator.fp32.onnx +++ b/flow.decoder.estimator.fp32.onnx @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51aed3efa2c153898ea53a780893c920e968dab1d7aec25402bd6c9815d94702 -size 286521895 +oid sha256:8fdcbbc2b5bfbbe7fa50260818bdb7a91e7dd12cd53117574512d8fdebdece5b +size 286644900 diff --git a/flow.encoder.fp16.zip b/flow.encoder.fp16.zip index 1fa00a5..758189d 100644 --- a/flow.encoder.fp16.zip +++ b/flow.encoder.fp16.zip @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46d2539ad8bdb90026cd50cb42e45bd389f10108111d742b912feddca105aeb6 -size 116703414 +oid sha256:717970406ec450109f7f79ee591c66ae1b7f4ff5c8fc2cca4602699df59b086b +size 185950580 diff --git a/flow.encoder.fp32.zip b/flow.encoder.fp32.zip index 261114f..afeb0b4 100644 --- a/flow.encoder.fp32.zip +++ b/flow.encoder.fp32.zip @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32ac668741e1358123d3c066cfd1f68a81bd386028755be9831509e304bfd98c -size 192365750 +oid sha256:ecf7fcb9f4cef30029a901ff9da3808325edf3a24006368c4e6f95b6c6f030ec +size 330818868