From f3bcc1af05e4863340ebc6c4634dbaed05dbeaed Mon Sep 17 00:00:00 2001
From: "lyuxiang.lx" <lyuxiang.lx@alibaba-inc.com>
Date: Mon, 7 Apr 2025 21:48:19 +0800
Subject: [PATCH] add flow cache model

---
 .gitattributes                        |   1 +
 cosyvoice.yaml => cosyvoice2.yaml     | 105 ++++++++++++++++++++++++--
 flow.cache.pt                         |   3 +
 flow.decoder.estimator.fp16.a10.plan  |   3 -
 flow.decoder.estimator.fp16.l20.plan  |   3 -
 flow.decoder.estimator.fp16.v100.plan |   3 -
 flow.decoder.estimator.fp32.onnx      |   4 +-
 flow.encoder.fp16.zip                 |   4 +-
 flow.encoder.fp32.zip                 |   4 +-
 9 files changed, 109 insertions(+), 21 deletions(-)
 rename cosyvoice.yaml => cosyvoice2.yaml (61%)
 create mode 100644 flow.cache.pt
 delete mode 100644 flow.decoder.estimator.fp16.a10.plan
 delete mode 100644 flow.decoder.estimator.fp16.l20.plan
 delete mode 100644 flow.decoder.estimator.fp16.v100.plan
diff --git a/.gitattributes b/.gitattributes
index 4bc72bd..536468e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -59,3 +59,4 @@ flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text
 flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
 flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text
 flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text
+flow.cache.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/cosyvoice.yaml b/cosyvoice2.yaml
similarity index 61%
rename from cosyvoice.yaml
rename to cosyvoice2.yaml
index c9a7848..d6bdeb6 100644
--- a/cosyvoice.yaml
+++ b/cosyvoice2.yaml
@@ -10,6 +10,12 @@ llm_input_size: 896
 llm_output_size: 896
 spk_embed_dim: 192
 qwen_pretrain_path: ''
+token_frame_rate: 25
+token_mel_ratio: 2
+
+# stream related params
+chunk_size: 25 # streaming inference chunk size, in token
+num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
 
 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
@@ -20,6 +26,7 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
     speech_token_size: 6561
     length_normalized_loss: True
     lsm_weight: 0
+    mix_ratio: [5, 15]
     llm: !new:cosyvoice.llm.llm.Qwen2Encoder
         pretrain_path: !ref <qwen_pretrain_path>
     sampling: !name:cosyvoice.utils.common.ras_sampling
@@ -34,9 +41,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
     spk_embed_dim: !ref <spk_embed_dim>
     output_type: 'mel'
     vocab_size: 6561
-    input_frame_rate: 25
+    input_frame_rate: !ref <token_frame_rate>
     only_mask_loss: True
-    token_mel_ratio: 2
+    token_mel_ratio: !ref <token_mel_ratio>
     pre_lookahead_len: 3
     encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
         output_size: 512
@@ -53,6 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
         input_size: 512
         use_cnn_module: False
         macaron_style: False
+        static_chunk_size: !ref <chunk_size>
     decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
         in_channels: 240
         n_spks: 1
@@ -65,10 +73,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
                 training_cfg_rate: 0.2
                 inference_cfg_rate: 0.7
                 reg_loss_type: 'l1'
-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
+        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
             in_channels: 320
             out_channels: 80
-            causal: True
             channels: [256]
             dropout: 0.0
             attention_head_dim: 64
@@ -76,6 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
             num_mid_blocks: 12
             num_heads: 8
             act_fn: 'gelu'
+            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
+            num_decoding_left_chunks: !ref <num_decoding_left_chunks>
 
 hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
     in_channels: 80
@@ -101,6 +110,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
         in_channels: 80
         cond_channels: 512
 
+# gan related module
+mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1920
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 480
+    win_size: 1920
+    fmin: 0
+    fmax: null
+    center: False
+hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
+    generator: !ref <hift>
+    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
+        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
+        mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
+    mel_spec_transform: [
+        !ref <mel_spec_transform1>
+    ]
+
 # processor functions
 parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
@@ -112,11 +140,13 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
     allowed_special: !ref <allowed_special>
 filter: !name:cosyvoice.dataset.processor.filter
     max_length: 40960
-    min_length: 0
+    min_length: 100
     token_max_length: 200
     token_min_length: 1
 resample: !name:cosyvoice.dataset.processor.resample
     resample_rate: !ref <sample_rate>
+truncate: !name:cosyvoice.dataset.processor.truncate
+    truncate_length: 24480 # must be a multiplier of hop_size
 feat_extractor: !name:matcha.utils.audio.mel_spectrogram
     n_fft: 1920
     num_mels: 80
@@ -128,6 +158,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
     center: False
 compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
     feat_extractor: !ref <feat_extractor>
+compute_f0: !name:cosyvoice.dataset.processor.compute_f0
+    sample_rate: !ref <sample_rate>
+    hop_size: 480
 parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
     normalize: True
 shuffle: !name:cosyvoice.dataset.processor.shuffle
@@ -137,4 +170,64 @@ sort: !name:cosyvoice.dataset.processor.sort
 batch: !name:cosyvoice.dataset.processor.batch
     batch_type: 'dynamic'
     max_frames_in_batch: 2000
-padding: !name:cosyvoice.dataset.processor.padding
\ No newline at end of file
+padding: !name:cosyvoice.dataset.processor.padding
+    use_spk_embedding: False # change to True during sft
+
+
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <compute_fbank>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]
+data_pipeline_gan: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <truncate>,
+    !ref <compute_fbank>,
+    !ref <compute_f0>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]
+
+# llm flow train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 1e-5 # change to 1e-5 during sft
+    scheduler: constantlr # change to constantlr during sft
+    scheduler_conf:
+        warmup_steps: 2500
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: -1
+
+# gan train conf
+train_conf_gan:
+    optim: adam
+    optim_conf:
+        lr: 0.0002 # use small lr for gan training
+    scheduler: constantlr
+    optim_d: adam
+    optim_conf_d:
+        lr: 0.0002 # use small lr for gan training
+    scheduler_d: constantlr
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 1 # in gan training, accum_grad must be 1
+    log_interval: 100
+    save_per_step: -1
\ No newline at end of file
diff --git a/flow.cache.pt b/flow.cache.pt
new file mode 100644
index 0000000..b87ad4c
--- /dev/null
+++ b/flow.cache.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ebde248652c6eed855e08bb6a263af3847039a1361f8019bdb27f5f680a1dc4
+size 450496991
diff --git a/flow.decoder.estimator.fp16.a10.plan b/flow.decoder.estimator.fp16.a10.plan
deleted file mode 100644
index 5bf713e..0000000
--- a/flow.decoder.estimator.fp16.a10.plan
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6f6b9073bd9e7b8ac5bef0a21431391cbc32376b9265ec73935d6f28a0d32d01
-size 168597292
diff --git a/flow.decoder.estimator.fp16.l20.plan b/flow.decoder.estimator.fp16.l20.plan
deleted file mode 100644
index aa7bf51..0000000
--- a/flow.decoder.estimator.fp16.l20.plan
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:012df9e730e36e1cb61bf2780378c15ae92c536ae87518b7a54a90026cb99385
-size 166520788
diff --git a/flow.decoder.estimator.fp16.v100.plan b/flow.decoder.estimator.fp16.v100.plan
deleted file mode 100644
index 1f6b504..0000000
--- a/flow.decoder.estimator.fp16.v100.plan
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f231edf01451fafbc3dc0498a51feb3a264afad43275536c8151fff954ef3c56
-size 161799540
diff --git a/flow.decoder.estimator.fp32.onnx b/flow.decoder.estimator.fp32.onnx
index e2c9281..848ab3e 100644
--- a/flow.decoder.estimator.fp32.onnx
+++ b/flow.decoder.estimator.fp32.onnx
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:51aed3efa2c153898ea53a780893c920e968dab1d7aec25402bd6c9815d94702
-size 286521895
+oid sha256:8fdcbbc2b5bfbbe7fa50260818bdb7a91e7dd12cd53117574512d8fdebdece5b
+size 286644900
diff --git a/flow.encoder.fp16.zip b/flow.encoder.fp16.zip
index 1fa00a5..758189d 100644
--- a/flow.encoder.fp16.zip
+++ b/flow.encoder.fp16.zip
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46d2539ad8bdb90026cd50cb42e45bd389f10108111d742b912feddca105aeb6
-size 116703414
+oid sha256:717970406ec450109f7f79ee591c66ae1b7f4ff5c8fc2cca4602699df59b086b
+size 185950580
diff --git a/flow.encoder.fp32.zip b/flow.encoder.fp32.zip
index 261114f..afeb0b4 100644
--- a/flow.encoder.fp32.zip
+++ b/flow.encoder.fp32.zip
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32ac668741e1358123d3c066cfd1f68a81bd386028755be9831509e304bfd98c
-size 192365750
+oid sha256:ecf7fcb9f4cef30029a901ff9da3808325edf3a24006368c4e6f95b6c6f030ec
+size 330818868