add flow cache model

2025-07-11 23:21:46 +08:00 · 2025-04-07 21:48:19 +08:00 · 2025-04-07 21:48:19 +08:00 · f3bcc1af05
commit f3bcc1af05
parent 9bd5b08fc0
9 changed files with 109 additions and 21 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -59,3 +59,4 @@ flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text
 flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
 flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text
 flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text
+flow.cache.pt filter=lfs diff=lfs merge=lfs -text
--- a/cosyvoice2.yaml
+++ b/cosyvoice2.yaml
@ -10,6 +10,12 @@ llm_input_size: 896
 llm_output_size: 896
 spk_embed_dim: 192
 qwen_pretrain_path: ''
+token_frame_rate: 25
+token_mel_ratio: 2
+
+# stream related params
+chunk_size: 25 # streaming inference chunk size, in token
+num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks

 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
@ -20,6 +26,7 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
    speech_token_size: 6561
    length_normalized_loss: True
    lsm_weight: 0
+    mix_ratio: [5, 15]
    llm: !new:cosyvoice.llm.llm.Qwen2Encoder
        pretrain_path: !ref <qwen_pretrain_path>
    sampling: !name:cosyvoice.utils.common.ras_sampling
@ -34,9 +41,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
    spk_embed_dim: !ref <spk_embed_dim>
    output_type: 'mel'
    vocab_size: 6561
-    input_frame_rate: 25
+    input_frame_rate: !ref <token_frame_rate>
    only_mask_loss: True
-    token_mel_ratio: 2
+    token_mel_ratio: !ref <token_mel_ratio>
    pre_lookahead_len: 3
    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
        output_size: 512
@ -53,6 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
        input_size: 512
        use_cnn_module: False
        macaron_style: False
+        static_chunk_size: !ref <chunk_size>
    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
        in_channels: 240
        n_spks: 1
@ -65,10 +73,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
                training_cfg_rate: 0.2
                inference_cfg_rate: 0.7
                reg_loss_type: 'l1'
-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
+        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
            in_channels: 320
            out_channels: 80
-            causal: True
            channels: [256]
            dropout: 0.0
            attention_head_dim: 64
@ -76,6 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
            num_mid_blocks: 12
            num_heads: 8
            act_fn: 'gelu'
+            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
+            num_decoding_left_chunks: !ref <num_decoding_left_chunks>

 hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
    in_channels: 80
@ -101,6 +110,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
        in_channels: 80
        cond_channels: 512

+# gan related module
+mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1920
+    num_mels: 80
+    sampling_rate: !ref <sample_rate>
+    hop_size: 480
+    win_size: 1920
+    fmin: 0
+    fmax: null
+    center: False
+hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
+    generator: !ref <hift>
+    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
+        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
+        mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
+    mel_spec_transform: [
+        !ref <mel_spec_transform1>
+    ]
+
 # processor functions
 parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
@ -112,11 +140,13 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
    allowed_special: !ref <allowed_special>
 filter: !name:cosyvoice.dataset.processor.filter
    max_length: 40960
-    min_length: 0
+    min_length: 100
    token_max_length: 200
    token_min_length: 1
 resample: !name:cosyvoice.dataset.processor.resample
    resample_rate: !ref <sample_rate>
+truncate: !name:cosyvoice.dataset.processor.truncate
+    truncate_length: 24480 # must be a multiplier of hop_size
 feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    n_fft: 1920
    num_mels: 80
@ -128,6 +158,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    center: False
 compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
+compute_f0: !name:cosyvoice.dataset.processor.compute_f0
+    sample_rate: !ref <sample_rate>
+    hop_size: 480
 parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
    normalize: True
 shuffle: !name:cosyvoice.dataset.processor.shuffle
@ -138,3 +171,63 @@ batch: !name:cosyvoice.dataset.processor.batch
    batch_type: 'dynamic'
    max_frames_in_batch: 2000
 padding: !name:cosyvoice.dataset.processor.padding
+    use_spk_embedding: False # change to True during sft
+
+
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <compute_fbank>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]
+data_pipeline_gan: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <filter>,
+    !ref <resample>,
+    !ref <truncate>,
+    !ref <compute_fbank>,
+    !ref <compute_f0>,
+    !ref <parse_embedding>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <batch>,
+    !ref <padding>,
+]
+
+# llm flow train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 1e-5 # change to 1e-5 during sft
+    scheduler: constantlr # change to constantlr during sft
+    scheduler_conf:
+        warmup_steps: 2500
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: -1
+
+# gan train conf
+train_conf_gan:
+    optim: adam
+    optim_conf:
+        lr: 0.0002 # use small lr for gan training
+    scheduler: constantlr
+    optim_d: adam
+    optim_conf_d:
+        lr: 0.0002 # use small lr for gan training
+    scheduler_d: constantlr
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 1 # in gan training, accum_grad must be 1
+    log_interval: 100
+    save_per_step: -1
--- a/flow.cache.pt
+++ b/flow.cache.pt
--- a/flow.decoder.estimator.fp16.a10.plan
+++ b/flow.decoder.estimator.fp16.a10.plan
--- a/flow.decoder.estimator.fp16.l20.plan
+++ b/flow.decoder.estimator.fp16.l20.plan
--- a/flow.decoder.estimator.fp16.v100.plan
+++ b/flow.decoder.estimator.fp16.v100.plan
--- a/flow.decoder.estimator.fp32.onnx
+++ b/flow.decoder.estimator.fp32.onnx
--- a/flow.encoder.fp16.zip
+++ b/flow.encoder.fp16.zip
--- a/flow.encoder.fp32.zip
+++ b/flow.encoder.fp32.zip