add flow cache model

2025-07-11 23:21:46 +08:00 · 2025-04-07 21:48:19 +08:00 · 2025-04-07 21:48:19 +08:00 · f3bcc1af05
commit f3bcc1af05
parent 9bd5b08fc0
9 changed files with 109 additions and 21 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -59,3 +59,4 @@ flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text
 flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
 flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text
 flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text
 flow.cache.pt filter=lfs diff=lfs merge=lfs -text
--- a/cosyvoice2.yaml
+++ b/cosyvoice2.yaml
@ -10,6 +10,12 @@ llm_input_size: 896
 llm_output_size: 896
 spk_embed_dim: 192
 qwen_pretrain_path: ''
 token_frame_rate: 25
 token_mel_ratio: 2
 # stream related params
 chunk_size: 25 # streaming inference chunk size, in token
 num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
 # model params
 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
@ -20,6 +26,7 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
    speech_token_size: 6561
    length_normalized_loss: True
    lsm_weight: 0
    mix_ratio: [5, 15]
    llm: !new:cosyvoice.llm.llm.Qwen2Encoder
        pretrain_path: !ref <qwen_pretrain_path>
    sampling: !name:cosyvoice.utils.common.ras_sampling
@ -34,9 +41,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
    spk_embed_dim: !ref <spk_embed_dim>
    output_type: 'mel'
    vocab_size: 6561
-    input_frame_rate: 25
+    input_frame_rate: !ref <token_frame_rate>
    only_mask_loss: True
-    token_mel_ratio: 2
+    token_mel_ratio: !ref <token_mel_ratio>
    pre_lookahead_len: 3
    encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
        output_size: 512
@ -53,6 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
        input_size: 512
        use_cnn_module: False
        macaron_style: False
        static_chunk_size: !ref <chunk_size>
    decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
        in_channels: 240
        n_spks: 1
@ -65,10 +73,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
                training_cfg_rate: 0.2
                inference_cfg_rate: 0.7
                reg_loss_type: 'l1'
-        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
+        estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
            in_channels: 320
            out_channels: 80
            causal: True
            channels: [256]
            dropout: 0.0
            attention_head_dim: 64
@ -76,6 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
            num_mid_blocks: 12
            num_heads: 8
            act_fn: 'gelu'
            static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
            num_decoding_left_chunks: !ref <num_decoding_left_chunks>
 hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
    in_channels: 80
@ -101,6 +110,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
        in_channels: 80
        cond_channels: 512
 # gan related module
 mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
    n_fft: 1920
    num_mels: 80
    sampling_rate: !ref <sample_rate>
    hop_size: 480
    win_size: 1920
    fmin: 0
    fmax: null
    center: False
 hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
    generator: !ref <hift>
    discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
        mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
        mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
    mel_spec_transform: [
        !ref <mel_spec_transform1>
    ]
 # processor functions
 parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
@ -112,11 +140,13 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
    allowed_special: !ref <allowed_special>
 filter: !name:cosyvoice.dataset.processor.filter
    max_length: 40960
-    min_length: 0
+    min_length: 100
    token_max_length: 200
    token_min_length: 1
 resample: !name:cosyvoice.dataset.processor.resample
    resample_rate: !ref <sample_rate>
 truncate: !name:cosyvoice.dataset.processor.truncate
    truncate_length: 24480 # must be a multiplier of hop_size
 feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    n_fft: 1920
    num_mels: 80
@ -128,6 +158,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    center: False
 compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
 compute_f0: !name:cosyvoice.dataset.processor.compute_f0
    sample_rate: !ref <sample_rate>
    hop_size: 480
 parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
    normalize: True
 shuffle: !name:cosyvoice.dataset.processor.shuffle
@ -137,4 +170,64 @@ sort: !name:cosyvoice.dataset.processor.sort
 batch: !name:cosyvoice.dataset.processor.batch
    batch_type: 'dynamic'
    max_frames_in_batch: 2000
-padding: !name:cosyvoice.dataset.processor.padding
+padding: !name:cosyvoice.dataset.processor.padding
    use_spk_embedding: False # change to True during sft
 # dataset processor pipeline
 data_pipeline: [
    !ref <parquet_opener>,
    !ref <tokenize>,
    !ref <filter>,
    !ref <resample>,
    !ref <compute_fbank>,
    !ref <parse_embedding>,
    !ref <shuffle>,
    !ref <sort>,
    !ref <batch>,
    !ref <padding>,
 ]
 data_pipeline_gan: [
    !ref <parquet_opener>,
    !ref <tokenize>,
    !ref <filter>,
    !ref <resample>,
    !ref <truncate>,
    !ref <compute_fbank>,
    !ref <compute_f0>,
    !ref <parse_embedding>,
    !ref <shuffle>,
    !ref <sort>,
    !ref <batch>,
    !ref <padding>,
 ]
 # llm flow train conf
 train_conf:
    optim: adam
    optim_conf:
        lr: 1e-5 # change to 1e-5 during sft
    scheduler: constantlr # change to constantlr during sft
    scheduler_conf:
        warmup_steps: 2500
    max_epoch: 200
    grad_clip: 5
    accum_grad: 2
    log_interval: 100
    save_per_step: -1
 # gan train conf
 train_conf_gan:
    optim: adam
    optim_conf:
        lr: 0.0002 # use small lr for gan training
    scheduler: constantlr
    optim_d: adam
    optim_conf_d:
        lr: 0.0002 # use small lr for gan training
    scheduler_d: constantlr
    max_epoch: 200
    grad_clip: 5
    accum_grad: 1 # in gan training, accum_grad must be 1
    log_interval: 100
    save_per_step: -1
--- a/flow.cache.pt
+++ b/flow.cache.pt
--- a/flow.decoder.estimator.fp16.a10.plan
+++ b/flow.decoder.estimator.fp16.a10.plan
--- a/flow.decoder.estimator.fp16.l20.plan
+++ b/flow.decoder.estimator.fp16.l20.plan
--- a/flow.decoder.estimator.fp16.v100.plan
+++ b/flow.decoder.estimator.fp16.v100.plan
--- a/flow.decoder.estimator.fp32.onnx
+++ b/flow.decoder.estimator.fp32.onnx
--- a/flow.encoder.fp16.zip
+++ b/flow.encoder.fp16.zip
--- a/flow.encoder.fp32.zip
+++ b/flow.encoder.fp32.zip