add flow cache model

This commit is contained in:
lyuxiang.lx 2025-04-07 21:48:19 +08:00
parent 9bd5b08fc0
commit f3bcc1af05
9 changed files with 109 additions and 21 deletions

1
.gitattributes vendored
View File

@ -59,3 +59,4 @@ flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text
flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text
flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text
flow.cache.pt filter=lfs diff=lfs merge=lfs -text

View File

@ -10,6 +10,12 @@ llm_input_size: 896
llm_output_size: 896 llm_output_size: 896
spk_embed_dim: 192 spk_embed_dim: 192
qwen_pretrain_path: '' qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
# stream related params
chunk_size: 25 # streaming inference chunk size, in token
num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
# model params # model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml. # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
@ -20,6 +26,7 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
speech_token_size: 6561 speech_token_size: 6561
length_normalized_loss: True length_normalized_loss: True
lsm_weight: 0 lsm_weight: 0
mix_ratio: [5, 15]
llm: !new:cosyvoice.llm.llm.Qwen2Encoder llm: !new:cosyvoice.llm.llm.Qwen2Encoder
pretrain_path: !ref <qwen_pretrain_path> pretrain_path: !ref <qwen_pretrain_path>
sampling: !name:cosyvoice.utils.common.ras_sampling sampling: !name:cosyvoice.utils.common.ras_sampling
@ -34,9 +41,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
spk_embed_dim: !ref <spk_embed_dim> spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel' output_type: 'mel'
vocab_size: 6561 vocab_size: 6561
input_frame_rate: 25 input_frame_rate: !ref <token_frame_rate>
only_mask_loss: True only_mask_loss: True
token_mel_ratio: 2 token_mel_ratio: !ref <token_mel_ratio>
pre_lookahead_len: 3 pre_lookahead_len: 3
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 512 output_size: 512
@ -53,6 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 512 input_size: 512
use_cnn_module: False use_cnn_module: False
macaron_style: False macaron_style: False
static_chunk_size: !ref <chunk_size>
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240 in_channels: 240
n_spks: 1 n_spks: 1
@ -65,10 +73,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
training_cfg_rate: 0.2 training_cfg_rate: 0.2
inference_cfg_rate: 0.7 inference_cfg_rate: 0.7
reg_loss_type: 'l1' reg_loss_type: 'l1'
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
in_channels: 320 in_channels: 320
out_channels: 80 out_channels: 80
causal: True
channels: [256] channels: [256]
dropout: 0.0 dropout: 0.0
attention_head_dim: 64 attention_head_dim: 64
@ -76,6 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
num_mid_blocks: 12 num_mid_blocks: 12
num_heads: 8 num_heads: 8
act_fn: 'gelu' act_fn: 'gelu'
static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80 in_channels: 80
@ -101,6 +110,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80 in_channels: 80
cond_channels: 512 cond_channels: 512
# gan related module
mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 480
win_size: 1920
fmin: 0
fmax: null
center: False
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift>
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
mel_spec_transform: [
!ref <mel_spec_transform1>
]
# processor functions # processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
@ -112,11 +140,13 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
allowed_special: !ref <allowed_special> allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter filter: !name:cosyvoice.dataset.processor.filter
max_length: 40960 max_length: 40960
min_length: 0 min_length: 100
token_max_length: 200 token_max_length: 200
token_min_length: 1 token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample resample: !name:cosyvoice.dataset.processor.resample
resample_rate: !ref <sample_rate> resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
truncate_length: 24480 # must be a multiplier of hop_size
feat_extractor: !name:matcha.utils.audio.mel_spectrogram feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920 n_fft: 1920
num_mels: 80 num_mels: 80
@ -128,6 +158,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
center: False center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor> feat_extractor: !ref <feat_extractor>
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
sample_rate: !ref <sample_rate>
hop_size: 480
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
normalize: True normalize: True
shuffle: !name:cosyvoice.dataset.processor.shuffle shuffle: !name:cosyvoice.dataset.processor.shuffle
@ -137,4 +170,64 @@ sort: !name:cosyvoice.dataset.processor.sort
batch: !name:cosyvoice.dataset.processor.batch batch: !name:cosyvoice.dataset.processor.batch
batch_type: 'dynamic' batch_type: 'dynamic'
max_frames_in_batch: 2000 max_frames_in_batch: 2000
padding: !name:cosyvoice.dataset.processor.padding padding: !name:cosyvoice.dataset.processor.padding
use_spk_embedding: False # change to True during sft
# dataset processor pipeline
data_pipeline: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <compute_fbank>,
!ref <parse_embedding>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
data_pipeline_gan: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <truncate>,
!ref <compute_fbank>,
!ref <compute_f0>,
!ref <parse_embedding>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
# llm flow train conf
train_conf:
optim: adam
optim_conf:
lr: 1e-5 # change to 1e-5 during sft
scheduler: constantlr # change to constantlr during sft
scheduler_conf:
warmup_steps: 2500
max_epoch: 200
grad_clip: 5
accum_grad: 2
log_interval: 100
save_per_step: -1
# gan train conf
train_conf_gan:
optim: adam
optim_conf:
lr: 0.0002 # use small lr for gan training
scheduler: constantlr
optim_d: adam
optim_conf_d:
lr: 0.0002 # use small lr for gan training
scheduler_d: constantlr
max_epoch: 200
grad_clip: 5
accum_grad: 1 # in gan training, accum_grad must be 1
log_interval: 100
save_per_step: -1

BIN
flow.cache.pt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
flow.decoder.estimator.fp16.a10.plan (Stored with Git LFS)

Binary file not shown.

BIN
flow.decoder.estimator.fp16.l20.plan (Stored with Git LFS)

Binary file not shown.

BIN
flow.decoder.estimator.fp16.v100.plan (Stored with Git LFS)

Binary file not shown.

BIN
flow.decoder.estimator.fp32.onnx (Stored with Git LFS)

Binary file not shown.

BIN
flow.encoder.fp16.zip (Stored with Git LFS)

Binary file not shown.

BIN
flow.encoder.fp32.zip (Stored with Git LFS)

Binary file not shown.