add flow cache model

This commit is contained in:
lyuxiang.lx 2025-04-07 21:48:19 +08:00
parent 9bd5b08fc0
commit f3bcc1af05
9 changed files with 109 additions and 21 deletions

1
.gitattributes vendored
View File

@ -59,3 +59,4 @@ flow.decoder.estimator.fp16.l20.plan filter=lfs diff=lfs merge=lfs -text
flow.decoder.estimator.fp16.a10.plan filter=lfs diff=lfs merge=lfs -text
flow.decoder.estimator.fp16.v100.plan filter=lfs diff=lfs merge=lfs -text
flow.encoder.fp16.zip filter=lfs diff=lfs merge=lfs -text
flow.cache.pt filter=lfs diff=lfs merge=lfs -text

View File

@ -10,6 +10,12 @@ llm_input_size: 896
llm_output_size: 896
spk_embed_dim: 192
qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
# stream related params
chunk_size: 25 # streaming inference chunk size, in token
num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
@ -20,6 +26,7 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
speech_token_size: 6561
length_normalized_loss: True
lsm_weight: 0
mix_ratio: [5, 15]
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
pretrain_path: !ref <qwen_pretrain_path>
sampling: !name:cosyvoice.utils.common.ras_sampling
@ -34,9 +41,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel'
vocab_size: 6561
input_frame_rate: 25
input_frame_rate: !ref <token_frame_rate>
only_mask_loss: True
token_mel_ratio: 2
token_mel_ratio: !ref <token_mel_ratio>
pre_lookahead_len: 3
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 512
@ -53,6 +60,7 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 512
use_cnn_module: False
macaron_style: False
static_chunk_size: !ref <chunk_size>
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
@ -65,10 +73,9 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
training_cfg_rate: 0.2
inference_cfg_rate: 0.7
reg_loss_type: 'l1'
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
in_channels: 320
out_channels: 80
causal: True
channels: [256]
dropout: 0.0
attention_head_dim: 64
@ -76,6 +83,8 @@ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
num_mid_blocks: 12
num_heads: 8
act_fn: 'gelu'
static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80
@ -101,6 +110,25 @@ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80
cond_channels: 512
# gan related module
mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 480
win_size: 1920
fmin: 0
fmax: null
center: False
hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
generator: !ref <hift>
discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
mel_spec_transform: [
!ref <mel_spec_transform1>
]
# processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
@ -112,11 +140,13 @@ tokenize: !name:cosyvoice.dataset.processor.tokenize
allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter
max_length: 40960
min_length: 0
min_length: 100
token_max_length: 200
token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
truncate_length: 24480 # must be a multiplier of hop_size
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920
num_mels: 80
@ -128,6 +158,9 @@ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
compute_f0: !name:cosyvoice.dataset.processor.compute_f0
sample_rate: !ref <sample_rate>
hop_size: 480
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
normalize: True
shuffle: !name:cosyvoice.dataset.processor.shuffle
@ -138,3 +171,63 @@ batch: !name:cosyvoice.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 2000
padding: !name:cosyvoice.dataset.processor.padding
use_spk_embedding: False # change to True during sft
# dataset processor pipeline
data_pipeline: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <compute_fbank>,
!ref <parse_embedding>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
data_pipeline_gan: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <truncate>,
!ref <compute_fbank>,
!ref <compute_f0>,
!ref <parse_embedding>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
# llm flow train conf
train_conf:
optim: adam
optim_conf:
lr: 1e-5 # change to 1e-5 during sft
scheduler: constantlr # change to constantlr during sft
scheduler_conf:
warmup_steps: 2500
max_epoch: 200
grad_clip: 5
accum_grad: 2
log_interval: 100
save_per_step: -1
# gan train conf
train_conf_gan:
optim: adam
optim_conf:
lr: 0.0002 # use small lr for gan training
scheduler: constantlr
optim_d: adam
optim_conf_d:
lr: 0.0002 # use small lr for gan training
scheduler_d: constantlr
max_epoch: 200
grad_clip: 5
accum_grad: 1 # in gan training, accum_grad must be 1
log_interval: 100
save_per_step: -1

BIN
flow.cache.pt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
flow.decoder.estimator.fp16.a10.plan (Stored with Git LFS)

Binary file not shown.

BIN
flow.decoder.estimator.fp16.l20.plan (Stored with Git LFS)

Binary file not shown.

BIN
flow.decoder.estimator.fp16.v100.plan (Stored with Git LFS)

Binary file not shown.

BIN
flow.decoder.estimator.fp32.onnx (Stored with Git LFS)

Binary file not shown.

BIN
flow.encoder.fp16.zip (Stored with Git LFS)

Binary file not shown.

BIN
flow.encoder.fp32.zip (Stored with Git LFS)

Binary file not shown.