This commit is contained in:
lyuxiang.lx 2024-12-12 19:06:17 +08:00
parent f0fdade79d
commit 7989775dd0
11 changed files with 172 additions and 1 deletions

9
.gitattributes vendored
View File

@ -44,4 +44,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
flow.decoder.estimator.fp16.Volta.plan filter=lfs diff=lfs merge=lfs -text
campplus.onnx filter=lfs diff=lfs merge=lfs -text
flow.decoder.estimator.fp32.onnx filter=lfs diff=lfs merge=lfs -text
speech_tokenizer_v2.onnx filter=lfs diff=lfs merge=lfs -text
flow.pt filter=lfs diff=lfs merge=lfs -text
hift.pt filter=lfs diff=lfs merge=lfs -text
llm.pt filter=lfs diff=lfs merge=lfs -text

BIN
asset/dingding.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

BIN
campplus.onnx (Stored with Git LFS) Normal file

Binary file not shown.

140
cosyvoice.yaml Normal file
View File

@ -0,0 +1,140 @@
# set random seed, so that you may reproduce your result.
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
# fixed params
sample_rate: 24000
llm_input_size: 896
llm_output_size: 896
spk_embed_dim: 192
qwen_pretrain_path: /mnt/lyuxiang.lx/CosyVoice_github/pretrained_models/CosyVoice2-0.5B/Qwen2-0.5B-CosyVoice-BlankEN
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:cosyvoice.llm.llm.Qwen2LM
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
speech_token_size: 6561
length_normalized_loss: True
lsm_weight: 0
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
pretrain_path: !ref <qwen_pretrain_path>
sampling: !name:cosyvoice.utils.common.ras_sampling
top_p: 0.8
top_k: 25
win_size: 10
tau_r: 0.1
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 512
output_size: 80
spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel'
vocab_size: 6561
input_frame_rate: 25
only_mask_loss: True
token_mel_ratio: 2
pre_lookahead_len: 3
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
input_size: 512
use_cnn_module: False
macaron_style: False
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
spk_emb_dim: 80
cfm_params: !new:omegaconf.DictConfig
content:
sigma_min: 1e-06
solver: 'euler'
t_scheduler: 'cosine'
training_cfg_rate: 0.2
inference_cfg_rate: 0.7
reg_loss_type: 'l1'
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
in_channels: 320
out_channels: 80
causal: True
channels: [256]
dropout: 0.0
attention_head_dim: 64
n_blocks: 4
num_mid_blocks: 12
num_heads: 8
act_fn: 'gelu'
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
in_channels: 80
base_channels: 512
nb_harmonics: 8
sampling_rate: !ref <sample_rate>
nsf_alpha: 0.1
nsf_sigma: 0.003
nsf_voiced_threshold: 10
upsample_rates: [8, 5, 3]
upsample_kernel_sizes: [16, 11, 7]
istft_params:
n_fft: 16
hop_len: 4
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
source_resblock_kernel_sizes: [7, 7, 11]
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
lrelu_slope: 0.1
audio_limit: 0.99
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
num_class: 1
in_channels: 80
cond_channels: 512
# processor functions
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
token_path: !ref <qwen_pretrain_path>
skip_special_tokens: True
allowed_special: 'all'
tokenize: !name:cosyvoice.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
resample_rate: !ref <sample_rate>
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 480
win_size: 1920
fmin: 0
fmax: 8000
center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
normalize: True
shuffle: !name:cosyvoice.dataset.processor.shuffle
shuffle_size: 1000
sort: !name:cosyvoice.dataset.processor.sort
sort_size: 500 # sort_size should be less than shuffle_size
batch: !name:cosyvoice.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 2000
padding: !name:cosyvoice.dataset.processor.padding

BIN
flow.decoder.estimator.fp16.Volta.plan (Stored with Git LFS) Normal file

Binary file not shown.

BIN
flow.decoder.estimator.fp32.onnx (Stored with Git LFS) Normal file

Binary file not shown.

BIN
flow.encoder.fp32.zip (Stored with Git LFS) Normal file

Binary file not shown.

BIN
flow.pt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
hift.pt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
llm.pt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
speech_tokenizer_v2.onnx (Stored with Git LFS) Normal file

Binary file not shown.