mirror of
https://www.modelscope.cn/iic/CosyVoice2-0.5B.git
synced 2025-04-10 11:50:22 +08:00
update
This commit is contained in:
parent
f0fdade79d
commit
7989775dd0
9
.gitattributes
vendored
9
.gitattributes
vendored
@ -44,4 +44,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
flow.decoder.estimator.fp16.Volta.plan filter=lfs diff=lfs merge=lfs -text
|
||||
campplus.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
flow.decoder.estimator.fp32.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
speech_tokenizer_v2.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
flow.pt filter=lfs diff=lfs merge=lfs -text
|
||||
hift.pt filter=lfs diff=lfs merge=lfs -text
|
||||
llm.pt filter=lfs diff=lfs merge=lfs -text
|
||||
|
BIN
asset/dingding.png
Normal file
BIN
asset/dingding.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 94 KiB |
BIN
campplus.onnx
(Stored with Git LFS)
Normal file
BIN
campplus.onnx
(Stored with Git LFS)
Normal file
Binary file not shown.
140
cosyvoice.yaml
Normal file
140
cosyvoice.yaml
Normal file
@ -0,0 +1,140 @@
|
||||
# set random seed, so that you may reproduce your result.
|
||||
__set_seed1: !apply:random.seed [1986]
|
||||
__set_seed2: !apply:numpy.random.seed [1986]
|
||||
__set_seed3: !apply:torch.manual_seed [1986]
|
||||
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
|
||||
|
||||
# fixed params
|
||||
sample_rate: 24000
|
||||
llm_input_size: 896
|
||||
llm_output_size: 896
|
||||
spk_embed_dim: 192
|
||||
qwen_pretrain_path: /mnt/lyuxiang.lx/CosyVoice_github/pretrained_models/CosyVoice2-0.5B/Qwen2-0.5B-CosyVoice-BlankEN
|
||||
|
||||
# model params
|
||||
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
|
||||
# for system/third_party class/function, we do not require this.
|
||||
llm: !new:cosyvoice.llm.llm.Qwen2LM
|
||||
llm_input_size: !ref <llm_input_size>
|
||||
llm_output_size: !ref <llm_output_size>
|
||||
speech_token_size: 6561
|
||||
length_normalized_loss: True
|
||||
lsm_weight: 0
|
||||
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
|
||||
pretrain_path: !ref <qwen_pretrain_path>
|
||||
sampling: !name:cosyvoice.utils.common.ras_sampling
|
||||
top_p: 0.8
|
||||
top_k: 25
|
||||
win_size: 10
|
||||
tau_r: 0.1
|
||||
|
||||
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
|
||||
input_size: 512
|
||||
output_size: 80
|
||||
spk_embed_dim: !ref <spk_embed_dim>
|
||||
output_type: 'mel'
|
||||
vocab_size: 6561
|
||||
input_frame_rate: 25
|
||||
only_mask_loss: True
|
||||
token_mel_ratio: 2
|
||||
pre_lookahead_len: 3
|
||||
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
|
||||
output_size: 512
|
||||
attention_heads: 8
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.1
|
||||
normalize_before: True
|
||||
input_layer: 'linear'
|
||||
pos_enc_layer_type: 'rel_pos_espnet'
|
||||
selfattention_layer_type: 'rel_selfattn'
|
||||
input_size: 512
|
||||
use_cnn_module: False
|
||||
macaron_style: False
|
||||
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
|
||||
in_channels: 240
|
||||
n_spks: 1
|
||||
spk_emb_dim: 80
|
||||
cfm_params: !new:omegaconf.DictConfig
|
||||
content:
|
||||
sigma_min: 1e-06
|
||||
solver: 'euler'
|
||||
t_scheduler: 'cosine'
|
||||
training_cfg_rate: 0.2
|
||||
inference_cfg_rate: 0.7
|
||||
reg_loss_type: 'l1'
|
||||
estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
|
||||
in_channels: 320
|
||||
out_channels: 80
|
||||
causal: True
|
||||
channels: [256]
|
||||
dropout: 0.0
|
||||
attention_head_dim: 64
|
||||
n_blocks: 4
|
||||
num_mid_blocks: 12
|
||||
num_heads: 8
|
||||
act_fn: 'gelu'
|
||||
|
||||
hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
|
||||
in_channels: 80
|
||||
base_channels: 512
|
||||
nb_harmonics: 8
|
||||
sampling_rate: !ref <sample_rate>
|
||||
nsf_alpha: 0.1
|
||||
nsf_sigma: 0.003
|
||||
nsf_voiced_threshold: 10
|
||||
upsample_rates: [8, 5, 3]
|
||||
upsample_kernel_sizes: [16, 11, 7]
|
||||
istft_params:
|
||||
n_fft: 16
|
||||
hop_len: 4
|
||||
resblock_kernel_sizes: [3, 7, 11]
|
||||
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
||||
source_resblock_kernel_sizes: [7, 7, 11]
|
||||
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
||||
lrelu_slope: 0.1
|
||||
audio_limit: 0.99
|
||||
f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
|
||||
num_class: 1
|
||||
in_channels: 80
|
||||
cond_channels: 512
|
||||
|
||||
# processor functions
|
||||
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
||||
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
|
||||
token_path: !ref <qwen_pretrain_path>
|
||||
skip_special_tokens: True
|
||||
allowed_special: 'all'
|
||||
tokenize: !name:cosyvoice.dataset.processor.tokenize
|
||||
get_tokenizer: !ref <get_tokenizer>
|
||||
allowed_special: !ref <allowed_special>
|
||||
filter: !name:cosyvoice.dataset.processor.filter
|
||||
max_length: 40960
|
||||
min_length: 0
|
||||
token_max_length: 200
|
||||
token_min_length: 1
|
||||
resample: !name:cosyvoice.dataset.processor.resample
|
||||
resample_rate: !ref <sample_rate>
|
||||
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
|
||||
n_fft: 1920
|
||||
num_mels: 80
|
||||
sampling_rate: !ref <sample_rate>
|
||||
hop_size: 480
|
||||
win_size: 1920
|
||||
fmin: 0
|
||||
fmax: 8000
|
||||
center: False
|
||||
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
|
||||
feat_extractor: !ref <feat_extractor>
|
||||
parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
|
||||
normalize: True
|
||||
shuffle: !name:cosyvoice.dataset.processor.shuffle
|
||||
shuffle_size: 1000
|
||||
sort: !name:cosyvoice.dataset.processor.sort
|
||||
sort_size: 500 # sort_size should be less than shuffle_size
|
||||
batch: !name:cosyvoice.dataset.processor.batch
|
||||
batch_type: 'dynamic'
|
||||
max_frames_in_batch: 2000
|
||||
padding: !name:cosyvoice.dataset.processor.padding
|
BIN
flow.decoder.estimator.fp16.Volta.plan
(Stored with Git LFS)
Normal file
BIN
flow.decoder.estimator.fp16.Volta.plan
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
flow.decoder.estimator.fp32.onnx
(Stored with Git LFS)
Normal file
BIN
flow.decoder.estimator.fp32.onnx
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
flow.encoder.fp32.zip
(Stored with Git LFS)
Normal file
BIN
flow.encoder.fp32.zip
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
speech_tokenizer_v2.onnx
(Stored with Git LFS)
Normal file
BIN
speech_tokenizer_v2.onnx
(Stored with Git LFS)
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user