audio_tokenizer: | |
mel_params: | |
sample_rate: 16000 | |
n_fft: 1024 | |
win_length: 640 | |
hop_length: 320 | |
mel_fmin: 10 | |
mel_fmax: null | |
num_mels: 128 | |
encoder: | |
input_channels: 1024 | |
vocos_dim: 384 | |
vocos_intermediate_dim: 2048 | |
vocos_num_layers: 12 | |
out_channels: 1024 | |
sample_ratios: [1,1] | |
decoder: | |
input_channel: 1024 | |
channels: 1536 | |
rates: [8, 5, 4, 2] | |
kernel_sizes: [16,11,8,4] | |
quantizer: | |
input_dim: 1024 | |
codebook_size: 8192 | |
codebook_dim: 8 | |
commitment: 0.25 | |
codebook_loss_weight: 2.0 | |
use_l2_normlize: True | |
threshold_ema_dead_code: 0.2 | |
speaker_encoder: | |
input_dim: 128 | |
out_dim: 1024 | |
latent_dim: 128 | |
token_num: 32 | |
fsq_levels: [4, 4, 4, 4, 4, 4] | |
fsq_num_quantizers: 1 | |
prenet: | |
input_channels: 1024 | |
vocos_dim: 384 | |
vocos_intermediate_dim: 2048 | |
vocos_num_layers: 12 | |
out_channels: 1024 | |
condition_dim: 1024 | |
sample_ratios: [1,1] | |
use_tanh_at_final: False | |
postnet: | |
input_channels: 1024 | |
vocos_dim: 384 | |
vocos_intermediate_dim: 2048 | |
vocos_num_layers: 6 | |
out_channels: 1024 | |
use_tanh_at_final: False | |