Vi-SparkTTS-0.5B / config.json
ancv's picture
Update config.json
b6c0b73 verified
{
"model_type": "spark-tts",
"architectures": [
"SparkTTSModel"
],
"auto_map": {
"AutoConfig": "configuration_spark_tts.SparkTTSConfig",
"AutoModel": "modeling_spark_tts.SparkTTSModel",
"AutoProcessor": "processing_spark_tts.SparkTTSProcessor"
},
"processor_class": "processing_spark_tts.SparkTTSProcessor",
"llm_model_name_or_path": "./LLM",
"bicodec_model_name_or_path": "./BiCodec",
"wav2vec2_model_name_or_path": "./wav2vec2-large-xlsr-53",
"sample_rate": 16000,
"highpass_cutoff_freq": 40,
"latent_hop_length": 320,
"ref_segment_duration": 6.0,
"volume_normalize": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.50.3",
"_commit_hash": null,
"bicodec_config": {
"mel_params": {
"sample_rate": 16000,
"n_fft": 1024,
"win_length": 640,
"hop_length": 320,
"mel_fmin": 10,
"mel_fmax": null,
"num_mels": 128
},
"encoder_config": {
"input_channels": 1024,
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
"out_channels": 1024,
"sample_ratios": [1, 1]
},
"decoder_config": {
"input_channel": 1024,
"channels": 1536,
"rates": [8, 5, 4, 2],
"kernel_sizes": [16, 11, 8, 4]
},
"quantizer_config": {
"input_dim": 1024,
"codebook_size": 8192,
"codebook_dim": 8,
"commitment": 0.25,
"codebook_loss_weight": 2.0,
"decay": 0.99,
"threshold_ema_dead_code": 0.2
},
"speaker_encoder_config": {
"input_dim": 128,
"out_dim": 1024,
"latent_dim": 128,
"token_num": 32,
"fsq_levels": [4, 4, 4, 4, 4, 4],
"fsq_num_quantizers": 1
},
"prenet_config": {
"input_channels": 1024,
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
"out_channels": 1024,
"condition_dim": 1024,
"sample_ratios": [1, 1],
"use_tanh_at_final": false
},
"postnet_config": {
"input_channels": 1024,
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 6,
"out_channels": 1024,
"use_tanh_at_final": false
}
}
}