|
{ |
|
"model_type": "spark-tts", |
|
"architectures": [ |
|
"SparkTTSModel" |
|
], |
|
"auto_map": { |
|
"AutoConfig": "configuration_spark_tts.SparkTTSConfig", |
|
"AutoModel": "modeling_spark_tts.SparkTTSModel", |
|
"AutoProcessor": "processing_spark_tts.SparkTTSProcessor" |
|
}, |
|
"processor_class": "processing_spark_tts.SparkTTSProcessor", |
|
"llm_model_name_or_path": "./LLM", |
|
"bicodec_model_name_or_path": "./BiCodec", |
|
"wav2vec2_model_name_or_path": "./wav2vec2-large-xlsr-53", |
|
"sample_rate": 16000, |
|
"highpass_cutoff_freq": 40, |
|
"latent_hop_length": 320, |
|
"ref_segment_duration": 6.0, |
|
"volume_normalize": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.50.3", |
|
"_commit_hash": null, |
|
"bicodec_config": { |
|
"mel_params": { |
|
"sample_rate": 16000, |
|
"n_fft": 1024, |
|
"win_length": 640, |
|
"hop_length": 320, |
|
"mel_fmin": 10, |
|
"mel_fmax": null, |
|
"num_mels": 128 |
|
}, |
|
"encoder_config": { |
|
"input_channels": 1024, |
|
"vocos_dim": 384, |
|
"vocos_intermediate_dim": 2048, |
|
"vocos_num_layers": 12, |
|
"out_channels": 1024, |
|
"sample_ratios": [1, 1] |
|
}, |
|
"decoder_config": { |
|
"input_channel": 1024, |
|
"channels": 1536, |
|
"rates": [8, 5, 4, 2], |
|
"kernel_sizes": [16, 11, 8, 4] |
|
}, |
|
"quantizer_config": { |
|
"input_dim": 1024, |
|
"codebook_size": 8192, |
|
"codebook_dim": 8, |
|
"commitment": 0.25, |
|
"codebook_loss_weight": 2.0, |
|
"decay": 0.99, |
|
"threshold_ema_dead_code": 0.2 |
|
}, |
|
"speaker_encoder_config": { |
|
"input_dim": 128, |
|
"out_dim": 1024, |
|
"latent_dim": 128, |
|
"token_num": 32, |
|
"fsq_levels": [4, 4, 4, 4, 4, 4], |
|
"fsq_num_quantizers": 1 |
|
}, |
|
"prenet_config": { |
|
"input_channels": 1024, |
|
"vocos_dim": 384, |
|
"vocos_intermediate_dim": 2048, |
|
"vocos_num_layers": 12, |
|
"out_channels": 1024, |
|
"condition_dim": 1024, |
|
"sample_ratios": [1, 1], |
|
"use_tanh_at_final": false |
|
}, |
|
"postnet_config": { |
|
"input_channels": 1024, |
|
"vocos_dim": 384, |
|
"vocos_intermediate_dim": 2048, |
|
"vocos_num_layers": 6, |
|
"out_channels": 1024, |
|
"use_tanh_at_final": false |
|
} |
|
} |
|
} |