|
|
|
|
|
|
|
|
|
|
|
import os |
|
import zipfile |
|
import gradio as gr |
|
import spaces |
|
from huggingface_hub import snapshot_download |
|
|
|
from knn_tts.synthesizer import Synthesizer |
|
from knn_tts.utils import get_vocoder_checkpoint_path |
|
|
|
|
|
if not os.path.exists("target_feats"): |
|
if os.path.exists("target_feats.zip"): |
|
with zipfile.ZipFile("target_feats.zip", "r") as zip_ref: |
|
zip_ref.extractall(".") |
|
else: |
|
raise FileNotFoundError("target_feats.zip not found.") |
|
|
|
SAMPLE_RATE = 16000 |
|
|
|
CHECKPOINTS_DIR = "./checkpoints" |
|
|
|
tts_checkpoints_dir = snapshot_download(repo_id="idiap/kNN-TTS", local_dir=CHECKPOINTS_DIR) |
|
vocoder_checkpoint_path = get_vocoder_checkpoint_path(CHECKPOINTS_DIR) |
|
|
|
tts_checkpoint_name = "best_model_646135.pth" |
|
synthesizer = Synthesizer(tts_checkpoints_dir, tts_checkpoint_name, vocoder_checkpoint_path, model_name="glowtts") |
|
|
|
target_speakers = { |
|
"Libri 7127":{ |
|
"feats_path": "target_feats/LibriSpeech-test-clean/7127/wavlm", |
|
}, |
|
"Libri 7729":{ |
|
"feats_path": "target_feats/LibriSpeech-test-clean/7729/wavlm", |
|
}, |
|
"Libri 6829":{ |
|
"feats_path": "target_feats/LibriSpeech-test-clean/6829/wavlm", |
|
}, |
|
"Libri 8555":{ |
|
"feats_path": "target_feats/LibriSpeech-test-clean/8555/wavlm", |
|
}, |
|
"Thorsten Neutral": { |
|
"feats_path": "target_feats/Thorsten/neutral/wavlm/", |
|
}, |
|
"Thorsten Whisper": { |
|
"feats_path": "target_feats/Thorsten/whisper/wavlm/", |
|
}, |
|
"ESD 0018 Neutral":{ |
|
"feats_path": "target_feats/ESD/0018/neutral/wavlm/", |
|
}, |
|
"ESD 0018 Surprised":{ |
|
"feats_path": "target_feats/ESD/0018/surprised/wavlm/", |
|
}, |
|
} |
|
|
|
@spaces.GPU |
|
def run(text_input, target_speaker, lambda_rate, topk, weighted_average): |
|
feats_path = target_speakers[target_speaker]["feats_path"] |
|
wav = synthesizer(text_input, feats_path, interpolation_rate=lambda_rate, knnvc_topk=topk, weighted_average=weighted_average, max_target_num_files=500) |
|
wav = (SAMPLE_RATE, wav.squeeze().cpu().numpy()) |
|
return wav |
|
|
|
|
|
def get_title(text, size=1): |
|
return f""" |
|
<center> |
|
|
|
<h{size}> {text} </h{size}> |
|
|
|
</center> |
|
""" |
|
|
|
def create_gradio_interface(): |
|
with gr.Blocks( |
|
theme=gr.themes.Default( |
|
text_size="lg", |
|
), |
|
title="kNN-TTS" |
|
) as iface: |
|
|
|
gr.HTML(get_title("kNN-TTS: kNN Retrieval for Simple and Effective Zero-Shot Multi-speaker Text-to-Speech", size=1)) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Generate Speech"): |
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
gr.Markdown("## Input") |
|
text_box = gr.Textbox( |
|
lines=3, |
|
placeholder="Enter the text to convert to speech...", |
|
label="Text", |
|
elem_id="text-input" |
|
) |
|
|
|
target_speaker_dropdown = gr.Dropdown( |
|
choices=list(target_speakers.keys()), |
|
value="Libri 7127", |
|
label="Target Voice", |
|
elem_id="target-voice" |
|
) |
|
|
|
rate_slider = gr.Slider( |
|
minimum=0.0, |
|
maximum=2.0, |
|
value=1.0, |
|
step=0.01, |
|
label="Voice Morphing (λ)", |
|
info="Higher values give more weight to target voice characteristics" |
|
) |
|
|
|
with gr.Accordion("Advanced Settings", open=False): |
|
k_slider = gr.Slider( |
|
minimum=1, |
|
maximum=50, |
|
value=4, |
|
step=1, |
|
label="Top-k Retrieval", |
|
info="k closest neighbors to retrieve" |
|
) |
|
weighted_toggle = gr.Checkbox( |
|
label="Use Weighted Averaging", |
|
value=False, |
|
info="Weight neighbors by similarity distance" |
|
) |
|
|
|
submit_button = gr.Button("Generate Audio", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(): |
|
gr.Markdown("## Generated Audio") |
|
with gr.Group(): |
|
audio_output = gr.Audio( |
|
type="numpy", |
|
label="Output Speech", |
|
elem_id="audio-output" |
|
) |
|
with gr.Row(): |
|
clear_btn = gr.ClearButton([text_box, target_speaker_dropdown, rate_slider, audio_output], variant="secondary", size="lg") |
|
|
|
|
|
with gr.Row(): |
|
gr.Examples( |
|
examples=[ |
|
["I think foosball is a combination of football and shish kebabs.", "Thorsten Whisper", 1.0, 8, True], |
|
["I think foosball is a combination of football and shish kebabs.", "Thorsten Neutral", 1.0, 4, False], |
|
["If you're traveling in the north country fair.", "Libri 7127", 1.0, 4, False], |
|
["Like a vision she dances across the porch as the radio plays.", "Libri 7729", 1.0, 8, True], |
|
["There weren't another other way to be.", "Libri 6829", 1.0, 4, False], |
|
], |
|
inputs=[text_box, target_speaker_dropdown, rate_slider, k_slider, weighted_toggle], |
|
outputs=audio_output, |
|
fn=run, |
|
cache_examples=True |
|
) |
|
|
|
|
|
with gr.TabItem("Model Details"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown(""" |
|
## kNN-TTS Technical Details |
|
|
|
kNN-TTS uses self-supervised learning (SSL) features and kNN retrieval to achieve robust zero-shot multi-speaker TTS. |
|
|
|
### Key Components |
|
|
|
1. **Feature Extraction**: We extract discrete representations from target speaker speech using a pre-trained SSL encoder. We use the 6th layer of WavLM Large. |
|
2. **Text-to-SSL**: We train a lightweight TTS model to predict the same representations from Text. For simplicity, we train on a single speaker dataset. |
|
3. **Retrieval Mechanism**: We use kNN to find for each unit in the generated features its closest matches in the target voice unit database |
|
4. **Voice Morphing**: By linearly interpolating the source and selected target speaker features, we can morph the two voices. The interpolation parameter λ controls the balance between source and target characteristics |
|
5. **Vocoder**: We use a pre-trained vocoder to convert the converted features to waveform. |
|
|
|
### Performance |
|
|
|
Our simple and efficient model achieves comparable results to sota models while being trained on 100 to 1000× less transcribed data. |
|
This framework is therefore particularly well-suited for low-resource domains. |
|
|
|
For more details, please refer to our paper (https://arxiv.org/abs/2408.10771). |
|
""") |
|
with gr.Column(): |
|
gr.Image("assets/diagram.png", label="Model Architecture", scale=0.3, show_label=False, show_download_button=False, show_fullscreen_button=False) |
|
|
|
with gr.TabItem("About"): |
|
gr.Markdown(""" |
|
## About the Project |
|
|
|
This demo showcases kNN-TTS, a lightweight zero-shot text-to-speech synthesis model. |
|
|
|
### Authors |
|
|
|
- Karl El Hajal |
|
- Ajinkya Kulkarni |
|
- Enno Hermann |
|
- Mathew Magimai.-Doss |
|
|
|
### Citation |
|
|
|
If you use kNN-TTS in your research, please cite our paper: |
|
|
|
``` |
|
@misc{hajal2025knntts, |
|
title={kNN Retrieval for Simple and Effective Zero-Shot Multi-speaker Text-to-Speech}, |
|
author={Karl El Hajal and Ajinkya Kulkarni and Enno Hermann and Mathew Magimai.-Doss}, |
|
year={2025}, |
|
eprint={2408.10771}, |
|
archivePrefix={arXiv}, |
|
primaryClass={eess.AS}, |
|
url={https://arxiv.org/abs/2408.10771}, |
|
} |
|
``` |
|
|
|
### Acknowledgments |
|
|
|
The target voices featured in this demo were sourced from the following datasets: |
|
|
|
- [Thorsten Dataset](https://www.thorsten-voice.de/) |
|
- [LibriSpeech Dataset](https://www.openslr.org/12) |
|
- [Emotional Speech Dataset (ESD)](https://hltsingapore.github.io/ESD/) |
|
|
|
### License |
|
|
|
This project is licensed under the MIT License. |
|
""") |
|
|
|
|
|
submit_button.click( |
|
fn=run, |
|
inputs=[text_box, target_speaker_dropdown, rate_slider, k_slider, weighted_toggle], |
|
outputs=[audio_output] |
|
) |
|
|
|
return iface |
|
|
|
demo = create_gradio_interface() |
|
demo.launch(share=True, debug=False) |