kNN-TTS / app.py
Karl El Hajal
Add kNN-TTS code and demo interface
a180d8c
raw
history blame
10.5 kB
# SPDX-FileCopyrightText: 2024 Idiap Research Institute
# SPDX-FileContributor: Karl El Hajal
#
# SPDX-License-Identifier: MIT
import os
import zipfile
import gradio as gr
import spaces
from huggingface_hub import snapshot_download
from knn_tts.synthesizer import Synthesizer
from knn_tts.utils import get_vocoder_checkpoint_path
# Check if target_feats directory exists, if not, unzip target_feats.zip
if not os.path.exists("target_feats"):
if os.path.exists("target_feats.zip"):
with zipfile.ZipFile("target_feats.zip", "r") as zip_ref:
zip_ref.extractall(".")
else:
raise FileNotFoundError("target_feats.zip not found.")
SAMPLE_RATE = 16000
CHECKPOINTS_DIR = "./checkpoints"
tts_checkpoints_dir = snapshot_download(repo_id="idiap/kNN-TTS", local_dir=CHECKPOINTS_DIR)
vocoder_checkpoint_path = get_vocoder_checkpoint_path(CHECKPOINTS_DIR)
tts_checkpoint_name = "best_model_646135.pth"
synthesizer = Synthesizer(tts_checkpoints_dir, tts_checkpoint_name, vocoder_checkpoint_path, model_name="glowtts")
target_speakers = {
"Libri 7127":{
"feats_path": "target_feats/LibriSpeech-test-clean/7127/wavlm",
},
"Libri 7729":{
"feats_path": "target_feats/LibriSpeech-test-clean/7729/wavlm",
},
"Libri 6829":{
"feats_path": "target_feats/LibriSpeech-test-clean/6829/wavlm",
},
"Libri 8555":{
"feats_path": "target_feats/LibriSpeech-test-clean/8555/wavlm",
},
"Thorsten Neutral": {
"feats_path": "target_feats/Thorsten/neutral/wavlm/",
},
"Thorsten Whisper": {
"feats_path": "target_feats/Thorsten/whisper/wavlm/",
},
"ESD 0018 Neutral":{
"feats_path": "target_feats/ESD/0018/neutral/wavlm/",
},
"ESD 0018 Surprised":{
"feats_path": "target_feats/ESD/0018/surprised/wavlm/",
},
}
@spaces.GPU
def run(text_input, target_speaker, lambda_rate, topk, weighted_average):
feats_path = target_speakers[target_speaker]["feats_path"]
wav = synthesizer(text_input, feats_path, interpolation_rate=lambda_rate, knnvc_topk=topk, weighted_average=weighted_average, max_target_num_files=500)
wav = (SAMPLE_RATE, wav.squeeze().cpu().numpy())
return wav
def get_title(text, size=1):
return f"""
<center>
<h{size}> {text} </h{size}>
</center>
"""
def create_gradio_interface():
with gr.Blocks(
theme=gr.themes.Default(
text_size="lg",
),
title="kNN-TTS"
) as iface:
gr.HTML(get_title("kNN-TTS: kNN Retrieval for Simple and Effective Zero-Shot Multi-speaker Text-to-Speech", size=1))
with gr.Tabs():
with gr.TabItem("Generate Speech"):
with gr.Row():
# Left column - inputs
with gr.Column():
gr.Markdown("## Input")
text_box = gr.Textbox(
lines=3,
placeholder="Enter the text to convert to speech...",
label="Text",
elem_id="text-input"
)
target_speaker_dropdown = gr.Dropdown(
choices=list(target_speakers.keys()),
value="Libri 7127",
label="Target Voice",
elem_id="target-voice"
)
rate_slider = gr.Slider(
minimum=0.0,
maximum=2.0,
value=1.0,
step=0.01,
label="Voice Morphing (λ)",
info="Higher values give more weight to target voice characteristics"
)
with gr.Accordion("Advanced Settings", open=False):
k_slider = gr.Slider(
minimum=1,
maximum=50,
value=4,
step=1,
label="Top-k Retrieval",
info="k closest neighbors to retrieve"
)
weighted_toggle = gr.Checkbox(
label="Use Weighted Averaging",
value=False,
info="Weight neighbors by similarity distance"
)
submit_button = gr.Button("Generate Audio", variant="primary", size="lg")
# Right column - outputs
with gr.Column():
gr.Markdown("## Generated Audio")
with gr.Group():
audio_output = gr.Audio(
type="numpy",
label="Output Speech",
elem_id="audio-output"
)
with gr.Row():
clear_btn = gr.ClearButton([text_box, target_speaker_dropdown, rate_slider, audio_output], variant="secondary", size="lg")
# Example section
with gr.Row():
gr.Examples(
examples=[
["I think foosball is a combination of football and shish kebabs.", "Thorsten Whisper", 1.0, 8, True],
["I think foosball is a combination of football and shish kebabs.", "Thorsten Neutral", 1.0, 4, False],
["If you're traveling in the north country fair.", "Libri 7127", 1.0, 4, False],
["Like a vision she dances across the porch as the radio plays.", "Libri 7729", 1.0, 8, True],
["There weren't another other way to be.", "Libri 6829", 1.0, 4, False],
],
inputs=[text_box, target_speaker_dropdown, rate_slider, k_slider, weighted_toggle],
outputs=audio_output,
fn=run,
cache_examples=True
)
# Additional tabs
with gr.TabItem("Model Details"):
with gr.Row():
with gr.Column():
gr.Markdown("""
## kNN-TTS Technical Details
kNN-TTS uses self-supervised learning (SSL) features and kNN retrieval to achieve robust zero-shot multi-speaker TTS.
### Key Components
1. **Feature Extraction**: We extract discrete representations from target speaker speech using a pre-trained SSL encoder. We use the 6th layer of WavLM Large.
2. **Text-to-SSL**: We train a lightweight TTS model to predict the same representations from Text. For simplicity, we train on a single speaker dataset.
3. **Retrieval Mechanism**: We use kNN to find for each unit in the generated features its closest matches in the target voice unit database
4. **Voice Morphing**: By linearly interpolating the source and selected target speaker features, we can morph the two voices. The interpolation parameter λ controls the balance between source and target characteristics
5. **Vocoder**: We use a pre-trained vocoder to convert the converted features to waveform.
### Performance
Our simple and efficient model achieves comparable results to sota models while being trained on 100 to 1000× less transcribed data.
This framework is therefore particularly well-suited for low-resource domains.
For more details, please refer to our paper (https://arxiv.org/abs/2408.10771).
""")
with gr.Column():
gr.Image("assets/diagram.png", label="Model Architecture", scale=0.3, show_label=False, show_download_button=False, show_fullscreen_button=False)
with gr.TabItem("About"):
gr.Markdown("""
## About the Project
This demo showcases kNN-TTS, a lightweight zero-shot text-to-speech synthesis model.
### Authors
- Karl El Hajal
- Ajinkya Kulkarni
- Enno Hermann
- Mathew Magimai.-Doss
### Citation
If you use kNN-TTS in your research, please cite our paper:
```
@misc{hajal2025knntts,
title={kNN Retrieval for Simple and Effective Zero-Shot Multi-speaker Text-to-Speech},
author={Karl El Hajal and Ajinkya Kulkarni and Enno Hermann and Mathew Magimai.-Doss},
year={2025},
eprint={2408.10771},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2408.10771},
}
```
### Acknowledgments
The target voices featured in this demo were sourced from the following datasets:
- [Thorsten Dataset](https://www.thorsten-voice.de/)
- [LibriSpeech Dataset](https://www.openslr.org/12)
- [Emotional Speech Dataset (ESD)](https://hltsingapore.github.io/ESD/)
### License
This project is licensed under the MIT License.
""")
# Event handlers
submit_button.click(
fn=run,
inputs=[text_box, target_speaker_dropdown, rate_slider, k_slider, weighted_toggle],
outputs=[audio_output]
)
return iface
demo = create_gradio_interface()
demo.launch(share=True, debug=False)