# SPDX-FileCopyrightText: 2024 Idiap Research Institute # SPDX-FileContributor: Karl El Hajal # # SPDX-License-Identifier: MIT import os import zipfile import gradio as gr import spaces from huggingface_hub import snapshot_download from knn_tts.synthesizer import Synthesizer from knn_tts.utils import get_vocoder_checkpoint_path # Check if target_feats directory exists, if not, unzip target_feats.zip if not os.path.exists("target_feats"): if os.path.exists("target_feats.zip"): with zipfile.ZipFile("target_feats.zip", "r") as zip_ref: zip_ref.extractall(".") else: raise FileNotFoundError("target_feats.zip not found.") SAMPLE_RATE = 16000 CHECKPOINTS_DIR = "./checkpoints" tts_checkpoints_dir = snapshot_download(repo_id="idiap/kNN-TTS", local_dir=CHECKPOINTS_DIR) vocoder_checkpoint_path = get_vocoder_checkpoint_path(CHECKPOINTS_DIR) tts_checkpoint_name = "best_model_646135.pth" synthesizer = Synthesizer(tts_checkpoints_dir, tts_checkpoint_name, vocoder_checkpoint_path, model_name="glowtts") target_speakers = { "Libri 7127":{ "feats_path": "target_feats/LibriSpeech-test-clean/7127/wavlm", }, "Libri 7729":{ "feats_path": "target_feats/LibriSpeech-test-clean/7729/wavlm", }, "Libri 6829":{ "feats_path": "target_feats/LibriSpeech-test-clean/6829/wavlm", }, "Libri 8555":{ "feats_path": "target_feats/LibriSpeech-test-clean/8555/wavlm", }, "Thorsten Neutral": { "feats_path": "target_feats/Thorsten/neutral/wavlm/", }, "Thorsten Whisper": { "feats_path": "target_feats/Thorsten/whisper/wavlm/", }, "ESD 0018 Neutral":{ "feats_path": "target_feats/ESD/0018/neutral/wavlm/", }, "ESD 0018 Surprised":{ "feats_path": "target_feats/ESD/0018/surprised/wavlm/", }, } @spaces.GPU def run(text_input, target_speaker, lambda_rate, topk, weighted_average): feats_path = target_speakers[target_speaker]["feats_path"] wav = synthesizer(text_input, feats_path, interpolation_rate=lambda_rate, knnvc_topk=topk, weighted_average=weighted_average, max_target_num_files=500) wav = (SAMPLE_RATE, wav.squeeze().cpu().numpy()) return wav def get_title(text, size=1): return f"""
{text}
""" def create_gradio_interface(): with gr.Blocks( theme=gr.themes.Default( text_size="lg", ), title="kNN-TTS" ) as iface: gr.HTML(get_title("kNN-TTS: kNN Retrieval for Simple and Effective Zero-Shot Multi-speaker Text-to-Speech", size=1)) with gr.Tabs(): with gr.TabItem("Generate Speech"): with gr.Row(): # Left column - inputs with gr.Column(): gr.Markdown("## Input") text_box = gr.Textbox( lines=3, placeholder="Enter the text to convert to speech...", label="Text", elem_id="text-input" ) target_speaker_dropdown = gr.Dropdown( choices=list(target_speakers.keys()), value="Libri 7127", label="Target Voice", elem_id="target-voice" ) rate_slider = gr.Slider( minimum=0.0, maximum=2.0, value=1.0, step=0.01, label="Voice Morphing (λ)", info="Higher values give more weight to target voice characteristics" ) with gr.Accordion("Advanced Settings", open=False): k_slider = gr.Slider( minimum=1, maximum=50, value=4, step=1, label="Top-k Retrieval", info="k closest neighbors to retrieve" ) weighted_toggle = gr.Checkbox( label="Use Weighted Averaging", value=False, info="Weight neighbors by similarity distance" ) submit_button = gr.Button("Generate Audio", variant="primary", size="lg") # Right column - outputs with gr.Column(): gr.Markdown("## Generated Audio") with gr.Group(): audio_output = gr.Audio( type="numpy", label="Output Speech", elem_id="audio-output" ) with gr.Row(): clear_btn = gr.ClearButton([text_box, target_speaker_dropdown, rate_slider, audio_output], variant="secondary", size="lg") # Example section with gr.Row(): gr.Examples( examples=[ ["I think foosball is a combination of football and shish kebabs.", "Thorsten Whisper", 1.0, 8, True], ["I think foosball is a combination of football and shish kebabs.", "Thorsten Neutral", 1.0, 4, False], ["If you're traveling in the north country fair.", "Libri 7127", 1.0, 4, False], ["Like a vision she dances across the porch as the radio plays.", "Libri 7729", 1.0, 8, True], ["There weren't another other way to be.", "Libri 6829", 1.0, 4, False], ], inputs=[text_box, target_speaker_dropdown, rate_slider, k_slider, weighted_toggle], outputs=audio_output, fn=run, cache_examples=True ) # Additional tabs with gr.TabItem("Model Details"): with gr.Row(): with gr.Column(): gr.Markdown(""" ## kNN-TTS Technical Details kNN-TTS uses self-supervised learning (SSL) features and kNN retrieval to achieve robust zero-shot multi-speaker TTS. ### Key Components 1. **Feature Extraction**: We extract discrete representations from target speaker speech using a pre-trained SSL encoder. We use the 6th layer of WavLM Large. 2. **Text-to-SSL**: We train a lightweight TTS model to predict the same representations from Text. For simplicity, we train on a single speaker dataset. 3. **Retrieval Mechanism**: We use kNN to find for each unit in the generated features its closest matches in the target voice unit database 4. **Voice Morphing**: By linearly interpolating the source and selected target speaker features, we can morph the two voices. The interpolation parameter λ controls the balance between source and target characteristics 5. **Vocoder**: We use a pre-trained vocoder to convert the converted features to waveform. ### Performance Our simple and efficient model achieves comparable results to sota models while being trained on 100 to 1000× less transcribed data. This framework is therefore particularly well-suited for low-resource domains. For more details, please refer to our paper (https://arxiv.org/abs/2408.10771). """) with gr.Column(): gr.Image("assets/diagram.png", label="Model Architecture", scale=0.3, show_label=False, show_download_button=False, show_fullscreen_button=False) with gr.TabItem("About"): gr.Markdown(""" ## About the Project This demo showcases kNN-TTS, a lightweight zero-shot text-to-speech synthesis model. ### Authors - Karl El Hajal - Ajinkya Kulkarni - Enno Hermann - Mathew Magimai.-Doss ### Citation If you use kNN-TTS in your research, please cite our paper: ``` @inproceedings{hajal-etal-2025-knn, title = "k{NN} Retrieval for Simple and Effective Zero-Shot Multi-speaker Text-to-Speech", author = "Hajal, Karl El and Kulkarni, Ajinkya and Hermann, Enno and Magimai Doss, Mathew", editor = "Chiruzzo, Luis and Ritter, Alan and Wang, Lu", booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)", month = apr, year = "2025", address = "Albuquerque, New Mexico", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2025.naacl-short.65/", pages = "778--786", ISBN = "979-8-89176-190-2" } ``` ### Acknowledgments The target voices featured in this demo were sourced from the following datasets: - [Thorsten Dataset](https://www.thorsten-voice.de/) - [LibriSpeech Dataset](https://www.openslr.org/12) - [Emotional Speech Dataset (ESD)](https://hltsingapore.github.io/ESD/) ### License This project is licensed under the MIT License. """) # Event handlers submit_button.click( fn=run, inputs=[text_box, target_speaker_dropdown, rate_slider, k_slider, weighted_toggle], outputs=[audio_output] ) return iface demo = create_gradio_interface() demo.launch(share=True, debug=False)