xVASynth-TTS / app.py
Pendrokar's picture
run_deepmoji
6115840
raw
history blame
9.2 kB
import os
import sys
import time
import requests
import json
from subprocess import Popen, PIPE
import threading
from huggingface_hub import hf_hub_download
import gradio as gr
hf_model_name = "Pendrokar/xvapitch_nvidia"
hf_cache_models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/'
# models_path = './resources/app/models/ccby/'
models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/'
voice_models = [
("Male #6671", "ccby_nvidia_hifi_6671_M"),
("Male #6670", "ccby_nvidia_hifi_6670_M"),
("Male #9017", "ccby_nvidia_hifi_9017_M"),
("Male #6097", "ccby_nvidia_hifi_6097_M"),
("Female #92", "ccby_nvidia_hifi_92_F"),
("Female #11697", "ccby_nvidia_hifi_11697_F"),
("Female #12787", "ccby_nvidia_hifi_12787_F"),
("Female #11614", "ccby_nv_hifi_11614_F"),
("Female #8051", "ccby_nvidia_hifi_8051_F"),
("Female #9136", "ccby_nvidia_hifi_9136_F"),
]
current_voice_model = None
base_speaker_emb = ''
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
languages = [
("๐Ÿ‡ฌ๐Ÿ‡ง EN", "en"),
("๐Ÿ‡ฉ๐Ÿ‡ช DE", "de"),
("๐Ÿ‡ช๐Ÿ‡ธ ES", "es"),
("๐Ÿ‡ฎ๐Ÿ‡น IT", "it"),
("๐Ÿ‡ณ๐Ÿ‡ฑ NL", "nl"),
("๐Ÿ‡ต๐Ÿ‡น PT", "pt"),
("๐Ÿ‡ต๐Ÿ‡ฑ PL", "pl"),
("๐Ÿ‡ท๐Ÿ‡ด RO", "ro"),
("๐Ÿ‡ธ๐Ÿ‡ช SV", "sv"),
("๐Ÿ‡ฉ๐Ÿ‡ฐ DA", "da"),
("๐Ÿ‡ซ๐Ÿ‡ฎ FI", "fi"),
("๐Ÿ‡ญ๐Ÿ‡บ HU", "hu"),
("๐Ÿ‡ฌ๐Ÿ‡ท EL", "el"),
("๐Ÿ‡ซ๐Ÿ‡ท FR", "fr"),
("๐Ÿ‡ท๐Ÿ‡บ RU", "ru"),
("๐Ÿ‡บ๐Ÿ‡ฆ UK", "uk"),
("๐Ÿ‡น๐Ÿ‡ท TR", "tr"),
("๐Ÿ‡ธ๐Ÿ‡ฆ AR", "ar"),
("๐Ÿ‡ฎ๐Ÿ‡ณ HI", "hi"),
("๐Ÿ‡ฏ๐Ÿ‡ต JP", "jp"),
("๐Ÿ‡ฐ๐Ÿ‡ท KO", "ko"),
("๐Ÿ‡จ๐Ÿ‡ณ ZH", "zh"),
("๐Ÿ‡ป๐Ÿ‡ณ VI", "vi"),
("๐Ÿ‡ป๐Ÿ‡ฆ LA", "la"),
("HA", "ha"),
("SW", "sw"),
("๐Ÿ‡ณ๐Ÿ‡ฌ YO", "yo"),
("WO", "wo"),
]
# Translated from English by DeepMind's Gemini Pro
default_text = {
"ar": "ู‡ุฐุง ู‡ูˆ ุตูˆุชูŠ.",
"da": "Sรฅdan lyder min stemme.",
"de": "So klingt meine Stimme.",
"el": "ฮˆฯ„ฯƒฮน ฮฑฮบฮฟฯฮณฮตฯ„ฮฑฮน ฮท ฯ†ฯ‰ฮฝฮฎ ฮผฮฟฯ….",
"en": "This is what my voice sounds like.",
"es": "Asรญ suena mi voz.",
"fi": "Nรคin รครคneni kuulostaa.",
"fr": "Voici ร  quoi ressemble ma voix.",
"ha": "Wannan ne muryata ke.",
"hi": "เคฏเคน เคฎเฅ‡เคฐเฅ€ เค†เคตเคพเคœเคผ เค•เฅˆเคธเฅ€ เคฒเค—เคคเฅ€ เคนเฅˆเฅค",
"hu": "รgy hangzik a hangom.",
"it": "Cosรฌ suona la mia voce.",
"jp": "ใ“ใ‚ŒใŒ็งใฎๅฃฐใงใ™ใ€‚",
"ko": "์—ฌ๊ธฐ ์ œ ๋ชฉ์†Œ๋ฆฌ๊ฐ€ ์–ด๋–ค์ง€ ๋“ค์–ด๋ณด์„ธ์š”.",
"la": "Haec est vox mea sonans.",
"nl": "Dit is hoe mijn stem klinkt.",
"pl": "Tak brzmi mรณj gล‚os.",
"pt": "ร‰ assim que minha voz soa.",
"ro": "Aศ™a sunฤƒ vocea mea.",
"ru": "ะ’ะพั‚ ะบะฐะบ ะทะฒัƒั‡ะธั‚ ะผะพะน ะณะพะปะพั.",
"sv": "Sรฅhรคr lรฅter min rรถst.",
"sw": "Sauti yangu inasikika hivi.",
"tr": "Benim sesimin sesi bรถyle.",
"uk": "ะžััŒ ัะบ ะทะฒัƒั‡ะธั‚ัŒ ะผั–ะน ะณะพะปะพั.",
"vi": "ฤรขy lร  giแปng nรณi cแปงa tรดi.",
"wo": "Ndox li neen xewnaal ma.",
"yo": "รŒyรญ ni ohรนn mi ล„lรก.",
"zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
}
def run_xvaserver():
# start the process without waiting for a response
print('Running xVAServer subprocess...\n')
xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/')
# Wait for a moment to ensure the server starts up
time.sleep(10)
# Check if the server is running
if xvaserver.poll() is not None:
print("Web server failed to start.")
sys.exit(0)
# contact local xVASynth server
print('Attempting to connect to xVASynth...')
try:
response = requests.get('http://0.0.0.0:8008')
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
except requests.exceptions.RequestException as err:
print('Failed to connect!')
return
print('xVAServer running on port 8008')
# load default model
load_model("ccby_nvidia_hifi_6671_M")
# Wait for the process to exit
xvaserver.wait()
def load_model(voice_model_name):
model_path = models_path + voice_model_name
model_type = 'xVAPitch'
language = 'en'
data = {
'outputs': None,
'version': '3.0',
'model': model_path,
'modelType': model_type,
'base_lang': language,
'pluginsContext': '{}',
}
embs = base_speaker_emb
try:
response = requests.post('http://0.0.0.0:8008/loadModel', json=data, timeout=60)
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
current_voice_model = voice_model_name
with open(model_path + '.json', 'r', encoding='utf-8') as f:
voice_model_json = json.load(f)
embs = voice_model_json['games'][0]['base_speaker_emb']
except requests.exceptions.RequestException as err:
print('Failed to load voice model!')
return embs
def predict(
input_text,
voice,
lang,
pacing,
pitch,
energy,
anger,
happy,
sad,
surprise,
use_deepmoji
):
# grab only the first 1000 characters
input_text = input_text[:1000]
# load voice model if not the current model
if (current_voice_model != voice):
base_speaker_emb = load_model(voice)
model_type = 'xVAPitch'
pace = pacing if pacing else 1.0
save_path = '/tmp/xvapitch_audio_sample.wav'
language = lang
use_sr = 0
use_cleanup = 0
pluginsContext = {}
pluginsContext["mantella_settings"] = {
"emAngry": (anger if anger > 0 else 0),
"emHappy": (happy if happy > 0 else 0),
"emSad": (sad if sad > 0 else 0),
"emSurprise": (surprise if surprise > 0 else 0),
"run_model": use_deepmoji
}
data = {
'pluginsContext': json.dumps(pluginsContext),
'modelType': model_type,
# pad with whitespaces as a workaround to avoid cutoffs
'sequence': input_text.center(len(input_text) + 2, ' '),
'pace': pace,
'outfile': save_path,
'vocoder': 'n/a',
'base_lang': language,
'base_emb': base_speaker_emb,
'useSR': use_sr,
'useCleanup': use_cleanup,
}
try:
response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
# response_data = json.loads(response.text)
except requests.exceptions.RequestException as err:
print('Failed to synthesize!')
print('server.log contents:')
with open('resources/app/server.log', 'r') as f:
print(f.read())
return ['', err]
print('server.log contents:')
with open('resources/app/server.log', 'r') as f:
print(f.read())
return [save_path, response.text]
input_textbox = gr.Textbox(
label="Input Text",
value="This is what my voice sounds like.",
info="Also accepts ARPAbet symbols placed within {} brackets.",
lines=1,
max_lines=5,
autofocus=True
)
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
voice_radio = gr.Radio(
voice_models,
value="ccby_nvidia_hifi_6671_M",
label="Voice",
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
)
def set_default_text(lang):
input_textbox = gr.Textbox(
label="Input Text",
value=default_text[lang],
lines=1,
max_lines=5,
autofocus=True
)
language_radio = gr.Radio(
languages,
value="en",
label="Language",
info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
)
# language_radio.change(set_default_text)
deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values")
gradio_app = gr.Interface(
predict,
[
input_textbox,
voice_radio,
language_radio,
pacing_slider,
pitch_slider,
energy_slider,
anger_slider,
happy_slider,
sad_slider,
surprise_slider,
deepmoji_checkbox
],
outputs=[
gr.Audio(label="22kHz audio output", type="filepath"),
gr.Textbox(label="xVASynth Server Response")
],
title="xVASynth (WIP)",
clear_btn=gr.Button(visible=False)
# examples=[
# ["Once, I headed in much deeper. But I doubt I'll ever do that again.", 1],
# ["You love hurting me, huh?", 1.5],
# ["Ah, I see. Well, I'm afraid I can't help with that.", 1],
# ["Embrace your demise!", 1],
# ["Never come back!", 1]
# ],
# cache_examples=None
)
if __name__ == "__main__":
# Run the web server in a separate thread
web_server_thread = threading.Thread(target=run_xvaserver)
print('Starting xVAServer thread')
web_server_thread.start()
print('running Gradio interface')
gradio_app.launch()
# Wait for the web server thread to finish (shouldn't be reached in normal execution)
web_server_thread.join()