Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import sys | |
import time | |
import requests | |
import json | |
from subprocess import Popen, PIPE | |
import threading | |
from huggingface_hub import hf_hub_download | |
import gradio as gr | |
hf_model_name = "Pendrokar/xvapitch_nvidia" | |
hf_cache_models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/' | |
# models_path = './resources/app/models/ccby/' | |
models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/61b10e60b22bc21c1e072f72f1108b9c2b21e94c/' | |
voice_models = [ | |
("Male #6671", "ccby_nvidia_hifi_6671_M"), | |
("Male #6670", "ccby_nvidia_hifi_6670_M"), | |
("Male #9017", "ccby_nvidia_hifi_9017_M"), | |
("Male #6097", "ccby_nvidia_hifi_6097_M"), | |
("Female #92", "ccby_nvidia_hifi_92_F"), | |
("Female #11697", "ccby_nvidia_hifi_11697_F"), | |
("Female #12787", "ccby_nvidia_hifi_12787_F"), | |
("Female #11614", "ccby_nv_hifi_11614_F"), | |
("Female #8051", "ccby_nvidia_hifi_8051_F"), | |
("Female #9136", "ccby_nvidia_hifi_9136_F"), | |
] | |
current_voice_model = None | |
base_speaker_emb = '' | |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA | |
languages = [ | |
("๐ฌ๐ง EN", "en"), | |
("๐ฉ๐ช DE", "de"), | |
("๐ช๐ธ ES", "es"), | |
("๐ฎ๐น IT", "it"), | |
("๐ณ๐ฑ NL", "nl"), | |
("๐ต๐น PT", "pt"), | |
("๐ต๐ฑ PL", "pl"), | |
("๐ท๐ด RO", "ro"), | |
("๐ธ๐ช SV", "sv"), | |
("๐ฉ๐ฐ DA", "da"), | |
("๐ซ๐ฎ FI", "fi"), | |
("๐ญ๐บ HU", "hu"), | |
("๐ฌ๐ท EL", "el"), | |
("๐ซ๐ท FR", "fr"), | |
("๐ท๐บ RU", "ru"), | |
("๐บ๐ฆ UK", "uk"), | |
("๐น๐ท TR", "tr"), | |
("๐ธ๐ฆ AR", "ar"), | |
("๐ฎ๐ณ HI", "hi"), | |
("๐ฏ๐ต JP", "jp"), | |
("๐ฐ๐ท KO", "ko"), | |
("๐จ๐ณ ZH", "zh"), | |
("๐ป๐ณ VI", "vi"), | |
("๐ป๐ฆ LA", "la"), | |
("HA", "ha"), | |
("SW", "sw"), | |
("๐ณ๐ฌ YO", "yo"), | |
("WO", "wo"), | |
] | |
# Translated from English by DeepMind's Gemini Pro | |
default_text = { | |
"ar": "ูุฐุง ูู ุตูุชู.", | |
"da": "Sรฅdan lyder min stemme.", | |
"de": "So klingt meine Stimme.", | |
"el": "ฮฯฯฮน ฮฑฮบฮฟฯฮณฮตฯฮฑฮน ฮท ฯฯฮฝฮฎ ฮผฮฟฯ .", | |
"en": "This is what my voice sounds like.", | |
"es": "Asรญ suena mi voz.", | |
"fi": "Nรคin รครคneni kuulostaa.", | |
"fr": "Voici ร quoi ressemble ma voix.", | |
"ha": "Wannan ne muryata ke.", | |
"hi": "เคฏเคน เคฎเฅเคฐเฅ เคเคตเคพเคเคผ เคเฅเคธเฅ เคฒเคเคคเฅ เคนเฅเฅค", | |
"hu": "รgy hangzik a hangom.", | |
"it": "Cosรฌ suona la mia voce.", | |
"jp": "ใใใ็งใฎๅฃฐใงใใ", | |
"ko": "์ฌ๊ธฐ ์ ๋ชฉ์๋ฆฌ๊ฐ ์ด๋ค์ง ๋ค์ด๋ณด์ธ์.", | |
"la": "Haec est vox mea sonans.", | |
"nl": "Dit is hoe mijn stem klinkt.", | |
"pl": "Tak brzmi mรณj gลos.", | |
"pt": "ร assim que minha voz soa.", | |
"ro": "Aศa sunฤ vocea mea.", | |
"ru": "ะะพั ะบะฐะบ ะทะฒััะธั ะผะพะน ะณะพะปะพั.", | |
"sv": "Sรฅhรคr lรฅter min rรถst.", | |
"sw": "Sauti yangu inasikika hivi.", | |
"tr": "Benim sesimin sesi bรถyle.", | |
"uk": "ะัั ัะบ ะทะฒััะธัั ะผัะน ะณะพะปะพั.", | |
"vi": "ฤรขy lร giแปng nรณi cแปงa tรดi.", | |
"wo": "Ndox li neen xewnaal ma.", | |
"yo": "รyรญ ni ohรนn mi ลlรก.", | |
"zh": "่ฟๆฏๆ็ๅฃฐ้ณใ", | |
} | |
def run_xvaserver(): | |
# start the process without waiting for a response | |
print('Running xVAServer subprocess...\n') | |
xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/') | |
# Wait for a moment to ensure the server starts up | |
time.sleep(10) | |
# Check if the server is running | |
if xvaserver.poll() is not None: | |
print("Web server failed to start.") | |
sys.exit(0) | |
# contact local xVASynth server | |
print('Attempting to connect to xVASynth...') | |
try: | |
response = requests.get('http://0.0.0.0:8008') | |
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception | |
except requests.exceptions.RequestException as err: | |
print('Failed to connect!') | |
return | |
print('xVAServer running on port 8008') | |
# load default model | |
load_model("ccby_nvidia_hifi_6671_M") | |
# Wait for the process to exit | |
xvaserver.wait() | |
def load_model(voice_model_name): | |
model_path = models_path + voice_model_name | |
model_type = 'xVAPitch' | |
language = 'en' | |
data = { | |
'outputs': None, | |
'version': '3.0', | |
'model': model_path, | |
'modelType': model_type, | |
'base_lang': language, | |
'pluginsContext': '{}', | |
} | |
embs = base_speaker_emb | |
try: | |
response = requests.post('http://0.0.0.0:8008/loadModel', json=data, timeout=60) | |
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception | |
current_voice_model = voice_model_name | |
with open(model_path + '.json', 'r', encoding='utf-8') as f: | |
voice_model_json = json.load(f) | |
embs = voice_model_json['games'][0]['base_speaker_emb'] | |
except requests.exceptions.RequestException as err: | |
print('Failed to load voice model!') | |
return embs | |
def predict( | |
input_text, | |
voice, | |
lang, | |
pacing, | |
pitch, | |
energy, | |
anger, | |
happy, | |
sad, | |
surprise, | |
use_deepmoji | |
): | |
# grab only the first 1000 characters | |
input_text = input_text[:1000] | |
# load voice model if not the current model | |
if (current_voice_model != voice): | |
base_speaker_emb = load_model(voice) | |
model_type = 'xVAPitch' | |
pace = pacing if pacing else 1.0 | |
save_path = '/tmp/xvapitch_audio_sample.wav' | |
language = lang | |
use_sr = 0 | |
use_cleanup = 0 | |
pluginsContext = {} | |
pluginsContext["mantella_settings"] = { | |
"emAngry": (anger if anger > 0 else 0), | |
"emHappy": (happy if happy > 0 else 0), | |
"emSad": (sad if sad > 0 else 0), | |
"emSurprise": (surprise if surprise > 0 else 0), | |
"run_model": use_deepmoji | |
} | |
data = { | |
'pluginsContext': json.dumps(pluginsContext), | |
'modelType': model_type, | |
# pad with whitespaces as a workaround to avoid cutoffs | |
'sequence': input_text.center(len(input_text) + 2, ' '), | |
'pace': pace, | |
'outfile': save_path, | |
'vocoder': 'n/a', | |
'base_lang': language, | |
'base_emb': base_speaker_emb, | |
'useSR': use_sr, | |
'useCleanup': use_cleanup, | |
} | |
try: | |
response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60) | |
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception | |
# response_data = json.loads(response.text) | |
except requests.exceptions.RequestException as err: | |
print('Failed to synthesize!') | |
print('server.log contents:') | |
with open('resources/app/server.log', 'r') as f: | |
print(f.read()) | |
return ['', err] | |
print('server.log contents:') | |
with open('resources/app/server.log', 'r') as f: | |
print(f.read()) | |
return [save_path, response.text] | |
input_textbox = gr.Textbox( | |
label="Input Text", | |
value="This is what my voice sounds like.", | |
info="Also accepts ARPAbet symbols placed within {} brackets.", | |
lines=1, | |
max_lines=5, | |
autofocus=True | |
) | |
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration") | |
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False) | |
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False) | |
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Anger", info="Tread lightly beyond 0.9") | |
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Happiness", info="Tread lightly beyond 0.7") | |
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ญ Sadness", info="Duration increased when beyond 0.2") | |
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3") | |
voice_radio = gr.Radio( | |
voice_models, | |
value="ccby_nvidia_hifi_6671_M", | |
label="Voice", | |
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model" | |
) | |
def set_default_text(lang): | |
input_textbox = gr.Textbox( | |
label="Input Text", | |
value=default_text[lang], | |
lines=1, | |
max_lines=5, | |
autofocus=True | |
) | |
language_radio = gr.Radio( | |
languages, | |
value="en", | |
label="Language", | |
info="Will be more monotone and have an English accent. Tested mostly by a native Briton." | |
) | |
# language_radio.change(set_default_text) | |
deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values") | |
gradio_app = gr.Interface( | |
predict, | |
[ | |
input_textbox, | |
voice_radio, | |
language_radio, | |
pacing_slider, | |
pitch_slider, | |
energy_slider, | |
anger_slider, | |
happy_slider, | |
sad_slider, | |
surprise_slider, | |
deepmoji_checkbox | |
], | |
outputs=[ | |
gr.Audio(label="22kHz audio output", type="filepath"), | |
gr.Textbox(label="xVASynth Server Response") | |
], | |
title="xVASynth (WIP)", | |
clear_btn=gr.Button(visible=False) | |
# examples=[ | |
# ["Once, I headed in much deeper. But I doubt I'll ever do that again.", 1], | |
# ["You love hurting me, huh?", 1.5], | |
# ["Ah, I see. Well, I'm afraid I can't help with that.", 1], | |
# ["Embrace your demise!", 1], | |
# ["Never come back!", 1] | |
# ], | |
# cache_examples=None | |
) | |
if __name__ == "__main__": | |
# Run the web server in a separate thread | |
web_server_thread = threading.Thread(target=run_xvaserver) | |
print('Starting xVAServer thread') | |
web_server_thread.start() | |
print('running Gradio interface') | |
gradio_app.launch() | |
# Wait for the web server thread to finish (shouldn't be reached in normal execution) | |
web_server_thread.join() | |