xVASynth-TTS / gr_client.py
Pendrokar's picture
expresso sample fix
dbc5ddc
import os
import json
import gradio as gr
from gradio_client import Client
voice_models = [
("👩 #ex04", "x_ex04"),
("🧑 #ex01", "x_ex01"),
("👱‍♀️ 🇬🇧 #92", "ccby_nvidia_hifi_92_F"),
("👨‍🦳 #6671", "ccby_nvidia_hifi_6671_M"),
]
voice_models_more = [
("👸 #ex02", "x_ex02"),
("👨‍🦱 #ex03", "x_ex03"),
("🧔 #6670", "ccby_nvidia_hifi_6670_M"),
("👨‍🦲 #9017", "ccby_nvidia_hifi_9017_M"),
("🧑 #6097", "ccby_nvidia_hifi_6097_M"),
("👩‍🦱 #12787", "ccby_nvidia_hifi_12787_F"),
("👵 #11614", "ccby_nv_hifi_11614_F"),
("👩‍🦰 #8051", "ccby_nvidia_hifi_8051_F"),
("👩‍🦳 #11697", "ccby_nvidia_hifi_11697_F"),
("👩‍🦲 #9136", "ccby_nvidia_hifi_9136_F"),
("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre- the multilingual capabilities of xVASynth v3
]
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
languages = [
("🇺🇸 EN", "en"),
("🇩🇪 DE", "de"),
("🇪🇸 ES", "es"),
("🇮🇳 HI", "hi"),
("🇨🇳 ZH", "zh"),
]
languages_more = [
("🇳🇱 NL", "nl"),
("🇧🇷 PT", "pt"),
("🇮🇹 IT", "it"),
("🇵🇱 PL", "pl"),
("🇷🇴 RO", "ro"),
("🇸🇪 SV", "sv"),
("🇩🇰 DA", "da"),
("🇫🇮 FI", "fi"),
("🇭🇺 HU", "hu"),
("🇬🇷 EL", "el"),
("🇫🇷 FR", "fr"),
("🇷🇺 RU", "ru"),
("🇺🇦 UA", "uk"),
("🇹🇷 TR", "tr"),
("🇸🇦 AR", "ar"),
("🇯🇵 JP", "jp"),
("🇰🇷 KO", "ko"),
("🇻🇳 VI", "vi"),
("🇻🇦 LA", "la"),
("🇳🇬 YO", "yo"),
("Swahili", "sw"),
("Hausa", "ha"),
("Wolof", "wo"),
]
lojban_lang = [
# There is no ISO 639-1 for Lojban, but jb is valid
('♟ Lojban', 'jb')
]
# Translated from English by DeepMind's Gemini Pro
default_text = {
"ar": "هذا هو صوتي.",
"da": "Sådan lyder min stemme.",
"de": "So klingt meine Stimme.",
"el": "Έτσι ακούγεται η φωνή μου.",
"en": "This is what my voice sounds like.",
"es": "Así suena mi voz.",
"fi": "Näin ääneni kuulostaa.",
"fr": "Voici à quoi ressemble ma voix.",
"ha": "Wannan ne muryata ke.",
"hi": "यह मेरी आवाज़ कैसी लगती है।",
"hu": "Így hangzik a hangom.",
"it": "Così suona la mia voce.",
"jb": ".i ca'e gusni",
"jp": "これが私の声です。",
"ko": "여기 제 목소리가 어떤지 들어보세요.",
"la": "Haec est vox mea sonans.",
"nl": "Dit is hoe mijn stem klinkt.",
"pl": "Tak brzmi mój głos.",
"pt": "É assim que minha voz soa.",
"ro": "Așa sună vocea mea.",
"ru": "Вот как звучит мой голос.",
"sv": "Såhär låter min röst.",
"sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4
"tr": "Benim sesimin sesi böyle.",
"uk": "Ось як звучить мій голос.",
"vi": "Đây là giọng nói của tôi.",
"wo": "Ndox li neen xewnaal ma.",
"yo": "Ìyí ni ohùn mi ńlá.",
"zh": "这是我的声音。",
}
# Component defaults
input_textbox_init = {
'label': "Input Text",
'value': "This is what my voice sounds like.",
'info': "Also accepts ARPAbet symbols placed within {} brackets.",
'lines': 1,
'max_lines': 5,
'autofocus': True,
}
pacing_slider_init = {
'value': 1.0,
'minimum': 0.5,
'maximum': 2.0,
'step': 0.1,
'label': "Duration",
}
pitch_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0.5,
'step': 0.05,
'label': "Pitch",
'visible': False,
}
energy_slider_init = {
'minimum': 0.1,
'maximum': 1.0,
'value': 1.0,
'step': 0.05,
'label': "Energy",
'visible': False,
}
anger_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😠 Anger",
'info': "Tread lightly beyond 0.9",
}
happy_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😃 Happiness",
'info': "Tread lightly beyond 0.7",
}
sad_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😭 Sadness",
'info': "Duration increased when beyond 0.2",
}
surprise_slider_init = {
'minimum': 0,
'maximum': 1.0,
'value': 0,
'step': 0.05,
'label': "😮 Surprise",
'info': "Oversaturates Happiness when beyond 0.3",
}
voice_radio_init = {
'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')],
'value': "ccby_nvidia_hifi_6671_M",
'label': "Voice",
'info': "Fine-tuned voice model"
}
deepmoji_checkbox_init = {
'label': "Use DeepMoji",
'info': "Auto adjust emotional values for English",
'value': True,
'interactive': True
}
def more_lang_options(lang):
# print('more_lang_options')
if lang != 'more':
return lang
radio_init = {**language_radio_init}
radio_init['choices'] = [*languages, *languages_more]
return gr.Radio(**radio_init)
def set_default_text(lang, deepmoji_checked):
textbox_init = {**input_textbox_init}
if lang == 'more':
textbox_init['value'] = default_text['en']
return gr.Textbox(**textbox_init), deepmoji_checked
textbox_init['value'] = default_text[lang]
# DeepMoji only works on English Text
checkbox_init = {**deepmoji_checkbox_init}
if lang == 'en':
checkbox_init['value'] = deepmoji_checked,
# checkbox_init['interactive'] = True
else:
deepmoji_checked = False
# FIXME: event listener conflict with toggle_deepmoji
# checkbox_init['info'] = "Works only with English!",
# checkbox_init['value'] = False,
# checkbox_init['interactive'] = False
# gr.Checkbox(**checkbox_init)
return gr.Textbox(**textbox_init), deepmoji_checked
# examples component
en_examples = [
"This is what my voice sounds like.",
"If there is anything else you need, feel free to ask.",
"Amazing! Could you do that again?",
"Why, I would be more than happy to help you!",
"That was unexpected.",
"How dare you! . You have no right.",
"Ahh, well, you see. There is more to it.",
"I can't believe she is gone.",
"Stay out of my way!!!",
# ARPAbet example
"{V AA1 R D AH0 N F EH2 L}, is the largest island in {M AO1 R OW0 W IH2 N D}.",
]
en_examples_dropdown_init = {
'choices': en_examples,
'value': en_examples[0],
'label': "Example dropdown",
'show_label': False,
'info': "English Examples",
'visible': True
}
def set_example_as_input(example_text):
# print('set_example_as_input')
return example_text
def toggle_example_dropdown(lang):
# print('toggle_example_dropdown')
dropdown_init = {**en_examples_dropdown_init}
if lang == 'en':
dropdown_init['visible'] = True
else:
dropdown_init['visible'] = False
return gr.Dropdown(**dropdown_init)
def more_voice_options(voice):
# print('more_voice_options')
if voice != 'more':
return voice
radio_init = {**voice_radio_init}
radio_init['choices'] = [*voice_models, *voice_models_more]
return gr.Radio(**radio_init)
def reset_em_sliders(
deepmoji_enabled,
anger,
happy,
sad,
surprise
):
# print('reset_em_sliders')
if (deepmoji_enabled):
return (0, 0, 0, 0)
else:
return (
anger,
happy,
sad,
surprise
)
def toggle_deepmoji(
checked,
anger,
happy,
sad,
surprise
):
# print('toggle_deepmoji')
if checked:
return (0, 0, 0, 0)
else:
return (
anger,
happy,
sad,
surprise
)
# languages component
language_radio_init = {
'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]],
'value': "en",
'label': "Language",
'info': "Will be more monotone and have an English accent."
}
def set_lojban_language(voice, lang):
if voice != 'x_selpahi':
return lang
radio_init = {**language_radio_init}
radio_init['choices'] = [
*lojban_lang,
*languages,
*languages_more,
]
radio_init['value'] = lojban_lang[0][1]
return gr.Radio(**radio_init)
_DESCRIPTION = '''
<div>
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
<a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.9k-blue?logo=nexusmods'/></a>
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
</div>
'''
class BlocksDemo:
def __init__(self, models_path, lojban_models_path, robotic_models_path, expresso_models_path):
self.models_path = models_path
self.lojban_models_path = lojban_models_path
self.robotic_models_path = robotic_models_path
self.expresso_models_path = expresso_models_path
if robotic_models_path != '':
# insert robotic voice as the third option
voice_models.append(("🤖 Robot", "cnc_cabal"))
voice_radio_init['choices'].insert(2, ("🤖 Robot", "cnc_cabal"))
self.block = self.create_interface()
def create_interface(self):
with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
gr.Markdown("# xVASynth TTS")
gr.HTML(label="description", value=_DESCRIPTION)
with gr.Row(): # Main row for inputs and language selection
with gr.Column(): # Input column
input_textbox = gr.Textbox(**input_textbox_init)
language_radio = gr.Radio(**language_radio_init)
# remove autofocus
input_textbox_init['autofocus'] = False
with gr.Row():
with gr.Column():
en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init)
with gr.Column():
pacing_slider = gr.Slider(**pacing_slider_init)
with gr.Column(): # Control column
voice_radio = gr.Radio(**voice_radio_init)
pitch_slider = gr.Slider(**pitch_slider_init)
energy_slider = gr.Slider(**energy_slider_init)
with gr.Row(): # Main row for inputs and language selection
with gr.Column(): # Input column
anger_slider = gr.Slider(**anger_slider_init)
sad_slider = gr.Slider(**sad_slider_init)
with gr.Column(): # Input column
happy_slider = gr.Slider(**happy_slider_init)
surprise_slider = gr.Slider(**surprise_slider_init)
deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init)
# Event handling using click
btn = gr.Button("Generate", variant="primary")
# with gr.Row(): # Main row for inputs and language selection
# with gr.Column(): # Input column
output_wav = gr.Audio(
label="22kHz audio output",
type="filepath",
editable=False,
autoplay=True
)
# with gr.Column(): # Input column
output_arpabet = gr.HTML(label="ARPAbet")
btn.click(
fn=self.predict,
inputs=[
input_textbox,
voice_radio,
language_radio,
pacing_slider,
pitch_slider,
energy_slider,
anger_slider,
happy_slider,
sad_slider,
surprise_slider,
deepmoji_checkbox
],
outputs=[
output_wav,
output_arpabet,
anger_slider,
happy_slider,
sad_slider,
surprise_slider,
# xVAServer response
gr.Textbox(visible=False)
]
)
# more languages option
language_radio.change(
more_lang_options,
inputs=language_radio,
outputs=language_radio,
trigger_mode='once',
show_progress='hidden',
)
# more voices option
voice_radio.change(
more_voice_options,
inputs=voice_radio,
outputs=voice_radio,
trigger_mode='once',
show_progress='hidden',
queue=False,
)
# set default text
language_radio.change(
set_default_text,
inputs=[language_radio, deepmoji_checkbox],
outputs=[input_textbox, deepmoji_checkbox],
show_progress='hidden',
queue=False,
)
# toggle en examples
language_radio.change(
toggle_example_dropdown,
inputs=language_radio,
outputs=en_examples_dropdown,
show_progress='hidden',
queue=False,
)
en_examples_dropdown.change(
set_example_as_input,
inputs=[en_examples_dropdown],
outputs=[input_textbox],
show_progress='hidden',
queue=False,
)
deepmoji_checkbox.change(
toggle_deepmoji,
inputs=[
deepmoji_checkbox,
anger_slider,
happy_slider,
sad_slider,
surprise_slider
],
outputs=[
anger_slider,
happy_slider,
sad_slider,
surprise_slider
],
show_progress='hidden',
queue=False,
)
input_textbox.change(
reset_em_sliders,
inputs=[
deepmoji_checkbox,
anger_slider,
happy_slider,
sad_slider,
surprise_slider
],
outputs=[
anger_slider,
happy_slider,
sad_slider,
surprise_slider
],
show_progress='hidden',
queue=False,
)
voice_radio.change(
reset_em_sliders,
inputs=[
deepmoji_checkbox,
anger_slider,
happy_slider,
sad_slider,
surprise_slider
],
outputs=[
anger_slider,
happy_slider,
sad_slider,
surprise_slider
],
show_progress='hidden',
queue=False,
)
# Replace output with voice audio sample
voice_radio.change(
self.set_default_audio,
inputs=voice_radio,
outputs=output_wav,
queue=True,
trigger_mode='once',
)
# Switched to Lojban voice
voice_radio.change(
set_lojban_language,
inputs=[voice_radio, language_radio],
outputs=[language_radio],
trigger_mode='once',
queue=True,
)
return demo
def predict(
self,
input_text,
voice,
lang,
pacing,
pitch,
energy,
anger,
happy,
sad,
surprise,
deepmoji_checked
):
wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
input_text, # str in 'Input Text' Textbox component
voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
anger, # float (numeric value between 0 and 1.0) in '😠 Anger' Slider component
happy, # float (numeric value between 0 and 1.0) in '😃 Happiness' Slider component
sad, # float (numeric value between 0 and 1.0) in '😭 Sadness' Slider component
surprise, # float (numeric value between 0 and 1.0) in '😮 Surprise' Slider component
deepmoji_checked, # bool
api_name="/predict"
)
arpabet_html = ''
if voice == 'x_selpahi':
em_angry = 0
em_happy = 0
em_sad = 0
em_surprise = 0
else:
json_data = json.loads(response.replace("'", '"'))
arpabet_html = '<h6>ARPAbet & Durations</h6>'
arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
arpabet_nopad = json_data['arpabet'].split('|PAD|')
arpabet_symbols = json_data['arpabet'].split('|')
wpad_len = len(arpabet_symbols)
nopad_len = len(arpabet_nopad)
total_dur_length = 0
for symb_i in range(wpad_len):
if (arpabet_symbols[symb_i] == '<PAD>'):
continue
total_dur_length += float(json_data['durations'][symb_i])
for symb_i in range(wpad_len):
if (arpabet_symbols[symb_i] == '<PAD>'):
continue
arpabet_length = float(json_data['durations'][symb_i])
cell_width = round(arpabet_length / total_dur_length * 100, 2)
arpabet_html += '<td class="arpabet" style="width: '\
+ str(cell_width)\
+'%">'\
+ arpabet_symbols[symb_i]\
+ '</td> '
arpabet_html += '<tr></tbody></table>'
if use_deepmoji:
em_angry = round(json_data['em_angry'][0], 2)
em_happy = round(json_data['em_happy'][0], 2)
em_sad = round(json_data['em_sad'][0], 2)
em_surprise = round(json_data['em_surprise'][0], 2)
else:
em_angry = anger
em_happy = happy
em_sad = sad
em_surprise = surprise
return [
wav_path,
arpabet_html,
em_angry,
em_happy,
em_sad,
em_surprise,
response
]
def set_default_audio(self, voice_id):
if voice_id == 'more':
return None
if voice_id == 'x_selpahi':
sample_path = self.lojban_models_path
elif voice_id == 'cnc_cabal':
sample_path = self.robotic_models_path
elif voice_id[:5] == 'x_ex0':
sample_path = self.expresso_models_path
else:
sample_path = self.models_path
if __name__ == "__main__":
return client.predict(
voice_id,
api_name="/set_default_audio"
)
return sample_path + voice_id + '.wav'
if __name__ == "__main__":
print('running Gradio interface')
client = Client("Pendrokar/xVASynth")
demo = BlocksDemo('', '', '', '')
demo.block.launch()