Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import torchaudio | |
import gradio as gr | |
from zonos.model import Zonos | |
from zonos.conditioning import make_cond_dict, supported_language_codes | |
device = "cuda" | |
CURRENT_MODEL_TYPE = None | |
CURRENT_MODEL = None | |
def load_model_if_needed(model_choice: str): | |
global CURRENT_MODEL_TYPE, CURRENT_MODEL | |
if CURRENT_MODEL_TYPE != model_choice: | |
if CURRENT_MODEL is not None: | |
del CURRENT_MODEL | |
torch.cuda.empty_cache() | |
print(f"Loading {model_choice} model...") | |
if model_choice == "Transformer": | |
CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device) | |
else: | |
CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=device) | |
CURRENT_MODEL.to(device) | |
CURRENT_MODEL.bfloat16() | |
CURRENT_MODEL.eval() | |
CURRENT_MODEL_TYPE = model_choice | |
print(f"{model_choice} model loaded successfully!") | |
else: | |
print(f"{model_choice} model is already loaded.") | |
return CURRENT_MODEL | |
def update_ui(model_choice): | |
""" | |
Dynamically show/hide UI elements based on the model's conditioners. | |
We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model. | |
""" | |
model = load_model_if_needed(model_choice) | |
cond_names = [c.name for c in model.prefix_conditioner.conditioners] | |
print("Conditioners in this model:", cond_names) | |
text_update = gr.update(visible=("espeak" in cond_names)) | |
language_update = gr.update(visible=("espeak" in cond_names)) | |
speaker_audio_update = gr.update(visible=("speaker" in cond_names)) | |
prefix_audio_update = gr.update(visible=True) | |
skip_speaker_update = gr.update(visible=("speaker" in cond_names)) | |
skip_emotion_update = gr.update(visible=("emotion" in cond_names)) | |
emotion1_update = gr.update(visible=("emotion" in cond_names)) | |
emotion2_update = gr.update(visible=("emotion" in cond_names)) | |
emotion3_update = gr.update(visible=("emotion" in cond_names)) | |
emotion4_update = gr.update(visible=("emotion" in cond_names)) | |
emotion5_update = gr.update(visible=("emotion" in cond_names)) | |
emotion6_update = gr.update(visible=("emotion" in cond_names)) | |
emotion7_update = gr.update(visible=("emotion" in cond_names)) | |
emotion8_update = gr.update(visible=("emotion" in cond_names)) | |
skip_vqscore_8_update = gr.update(visible=("vqscore_8" in cond_names)) | |
vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names)) | |
fmax_slider_update = gr.update(visible=("fmax" in cond_names)) | |
skip_fmax_update = gr.update(visible=("fmax" in cond_names)) | |
pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names)) | |
skip_pitch_std_update = gr.update(visible=("pitch_std" in cond_names)) | |
speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names)) | |
skip_speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names)) | |
dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) | |
skip_dnsmos_ovrl_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) | |
speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names)) | |
skip_speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names)) | |
return ( | |
text_update, # 1 | |
language_update, # 2 | |
speaker_audio_update, # 3 | |
prefix_audio_update, # 4 | |
skip_speaker_update, # 5 | |
skip_emotion_update, # 6 | |
emotion1_update, # 7 | |
emotion2_update, # 8 | |
emotion3_update, # 9 | |
emotion4_update, # 10 | |
emotion5_update, # 11 | |
emotion6_update, # 12 | |
emotion7_update, # 13 | |
emotion8_update, # 14 | |
skip_vqscore_8_update, # 15 | |
vq_single_slider_update, # 16 | |
fmax_slider_update, # 17 | |
skip_fmax_update, # 18 | |
pitch_std_slider_update, # 19 | |
skip_pitch_std_update, # 20 | |
speaking_rate_slider_update, # 21 | |
skip_speaking_rate_update, # 22 | |
dnsmos_slider_update, # 23 | |
skip_dnsmos_ovrl_update, # 24 | |
speaker_noised_checkbox_update, # 25 | |
skip_speaker_noised_update, # 26 | |
) | |
def generate_audio( | |
model_choice, | |
text, | |
language, | |
speaker_audio, | |
prefix_audio, | |
skip_speaker, | |
skip_emotion, | |
e1, | |
e2, | |
e3, | |
e4, | |
e5, | |
e6, | |
e7, | |
e8, | |
skip_vqscore_8, | |
vq_single, | |
fmax, | |
skip_fmax, | |
pitch_std, | |
skip_pitch_std, | |
speaking_rate, | |
skip_speaking_rate, | |
dnsmos_ovrl, | |
skip_dnsmos_ovrl, | |
speaker_noised, | |
skip_speaker_noised, | |
cfg_scale, | |
min_p, | |
seed, | |
): | |
""" | |
Generates audio based on the provided UI parameters. | |
We do NOT use language_id or ctc_loss even if the model has them. | |
""" | |
selected_model = load_model_if_needed(model_choice) | |
uncond_keys = [] | |
if skip_speaker: | |
uncond_keys.append("speaker") | |
if skip_emotion: | |
uncond_keys.append("emotion") | |
if skip_vqscore_8: | |
uncond_keys.append("vqscore_8") | |
if skip_fmax: | |
uncond_keys.append("fmax") | |
if skip_pitch_std: | |
uncond_keys.append("pitch_std") | |
if skip_speaking_rate: | |
uncond_keys.append("speaking_rate") | |
if skip_dnsmos_ovrl: | |
uncond_keys.append("dnsmos_ovrl") | |
if skip_speaker_noised: | |
uncond_keys.append("speaker_noised") | |
speaker_noised_bool = bool(speaker_noised) | |
fmax = float(fmax) | |
pitch_std = float(pitch_std) | |
speaking_rate = float(speaking_rate) | |
dnsmos_ovrl = float(dnsmos_ovrl) | |
cfg_scale = float(cfg_scale) | |
min_p = float(min_p) | |
seed = int(seed) | |
max_new_tokens = 86 * 30 | |
torch.manual_seed(seed) | |
speaker_embedding = None | |
if speaker_audio is not None and not skip_speaker: | |
wav, sr = torchaudio.load(speaker_audio) | |
speaker_embedding = selected_model.make_speaker_embedding(wav, sr) | |
speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16) | |
audio_prefix_codes = None | |
if prefix_audio is not None: | |
wav_prefix, sr_prefix = torchaudio.load(prefix_audio) | |
wav_prefix = wav_prefix.mean(0, keepdim=True) | |
wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate) | |
wav_prefix = wav_prefix.to(device, dtype=torch.float32) | |
with torch.autocast(device, dtype=torch.float32): | |
audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0)) | |
emotion_tensor = torch.tensor( | |
[[float(e1), float(e2), float(e3), float(e4), float(e5), float(e6), float(e7), float(e8)]], device=device | |
) | |
vq_val = float(vq_single) | |
vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0) | |
cond_dict = make_cond_dict( | |
text=text, | |
language=language, | |
speaker=speaker_embedding, | |
emotion=emotion_tensor, | |
vqscore_8=vq_tensor, | |
fmax=fmax, | |
pitch_std=pitch_std, | |
speaking_rate=speaking_rate, | |
dnsmos_ovrl=dnsmos_ovrl, | |
speaker_noised=speaker_noised_bool, | |
device=device, | |
unconditional_keys=uncond_keys, | |
) | |
conditioning = selected_model.prepare_conditioning(cond_dict) | |
codes = selected_model.generate( | |
prefix_conditioning=conditioning, | |
audio_prefix_codes=audio_prefix_codes, | |
max_new_tokens=max_new_tokens, | |
cfg_scale=cfg_scale, | |
batch_size=1, | |
sampling_params=dict(min_p=min_p), | |
) | |
wav_out = selected_model.autoencoder.decode(codes).cpu().detach() | |
sr_out = selected_model.autoencoder.sampling_rate | |
if wav_out.dim() == 2 and wav_out.size(0) > 1: | |
wav_out = wav_out[0:1, :] | |
return sr_out, wav_out.squeeze().numpy() | |
def build_interface(): | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
model_choice = gr.Dropdown( | |
choices=["Hybrid", "Transformer"], | |
value="Transformer", | |
label="Zonos Model Type", | |
info="Select the model variant to use.", | |
) | |
text = gr.Textbox( | |
label="Text to Synthesize", value="Zonos uses eSpeak for text to phoneme conversion!", lines=4 | |
) | |
language = gr.Dropdown( | |
choices=supported_language_codes, | |
value="en-us", | |
label="Language Code", | |
info="Select a language code.", | |
) | |
prefix_audio = gr.Audio( | |
value="assets/silence_100ms.wav", | |
label="Optional Prefix Audio (continue from this audio)", | |
type="filepath", | |
) | |
with gr.Column(): | |
speaker_audio = gr.Audio( | |
label="Optional Speaker Audio (for cloning)", | |
type="filepath", | |
) | |
speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False) | |
with gr.Column(): | |
gr.Markdown("## Conditioning Parameters") | |
with gr.Row(): | |
dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall") | |
fmax_slider = gr.Slider(0, 24000, value=22050, step=1, label="Fmax (Hz)") | |
vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score") | |
pitch_std_slider = gr.Slider(0.0, 400.0, value=20.0, step=1, label="Pitch Std") | |
speaking_rate_slider = gr.Slider(0.0, 40.0, value=15.0, step=1, label="Speaking Rate") | |
gr.Markdown("### Emotion Sliders") | |
with gr.Row(): | |
emotion1 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Happiness") | |
emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness") | |
emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust") | |
emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear") | |
with gr.Row(): | |
emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise") | |
emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger") | |
emotion7 = gr.Slider(0.0, 1.0, 0.5, 0.05, label="Other") | |
emotion8 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Neutral") | |
gr.Markdown("### Unconditional Toggles") | |
with gr.Row(): | |
skip_speaker = gr.Checkbox(label="Skip Speaker", value=False) | |
skip_emotion = gr.Checkbox(label="Skip Emotion", value=False) | |
skip_vqscore_8 = gr.Checkbox(label="Skip VQ Score", value=True) | |
skip_fmax = gr.Checkbox(label="Skip Fmax", value=False) | |
skip_pitch_std = gr.Checkbox(label="Skip Pitch Std", value=False) | |
skip_speaking_rate = gr.Checkbox(label="Skip Speaking Rate", value=False) | |
skip_dnsmos_ovrl = gr.Checkbox(label="Skip DNSMOS", value=True) | |
skip_speaker_noised = gr.Checkbox(label="Skip Noised Speaker", value=False) | |
with gr.Column(): | |
gr.Markdown("## Generation Parameters") | |
with gr.Row(): | |
cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale") | |
min_p_slider = gr.Slider(0.0, 1.0, 0.1, 0.01, label="Min P") | |
seed_number = gr.Number(label="Seed", value=420, precision=0) | |
generate_button = gr.Button("Generate Audio") | |
output_audio = gr.Audio(label="Generated Audio", type="numpy") | |
model_choice.change( | |
fn=update_ui, | |
inputs=[model_choice], | |
outputs=[ | |
text, # 1 | |
language, # 2 | |
speaker_audio, # 3 | |
prefix_audio, # 4 | |
skip_speaker, # 5 | |
skip_emotion, # 6 | |
emotion1, # 7 | |
emotion2, # 8 | |
emotion3, # 9 | |
emotion4, # 10 | |
emotion5, # 11 | |
emotion6, # 12 | |
emotion7, # 13 | |
emotion8, # 14 | |
skip_vqscore_8, # 15 | |
vq_single_slider, # 16 | |
fmax_slider, # 17 | |
skip_fmax, # 18 | |
pitch_std_slider, # 19 | |
skip_pitch_std, # 20 | |
speaking_rate_slider, # 21 | |
skip_speaking_rate, # 22 | |
dnsmos_slider, # 23 | |
skip_dnsmos_ovrl, # 24 | |
speaker_noised_checkbox, # 25 | |
skip_speaker_noised, # 26 | |
], | |
) | |
# On page load, trigger the same UI refresh | |
demo.load( | |
fn=update_ui, | |
inputs=[model_choice], | |
outputs=[ | |
text, | |
language, | |
speaker_audio, | |
prefix_audio, | |
skip_speaker, | |
skip_emotion, | |
emotion1, | |
emotion2, | |
emotion3, | |
emotion4, | |
emotion5, | |
emotion6, | |
emotion7, | |
emotion8, | |
skip_vqscore_8, | |
vq_single_slider, | |
fmax_slider, | |
skip_fmax, | |
pitch_std_slider, | |
skip_pitch_std, | |
speaking_rate_slider, | |
skip_speaking_rate, | |
dnsmos_slider, | |
skip_dnsmos_ovrl, | |
speaker_noised_checkbox, | |
skip_speaker_noised, | |
], | |
) | |
# Generate audio on button click | |
generate_button.click( | |
fn=generate_audio, | |
inputs=[ | |
model_choice, | |
text, | |
language, | |
speaker_audio, | |
prefix_audio, | |
skip_speaker, | |
skip_emotion, | |
emotion1, | |
emotion2, | |
emotion3, | |
emotion4, | |
emotion5, | |
emotion6, | |
emotion7, | |
emotion8, | |
skip_vqscore_8, | |
vq_single_slider, | |
fmax_slider, | |
skip_fmax, | |
pitch_std_slider, | |
skip_pitch_std, | |
speaking_rate_slider, | |
skip_speaking_rate, | |
dnsmos_slider, | |
skip_dnsmos_ovrl, | |
speaker_noised_checkbox, | |
skip_speaker_noised, | |
cfg_scale_slider, | |
min_p_slider, | |
seed_number, | |
], | |
outputs=[output_audio], | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = build_interface() | |
demo.launch(server_name="0.0.0.0", server_port=7860, share=True) |