import torch import torchaudio import gradio as gr from zonos.model import Zonos from zonos.conditioning import make_cond_dict, supported_language_codes device = "cuda" CURRENT_MODEL_TYPE = None CURRENT_MODEL = None def load_model_if_needed(model_choice: str): global CURRENT_MODEL_TYPE, CURRENT_MODEL if CURRENT_MODEL_TYPE != model_choice: if CURRENT_MODEL is not None: del CURRENT_MODEL torch.cuda.empty_cache() print(f"Loading {model_choice} model...") if model_choice == "Transformer": CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device) else: CURRENT_MODEL = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=device) CURRENT_MODEL.to(device) CURRENT_MODEL.bfloat16() CURRENT_MODEL.eval() CURRENT_MODEL_TYPE = model_choice print(f"{model_choice} model loaded successfully!") else: print(f"{model_choice} model is already loaded.") return CURRENT_MODEL def update_ui(model_choice): """ Dynamically show/hide UI elements based on the model's conditioners. We do NOT display 'language_id' or 'ctc_loss' even if they exist in the model. """ model = load_model_if_needed(model_choice) cond_names = [c.name for c in model.prefix_conditioner.conditioners] print("Conditioners in this model:", cond_names) text_update = gr.update(visible=("espeak" in cond_names)) language_update = gr.update(visible=("espeak" in cond_names)) speaker_audio_update = gr.update(visible=("speaker" in cond_names)) prefix_audio_update = gr.update(visible=True) skip_speaker_update = gr.update(visible=("speaker" in cond_names)) skip_emotion_update = gr.update(visible=("emotion" in cond_names)) emotion1_update = gr.update(visible=("emotion" in cond_names)) emotion2_update = gr.update(visible=("emotion" in cond_names)) emotion3_update = gr.update(visible=("emotion" in cond_names)) emotion4_update = gr.update(visible=("emotion" in cond_names)) emotion5_update = gr.update(visible=("emotion" in cond_names)) emotion6_update = gr.update(visible=("emotion" in cond_names)) emotion7_update = gr.update(visible=("emotion" in cond_names)) emotion8_update = gr.update(visible=("emotion" in cond_names)) skip_vqscore_8_update = gr.update(visible=("vqscore_8" in cond_names)) vq_single_slider_update = gr.update(visible=("vqscore_8" in cond_names)) fmax_slider_update = gr.update(visible=("fmax" in cond_names)) skip_fmax_update = gr.update(visible=("fmax" in cond_names)) pitch_std_slider_update = gr.update(visible=("pitch_std" in cond_names)) skip_pitch_std_update = gr.update(visible=("pitch_std" in cond_names)) speaking_rate_slider_update = gr.update(visible=("speaking_rate" in cond_names)) skip_speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names)) dnsmos_slider_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) skip_dnsmos_ovrl_update = gr.update(visible=("dnsmos_ovrl" in cond_names)) speaker_noised_checkbox_update = gr.update(visible=("speaker_noised" in cond_names)) skip_speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names)) return ( text_update, # 1 language_update, # 2 speaker_audio_update, # 3 prefix_audio_update, # 4 skip_speaker_update, # 5 skip_emotion_update, # 6 emotion1_update, # 7 emotion2_update, # 8 emotion3_update, # 9 emotion4_update, # 10 emotion5_update, # 11 emotion6_update, # 12 emotion7_update, # 13 emotion8_update, # 14 skip_vqscore_8_update, # 15 vq_single_slider_update, # 16 fmax_slider_update, # 17 skip_fmax_update, # 18 pitch_std_slider_update, # 19 skip_pitch_std_update, # 20 speaking_rate_slider_update, # 21 skip_speaking_rate_update, # 22 dnsmos_slider_update, # 23 skip_dnsmos_ovrl_update, # 24 speaker_noised_checkbox_update, # 25 skip_speaker_noised_update, # 26 ) def generate_audio( model_choice, text, language, speaker_audio, prefix_audio, skip_speaker, skip_emotion, e1, e2, e3, e4, e5, e6, e7, e8, skip_vqscore_8, vq_single, fmax, skip_fmax, pitch_std, skip_pitch_std, speaking_rate, skip_speaking_rate, dnsmos_ovrl, skip_dnsmos_ovrl, speaker_noised, skip_speaker_noised, cfg_scale, min_p, seed, ): """ Generates audio based on the provided UI parameters. We do NOT use language_id or ctc_loss even if the model has them. """ selected_model = load_model_if_needed(model_choice) uncond_keys = [] if skip_speaker: uncond_keys.append("speaker") if skip_emotion: uncond_keys.append("emotion") if skip_vqscore_8: uncond_keys.append("vqscore_8") if skip_fmax: uncond_keys.append("fmax") if skip_pitch_std: uncond_keys.append("pitch_std") if skip_speaking_rate: uncond_keys.append("speaking_rate") if skip_dnsmos_ovrl: uncond_keys.append("dnsmos_ovrl") if skip_speaker_noised: uncond_keys.append("speaker_noised") speaker_noised_bool = bool(speaker_noised) fmax = float(fmax) pitch_std = float(pitch_std) speaking_rate = float(speaking_rate) dnsmos_ovrl = float(dnsmos_ovrl) cfg_scale = float(cfg_scale) min_p = float(min_p) seed = int(seed) max_new_tokens = 86 * 30 torch.manual_seed(seed) speaker_embedding = None if speaker_audio is not None and not skip_speaker: wav, sr = torchaudio.load(speaker_audio) speaker_embedding = selected_model.make_speaker_embedding(wav, sr) speaker_embedding = speaker_embedding.to(device, dtype=torch.bfloat16) audio_prefix_codes = None if prefix_audio is not None: wav_prefix, sr_prefix = torchaudio.load(prefix_audio) wav_prefix = wav_prefix.mean(0, keepdim=True) wav_prefix = torchaudio.functional.resample(wav_prefix, sr_prefix, selected_model.autoencoder.sampling_rate) wav_prefix = wav_prefix.to(device, dtype=torch.float32) with torch.autocast(device, dtype=torch.float32): audio_prefix_codes = selected_model.autoencoder.encode(wav_prefix.unsqueeze(0)) emotion_tensor = torch.tensor( [[float(e1), float(e2), float(e3), float(e4), float(e5), float(e6), float(e7), float(e8)]], device=device ) vq_val = float(vq_single) vq_tensor = torch.tensor([vq_val] * 8, device=device).unsqueeze(0) cond_dict = make_cond_dict( text=text, language=language, speaker=speaker_embedding, emotion=emotion_tensor, vqscore_8=vq_tensor, fmax=fmax, pitch_std=pitch_std, speaking_rate=speaking_rate, dnsmos_ovrl=dnsmos_ovrl, speaker_noised=speaker_noised_bool, device=device, unconditional_keys=uncond_keys, ) conditioning = selected_model.prepare_conditioning(cond_dict) codes = selected_model.generate( prefix_conditioning=conditioning, audio_prefix_codes=audio_prefix_codes, max_new_tokens=max_new_tokens, cfg_scale=cfg_scale, batch_size=1, sampling_params=dict(min_p=min_p), ) wav_out = selected_model.autoencoder.decode(codes).cpu().detach() sr_out = selected_model.autoencoder.sampling_rate if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :] return sr_out, wav_out.squeeze().numpy() def build_interface(): with gr.Blocks() as demo: with gr.Row(): with gr.Column(): model_choice = gr.Dropdown( choices=["Hybrid", "Transformer"], value="Transformer", label="Zonos Model Type", info="Select the model variant to use.", ) text = gr.Textbox( label="Text to Synthesize", value="Zonos uses eSpeak for text to phoneme conversion!", lines=4 ) language = gr.Dropdown( choices=supported_language_codes, value="en-us", label="Language Code", info="Select a language code.", ) prefix_audio = gr.Audio( value="assets/silence_100ms.wav", label="Optional Prefix Audio (continue from this audio)", type="filepath", ) with gr.Column(): speaker_audio = gr.Audio( label="Optional Speaker Audio (for cloning)", type="filepath", ) speaker_noised_checkbox = gr.Checkbox(label="Denoise Speaker?", value=False) with gr.Column(): gr.Markdown("## Conditioning Parameters") with gr.Row(): dnsmos_slider = gr.Slider(1.0, 5.0, value=4.0, step=0.1, label="DNSMOS Overall") fmax_slider = gr.Slider(0, 24000, value=22050, step=1, label="Fmax (Hz)") vq_single_slider = gr.Slider(0.5, 0.8, 0.78, 0.01, label="VQ Score") pitch_std_slider = gr.Slider(0.0, 400.0, value=20.0, step=1, label="Pitch Std") speaking_rate_slider = gr.Slider(0.0, 40.0, value=15.0, step=1, label="Speaking Rate") gr.Markdown("### Emotion Sliders") with gr.Row(): emotion1 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Happiness") emotion2 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Sadness") emotion3 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Disgust") emotion4 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Fear") with gr.Row(): emotion5 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Surprise") emotion6 = gr.Slider(0.0, 1.0, 0.05, 0.05, label="Anger") emotion7 = gr.Slider(0.0, 1.0, 0.5, 0.05, label="Other") emotion8 = gr.Slider(0.0, 1.0, 0.6, 0.05, label="Neutral") gr.Markdown("### Unconditional Toggles") with gr.Row(): skip_speaker = gr.Checkbox(label="Skip Speaker", value=False) skip_emotion = gr.Checkbox(label="Skip Emotion", value=False) skip_vqscore_8 = gr.Checkbox(label="Skip VQ Score", value=True) skip_fmax = gr.Checkbox(label="Skip Fmax", value=False) skip_pitch_std = gr.Checkbox(label="Skip Pitch Std", value=False) skip_speaking_rate = gr.Checkbox(label="Skip Speaking Rate", value=False) skip_dnsmos_ovrl = gr.Checkbox(label="Skip DNSMOS", value=True) skip_speaker_noised = gr.Checkbox(label="Skip Noised Speaker", value=False) with gr.Column(): gr.Markdown("## Generation Parameters") with gr.Row(): cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale") min_p_slider = gr.Slider(0.0, 1.0, 0.1, 0.01, label="Min P") seed_number = gr.Number(label="Seed", value=420, precision=0) generate_button = gr.Button("Generate Audio") output_audio = gr.Audio(label="Generated Audio", type="numpy") model_choice.change( fn=update_ui, inputs=[model_choice], outputs=[ text, # 1 language, # 2 speaker_audio, # 3 prefix_audio, # 4 skip_speaker, # 5 skip_emotion, # 6 emotion1, # 7 emotion2, # 8 emotion3, # 9 emotion4, # 10 emotion5, # 11 emotion6, # 12 emotion7, # 13 emotion8, # 14 skip_vqscore_8, # 15 vq_single_slider, # 16 fmax_slider, # 17 skip_fmax, # 18 pitch_std_slider, # 19 skip_pitch_std, # 20 speaking_rate_slider, # 21 skip_speaking_rate, # 22 dnsmos_slider, # 23 skip_dnsmos_ovrl, # 24 speaker_noised_checkbox, # 25 skip_speaker_noised, # 26 ], ) # On page load, trigger the same UI refresh demo.load( fn=update_ui, inputs=[model_choice], outputs=[ text, language, speaker_audio, prefix_audio, skip_speaker, skip_emotion, emotion1, emotion2, emotion3, emotion4, emotion5, emotion6, emotion7, emotion8, skip_vqscore_8, vq_single_slider, fmax_slider, skip_fmax, pitch_std_slider, skip_pitch_std, speaking_rate_slider, skip_speaking_rate, dnsmos_slider, skip_dnsmos_ovrl, speaker_noised_checkbox, skip_speaker_noised, ], ) # Generate audio on button click generate_button.click( fn=generate_audio, inputs=[ model_choice, text, language, speaker_audio, prefix_audio, skip_speaker, skip_emotion, emotion1, emotion2, emotion3, emotion4, emotion5, emotion6, emotion7, emotion8, skip_vqscore_8, vq_single_slider, fmax_slider, skip_fmax, pitch_std_slider, skip_pitch_std, speaking_rate_slider, skip_speaking_rate, dnsmos_slider, skip_dnsmos_ovrl, speaker_noised_checkbox, skip_speaker_noised, cfg_scale_slider, min_p_slider, seed_number, ], outputs=[output_audio], ) return demo if __name__ == "__main__": demo = build_interface() demo.launch(server_name="0.0.0.0", server_port=7860, share=True)