import gradio as gr import torch import torchaudio import tempfile import numpy as np from nemo.collections.tts.models import FastPitchModel from nemo.collections.tts.models import HifiGanModel from nemo.collections.tts.models import MixerTTSModel from transformers import pipeline # spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx") # model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx") def greet(name): return "Hello " + name + "!!" def run(spec_generator, voc_model, pipe): def generate_tts(text: str, speaker: int = 0): sr = 44100 parsed = spec_generator.parse(text) spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker) audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram) return gr.Audio.update(sr, audio.squeeze(0).cpu().numpy()) demo = gr.Interface( fn=generate_tts, inputs=[gr.Textbox(value="This is a test.", label="Text to Synthesize"), gr.Slider(0, 10, step=1, label="Speaker")], outputs=gr.Audio(label="Output", type="numpy"), allow_flagging=False, ) demo.launch(server_name="0.0.0.0", server_port=7860) if __name__ == "__main__": spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker") spec_generator.eval() voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch") voc_model.eval() pipe = pipeline("text-to-speech", model="suno/bark-small") run(spec_generator, voc_model, pipe)