toolbox-tts / app.py
kennethli319's picture
update app
7c1cb1d
import gradio as gr
import torch
import torchaudio
import tempfile
import numpy as np
from nemo.collections.tts.models import FastPitchModel
from nemo.collections.tts.models import HifiGanModel
from nemo.collections.tts.models import MixerTTSModel
from transformers import pipeline
# spec_generator_2 = MixerTTSModel.from_pretrained("tts_en_lj_mixerttsx")
# model1 = HifiGanModel.from_pretrained(model_name="tts_en_lj_hifigan_ft_mixerttsx")
def greet(name):
return "Hello " + name + "!!"
def run(spec_generator, voc_model, pipe):
def generate_tts(text: str, speaker: int = 0):
sr = 44100
parsed = spec_generator.parse(text)
spectrogram = spec_generator.generate_spectrogram(tokens=parsed, speaker=speaker)
audio = voc_model.convert_spectrogram_to_audio(spec=spectrogram)
return gr.Audio.update(sr, audio.squeeze(0).cpu().numpy())
demo = gr.Interface(
fn=generate_tts,
inputs=[gr.Textbox(value="This is a test.", label="Text to Synthesize"),
gr.Slider(0, 10, step=1, label="Speaker")],
outputs=gr.Audio(label="Output", type="numpy"),
allow_flagging=False,
)
demo.launch(server_name="0.0.0.0", server_port=7860)
if __name__ == "__main__":
spec_generator = FastPitchModel.from_pretrained("tts_en_fastpitch_multispeaker")
spec_generator.eval()
voc_model = HifiGanModel.from_pretrained(model_name="tts_en_hifitts_hifigan_ft_fastpitch")
voc_model.eval()
pipe = pipeline("text-to-speech", model="suno/bark-small")
run(spec_generator, voc_model, pipe)