import gradio as gr from hyper_parameters import tacotron_params as hparams from training import load_model from audio_processing import griffin_lim from nn_layers import TacotronSTFT from text import text_to_sequence from hifigan.env import AttrDict from examples_taco2 import * from hifigan.models import Generator import torch import numpy as np import json import os from matplotlib import pyplot as plt # Adjust vertical spacing between subplots plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed # Adjust the white space (margins) around the plot plt.tight_layout(pad=0.5) # You can adjust the pad value as needed torch.manual_seed(1234) MAX_WAV_VALUE = 32768.0 def load_checkpoint(filepath, device): assert os.path.isfile(filepath) print("Loading '{}'".format(filepath)) checkpoint_dict = torch.load(filepath, map_location=device) print("Complete.") return checkpoint_dict def plot_spec_align_sep(mel, align): plt.figure(figsize=(4, 3)) fig_mel = plt.figure() ax_mel = fig_mel.add_subplot(111) fig_mel.tight_layout() ax_mel.imshow(mel) # fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12) fig_align = plt.figure() ax_align = fig_align.add_subplot(111) # fig_align fig_align.tight_layout() ax_align.imshow(align) # fig_align.set_title('Alignment', fontsize=12) return fig_mel, fig_align # load trained tacotron2 + GST model: model = load_model(hparams) checkpoint_path = "models/checkpoint_78000.model" model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict']) # model.to('cuda') _ = model.eval() # load pre-trained HiFi-GAN model for mel2audio: hifigan_checkpoint_path = "models/generator_v1" config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json') with open(config_file) as f: data = f.read() json_config = json.loads(data) h = AttrDict(json_config) device = torch.device("cpu") generator = Generator(h).to(device) state_dict_g = load_checkpoint(hifigan_checkpoint_path, device) generator.load_state_dict(state_dict_g['generator']) generator.eval() generator.remove_weight_norm() def synthesize(text, gst_1, gst_2, gst_3, voc): sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64) # gst_head_scores = np.array([0.5, 0.15, 0.35]) gst_head_scores = np.array([gst_1, gst_2, gst_3]) gst_scores = torch.from_numpy(gst_head_scores).float() with torch.no_grad(): mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) if voc == 0: # mel2wav inference: with torch.no_grad(): y_g_hat = generator(mel_outputs_postnet) audio = y_g_hat.squeeze() audio = audio * MAX_WAV_VALUE audio_numpy = audio.cpu().numpy().astype('int16') # audio = vocoder_model.inference(mel_outputs_postnet) # audio_numpy = audio.data.cpu().detach().numpy() else: # Griffin Lim vocoder synthesis: griffin_iters = 60 taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'], sampling_rate=hparams['sampling_rate']) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 60 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) audio = audio.squeeze() audio_numpy = audio.cpu().numpy() # prepare plot for the output: mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0]) mel_outputs_postnet = mel_outputs_postnet.detach().numpy() alignments = alignments.squeeze().T.detach().numpy() # normalize numpy arrays between [-1, 1] min_val = np.min(mel_outputs_postnet) max_val = np.max(mel_outputs_postnet) scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val) normalized_mel = 2 * scaled_mel - 1 min_val = np.min(alignments) max_val = np.max(alignments) scaled_align = (alignments - min_val) / (max_val - min_val) normalized_align = 2 * scaled_align - 1 aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg', bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True) return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align with gr.Blocks() as demo: gr.Markdown("

English Neural Text-to-Speech

" "

Speech Synthesis with Partial Style Control


") # gr.Markdown("##
Unsupervised Style Tokens using Single-Head Attention Parallel Encoder " # "with Tacotron2
") with gr.Row(): with gr.Column(scale=1): # , value="Speech synthesis has evolved dramatically since the development of neural architectures capable of generating high quality samples." inp = gr.Textbox(label="Input Text") clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp]) # gr.Markdown("A continuació, calibrem els pesos dels *style tokens*:") with gr.Row(): with gr.Column(scale=2): with gr.Tab("Global Style Tokens"): gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4) gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26) gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33) with gr.Column(scale=0): with gr.Tab("Vocoder"): vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)], container=False, value=0, min_width=300) # label="Vocoder") greet_btn = gr.Button("Synthesize!", scale=1) with gr.Column(): with gr.Tab("Spectrogram"): spec_plot = gr.Image(container=False) with gr.Tab("Alignment"): align_plot = gr.Image(container=False) wave_video = gr.Video(label="Waveform", height=150, width=800, container=False) def display_video(): return wave_video greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder], outputs=[wave_video, spec_plot, align_plot], api_name="synthesize") with gr.Row(): with gr.Column(): gr.Examples(examples=infer_from_text_examples, inputs=[inp, gst_1, gst_2, gst_3, vocoder], outputs=[wave_video, spec_plot, align_plot], fn=synthesize, cache_examples=False, ) gr.Markdown(""" ### Details and Indications This is a Text-to-Speech (TTS) system that consists of two modules: 1) a replicated Tacotron2 model, which generates the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps spectrograms to a digital waveforms. Global Style Tokens (GST) have been implemented to catch style information from the female speaker with which the model has been trained (see the links below for more information). Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text. Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or higher than 1 may cause low energy, mispronunciations or distortion. You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need to be trained but produces a "robotic" effect. ### More Information Spectrogram generator has been adapted and trained from the [NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in Tacotron2
The neural vocoder is a pre-trained model replicated from HiFiGAN
Unsupervised style control has been implemented based on Global Style Tokens
""") demo.launch()