# Converting Pytorch models to Tensorflow and TFLite by MozillaTTS

This is a tutorial demonstrating Mozilla TTS capabilities to convert 
trained PyTorch models to Tensorflow and Tflite.


# Installation

### Download TF Models and configs

In [None]:
!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar
!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json

In [None]:
!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar
!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json
!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy

# Model Conversion PyTorch -> TF -> TFLite

## Converting PyTorch to Tensorflow


In [None]:
# convert TTS model to Tensorflow
!python ../TTS/bin/convert_tacotron2_torch_to_tf.py --config_path data/config.json --torch_model_path data/tts_model.pth.tar --output_path data/tts_model_tf.pkl

In [None]:
# convert Vocoder model to Tensorflow
!python ../TTS/bin/convert_melgan_torch_to_tf.py --config_path data/config_vocoder.json --torch_model_path data/vocoder_model.pth.tar --output_path data/vocoder_model_tf.pkl

## Converting Tensorflow to TFLite

In [None]:
# convert TTS model to TFLite
!python ../TTS/bin/convert_tacotron2_tflite.py --config_path data/config.json --tf_model data/tts_model_tf.pkl --output_path data/tts_model.tflite

In [None]:
# convert Vocoder model to TFLite
!python ../TTS/bin/convert_melgan_tflite.py --config_path data/config_vocoder.json --tf_model data/vocoder_model_tf.pkl --output_path data/vocoder_model.tflite

# Run Inference with TFLite 

In [None]:
def run_vocoder(mel_spec):
 vocoder_inputs = mel_spec[None, :, :]
 # get input and output details
 input_details = vocoder_model.get_input_details()
 # reshape input tensor for the new input shape
 vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)
 vocoder_model.allocate_tensors()
 detail = input_details[0]
 vocoder_model.set_tensor(detail['index'], vocoder_inputs)
 # run the model
 vocoder_model.invoke()
 # collect outputs
 output_details = vocoder_model.get_output_details()
 waveform = vocoder_model.get_tensor(output_details[0]['index'])
 return waveform 


def tts(model, text, CONFIG, p):
 t_1 = time.time()
 waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,
 truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,
 backend='tflite')
 waveform = run_vocoder(mel_postnet_spec.T)
 waveform = waveform[0, 0]
 rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
 tps = (time.time() - t_1) / len(waveform)
 print(waveform.shape)
 print(" > Run-time: {}".format(time.time() - t_1))
 print(" > Real-time factor: {}".format(rtf))
 print(" > Time per step: {}".format(tps))
 IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) 
 return alignment, mel_postnet_spec, stop_tokens, waveform

### Load TF Models

In [None]:
import os
import torch
import time
import IPython

from TTS.tts.tf.utils.tflite import load_tflite_model
from TTS.tts.tf.utils.io import load_checkpoint
from TTS.utils.io import load_config
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis

In [None]:
# runtime settings
use_cuda = False

In [None]:
# model paths
TTS_MODEL = "data/tts_model.tflite"
TTS_CONFIG = "data/config.json"
VOCODER_MODEL = "data/vocoder_model.tflite"
VOCODER_CONFIG = "data/config_vocoder.json"

In [None]:
# load configs
TTS_CONFIG = load_config(TTS_CONFIG)
VOCODER_CONFIG = load_config(VOCODER_CONFIG)

In [None]:
# load the audio processor
TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'
ap = AudioProcessor(**TTS_CONFIG.audio) 

In [None]:
# LOAD TTS MODEL
# multi speaker 
speaker_id = None
speakers = []

# load the models
model = load_tflite_model(TTS_MODEL)
vocoder_model = load_tflite_model(VOCODER_MODEL)

## Run Sample Sentence

In [None]:
sentence = "Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go."
align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)