{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Tutorial_Converting_PyTorch_to_TF_to_TFlite.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "6LWsNd3_M3MP", "colab_type": "text" }, "source": [ "# Converting Pytorch models to Tensorflow and TFLite by MozillaTTS" ] }, { "cell_type": "markdown", "metadata": { "id": "FAqrSIWgLyP0", "colab_type": "text" }, "source": [ "This is a tutorial demonstrating Mozilla TTS capabilities to convert \n", "trained PyTorch models to Tensorflow and Tflite.\n" ] }, { "cell_type": "markdown", "metadata": { "id": "MBJjGYnoEo4v", "colab_type": "text" }, "source": [ "# Installation" ] }, { "cell_type": "markdown", "metadata": { "id": "Ku-dA4DKoeXk", "colab_type": "text" }, "source": [ "### Download TF Models and configs" ] }, { "cell_type": "code", "metadata": { "id": "jGIgnWhGsxU1", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 162 }, "outputId": "b461952f-8507-4dd2-af06-4e6b8692765d", "tags": [] }, "source": [ "!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O data/tts_model.pth.tar\n", "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O data/config.json" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "4dnpE0-kvTsu", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "outputId": "f67c3138-bda0-4b3e-ffcc-647f9feec23e", "tags": [] }, "source": [ "!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O data/vocoder_model.pth.tar\n", "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O data/config_vocoder.json\n", "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O data/scale_stats.npy" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "3IGvvCRMEwqn", "colab_type": "text" }, "source": [ "# Model Conversion PyTorch -> TF -> TFLite" ] }, { "cell_type": "markdown", "metadata": { "id": "tLhz8SAf8Pgp", "colab_type": "text" }, "source": [ "## Converting PyTorch to Tensorflow\n" ] }, { "cell_type": "code", "metadata": { "id": "Xsrvr_WQ8Ib5", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "dae96616-e5f7-41b6-cdb9-5026cfcd3214", "tags": [] }, "source": [ "# convert TTS model to Tensorflow\n", "!python ../TTS/bin/convert_tacotron2_torch_to_tf.py --config_path data/config.json --torch_model_path data/tts_model.pth.tar --output_path data/tts_model_tf.pkl" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "VJ4NA5If9ljv", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "1520dca8-1db8-4e07-bc0c-b1d5941c775e", "tags": [] }, "source": [ "# convert Vocoder model to Tensorflow\n", "!python ../TTS/bin/convert_melgan_torch_to_tf.py --config_path data/config_vocoder.json --torch_model_path data/vocoder_model.pth.tar --output_path data/vocoder_model_tf.pkl" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "7d5vTkBZ-BYQ", "colab_type": "text" }, "source": [ "## Converting Tensorflow to TFLite" ] }, { "cell_type": "code", "metadata": { "id": "33hTfpuU99cg", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 927 }, "outputId": "8a0e5be1-23a2-4128-ee37-8232adcb8ff0", "tags": [] }, "source": [ "# convert TTS model to TFLite\n", "!python ../TTS/bin/convert_tacotron2_tflite.py --config_path data/config.json --tf_model data/tts_model_tf.pkl --output_path data/tts_model.tflite" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "e00Hm75Y-wZ2", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 364 }, "outputId": "42381b05-3c9d-44f0-dac7-d81efd95eadf", "tags": [] }, "source": [ "# convert Vocoder model to TFLite\n", "!python ../TTS/bin/convert_melgan_tflite.py --config_path data/config_vocoder.json --tf_model data/vocoder_model_tf.pkl --output_path data/vocoder_model.tflite" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Zlgi8fPdpRF0", "colab_type": "text" }, "source": [ "# Run Inference with TFLite " ] }, { "cell_type": "code", "metadata": { "id": "f-Yc42nQZG5A", "colab_type": "code", "colab": {} }, "source": [ "def run_vocoder(mel_spec):\n", " vocoder_inputs = mel_spec[None, :, :]\n", " # get input and output details\n", " input_details = vocoder_model.get_input_details()\n", " # reshape input tensor for the new input shape\n", " vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n", " vocoder_model.allocate_tensors()\n", " detail = input_details[0]\n", " vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n", " # run the model\n", " vocoder_model.invoke()\n", " # collect outputs\n", " output_details = vocoder_model.get_output_details()\n", " waveform = vocoder_model.get_tensor(output_details[0]['index'])\n", " return waveform \n", "\n", "\n", "def tts(model, text, CONFIG, p):\n", " t_1 = time.time()\n", " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", " backend='tflite')\n", " waveform = run_vocoder(mel_postnet_spec.T)\n", " waveform = waveform[0, 0]\n", " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", " tps = (time.time() - t_1) / len(waveform)\n", " print(waveform.shape)\n", " print(\" > Run-time: {}\".format(time.time() - t_1))\n", " print(\" > Real-time factor: {}\".format(rtf))\n", " print(\" > Time per step: {}\".format(tps))\n", " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", " return alignment, mel_postnet_spec, stop_tokens, waveform" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ZksegYQepkFg", "colab_type": "text" }, "source": [ "### Load TF Models" ] }, { "cell_type": "code", "metadata": { "id": "oVa0kOamprgj", "colab_type": "code", "colab": {} }, "source": [ "import os\n", "import torch\n", "import time\n", "import IPython\n", "\n", "from TTS.tts.tf.utils.tflite import load_tflite_model\n", "from TTS.tts.tf.utils.io import load_checkpoint\n", "from TTS.utils.io import load_config\n", "from TTS.tts.utils.text.symbols import symbols, phonemes\n", "from TTS.utils.audio import AudioProcessor\n", "from TTS.tts.utils.synthesis import synthesis" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "EY-sHVO8IFSH", "colab_type": "code", "colab": {} }, "source": [ "# runtime settings\n", "use_cuda = False" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "_1aIUp2FpxOQ", "colab_type": "code", "colab": {} }, "source": [ "# model paths\n", "TTS_MODEL = \"data/tts_model.tflite\"\n", "TTS_CONFIG = \"data/config.json\"\n", "VOCODER_MODEL = \"data/vocoder_model.tflite\"\n", "VOCODER_CONFIG = \"data/config_vocoder.json\"" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "CpgmdBVQplbv", "colab_type": "code", "colab": {} }, "source": [ "# load configs\n", "TTS_CONFIG = load_config(TTS_CONFIG)\n", "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "zmrQxiozIUVE", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 471 }, "outputId": "21cda136-de87-4d55-fd46-7d5306103d90", "tags": [] }, "source": [ "# load the audio processor\n", "TTS_CONFIG.audio['stats_path'] = 'data/scale_stats.npy'\n", "ap = AudioProcessor(**TTS_CONFIG.audio) " ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8fLoI4ipqMeS", "colab_type": "code", "colab": {} }, "source": [ "# LOAD TTS MODEL\n", "# multi speaker \n", "speaker_id = None\n", "speakers = []\n", "\n", "# load the models\n", "model = load_tflite_model(TTS_MODEL)\n", "vocoder_model = load_tflite_model(VOCODER_MODEL)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Ws_YkPKsLgo-", "colab_type": "text" }, "source": [ "## Run Sample Sentence" ] }, { "cell_type": "code", "metadata": { "id": "FuWxZ9Ey5Puj", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 134 }, "outputId": "535c2df1-c27c-458b-e14b-41a977635aa1", "tags": [] }, "source": [ "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }