{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "%matplotlib inline\n", "from tts.utils.audio import AudioProcessor\n", "from tts.tts.utils.visual import plot_spectrogram\n", "from tts.utils.io import load_config\n", "import glob " ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "config_path = \"/home/erogol/Projects/TTS/tts/tts/config_thorsten_de.json\"\n", "data_path = \"/home/erogol/Data/thorsten-german/\"\n", "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n", "CONFIG = load_config(config_path)" ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Setup Audio Processor\n", "Play with the AP parameters until you find a good fit with the synthesis speech below. " ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "Collapsed": "false" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " > Setting up Audio Processor...\n", " | > sample_rate:22050\n", " | > num_mels:80\n", " | > min_level_db:-100\n", " | > frame_shift_ms:None\n", " | > frame_length_ms:None\n", " | > ref_level_db:20\n", " | > fft_size:1024\n", " | > power:1.5\n", " | > preemphasis:0.0\n", " | > griffin_lim_iters:60\n", " | > signal_norm:True\n", " | > symmetric_norm:True\n", " | > mel_fmin:0\n", " | > mel_fmax:8000.0\n", " | > spec_gain:1.0\n", " | > stft_pad_mode:reflect\n", " | > max_norm:4.0\n", " | > clip_norm:True\n", " | > do_trim_silence:True\n", " | > trim_db:60\n", " | > do_sound_norm:True\n", " | > stats_path:None\n", " | > hop_length:256\n", " | > win_length:1024\n" ] } ], "source": [ "# audio={\n", "# 'audio_processor': 'audio',\n", "# 'num_mels': 80, # In general, you don'tneed to change it \n", "# 'fft_size': 1024, # In general, you don'tneed to change it \n", "# 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n", "# 'hop_length': 256, # In general, you don'tneed to change it \n", "# 'win_length': 1024, # In general, you don'tneed to change it \n", "# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n", "# 'min_level_db': -100,\n", "# 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n", "# 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n", "# 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n", "# 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n", "# 'symmetric_norm': False, # Same as above\n", "# 'max_norm': 1, # Same as above\n", "# 'clip_norm': True, # Same as above\n", "# 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", "# 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", "# 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", "\n", "AP = AudioProcessor(**CONFIG.audio);" ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Check audio loading " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "Collapsed": "false" }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wav = AP.load_wav(file_paths[10])\n", "ipd.Audio(data=wav, rate=AP.sample_rate) " ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Generate Mel-Spectrogram and Re-synthesis with GL" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "AP.power = 1.0" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "Collapsed": "false" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Max: 2.4340844\n", "Min: 2.0181823\n", "Mean: 2.2137265\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mel = AP.melspectrogram(wav)\n", "print(\"Max:\", mel.max())\n", "print(\"Min:\", mel.min())\n", "print(\"Mean:\", mel.mean())\n", "plot_spectrogram(mel.T, AP);\n", "\n", "wav_gen = AP.inv_melspectrogram(mel)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Generate Linear-Spectrogram and Re-synthesis with GL" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "Collapsed": "false" }, "outputs": [ { "ename": "RuntimeError", "evalue": " [!] Mean-Var stats does not match the given feature dimensions.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Max:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Min:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplot_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnormalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mnormalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mRuntimeError\u001b[0m: [!] Mean-Var stats does not match the given feature dimensions." ] } ], "source": [ "spec = AP.spectrogram(wav)\n", "print(\"Max:\", spec.max())\n", "print(\"Min:\", spec.min())\n", "print(\"Mean:\", spec.mean())\n", "plot_spectrogram(spec.T, AP);\n", "\n", "wav_gen = AP.inv_spectrogram(spec)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" ] }, { "cell_type": "markdown", "metadata": { "Collapsed": "false" }, "source": [ "### Compare values for a certain parameter\n", "\n", "Optimize your parameters by comparing different values per parameter at a time." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "audio={\n", " 'audio_processor': 'audio',\n", " 'num_mels': 80, # In general, you don'tneed to change it \n", " 'num_freq': 1025, # In general, you don'tneed to change it \n", " 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n", " 'frame_length_ms': 50, # In general, you don'tneed to change it \n", " 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n", " 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n", " 'min_level_db': -100,\n", " 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n", " 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n", " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n", " 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n", " 'symmetric_norm': False, # Same as above\n", " 'max_norm': 1, # Same as above\n", " 'clip_norm': True, # Same as above\n", " 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", " 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", "\n", "AP = AudioProcessor(**audio);" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "from librosa import display\n", "from matplotlib import pylab as plt\n", "import IPython\n", "plt.rcParams['figure.figsize'] = (20.0, 16.0)\n", "\n", "def compare_values(attribute, values, file):\n", " \"\"\"\n", " attributes (str): the names of the attribute you like to test.\n", " values (list): list of values to compare.\n", " file (str): file name to perform the tests.\n", " \"\"\"\n", " wavs = []\n", " for idx, val in enumerate(values):\n", " set_val_cmd = \"AP.{}={}\".format(attribute, val)\n", " exec(set_val_cmd)\n", " wav = AP.load_wav(file)\n", " spec = AP.spectrogram(wav)\n", " spec_norm = AP.denormalize(spec.T)\n", " plt.subplot(len(values), 2, 2*idx + 1)\n", " plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n", " # plt.colorbar()\n", " plt.tight_layout()\n", " wav_gen = AP.inv_spectrogram(spec)\n", " wavs.append(wav_gen)\n", " plt.subplot(len(values), 2, 2*idx + 2)\n", " display.waveplot(wav, alpha=0.5)\n", " display.waveplot(wav_gen, alpha=0.25)\n", " plt.title(\"{}={}\".format(attribute, val))\n", " plt.tight_layout()\n", " \n", " wav = AP.load_wav(file)\n", " print(\" > Ground-truth\")\n", " IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate))\n", " \n", " for idx, wav_gen in enumerate(wavs):\n", " val = values[idx]\n", " print(\" > {} = {}\".format(attribute, val))\n", " IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "Collapsed": "false" }, "outputs": [], "source": [ "compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }