Spaces:
Running
Running
File size: 8,006 Bytes
45ee559 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
{
"cells": [{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.\n",
"### Features of this notebook\n",
"- You can see visually how your model performs on each sentence and try to dicern common problems.\n",
"- At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.\n",
"- You can change the list of sentences byt providing a different sentence file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false",
"scrolled": true
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os, sys\n",
"import torch \n",
"import time\n",
"import numpy as np\n",
"from matplotlib import pylab as plt\n",
"\n",
"%pylab inline\n",
"plt.rcParams[\"figure.figsize\"] = (16,5)\n",
"\n",
"import librosa\n",
"import librosa.display\n",
"\n",
"from TTS.tts.layers import *\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.io import load_config\n",
"from TTS.tts.utils.text import text_to_sequence\n",
"from TTS.tts.utils.synthesis import synthesis\n",
"from TTS.tts.utils.visual import plot_alignment\n",
"from TTS.tts.utils.measures import alignment_diagonal_score\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
"\n",
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
"\n",
"def tts(model, text, CONFIG, use_cuda, ap):\n",
" t_1 = time.time()\n",
" # run the model\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" # plotting\n",
" attn_score = alignment_diagonal_score(torch.FloatTensor(alignment).unsqueeze(0))\n",
" print(f\" > {text}\")\n",
" IPython.display.display(IPython.display.Audio(waveform, rate=ap.sample_rate))\n",
" fig = plot_alignment(alignment, fig_size=(8, 5))\n",
" IPython.display.display(fig)\n",
" #saving results\n",
" os.makedirs(OUT_FOLDER, exist_ok=True)\n",
" file_name = text[:200].replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
" out_path = os.path.join(OUT_FOLDER, file_name)\n",
" ap.save_wav(waveform, out_path)\n",
" return attn_score\n",
"\n",
"# Set constants\n",
"ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
"MODEL_PATH = ROOT_PATH + '/best_model.pth'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = './hard_sentences/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
"SENTENCES_PATH = 'sentences.txt'\n",
"use_cuda = True\n",
"\n",
"# Set some config fields manually for testing\n",
"# CONFIG.windowing = False\n",
"# CONFIG.prenet_dropout = False\n",
"# CONFIG.separate_stopnet = True\n",
"CONFIG.use_forward_attn = False\n",
"# CONFIG.forward_attn_mask = True\n",
"# CONFIG.stopnet = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n",
"# multi speaker \n",
"if CONFIG.use_speaker_embedding:\n",
" speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
" speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
"else:\n",
" speakers = []\n",
" speaker_id = None\n",
"\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in CONFIG.keys():\n",
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), CONFIG)\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(**CONFIG.audio) \n",
"\n",
"\n",
"# load model state\n",
"if use_cuda:\n",
" cp = torch.load(MODEL_PATH)\n",
"else:\n",
" cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n",
"\n",
"# load the model\n",
"model.load_state_dict(cp['model'])\n",
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()\n",
"print(cp['step'])\n",
"print(cp['r'])\n",
"\n",
"# set model stepsize\n",
"if 'r' in cp:\n",
" model.decoder.set_r(cp['r'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"model.decoder.max_decoder_steps=3000\n",
"attn_scores = []\n",
"with open(SENTENCES_PATH, 'r') as f:\n",
" for text in f:\n",
" attn_score = tts(model, text, CONFIG, use_cuda, ap)\n",
" attn_scores.append(attn_score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"np.mean(attn_scores)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
} |