Serhiy Stetskovych commited on
Commit
37f9a5d
1 Parent(s): 39cc8c4

App with new vocoder

Browse files
Files changed (3) hide show
  1. app.py +33 -28
  2. prompt.wav +0 -0
  3. prompt22050.wav +0 -0
app.py CHANGED
@@ -28,15 +28,17 @@ from vocos import Vocos
28
 
29
  PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
30
  #PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
31
- VOCODER_MODEL_PATH = 'checkpoints/pytorch_model.bin'
 
 
32
  HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
33
 
34
 
35
  transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
36
- wav, sr = torchaudio.load('prompt.wav')
37
 
38
  prompt = mel_spectrogram(
39
- wav,
40
  1024,
41
  80,
42
  22050,
@@ -85,7 +87,7 @@ def load_vocos(checkpoint_path, config_path, device):
85
 
86
 
87
  def to_waveform(mel, vocoder, denoiser=None):
88
- return vocoder.decode(mel).cpu().squeeze()
89
 
90
  # audio = vocoder(mel).clamp(-1, 1)
91
  # if denoiser is not None:
@@ -113,9 +115,10 @@ model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
113
  _ = model.eval()
114
 
115
 
116
- #hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device)
117
- vocos = load_vocos(VOCODER_MODEL_PATH, 'config.yaml', device)
118
- #vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=4_step=93440_val_loss=5.2596_44100_10.ckpt', 'vocos.yaml', device)
 
119
  denoiser = None#Denoiser(vocoder, mode="zeros")
120
 
121
 
@@ -134,23 +137,25 @@ def synthesise(text, speed):
134
  length_scale=1/speed,
135
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
136
 
137
- guidance_scale=1.5
138
 
139
  )
140
- waveform_vocos = vocos.decode(output["mel"]).cpu().squeeze()
141
- #waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
142
- #waveform_hifigan = hifigan(output["mel"]).clamp(-1, 1).cpu().squeeze()
143
- #transform = torchaudio.transforms.Vol(gain=-18, gain_type="db")
144
 
145
 
146
- return text_processed['x_phones'][1::2], (22050, waveform_vocos.numpy())
147
 
148
 
149
  description = f'''
150
  # Експериментальна апка для генерації аудіо з тексту.
151
 
152
  pflow checkpoint {PFLOW_MODEL_PATH}
153
- vocoder: Vocos - {VOCODER_MODEL_PATH}
 
 
154
  '''
155
 
156
 
@@ -164,28 +169,28 @@ if __name__ == "__main__":
164
  ],
165
  outputs=[
166
  gr.Text(label='Фонемізований текст:', lines=5),
167
- # gr.Audio(
168
- # label="Vocos 44100 аудіо:",
169
- # autoplay=False,
170
- # streaming=False,
171
- # type="numpy",
172
- # ),
173
  gr.Audio(
174
- label="Vocos аудіо:",
 
 
 
 
 
 
175
  autoplay=False,
176
  streaming=False,
177
  type="numpy",
178
  ),
179
- # gr.Audio(
180
- # label="HIFIGAN аудіо:",
181
- # autoplay=False,
182
- # streaming=False,
183
- # type="numpy",
184
- # )
185
 
186
  ],
187
  allow_flagging ='manual',
188
- flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
189
  cache_examples=True,
190
  title='',
191
  # description=description,
 
28
 
29
  PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
30
  #PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
31
+ VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz'
32
+ VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz'
33
+
34
  HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
35
 
36
 
37
  transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
38
+ wav, sr = torchaudio.load('prompt22050.wav')
39
 
40
  prompt = mel_spectrogram(
41
+ transform(wav),
42
  1024,
43
  80,
44
  22050,
 
87
 
88
 
89
  def to_waveform(mel, vocoder, denoiser=None):
90
+ return vocoder.decode(mel).clamp(-1, 1).cpu().squeeze()
91
 
92
  # audio = vocoder(mel).clamp(-1, 1)
93
  # if denoiser is not None:
 
115
  _ = model.eval()
116
 
117
 
118
+ hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device)
119
+ vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH)
120
+ #vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt', 'vocos.yaml', device)
121
+ vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH)
122
  denoiser = None#Denoiser(vocoder, mode="zeros")
123
 
124
 
 
137
  length_scale=1/speed,
138
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
139
 
140
+ guidance_scale=2.0
141
 
142
  )
143
+ waveform_vocos = vocos_22050.decode(output["mel"]).cpu().squeeze()
144
+ waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
145
+ waveform_hifigan = hifigan(output["mel"]).clamp(-1, 1).cpu().squeeze()
146
+ transform = torchaudio.transforms.Vol(gain=-18, gain_type="db")
147
 
148
 
149
+ return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy()), (22050, transform(waveform_hifigan).numpy())
150
 
151
 
152
  description = f'''
153
  # Експериментальна апка для генерації аудіо з тексту.
154
 
155
  pflow checkpoint {PFLOW_MODEL_PATH}
156
+ Vocos 44100 аудіо - {VOCODER44_MODEL_PATH}
157
+ Vocos 22050 аудіо - {VOCODER22_MODEL_PATH}
158
+ HIFIGAN 22050 аудіо - {HIFIGAN_MODEL_PATH}
159
  '''
160
 
161
 
 
169
  ],
170
  outputs=[
171
  gr.Text(label='Фонемізований текст:', lines=5),
 
 
 
 
 
 
172
  gr.Audio(
173
+ label="Vocos 44100 аудіо:",
174
+ autoplay=False,
175
+ streaming=False,
176
+ type="numpy",
177
+ ),
178
+ gr.Audio(
179
+ label="Vocos 22050 аудіо:",
180
  autoplay=False,
181
  streaming=False,
182
  type="numpy",
183
  ),
184
+ gr.Audio(
185
+ label="HIFIGAN 22050 аудіо:",
186
+ autoplay=False,
187
+ streaming=False,
188
+ type="numpy",
189
+ )
190
 
191
  ],
192
  allow_flagging ='manual',
193
+ #flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
194
  cache_examples=True,
195
  title='',
196
  # description=description,
prompt.wav DELETED
Binary file (112 kB)
 
prompt22050.wav ADDED
Binary file (655 kB). View file