L0SG commited on
Commit
3455431
1 Parent(s): ec3f86c

switch to gr.Audio for output display

Browse files
Files changed (1) hide show
  1. app.py +52 -41
app.py CHANGED
@@ -10,7 +10,7 @@ from meldataset import mel_spectrogram, MAX_WAV_VALUE
10
  from models import BigVGAN as Generator
11
  import librosa
12
  import numpy as np
13
- from utils import plot_spectrogram, plot_spectrogram_clipped
14
  import PIL
15
 
16
  if torch.cuda.is_available():
@@ -43,16 +43,21 @@ def inference_gradio(input, model_choice): # input is audio waveform in [T, cha
43
  if len(audio.shape) == 2: # stereo
44
  audio = librosa.to_mono(audio) # convert to mono if stereo
45
  audio = librosa.util.normalize(audio) * 0.95
46
- output, spec_gen = inference_model(audio, h, model) # output is generated audio in ndarray
47
 
48
- spec_plot_gen = plot_spectrogram(spec_gen.numpy())
 
 
49
 
50
- output_video = gr.make_waveform((h.sampling_rate, output))
51
- output_image_gen = PIL.Image.frombytes('RGB',
52
- spec_plot_gen.canvas.get_width_height(),
53
- spec_plot_gen.canvas.tostring_rgb())
54
 
55
- return output_video, output_image_gen
 
 
 
 
 
 
 
56
 
57
 
58
  @spaces.GPU(duration=120)
@@ -61,8 +66,8 @@ def inference_model(audio_input, h, model):
61
  model.to(device)
62
 
63
  def get_mel(x):
64
- return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
65
-
66
  with torch.inference_mode():
67
  wav = torch.FloatTensor(audio_input)
68
  # compute mel spectrogram from the ground truth audio
@@ -72,15 +77,16 @@ def inference_model(audio_input, h, model):
72
 
73
  audio_gen = y_g_hat.squeeze().cpu()
74
  spec_gen = get_mel(audio_gen.unsqueeze(0))
75
- audio_gen = audio_gen * MAX_WAV_VALUE
76
- audio_gen = audio_gen.numpy().astype('int16')
 
77
 
78
  # unload to cpu
79
- model.to('cpu')
80
  # delete gpu tensor
81
  del spec_gt, y_g_hat
82
-
83
- return audio_gen, spec_gen[0]
84
 
85
 
86
  css = """
@@ -222,9 +228,9 @@ css = """
222
 
223
  ######################## script for loading the models ########################
224
 
225
- model_path = "nvidia/BigVGAN"
226
 
227
- list_model_name = [
228
  "bigvgan_24khz_100band",
229
  "bigvgan_base_24khz_100band",
230
  "bigvgan_22khz_80band",
@@ -236,7 +242,7 @@ list_model_name = [
236
  "bigvgan_v2_44khz_128band_512x"
237
  ]
238
 
239
- model_files = {
240
  "bigvgan_24khz_100band": "g_05000000",
241
  "bigvgan_base_24khz_100band": "g_05000000",
242
  "bigvgan_22khz_80band": "g_05000000",
@@ -251,9 +257,9 @@ model_files = {
251
  list_model = []
252
  list_config = []
253
 
254
- for model_name in list_model_name:
255
- model_file = hf_hub_download(model_path, f"{model_name}/{model_files[model_name]}", use_auth_token=os.environ['TOKEN'])
256
- config_file = hf_hub_download(model_path, f"{model_name}/config.json", use_auth_token=os.environ['TOKEN'])
257
 
258
  with open(config_file) as f:
259
  data = f.read()
@@ -314,24 +320,29 @@ with iface:
314
  )
315
 
316
  with gr.Group():
317
- model_choice = gr.Radio(label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
318
- value="bigvgan_v2_24khz_100band_256x",
319
- choices=[m for m in list_model_name],
320
- type="index",
321
- interactive=True)
322
- audio_input = gr.Audio(label="Input Audio",
323
- elem_id="input-audio",
324
- interactive=True)
 
 
 
 
325
  button = gr.Button("Submit")
326
- output_video = gr.Video(label="Output Audio",
327
- elem_id="output-video")
328
- output_image_gen = gr.Image(label="Output Mel Spectrogram",
329
- elem_id="output-image-gen")
330
- button.click(inference_gradio,
331
- inputs=[audio_input, model_choice],
332
- outputs=[output_video, output_image_gen],
333
- concurrency_limit=10
334
- )
 
335
 
336
  gr.Examples(
337
  [
@@ -347,7 +358,7 @@ with iface:
347
  ],
348
  fn=inference_gradio,
349
  inputs=[audio_input, model_choice],
350
- outputs=[output_video, output_image_gen]
351
  )
352
 
353
  gr.HTML(
@@ -355,12 +366,12 @@ with iface:
355
  <table border="1" cellspacing="0" cellpadding="5">
356
  <thead>
357
  <tr>
358
- <th>Folder Name</th>
359
  <th>Sampling Rate</th>
360
  <th>Mel band</th>
361
  <th>fmax</th>
362
  <th>Upsampling Ratio</th>
363
- <th>Params.</th>
364
  <th>Dataset</th>
365
  <th>Fine-Tuned</th>
366
  </tr>
 
10
  from models import BigVGAN as Generator
11
  import librosa
12
  import numpy as np
13
+ from utils import plot_spectrogram
14
  import PIL
15
 
16
  if torch.cuda.is_available():
 
43
  if len(audio.shape) == 2: # stereo
44
  audio = librosa.to_mono(audio) # convert to mono if stereo
45
  audio = librosa.util.normalize(audio) * 0.95
 
46
 
47
+ output, spec_gen = inference_model(
48
+ audio, h, model
49
+ ) # output is generated audio in ndarray, int16
50
 
51
+ spec_plot_gen = plot_spectrogram(spec_gen)
 
 
 
52
 
53
+ output_audio = (h.sampling_rate, output) # tuple for gr.Audio output
54
+ output_image = PIL.Image.frombytes(
55
+ "RGB",
56
+ spec_plot_gen.canvas.get_width_height(),
57
+ spec_plot_gen.canvas.tostring_rgb(),
58
+ )
59
+
60
+ return output_audio, output_image
61
 
62
 
63
  @spaces.GPU(duration=120)
 
66
  model.to(device)
67
 
68
  def get_mel(x):
69
+ return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
70
+
71
  with torch.inference_mode():
72
  wav = torch.FloatTensor(audio_input)
73
  # compute mel spectrogram from the ground truth audio
 
77
 
78
  audio_gen = y_g_hat.squeeze().cpu()
79
  spec_gen = get_mel(audio_gen.unsqueeze(0))
80
+ audio_gen = audio_gen.numpy() # [T], float [-1, 1]
81
+ audio_gen = (audio_gen * MAX_WAV_VALUE).astype("int16") # [T], int16
82
+ spec_gen = spec_gen.squeeze().numpy() # [C, T_frame]
83
 
84
  # unload to cpu
85
+ model.to("cpu")
86
  # delete gpu tensor
87
  del spec_gt, y_g_hat
88
+
89
+ return audio_gen, spec_gen
90
 
91
 
92
  css = """
 
228
 
229
  ######################## script for loading the models ########################
230
 
231
+ MODEL_PATH = "nvidia/BigVGAN"
232
 
233
+ LIST_MODEL_NAME = [
234
  "bigvgan_24khz_100band",
235
  "bigvgan_base_24khz_100band",
236
  "bigvgan_22khz_80band",
 
242
  "bigvgan_v2_44khz_128band_512x"
243
  ]
244
 
245
+ DICT_MODEL_NAME_FILE_PAIRS = {
246
  "bigvgan_24khz_100band": "g_05000000",
247
  "bigvgan_base_24khz_100band": "g_05000000",
248
  "bigvgan_22khz_80band": "g_05000000",
 
257
  list_model = []
258
  list_config = []
259
 
260
+ for model_name in LIST_MODEL_NAME:
261
+ model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
262
+ config_file = hf_hub_download(MODEL_PATH, f"{model_name}/config.json", use_auth_token=os.environ['TOKEN'])
263
 
264
  with open(config_file) as f:
265
  data = f.read()
 
320
  )
321
 
322
  with gr.Group():
323
+ model_choice = gr.Radio(
324
+ label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
325
+ value="bigvgan_v2_24khz_100band_256x",
326
+ choices=[m for m in LIST_MODEL_NAME],
327
+ type="index",
328
+ interactive=True,
329
+ )
330
+
331
+ audio_input = gr.Audio(
332
+ label="Input Audio", elem_id="input-audio", interactive=True
333
+ )
334
+
335
  button = gr.Button("Submit")
336
+
337
+ output_audio = gr.Audio(label="Output Audio", elem_id="output-audio")
338
+ output_image = gr.Image(label="Output Mel Spectrogram", elem_id="output-image-gen")
339
+
340
+ button.click(
341
+ inference_gradio,
342
+ inputs=[audio_input, model_choice],
343
+ outputs=[output_audio, output_image],
344
+ concurrency_limit=10,
345
+ )
346
 
347
  gr.Examples(
348
  [
 
358
  ],
359
  fn=inference_gradio,
360
  inputs=[audio_input, model_choice],
361
+ outputs=[output_audio, output_image]
362
  )
363
 
364
  gr.HTML(
 
366
  <table border="1" cellspacing="0" cellpadding="5">
367
  <thead>
368
  <tr>
369
+ <th>Model Name</th>
370
  <th>Sampling Rate</th>
371
  <th>Mel band</th>
372
  <th>fmax</th>
373
  <th>Upsampling Ratio</th>
374
+ <th>Parameters</th>
375
  <th>Dataset</th>
376
  <th>Fine-Tuned</th>
377
  </tr>