L0SG commited on
Commit
eac4c42
1 Parent(s): 3455431
Files changed (1) hide show
  1. app.py +33 -25
app.py CHANGED
@@ -35,8 +35,8 @@ def inference_gradio(input, model_choice): # input is audio waveform in [T, cha
35
  audio = np.transpose(audio) # transpose to [channel, T] for librosa
36
  audio = audio / MAX_WAV_VALUE # convert int16 to float range used by BigVGAN
37
 
38
- h = list_config[model_choice]
39
- model = list_model[model_choice]
40
 
41
  if sr != h.sampling_rate: # convert audio to model's sampling rate
42
  audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
@@ -254,8 +254,8 @@ DICT_MODEL_NAME_FILE_PAIRS = {
254
  "bigvgan_v2_44khz_128band_512x": "g_03000000"
255
  }
256
 
257
- list_model = []
258
- list_config = []
259
 
260
  for model_name in LIST_MODEL_NAME:
261
  model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
@@ -275,8 +275,8 @@ for model_name in LIST_MODEL_NAME:
275
  generator.eval()
276
  generator.remove_weight_norm()
277
 
278
- list_model.append(generator)
279
- list_config.append(h)
280
 
281
  ######################## script for gradio UI ########################
282
 
@@ -285,29 +285,29 @@ iface = gr.Blocks(css=css)
285
  with iface:
286
  gr.HTML(
287
  """
288
- <div style="text-align: center; max-width: 700px; margin: 0 auto;">
289
- <div
290
- style="
291
- display: inline-flex;
292
- align-items: center;
293
- gap: 0.8rem;
294
- font-size: 1.75rem;
295
- "
296
- >
297
- <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
298
- BigVGAN: A Universal Neural Vocoder with Large-Scale Training
299
- </h1>
300
- </div>
301
- <p style="margin-bottom: 10px; font-size: 94%">
302
- <a href="https://arxiv.org/abs/2206.04658">[Paper]</a> <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a> <a href="https://bigvgan-demo.github.io/">[Demo]</a> <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
303
- </p>
304
  </div>
 
 
 
 
305
  """
306
  )
307
  gr.HTML(
308
  """
309
  <div>
310
- <h2>News</h2>
311
  <p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
312
  <ul>
313
  <li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
@@ -318,13 +318,21 @@ with iface:
318
  </div>
319
  """
320
  )
 
 
 
 
 
 
 
 
 
321
 
322
  with gr.Group():
323
- model_choice = gr.Radio(
324
  label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
325
  value="bigvgan_v2_24khz_100band_256x",
326
  choices=[m for m in LIST_MODEL_NAME],
327
- type="index",
328
  interactive=True,
329
  )
330
 
 
35
  audio = np.transpose(audio) # transpose to [channel, T] for librosa
36
  audio = audio / MAX_WAV_VALUE # convert int16 to float range used by BigVGAN
37
 
38
+ h = dict_config[model_choice]
39
+ model = dict_model[model_choice]
40
 
41
  if sr != h.sampling_rate: # convert audio to model's sampling rate
42
  audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
 
254
  "bigvgan_v2_44khz_128band_512x": "g_03000000"
255
  }
256
 
257
+ dict_model = {}
258
+ dict_config = {}
259
 
260
  for model_name in LIST_MODEL_NAME:
261
  model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
 
275
  generator.eval()
276
  generator.remove_weight_norm()
277
 
278
+ dict_model[model_name] = generator
279
+ dict_config[model_name] = h
280
 
281
  ######################## script for gradio UI ########################
282
 
 
285
  with iface:
286
  gr.HTML(
287
  """
288
+ <div style="text-align: center; max-width: 900px; margin: 0 auto;">
289
+ <div
290
+ style="
291
+ display: inline-flex;
292
+ align-items: center;
293
+ gap: 0.8rem;
294
+ font-size: 1.75rem;
295
+ "
296
+ >
297
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
298
+ BigVGAN: A Universal Neural Vocoder with Large-Scale Training
299
+ </h1>
 
 
 
 
300
  </div>
301
+ <p style="margin-bottom: 10px; font-size: 125%">
302
+ <a href="https://arxiv.org/abs/2206.04658">[Paper]</a> <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a> <a href="https://bigvgan-demo.github.io/">[Demo]</a> <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
303
+ </p>
304
+ </div>
305
  """
306
  )
307
  gr.HTML(
308
  """
309
  <div>
310
+ <h3>News</h3>
311
  <p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
312
  <ul>
313
  <li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
 
318
  </div>
319
  """
320
  )
321
+ gr.HTML(
322
+ """
323
+ <div>
324
+ <h3>Model Overview</h3>
325
+ BigVGAN is a neural vocoder model that generates audio waveforms using mel spectrogram as inputs.
326
+ <center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800" style="margin-top: 20px;"></center>
327
+ </div>
328
+ """
329
+ )
330
 
331
  with gr.Group():
332
+ model_choice = gr.Dropdown(
333
  label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
334
  value="bigvgan_v2_24khz_100band_256x",
335
  choices=[m for m in LIST_MODEL_NAME],
 
336
  interactive=True,
337
  )
338