update
Browse files
app.py
CHANGED
@@ -35,8 +35,8 @@ def inference_gradio(input, model_choice): # input is audio waveform in [T, cha
|
|
35 |
audio = np.transpose(audio) # transpose to [channel, T] for librosa
|
36 |
audio = audio / MAX_WAV_VALUE # convert int16 to float range used by BigVGAN
|
37 |
|
38 |
-
h =
|
39 |
-
model =
|
40 |
|
41 |
if sr != h.sampling_rate: # convert audio to model's sampling rate
|
42 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
|
@@ -254,8 +254,8 @@ DICT_MODEL_NAME_FILE_PAIRS = {
|
|
254 |
"bigvgan_v2_44khz_128band_512x": "g_03000000"
|
255 |
}
|
256 |
|
257 |
-
|
258 |
-
|
259 |
|
260 |
for model_name in LIST_MODEL_NAME:
|
261 |
model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
|
@@ -275,8 +275,8 @@ for model_name in LIST_MODEL_NAME:
|
|
275 |
generator.eval()
|
276 |
generator.remove_weight_norm()
|
277 |
|
278 |
-
|
279 |
-
|
280 |
|
281 |
######################## script for gradio UI ########################
|
282 |
|
@@ -285,29 +285,29 @@ iface = gr.Blocks(css=css)
|
|
285 |
with iface:
|
286 |
gr.HTML(
|
287 |
"""
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
</div>
|
301 |
-
<p style="margin-bottom: 10px; font-size: 94%">
|
302 |
-
<a href="https://arxiv.org/abs/2206.04658">[Paper]</a> <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a> <a href="https://bigvgan-demo.github.io/">[Demo]</a> <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
|
303 |
-
</p>
|
304 |
</div>
|
|
|
|
|
|
|
|
|
305 |
"""
|
306 |
)
|
307 |
gr.HTML(
|
308 |
"""
|
309 |
<div>
|
310 |
-
<
|
311 |
<p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
|
312 |
<ul>
|
313 |
<li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
|
@@ -318,13 +318,21 @@ with iface:
|
|
318 |
</div>
|
319 |
"""
|
320 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
|
322 |
with gr.Group():
|
323 |
-
model_choice = gr.
|
324 |
label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
|
325 |
value="bigvgan_v2_24khz_100band_256x",
|
326 |
choices=[m for m in LIST_MODEL_NAME],
|
327 |
-
type="index",
|
328 |
interactive=True,
|
329 |
)
|
330 |
|
|
|
35 |
audio = np.transpose(audio) # transpose to [channel, T] for librosa
|
36 |
audio = audio / MAX_WAV_VALUE # convert int16 to float range used by BigVGAN
|
37 |
|
38 |
+
h = dict_config[model_choice]
|
39 |
+
model = dict_model[model_choice]
|
40 |
|
41 |
if sr != h.sampling_rate: # convert audio to model's sampling rate
|
42 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
|
|
|
254 |
"bigvgan_v2_44khz_128band_512x": "g_03000000"
|
255 |
}
|
256 |
|
257 |
+
dict_model = {}
|
258 |
+
dict_config = {}
|
259 |
|
260 |
for model_name in LIST_MODEL_NAME:
|
261 |
model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
|
|
|
275 |
generator.eval()
|
276 |
generator.remove_weight_norm()
|
277 |
|
278 |
+
dict_model[model_name] = generator
|
279 |
+
dict_config[model_name] = h
|
280 |
|
281 |
######################## script for gradio UI ########################
|
282 |
|
|
|
285 |
with iface:
|
286 |
gr.HTML(
|
287 |
"""
|
288 |
+
<div style="text-align: center; max-width: 900px; margin: 0 auto;">
|
289 |
+
<div
|
290 |
+
style="
|
291 |
+
display: inline-flex;
|
292 |
+
align-items: center;
|
293 |
+
gap: 0.8rem;
|
294 |
+
font-size: 1.75rem;
|
295 |
+
"
|
296 |
+
>
|
297 |
+
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
|
298 |
+
BigVGAN: A Universal Neural Vocoder with Large-Scale Training
|
299 |
+
</h1>
|
|
|
|
|
|
|
|
|
300 |
</div>
|
301 |
+
<p style="margin-bottom: 10px; font-size: 125%">
|
302 |
+
<a href="https://arxiv.org/abs/2206.04658">[Paper]</a> <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a> <a href="https://bigvgan-demo.github.io/">[Demo]</a> <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
|
303 |
+
</p>
|
304 |
+
</div>
|
305 |
"""
|
306 |
)
|
307 |
gr.HTML(
|
308 |
"""
|
309 |
<div>
|
310 |
+
<h3>News</h3>
|
311 |
<p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
|
312 |
<ul>
|
313 |
<li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
|
|
|
318 |
</div>
|
319 |
"""
|
320 |
)
|
321 |
+
gr.HTML(
|
322 |
+
"""
|
323 |
+
<div>
|
324 |
+
<h3>Model Overview</h3>
|
325 |
+
BigVGAN is a neural vocoder model that generates audio waveforms using mel spectrogram as inputs.
|
326 |
+
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800" style="margin-top: 20px;"></center>
|
327 |
+
</div>
|
328 |
+
"""
|
329 |
+
)
|
330 |
|
331 |
with gr.Group():
|
332 |
+
model_choice = gr.Dropdown(
|
333 |
label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
|
334 |
value="bigvgan_v2_24khz_100band_256x",
|
335 |
choices=[m for m in LIST_MODEL_NAME],
|
|
|
336 |
interactive=True,
|
337 |
)
|
338 |
|