switch to gr.Audio for output display
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ from meldataset import mel_spectrogram, MAX_WAV_VALUE
|
|
10 |
from models import BigVGAN as Generator
|
11 |
import librosa
|
12 |
import numpy as np
|
13 |
-
from utils import plot_spectrogram
|
14 |
import PIL
|
15 |
|
16 |
if torch.cuda.is_available():
|
@@ -43,16 +43,21 @@ def inference_gradio(input, model_choice): # input is audio waveform in [T, cha
|
|
43 |
if len(audio.shape) == 2: # stereo
|
44 |
audio = librosa.to_mono(audio) # convert to mono if stereo
|
45 |
audio = librosa.util.normalize(audio) * 0.95
|
46 |
-
output, spec_gen = inference_model(audio, h, model) # output is generated audio in ndarray
|
47 |
|
48 |
-
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
output_image_gen = PIL.Image.frombytes('RGB',
|
52 |
-
spec_plot_gen.canvas.get_width_height(),
|
53 |
-
spec_plot_gen.canvas.tostring_rgb())
|
54 |
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
|
58 |
@spaces.GPU(duration=120)
|
@@ -61,8 +66,8 @@ def inference_model(audio_input, h, model):
|
|
61 |
model.to(device)
|
62 |
|
63 |
def get_mel(x):
|
64 |
-
|
65 |
-
|
66 |
with torch.inference_mode():
|
67 |
wav = torch.FloatTensor(audio_input)
|
68 |
# compute mel spectrogram from the ground truth audio
|
@@ -72,15 +77,16 @@ def inference_model(audio_input, h, model):
|
|
72 |
|
73 |
audio_gen = y_g_hat.squeeze().cpu()
|
74 |
spec_gen = get_mel(audio_gen.unsqueeze(0))
|
75 |
-
audio_gen = audio_gen
|
76 |
-
audio_gen = audio_gen
|
|
|
77 |
|
78 |
# unload to cpu
|
79 |
-
model.to(
|
80 |
# delete gpu tensor
|
81 |
del spec_gt, y_g_hat
|
82 |
-
|
83 |
-
return audio_gen, spec_gen
|
84 |
|
85 |
|
86 |
css = """
|
@@ -222,9 +228,9 @@ css = """
|
|
222 |
|
223 |
######################## script for loading the models ########################
|
224 |
|
225 |
-
|
226 |
|
227 |
-
|
228 |
"bigvgan_24khz_100band",
|
229 |
"bigvgan_base_24khz_100band",
|
230 |
"bigvgan_22khz_80band",
|
@@ -236,7 +242,7 @@ list_model_name = [
|
|
236 |
"bigvgan_v2_44khz_128band_512x"
|
237 |
]
|
238 |
|
239 |
-
|
240 |
"bigvgan_24khz_100band": "g_05000000",
|
241 |
"bigvgan_base_24khz_100band": "g_05000000",
|
242 |
"bigvgan_22khz_80band": "g_05000000",
|
@@ -251,9 +257,9 @@ model_files = {
|
|
251 |
list_model = []
|
252 |
list_config = []
|
253 |
|
254 |
-
for model_name in
|
255 |
-
model_file = hf_hub_download(
|
256 |
-
config_file = hf_hub_download(
|
257 |
|
258 |
with open(config_file) as f:
|
259 |
data = f.read()
|
@@ -314,24 +320,29 @@ with iface:
|
|
314 |
)
|
315 |
|
316 |
with gr.Group():
|
317 |
-
model_choice = gr.Radio(
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
|
|
|
|
|
|
|
|
325 |
button = gr.Button("Submit")
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
button.click(
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
|
|
335 |
|
336 |
gr.Examples(
|
337 |
[
|
@@ -347,7 +358,7 @@ with iface:
|
|
347 |
],
|
348 |
fn=inference_gradio,
|
349 |
inputs=[audio_input, model_choice],
|
350 |
-
outputs=[
|
351 |
)
|
352 |
|
353 |
gr.HTML(
|
@@ -355,12 +366,12 @@ with iface:
|
|
355 |
<table border="1" cellspacing="0" cellpadding="5">
|
356 |
<thead>
|
357 |
<tr>
|
358 |
-
<th>
|
359 |
<th>Sampling Rate</th>
|
360 |
<th>Mel band</th>
|
361 |
<th>fmax</th>
|
362 |
<th>Upsampling Ratio</th>
|
363 |
-
<th>
|
364 |
<th>Dataset</th>
|
365 |
<th>Fine-Tuned</th>
|
366 |
</tr>
|
|
|
10 |
from models import BigVGAN as Generator
|
11 |
import librosa
|
12 |
import numpy as np
|
13 |
+
from utils import plot_spectrogram
|
14 |
import PIL
|
15 |
|
16 |
if torch.cuda.is_available():
|
|
|
43 |
if len(audio.shape) == 2: # stereo
|
44 |
audio = librosa.to_mono(audio) # convert to mono if stereo
|
45 |
audio = librosa.util.normalize(audio) * 0.95
|
|
|
46 |
|
47 |
+
output, spec_gen = inference_model(
|
48 |
+
audio, h, model
|
49 |
+
) # output is generated audio in ndarray, int16
|
50 |
|
51 |
+
spec_plot_gen = plot_spectrogram(spec_gen)
|
|
|
|
|
|
|
52 |
|
53 |
+
output_audio = (h.sampling_rate, output) # tuple for gr.Audio output
|
54 |
+
output_image = PIL.Image.frombytes(
|
55 |
+
"RGB",
|
56 |
+
spec_plot_gen.canvas.get_width_height(),
|
57 |
+
spec_plot_gen.canvas.tostring_rgb(),
|
58 |
+
)
|
59 |
+
|
60 |
+
return output_audio, output_image
|
61 |
|
62 |
|
63 |
@spaces.GPU(duration=120)
|
|
|
66 |
model.to(device)
|
67 |
|
68 |
def get_mel(x):
|
69 |
+
return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
|
70 |
+
|
71 |
with torch.inference_mode():
|
72 |
wav = torch.FloatTensor(audio_input)
|
73 |
# compute mel spectrogram from the ground truth audio
|
|
|
77 |
|
78 |
audio_gen = y_g_hat.squeeze().cpu()
|
79 |
spec_gen = get_mel(audio_gen.unsqueeze(0))
|
80 |
+
audio_gen = audio_gen.numpy() # [T], float [-1, 1]
|
81 |
+
audio_gen = (audio_gen * MAX_WAV_VALUE).astype("int16") # [T], int16
|
82 |
+
spec_gen = spec_gen.squeeze().numpy() # [C, T_frame]
|
83 |
|
84 |
# unload to cpu
|
85 |
+
model.to("cpu")
|
86 |
# delete gpu tensor
|
87 |
del spec_gt, y_g_hat
|
88 |
+
|
89 |
+
return audio_gen, spec_gen
|
90 |
|
91 |
|
92 |
css = """
|
|
|
228 |
|
229 |
######################## script for loading the models ########################
|
230 |
|
231 |
+
MODEL_PATH = "nvidia/BigVGAN"
|
232 |
|
233 |
+
LIST_MODEL_NAME = [
|
234 |
"bigvgan_24khz_100band",
|
235 |
"bigvgan_base_24khz_100band",
|
236 |
"bigvgan_22khz_80band",
|
|
|
242 |
"bigvgan_v2_44khz_128band_512x"
|
243 |
]
|
244 |
|
245 |
+
DICT_MODEL_NAME_FILE_PAIRS = {
|
246 |
"bigvgan_24khz_100band": "g_05000000",
|
247 |
"bigvgan_base_24khz_100band": "g_05000000",
|
248 |
"bigvgan_22khz_80band": "g_05000000",
|
|
|
257 |
list_model = []
|
258 |
list_config = []
|
259 |
|
260 |
+
for model_name in LIST_MODEL_NAME:
|
261 |
+
model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
|
262 |
+
config_file = hf_hub_download(MODEL_PATH, f"{model_name}/config.json", use_auth_token=os.environ['TOKEN'])
|
263 |
|
264 |
with open(config_file) as f:
|
265 |
data = f.read()
|
|
|
320 |
)
|
321 |
|
322 |
with gr.Group():
|
323 |
+
model_choice = gr.Radio(
|
324 |
+
label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
|
325 |
+
value="bigvgan_v2_24khz_100band_256x",
|
326 |
+
choices=[m for m in LIST_MODEL_NAME],
|
327 |
+
type="index",
|
328 |
+
interactive=True,
|
329 |
+
)
|
330 |
+
|
331 |
+
audio_input = gr.Audio(
|
332 |
+
label="Input Audio", elem_id="input-audio", interactive=True
|
333 |
+
)
|
334 |
+
|
335 |
button = gr.Button("Submit")
|
336 |
+
|
337 |
+
output_audio = gr.Audio(label="Output Audio", elem_id="output-audio")
|
338 |
+
output_image = gr.Image(label="Output Mel Spectrogram", elem_id="output-image-gen")
|
339 |
+
|
340 |
+
button.click(
|
341 |
+
inference_gradio,
|
342 |
+
inputs=[audio_input, model_choice],
|
343 |
+
outputs=[output_audio, output_image],
|
344 |
+
concurrency_limit=10,
|
345 |
+
)
|
346 |
|
347 |
gr.Examples(
|
348 |
[
|
|
|
358 |
],
|
359 |
fn=inference_gradio,
|
360 |
inputs=[audio_input, model_choice],
|
361 |
+
outputs=[output_audio, output_image]
|
362 |
)
|
363 |
|
364 |
gr.HTML(
|
|
|
366 |
<table border="1" cellspacing="0" cellpadding="5">
|
367 |
<thead>
|
368 |
<tr>
|
369 |
+
<th>Model Name</th>
|
370 |
<th>Sampling Rate</th>
|
371 |
<th>Mel band</th>
|
372 |
<th>fmax</th>
|
373 |
<th>Upsampling Ratio</th>
|
374 |
+
<th>Parameters</th>
|
375 |
<th>Dataset</th>
|
376 |
<th>Fine-Tuned</th>
|
377 |
</tr>
|