Spaces:
Runtime error
Runtime error
haoheliu
commited on
Commit
·
39711bd
1
Parent(s):
4eab478
try out UI design
Browse files- app.py +53 -46
- audioldm/latent_diffusion/ddim.py +3 -0
- audioldm/ldm.py +5 -7
- audioldm/pipeline.py +6 -3
app.py
CHANGED
@@ -1,55 +1,62 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
-
|
4 |
|
5 |
-
|
6 |
-
# waveform = text_to_audio(text, n_gen=1) # [bs, 1, samples]
|
7 |
-
# waveform = [(16000, wave[0]) for wave in waveform]
|
8 |
-
waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
|
9 |
-
return waveform
|
10 |
|
11 |
-
|
12 |
-
#
|
|
|
|
|
|
|
|
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
block = gr.Blocks()
|
16 |
|
17 |
-
with block:
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
block.launch(debug=True)
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
+
from audioldm import text_to_audio, seed_everything, build_model
|
4 |
|
5 |
+
audioldm = build_model()
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
def text2audio(text, duration, guidance_scale):
|
8 |
+
# print(text, length, guidance_scale)
|
9 |
+
waveform = text_to_audio(audioldm, text, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=1) # [bs, 1, samples]
|
10 |
+
waveform = [(16000, wave[0]) for wave in waveform]
|
11 |
+
# waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
|
12 |
+
return waveform
|
13 |
|
14 |
+
iface = gr.Interface(fn=text2audio, inputs=[
|
15 |
+
gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
|
16 |
+
gr.Slider(2, 15, value=5, step=0.1),
|
17 |
+
gr.Slider(0, 5, value=2.5, step=0.5),
|
18 |
+
], outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
|
19 |
+
)
|
20 |
+
iface.launch(share=True)
|
21 |
|
22 |
+
# block = gr.Blocks()
|
23 |
|
24 |
+
# with block:
|
25 |
+
# gr.HTML(
|
26 |
+
# """
|
27 |
+
# <div style="text-align: center; max-width: 700px; margin: 0 auto;">
|
28 |
+
# <div
|
29 |
+
# style="
|
30 |
+
# display: inline-flex;
|
31 |
+
# align-items: center;
|
32 |
+
# gap: 0.8rem;
|
33 |
+
# font-size: 1.75rem;
|
34 |
+
# "
|
35 |
+
# >
|
36 |
+
# <h1 style="font-weight: 900; margin-bottom: 7px;">
|
37 |
+
# Text-to-Audio Generation with AudioLDM
|
38 |
+
# </h1>
|
39 |
+
# </div>
|
40 |
+
# <p style="margin-bottom: 10px; font-size: 94%">
|
41 |
+
# <a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project page]</a>
|
42 |
+
# </p>
|
43 |
+
# </div>
|
44 |
+
# """
|
45 |
+
# )
|
46 |
+
# with gr.Group():
|
47 |
+
# with gr.Box():
|
48 |
+
# textbox = gr.Textbox(value="A man is speaking in a huge room")
|
49 |
+
# length = gr.Slider(1.0, 30.0, value=5.0, step=0.5, label="Audio length in seconds")
|
50 |
+
# # model = gr.Dropdown(choices=["harmonai/maestro-150k"], value="harmonai/maestro-150k",type="value", label="Model")
|
51 |
+
# out = [gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
|
52 |
+
# btn = gr.Button("Submit").style(full_width=True)
|
53 |
|
54 |
+
# btn.click(text2audio, inputs=[textbox, length], outputs=out)
|
55 |
+
# gr.HTML('''
|
56 |
+
# <div class="footer" style="text-align: center; max-width: 700px; margin: 0 auto;">
|
57 |
+
# <p>Model by <a href="https://haoheliu.github.io/" style="text-decoration: underline;" target="_blank">Haohe Liu</a>
|
58 |
+
# </p>
|
59 |
+
# </div>
|
60 |
+
# ''')
|
61 |
|
62 |
+
# block.launch(debug=True)
|
audioldm/latent_diffusion/ddim.py
CHANGED
@@ -10,6 +10,7 @@ from audioldm.latent_diffusion.util import (
|
|
10 |
noise_like,
|
11 |
extract_into_tensor,
|
12 |
)
|
|
|
13 |
|
14 |
class DDIMSampler(object):
|
15 |
def __init__(self, model, schedule="linear", **kwargs):
|
@@ -200,6 +201,7 @@ class DDIMSampler(object):
|
|
200 |
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
201 |
# print(f"Running DDIM Sampling with {total_steps} timesteps")
|
202 |
|
|
|
203 |
iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
|
204 |
|
205 |
for i, step in enumerate(iterator):
|
@@ -281,6 +283,7 @@ class DDIMSampler(object):
|
|
281 |
total_steps = timesteps.shape[0]
|
282 |
# print(f"Running DDIM Sampling with {total_steps} timesteps")
|
283 |
|
|
|
284 |
iterator = tqdm(time_range, desc="Decoding image", total=total_steps)
|
285 |
x_dec = x_latent
|
286 |
|
|
|
10 |
noise_like,
|
11 |
extract_into_tensor,
|
12 |
)
|
13 |
+
import gradio as gr
|
14 |
|
15 |
class DDIMSampler(object):
|
16 |
def __init__(self, model, schedule="linear", **kwargs):
|
|
|
201 |
total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
|
202 |
# print(f"Running DDIM Sampling with {total_steps} timesteps")
|
203 |
|
204 |
+
# iterator = gr.Progress().tqdm(time_range, desc="DDIM Sampler", total=total_steps)
|
205 |
iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps)
|
206 |
|
207 |
for i, step in enumerate(iterator):
|
|
|
283 |
total_steps = timesteps.shape[0]
|
284 |
# print(f"Running DDIM Sampling with {total_steps} timesteps")
|
285 |
|
286 |
+
# iterator = gr.Progress().tqdm(time_range, desc="Decoding image", total=total_steps)
|
287 |
iterator = tqdm(time_range, desc="Decoding image", total=total_steps)
|
288 |
x_dec = x_latent
|
289 |
|
audioldm/ldm.py
CHANGED
@@ -636,7 +636,7 @@ class LatentDiffusion(DDPM):
|
|
636 |
ddim_steps=200,
|
637 |
ddim_eta=1.0,
|
638 |
x_T=None,
|
639 |
-
|
640 |
unconditional_guidance_scale=1.0,
|
641 |
unconditional_conditioning=None,
|
642 |
name="waveform",
|
@@ -644,7 +644,7 @@ class LatentDiffusion(DDPM):
|
|
644 |
save=False,
|
645 |
**kwargs,
|
646 |
):
|
647 |
-
# Generate
|
648 |
# Batch: audio, text, fnames
|
649 |
assert x_T is None
|
650 |
try:
|
@@ -672,17 +672,15 @@ class LatentDiffusion(DDPM):
|
|
672 |
text = super().get_input(batch, "text")
|
673 |
|
674 |
# Generate multiple samples
|
675 |
-
batch_size = z.shape[0] *
|
676 |
-
c = torch.cat([c] *
|
677 |
-
text = text *
|
678 |
|
679 |
if unconditional_guidance_scale != 1.0:
|
680 |
unconditional_conditioning = (
|
681 |
self.cond_stage_model.get_unconditional_condition(batch_size)
|
682 |
)
|
683 |
|
684 |
-
fnames = list(super().get_input(batch, "fname"))
|
685 |
-
|
686 |
samples, _ = self.sample_log(
|
687 |
cond=c,
|
688 |
batch_size=batch_size,
|
|
|
636 |
ddim_steps=200,
|
637 |
ddim_eta=1.0,
|
638 |
x_T=None,
|
639 |
+
n_candidate_gen_per_text=1,
|
640 |
unconditional_guidance_scale=1.0,
|
641 |
unconditional_conditioning=None,
|
642 |
name="waveform",
|
|
|
644 |
save=False,
|
645 |
**kwargs,
|
646 |
):
|
647 |
+
# Generate n_candidate_gen_per_text times and select the best
|
648 |
# Batch: audio, text, fnames
|
649 |
assert x_T is None
|
650 |
try:
|
|
|
672 |
text = super().get_input(batch, "text")
|
673 |
|
674 |
# Generate multiple samples
|
675 |
+
batch_size = z.shape[0] * n_candidate_gen_per_text
|
676 |
+
c = torch.cat([c] * n_candidate_gen_per_text, dim=0)
|
677 |
+
text = text * n_candidate_gen_per_text
|
678 |
|
679 |
if unconditional_guidance_scale != 1.0:
|
680 |
unconditional_conditioning = (
|
681 |
self.cond_stage_model.get_unconditional_condition(batch_size)
|
682 |
)
|
683 |
|
|
|
|
|
684 |
samples, _ = self.sample_log(
|
685 |
cond=c,
|
686 |
batch_size=batch_size,
|
audioldm/pipeline.py
CHANGED
@@ -29,7 +29,7 @@ def make_batch_for_text_to_audio(text, batchsize=2):
|
|
29 |
)
|
30 |
return batch
|
31 |
|
32 |
-
def
|
33 |
if(torch.cuda.is_available()):
|
34 |
device = torch.device("cuda:0")
|
35 |
else:
|
@@ -57,13 +57,16 @@ def text_to_audio(text, batchsize=2, guidance_scale=2.5, n_gen=1, config=None):
|
|
57 |
latent_diffusion = latent_diffusion.to(device)
|
58 |
|
59 |
latent_diffusion.cond_stage_model.embed_mode = "text"
|
|
|
60 |
|
61 |
-
batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
|
62 |
|
|
|
|
|
63 |
with torch.no_grad():
|
64 |
waveform = latent_diffusion.generate_sample(
|
65 |
[batch],
|
66 |
unconditional_guidance_scale=guidance_scale,
|
67 |
-
|
|
|
68 |
)
|
69 |
return waveform
|
|
|
29 |
)
|
30 |
return batch
|
31 |
|
32 |
+
def build_model(config=None):
|
33 |
if(torch.cuda.is_available()):
|
34 |
device = torch.device("cuda:0")
|
35 |
else:
|
|
|
57 |
latent_diffusion = latent_diffusion.to(device)
|
58 |
|
59 |
latent_diffusion.cond_stage_model.embed_mode = "text"
|
60 |
+
return latent_diffusion
|
61 |
|
|
|
62 |
|
63 |
+
def text_to_audio(latent_diffusion, text, duration=10, batchsize=2, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
|
64 |
+
batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
|
65 |
with torch.no_grad():
|
66 |
waveform = latent_diffusion.generate_sample(
|
67 |
[batch],
|
68 |
unconditional_guidance_scale=guidance_scale,
|
69 |
+
n_candidate_gen_per_text=n_candidate_gen_per_text,
|
70 |
+
duration=duration
|
71 |
)
|
72 |
return waveform
|