Spaces:
Running
on
L4
Running
on
L4
haoheliu
commited on
Commit
·
4e9d8a1
1
Parent(s):
c55c219
two output to one output
Browse files- app.py +8 -4
- audioldm/clap/open_clip/model.py +2 -0
- audioldm/ldm.py +13 -12
- audioldm/pipeline.py +4 -4
- requirements.txt +1 -0
app.py
CHANGED
@@ -30,7 +30,10 @@ def text2audio(text, duration, guidance_scale, random_seed, n_candidates):
|
|
30 |
waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
|
31 |
waveform = [(16000, wave[0]) for wave in waveform]
|
32 |
# waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
|
33 |
-
|
|
|
|
|
|
|
34 |
|
35 |
# iface = gr.Interface(fn=text2audio, inputs=[
|
36 |
# gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
|
@@ -71,13 +74,14 @@ with iface:
|
|
71 |
############# Input
|
72 |
textbox = gr.Textbox(value="A hammer is hitting a wooden surface", max_lines=1)
|
73 |
|
74 |
-
with gr.Accordion("Click to
|
75 |
seed = gr.Number(value=42, label="Change this value (any integer number) will lead to a different generation result.")
|
76 |
duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
|
77 |
guidance_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
|
78 |
n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
|
79 |
############# Output
|
80 |
-
outputs=[gr.Audio(label="Output", type="numpy")
|
|
|
81 |
|
82 |
btn = gr.Button("Submit").style(full_width=True)
|
83 |
btn.click(text2audio, inputs=[textbox, duration, guidance_scale, seed, n_candidates], outputs=outputs)
|
@@ -89,6 +93,6 @@ with iface:
|
|
89 |
</div>
|
90 |
''')
|
91 |
|
92 |
-
iface.queue(concurrency_count=2)
|
93 |
iface.launch(debug=True)
|
94 |
# iface.launch(debug=True, share=True)
|
|
|
30 |
waveform = text_to_audio(audioldm, text, random_seed, duration=duration, guidance_scale=guidance_scale, n_candidate_gen_per_text=int(n_candidates)) # [bs, 1, samples]
|
31 |
waveform = [(16000, wave[0]) for wave in waveform]
|
32 |
# waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
|
33 |
+
if(len(waveform) == 1):
|
34 |
+
return waveform[0]
|
35 |
+
else:
|
36 |
+
return waveform
|
37 |
|
38 |
# iface = gr.Interface(fn=text2audio, inputs=[
|
39 |
# gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
|
|
|
74 |
############# Input
|
75 |
textbox = gr.Textbox(value="A hammer is hitting a wooden surface", max_lines=1)
|
76 |
|
77 |
+
with gr.Accordion("Click to modify detailed configurations", open=False):
|
78 |
seed = gr.Number(value=42, label="Change this value (any integer number) will lead to a different generation result.")
|
79 |
duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
|
80 |
guidance_scale = gr.Slider(0, 5, value=2.5, step=0.5, label="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)")
|
81 |
n_candidates = gr.Slider(1, 5, value=3, step=1, label="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation")
|
82 |
############# Output
|
83 |
+
outputs=[gr.Audio(label="Output", type="numpy")]
|
84 |
+
# outputs=[gr.Audio(label="Output", type="numpy"), gr.Audio(label="Output", type="numpy")]
|
85 |
|
86 |
btn = gr.Button("Submit").style(full_width=True)
|
87 |
btn.click(text2audio, inputs=[textbox, duration, guidance_scale, seed, n_candidates], outputs=outputs)
|
|
|
93 |
</div>
|
94 |
''')
|
95 |
|
96 |
+
iface.queue(concurrency_count = 2)
|
97 |
iface.launch(debug=True)
|
98 |
# iface.launch(debug=True, share=True)
|
audioldm/clap/open_clip/model.py
CHANGED
@@ -745,6 +745,8 @@ class CLAP(nn.Module):
|
|
745 |
device = next(self.parameters()).device
|
746 |
for k in data:
|
747 |
data[k] = data[k].to(device)
|
|
|
|
|
748 |
text_embeds = self.encode_text(data, device=device)
|
749 |
text_embeds = F.normalize(text_embeds, dim=-1)
|
750 |
|
|
|
745 |
device = next(self.parameters()).device
|
746 |
for k in data:
|
747 |
data[k] = data[k].to(device)
|
748 |
+
if(len(data[k].size()) < 2):
|
749 |
+
data[k] = data[k].unsqueeze(0)
|
750 |
text_embeds = self.encode_text(data, device=device)
|
751 |
text_embeds = F.normalize(text_embeds, dim=-1)
|
752 |
|
audioldm/ldm.py
CHANGED
@@ -697,18 +697,19 @@ class LatentDiffusion(DDPM):
|
|
697 |
|
698 |
waveform = self.mel_spectrogram_to_waveform(mel)
|
699 |
|
700 |
-
|
701 |
-
|
702 |
-
|
|
|
703 |
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
return waveform
|
|
|
697 |
|
698 |
waveform = self.mel_spectrogram_to_waveform(mel)
|
699 |
|
700 |
+
if(waveform.shape[0] > 1):
|
701 |
+
similarity = self.cond_stage_model.cos_similarity(
|
702 |
+
torch.FloatTensor(waveform).squeeze(1), text
|
703 |
+
)
|
704 |
|
705 |
+
best_index = []
|
706 |
+
for i in range(z.shape[0]):
|
707 |
+
candidates = similarity[i :: z.shape[0]]
|
708 |
+
max_index = torch.argmax(candidates).item()
|
709 |
+
best_index.append(i + max_index * z.shape[0])
|
710 |
|
711 |
+
waveform = waveform[best_index]
|
712 |
+
# print("Similarity between generated audio and text", similarity)
|
713 |
+
# print("Choose the following indexes:", best_index)
|
714 |
+
|
715 |
return waveform
|
audioldm/pipeline.py
CHANGED
@@ -12,10 +12,10 @@ from audioldm.utils import default_audioldm_config
|
|
12 |
|
13 |
import time
|
14 |
|
15 |
-
def make_batch_for_text_to_audio(text, batchsize=
|
16 |
text = [text] * batchsize
|
17 |
-
if batchsize <
|
18 |
-
print("Warning: Batchsize must be at least
|
19 |
fbank = torch.zeros((batchsize, 1024, 64)) # Not used, here to keep the code format
|
20 |
stft = torch.zeros((batchsize, 1024, 512)) # Not used
|
21 |
waveform = torch.zeros((batchsize, 160000)) # Not used
|
@@ -63,7 +63,7 @@ def build_model(config=None):
|
|
63 |
def duration_to_latent_t_size(duration):
|
64 |
return int(duration * 25.6)
|
65 |
|
66 |
-
def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=
|
67 |
seed_everything(int(seed))
|
68 |
batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
|
69 |
|
|
|
12 |
|
13 |
import time
|
14 |
|
15 |
+
def make_batch_for_text_to_audio(text, batchsize=1):
|
16 |
text = [text] * batchsize
|
17 |
+
if batchsize < 1:
|
18 |
+
print("Warning: Batchsize must be at least 1. Batchsize is set to .")
|
19 |
fbank = torch.zeros((batchsize, 1024, 64)) # Not used, here to keep the code format
|
20 |
stft = torch.zeros((batchsize, 1024, 512)) # Not used
|
21 |
waveform = torch.zeros((batchsize, 160000)) # Not used
|
|
|
63 |
def duration_to_latent_t_size(duration):
|
64 |
return int(duration * 25.6)
|
65 |
|
66 |
+
def text_to_audio(latent_diffusion, text, seed=42, duration=10, batchsize=1, guidance_scale=2.5, n_candidate_gen_per_text=3, config=None):
|
67 |
seed_everything(int(seed))
|
68 |
batch = make_batch_for_text_to_audio(text, batchsize=batchsize)
|
69 |
|
requirements.txt
CHANGED
@@ -11,6 +11,7 @@ numpy<=1.23.5
|
|
11 |
soundfile
|
12 |
librosa
|
13 |
pandas
|
|
|
14 |
torchlibrosa
|
15 |
transformers
|
16 |
ftfy
|
|
|
11 |
soundfile
|
12 |
librosa
|
13 |
pandas
|
14 |
+
# transformers
|
15 |
torchlibrosa
|
16 |
transformers
|
17 |
ftfy
|