Spaces:
Running
on
Zero
Running
on
Zero
soujanyaporia
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -8,8 +8,167 @@ from huggingface_hub import snapshot_download
|
|
8 |
from models import AudioDiffusion, DDPMScheduler
|
9 |
from audioldm.audio.stft import TacotronSTFT
|
10 |
from audioldm.variational_autoencoder import AutoencoderKL
|
|
|
11 |
from gradio import Markdown
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Automatic device detection
|
15 |
if torch.cuda.is_available():
|
@@ -20,7 +179,7 @@ else:
|
|
20 |
device_selection = "cpu"
|
21 |
|
22 |
class Tango:
|
23 |
-
def __init__(self, name="declare-lab/
|
24 |
|
25 |
path = snapshot_download(repo_id=name)
|
26 |
|
@@ -53,13 +212,13 @@ class Tango:
|
|
53 |
for i in range(0, len(lst), n):
|
54 |
yield lst[i:i + n]
|
55 |
|
56 |
-
def generate(self, prompt, steps=100, guidance=3, samples=
|
57 |
""" Genrate audio for a single prompt string. """
|
58 |
with torch.no_grad():
|
59 |
latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
60 |
mel = self.vae.decode_first_stage(latents)
|
61 |
wave = self.vae.decode_to_waveform(mel)
|
62 |
-
return wave
|
63 |
|
64 |
def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
|
65 |
""" Genrate audio for a list of prompt strings. """
|
@@ -82,22 +241,29 @@ tango = Tango(device="cpu")
|
|
82 |
tango.vae.to(device_type)
|
83 |
tango.stft.to(device_type)
|
84 |
tango.model.to(device_type)
|
85 |
-
|
86 |
-
@spaces.GPU(duration=120)
|
87 |
-
def gradio_generate(prompt, steps, guidance):
|
88 |
-
output_wave = tango.generate(prompt, steps, guidance)
|
89 |
-
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
|
90 |
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
|
98 |
-
|
|
|
99 |
|
100 |
-
return
|
101 |
|
102 |
# description_text = """
|
103 |
# <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
|
@@ -119,29 +285,24 @@ def gradio_generate(prompt, steps, guidance):
|
|
119 |
# <p/>
|
120 |
# """
|
121 |
description_text = """
|
122 |
-
<p><a href="https://huggingface.co/spaces/declare-lab/
|
123 |
-
Generate audio using
|
124 |
-
<br/>
|
125 |
-
<br/>
|
126 |
-
As TANGO consists of an instruction-tuned LLM, it is able to process complex sound descriptions allowing us to provide more detailed instructions to improve the generation quality.
|
127 |
-
For example, ``A boat is moving on the sea'' vs ``The sound of the water lapping against the hull of the boat or splashing as you move through the waves''. The latter is obtained by prompting ChatGPT to explain the sound generated when a boat moves on the sea.
|
128 |
-
Using this ChatGPT-generated description of the sound, TANGO provides superior results.
|
129 |
<p/>
|
130 |
"""
|
131 |
# Gradio input and output components
|
132 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
133 |
-
|
134 |
-
|
135 |
-
output_audio_3 = gr.Audio(label="Generated Audio #3/3", type="filepath")
|
136 |
denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
|
137 |
guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
|
138 |
|
139 |
# Gradio interface
|
140 |
gr_interface = gr.Interface(
|
141 |
fn=gradio_generate,
|
142 |
-
inputs=[input_text, denoising_steps, guidance_scale],
|
143 |
-
outputs=[
|
144 |
-
title="Tango: Text-to-Audio
|
145 |
description=description_text,
|
146 |
allow_flagging=False,
|
147 |
examples=[
|
|
|
8 |
from models import AudioDiffusion, DDPMScheduler
|
9 |
from audioldm.audio.stft import TacotronSTFT
|
10 |
from audioldm.variational_autoencoder import AutoencoderKL
|
11 |
+
from pydub import AudioSegment
|
12 |
from gradio import Markdown
|
13 |
|
14 |
+
import torch
|
15 |
+
#from diffusers.models.autoencoder_kl import AutoencoderKL
|
16 |
+
from diffusers.models.unet_2d_condition import UNet2DConditionModel
|
17 |
+
from diffusers import DiffusionPipeline,AudioPipelineOutput
|
18 |
+
from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
|
19 |
+
from typing import Union
|
20 |
+
from diffusers.utils.torch_utils import randn_tensor
|
21 |
+
from tqdm import tqdm
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
class Tango2Pipeline(DiffusionPipeline):
|
28 |
+
|
29 |
+
|
30 |
+
def __init__(
|
31 |
+
self,
|
32 |
+
vae: AutoencoderKL,
|
33 |
+
text_encoder: T5EncoderModel,
|
34 |
+
tokenizer: Union[T5Tokenizer, T5TokenizerFast],
|
35 |
+
unet: UNet2DConditionModel,
|
36 |
+
scheduler: DDPMScheduler
|
37 |
+
):
|
38 |
+
|
39 |
+
super().__init__()
|
40 |
+
|
41 |
+
self.register_modules(vae=vae,
|
42 |
+
text_encoder=text_encoder,
|
43 |
+
tokenizer=tokenizer,
|
44 |
+
unet=unet,
|
45 |
+
scheduler=scheduler
|
46 |
+
)
|
47 |
+
|
48 |
+
|
49 |
+
def _encode_prompt(self, prompt):
|
50 |
+
device = self.text_encoder.device
|
51 |
+
|
52 |
+
batch = self.tokenizer(
|
53 |
+
prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
|
54 |
+
)
|
55 |
+
input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
|
56 |
+
|
57 |
+
|
58 |
+
encoder_hidden_states = self.text_encoder(
|
59 |
+
input_ids=input_ids, attention_mask=attention_mask
|
60 |
+
)[0]
|
61 |
+
|
62 |
+
boolean_encoder_mask = (attention_mask == 1).to(device)
|
63 |
+
|
64 |
+
return encoder_hidden_states, boolean_encoder_mask
|
65 |
+
|
66 |
+
def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
|
67 |
+
device = self.text_encoder.device
|
68 |
+
batch = self.tokenizer(
|
69 |
+
prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
|
70 |
+
)
|
71 |
+
input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
|
72 |
+
|
73 |
+
with torch.no_grad():
|
74 |
+
prompt_embeds = self.text_encoder(
|
75 |
+
input_ids=input_ids, attention_mask=attention_mask
|
76 |
+
)[0]
|
77 |
+
|
78 |
+
prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
|
79 |
+
attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
|
80 |
+
|
81 |
+
# get unconditional embeddings for classifier free guidance
|
82 |
+
uncond_tokens = [""] * len(prompt)
|
83 |
+
|
84 |
+
max_length = prompt_embeds.shape[1]
|
85 |
+
uncond_batch = self.tokenizer(
|
86 |
+
uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
|
87 |
+
)
|
88 |
+
uncond_input_ids = uncond_batch.input_ids.to(device)
|
89 |
+
uncond_attention_mask = uncond_batch.attention_mask.to(device)
|
90 |
+
|
91 |
+
with torch.no_grad():
|
92 |
+
negative_prompt_embeds = self.text_encoder(
|
93 |
+
input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
|
94 |
+
)[0]
|
95 |
+
|
96 |
+
negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
|
97 |
+
uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
|
98 |
+
|
99 |
+
# For classifier free guidance, we need to do two forward passes.
|
100 |
+
# We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
|
101 |
+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
102 |
+
prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
|
103 |
+
boolean_prompt_mask = (prompt_mask == 1).to(device)
|
104 |
+
|
105 |
+
return prompt_embeds, boolean_prompt_mask
|
106 |
+
|
107 |
+
def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
|
108 |
+
shape = (batch_size, num_channels_latents, 256, 16)
|
109 |
+
latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
|
110 |
+
# scale the initial noise by the standard deviation required by the scheduler
|
111 |
+
latents = latents * inference_scheduler.init_noise_sigma
|
112 |
+
return latents
|
113 |
+
|
114 |
+
@torch.no_grad()
|
115 |
+
def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
|
116 |
+
disable_progress=True):
|
117 |
+
device = self.text_encoder.device
|
118 |
+
classifier_free_guidance = guidance_scale > 1.0
|
119 |
+
batch_size = len(prompt) * num_samples_per_prompt
|
120 |
+
|
121 |
+
if classifier_free_guidance:
|
122 |
+
prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
|
123 |
+
else:
|
124 |
+
prompt_embeds, boolean_prompt_mask = self._encode_text(prompt)
|
125 |
+
prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
|
126 |
+
boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
|
127 |
+
|
128 |
+
inference_scheduler.set_timesteps(num_steps, device=device)
|
129 |
+
timesteps = inference_scheduler.timesteps
|
130 |
+
|
131 |
+
num_channels_latents = self.unet.config.in_channels
|
132 |
+
latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
|
133 |
+
|
134 |
+
num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
|
135 |
+
progress_bar = tqdm(range(num_steps), disable=disable_progress)
|
136 |
+
|
137 |
+
for i, t in enumerate(timesteps):
|
138 |
+
# expand the latents if we are doing classifier free guidance
|
139 |
+
latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
|
140 |
+
latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
|
141 |
+
|
142 |
+
noise_pred = self.unet(
|
143 |
+
latent_model_input, t, encoder_hidden_states=prompt_embeds,
|
144 |
+
encoder_attention_mask=boolean_prompt_mask
|
145 |
+
).sample
|
146 |
+
|
147 |
+
# perform guidance
|
148 |
+
if classifier_free_guidance:
|
149 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
150 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
151 |
+
|
152 |
+
# compute the previous noisy sample x_t -> x_t-1
|
153 |
+
latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
|
154 |
+
|
155 |
+
# call the callback, if provided
|
156 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
|
157 |
+
progress_bar.update(1)
|
158 |
+
|
159 |
+
return latents
|
160 |
+
|
161 |
+
@torch.no_grad()
|
162 |
+
def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
|
163 |
+
""" Genrate audio for a single prompt string. """
|
164 |
+
with torch.no_grad():
|
165 |
+
latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
166 |
+
mel = self.vae.decode_first_stage(latents)
|
167 |
+
wave = self.vae.decode_to_waveform(mel)
|
168 |
+
|
169 |
+
|
170 |
+
return AudioPipelineOutput(audios=wave)
|
171 |
+
|
172 |
|
173 |
# Automatic device detection
|
174 |
if torch.cuda.is_available():
|
|
|
179 |
device_selection = "cpu"
|
180 |
|
181 |
class Tango:
|
182 |
+
def __init__(self, name="declare-lab/tango2", device=device_selection):
|
183 |
|
184 |
path = snapshot_download(repo_id=name)
|
185 |
|
|
|
212 |
for i in range(0, len(lst), n):
|
213 |
yield lst[i:i + n]
|
214 |
|
215 |
+
def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
|
216 |
""" Genrate audio for a single prompt string. """
|
217 |
with torch.no_grad():
|
218 |
latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
|
219 |
mel = self.vae.decode_first_stage(latents)
|
220 |
wave = self.vae.decode_to_waveform(mel)
|
221 |
+
return wave[0]
|
222 |
|
223 |
def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
|
224 |
""" Genrate audio for a list of prompt strings. """
|
|
|
241 |
tango.vae.to(device_type)
|
242 |
tango.stft.to(device_type)
|
243 |
tango.model.to(device_type)
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
+
pipe = Tango2Pipeline(vae=tango.vae,
|
246 |
+
text_encoder=tango.model.text_encoder,
|
247 |
+
tokenizer=tango.model.tokenizer,
|
248 |
+
unet=tango.model.unet,
|
249 |
+
scheduler=tango.scheduler
|
250 |
+
)
|
251 |
|
252 |
+
|
253 |
+
@spaces.GPU(duration=60)
|
254 |
+
def gradio_generate(prompt, output_format, steps, guidance):
|
255 |
+
output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
|
256 |
+
#output_wave = tango.generate(prompt, steps, guidance)
|
257 |
+
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
|
258 |
+
output_wave = output_wave.audios[0]
|
259 |
+
output_filename = "temp.wav"
|
260 |
+
wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
|
261 |
|
262 |
+
if (output_format == "mp3"):
|
263 |
+
AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
|
264 |
+
output_filename = "temp.mp3"
|
265 |
|
266 |
+
return output_filename
|
267 |
|
268 |
# description_text = """
|
269 |
# <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
|
|
|
285 |
# <p/>
|
286 |
# """
|
287 |
description_text = """
|
288 |
+
<p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
|
289 |
+
Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
|
290 |
+
<br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
|
|
|
|
|
|
|
|
|
291 |
<p/>
|
292 |
"""
|
293 |
# Gradio input and output components
|
294 |
input_text = gr.Textbox(lines=2, label="Prompt")
|
295 |
+
output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
|
296 |
+
output_audio = gr.Audio(label="Generated Audio", type="filepath")
|
|
|
297 |
denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
|
298 |
guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
|
299 |
|
300 |
# Gradio interface
|
301 |
gr_interface = gr.Interface(
|
302 |
fn=gradio_generate,
|
303 |
+
inputs=[input_text, output_format, denoising_steps, guidance_scale],
|
304 |
+
outputs=[output_audio],
|
305 |
+
title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
|
306 |
description=description_text,
|
307 |
allow_flagging=False,
|
308 |
examples=[
|