soujanyaporia commited on
Commit
39533a5
·
verified ·
1 Parent(s): fe39952

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +189 -28
app.py CHANGED
@@ -8,8 +8,167 @@ from huggingface_hub import snapshot_download
8
  from models import AudioDiffusion, DDPMScheduler
9
  from audioldm.audio.stft import TacotronSTFT
10
  from audioldm.variational_autoencoder import AutoencoderKL
 
11
  from gradio import Markdown
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Automatic device detection
15
  if torch.cuda.is_available():
@@ -20,7 +179,7 @@ else:
20
  device_selection = "cpu"
21
 
22
  class Tango:
23
- def __init__(self, name="declare-lab/tango-full-ft-audiocaps", device=device_selection):
24
 
25
  path = snapshot_download(repo_id=name)
26
 
@@ -53,13 +212,13 @@ class Tango:
53
  for i in range(0, len(lst), n):
54
  yield lst[i:i + n]
55
 
56
- def generate(self, prompt, steps=100, guidance=3, samples=3, disable_progress=True):
57
  """ Genrate audio for a single prompt string. """
58
  with torch.no_grad():
59
  latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
60
  mel = self.vae.decode_first_stage(latents)
61
  wave = self.vae.decode_to_waveform(mel)
62
- return wave
63
 
64
  def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
65
  """ Genrate audio for a list of prompt strings. """
@@ -82,22 +241,29 @@ tango = Tango(device="cpu")
82
  tango.vae.to(device_type)
83
  tango.stft.to(device_type)
84
  tango.model.to(device_type)
85
-
86
- @spaces.GPU(duration=120)
87
- def gradio_generate(prompt, steps, guidance):
88
- output_wave = tango.generate(prompt, steps, guidance)
89
- # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
90
 
91
- output_filename_1 = "tmp1_.wav"
92
- wavio.write(output_filename_1, output_wave[0], rate=16000, sampwidth=2)
 
 
 
 
93
 
94
- output_filename_2 = "tmp2_.wav"
95
- wavio.write(output_filename_2, output_wave[1], rate=16000, sampwidth=2)
 
 
 
 
 
 
 
96
 
97
- output_filename_3 = "tmp3_.wav"
98
- wavio.write(output_filename_3, output_wave[2], rate=16000, sampwidth=2)
 
99
 
100
- return [output_filename_1, output_filename_2, output_filename_3]
101
 
102
  # description_text = """
103
  # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
@@ -119,29 +285,24 @@ def gradio_generate(prompt, steps, guidance):
119
  # <p/>
120
  # """
121
  description_text = """
122
- <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
123
- Generate audio using TANGO by providing a text prompt.
124
- <br/>
125
- <br/>
126
- As TANGO consists of an instruction-tuned LLM, it is able to process complex sound descriptions allowing us to provide more detailed instructions to improve the generation quality.
127
- For example, ``A boat is moving on the sea'' vs ``The sound of the water lapping against the hull of the boat or splashing as you move through the waves''. The latter is obtained by prompting ChatGPT to explain the sound generated when a boat moves on the sea.
128
- Using this ChatGPT-generated description of the sound, TANGO provides superior results.
129
  <p/>
130
  """
131
  # Gradio input and output components
132
  input_text = gr.Textbox(lines=2, label="Prompt")
133
- output_audio_1 = gr.Audio(label="Generated Audio #1/3", type="filepath")
134
- output_audio_2 = gr.Audio(label="Generated Audio #2/3", type="filepath")
135
- output_audio_3 = gr.Audio(label="Generated Audio #3/3", type="filepath")
136
  denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
137
  guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
138
 
139
  # Gradio interface
140
  gr_interface = gr.Interface(
141
  fn=gradio_generate,
142
- inputs=[input_text, denoising_steps, guidance_scale],
143
- outputs=[output_audio_1, output_audio_2, output_audio_3],
144
- title="Tango: Text-to-Audio Generation using Instruction-tuned LLM and Latent Diffusion Model",
145
  description=description_text,
146
  allow_flagging=False,
147
  examples=[
 
8
  from models import AudioDiffusion, DDPMScheduler
9
  from audioldm.audio.stft import TacotronSTFT
10
  from audioldm.variational_autoencoder import AutoencoderKL
11
+ from pydub import AudioSegment
12
  from gradio import Markdown
13
 
14
+ import torch
15
+ #from diffusers.models.autoencoder_kl import AutoencoderKL
16
+ from diffusers.models.unet_2d_condition import UNet2DConditionModel
17
+ from diffusers import DiffusionPipeline,AudioPipelineOutput
18
+ from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
19
+ from typing import Union
20
+ from diffusers.utils.torch_utils import randn_tensor
21
+ from tqdm import tqdm
22
+
23
+
24
+
25
+
26
+
27
+ class Tango2Pipeline(DiffusionPipeline):
28
+
29
+
30
+ def __init__(
31
+ self,
32
+ vae: AutoencoderKL,
33
+ text_encoder: T5EncoderModel,
34
+ tokenizer: Union[T5Tokenizer, T5TokenizerFast],
35
+ unet: UNet2DConditionModel,
36
+ scheduler: DDPMScheduler
37
+ ):
38
+
39
+ super().__init__()
40
+
41
+ self.register_modules(vae=vae,
42
+ text_encoder=text_encoder,
43
+ tokenizer=tokenizer,
44
+ unet=unet,
45
+ scheduler=scheduler
46
+ )
47
+
48
+
49
+ def _encode_prompt(self, prompt):
50
+ device = self.text_encoder.device
51
+
52
+ batch = self.tokenizer(
53
+ prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
54
+ )
55
+ input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
56
+
57
+
58
+ encoder_hidden_states = self.text_encoder(
59
+ input_ids=input_ids, attention_mask=attention_mask
60
+ )[0]
61
+
62
+ boolean_encoder_mask = (attention_mask == 1).to(device)
63
+
64
+ return encoder_hidden_states, boolean_encoder_mask
65
+
66
+ def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
67
+ device = self.text_encoder.device
68
+ batch = self.tokenizer(
69
+ prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
70
+ )
71
+ input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
72
+
73
+ with torch.no_grad():
74
+ prompt_embeds = self.text_encoder(
75
+ input_ids=input_ids, attention_mask=attention_mask
76
+ )[0]
77
+
78
+ prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
79
+ attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
80
+
81
+ # get unconditional embeddings for classifier free guidance
82
+ uncond_tokens = [""] * len(prompt)
83
+
84
+ max_length = prompt_embeds.shape[1]
85
+ uncond_batch = self.tokenizer(
86
+ uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
87
+ )
88
+ uncond_input_ids = uncond_batch.input_ids.to(device)
89
+ uncond_attention_mask = uncond_batch.attention_mask.to(device)
90
+
91
+ with torch.no_grad():
92
+ negative_prompt_embeds = self.text_encoder(
93
+ input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
94
+ )[0]
95
+
96
+ negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
97
+ uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
98
+
99
+ # For classifier free guidance, we need to do two forward passes.
100
+ # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
101
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
102
+ prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
103
+ boolean_prompt_mask = (prompt_mask == 1).to(device)
104
+
105
+ return prompt_embeds, boolean_prompt_mask
106
+
107
+ def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
108
+ shape = (batch_size, num_channels_latents, 256, 16)
109
+ latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
110
+ # scale the initial noise by the standard deviation required by the scheduler
111
+ latents = latents * inference_scheduler.init_noise_sigma
112
+ return latents
113
+
114
+ @torch.no_grad()
115
+ def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
116
+ disable_progress=True):
117
+ device = self.text_encoder.device
118
+ classifier_free_guidance = guidance_scale > 1.0
119
+ batch_size = len(prompt) * num_samples_per_prompt
120
+
121
+ if classifier_free_guidance:
122
+ prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
123
+ else:
124
+ prompt_embeds, boolean_prompt_mask = self._encode_text(prompt)
125
+ prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
126
+ boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
127
+
128
+ inference_scheduler.set_timesteps(num_steps, device=device)
129
+ timesteps = inference_scheduler.timesteps
130
+
131
+ num_channels_latents = self.unet.config.in_channels
132
+ latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
133
+
134
+ num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
135
+ progress_bar = tqdm(range(num_steps), disable=disable_progress)
136
+
137
+ for i, t in enumerate(timesteps):
138
+ # expand the latents if we are doing classifier free guidance
139
+ latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
140
+ latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
141
+
142
+ noise_pred = self.unet(
143
+ latent_model_input, t, encoder_hidden_states=prompt_embeds,
144
+ encoder_attention_mask=boolean_prompt_mask
145
+ ).sample
146
+
147
+ # perform guidance
148
+ if classifier_free_guidance:
149
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
150
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
151
+
152
+ # compute the previous noisy sample x_t -> x_t-1
153
+ latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
154
+
155
+ # call the callback, if provided
156
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
157
+ progress_bar.update(1)
158
+
159
+ return latents
160
+
161
+ @torch.no_grad()
162
+ def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
163
+ """ Genrate audio for a single prompt string. """
164
+ with torch.no_grad():
165
+ latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
166
+ mel = self.vae.decode_first_stage(latents)
167
+ wave = self.vae.decode_to_waveform(mel)
168
+
169
+
170
+ return AudioPipelineOutput(audios=wave)
171
+
172
 
173
  # Automatic device detection
174
  if torch.cuda.is_available():
 
179
  device_selection = "cpu"
180
 
181
  class Tango:
182
+ def __init__(self, name="declare-lab/tango2", device=device_selection):
183
 
184
  path = snapshot_download(repo_id=name)
185
 
 
212
  for i in range(0, len(lst), n):
213
  yield lst[i:i + n]
214
 
215
+ def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
216
  """ Genrate audio for a single prompt string. """
217
  with torch.no_grad():
218
  latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
219
  mel = self.vae.decode_first_stage(latents)
220
  wave = self.vae.decode_to_waveform(mel)
221
+ return wave[0]
222
 
223
  def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
224
  """ Genrate audio for a list of prompt strings. """
 
241
  tango.vae.to(device_type)
242
  tango.stft.to(device_type)
243
  tango.model.to(device_type)
 
 
 
 
 
244
 
245
+ pipe = Tango2Pipeline(vae=tango.vae,
246
+ text_encoder=tango.model.text_encoder,
247
+ tokenizer=tango.model.tokenizer,
248
+ unet=tango.model.unet,
249
+ scheduler=tango.scheduler
250
+ )
251
 
252
+
253
+ @spaces.GPU(duration=60)
254
+ def gradio_generate(prompt, output_format, steps, guidance):
255
+ output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
256
+ #output_wave = tango.generate(prompt, steps, guidance)
257
+ # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
258
+ output_wave = output_wave.audios[0]
259
+ output_filename = "temp.wav"
260
+ wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
261
 
262
+ if (output_format == "mp3"):
263
+ AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
264
+ output_filename = "temp.mp3"
265
 
266
+ return output_filename
267
 
268
  # description_text = """
269
  # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 
285
  # <p/>
286
  # """
287
  description_text = """
288
+ <p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
289
+ Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
290
+ <br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
 
 
 
 
291
  <p/>
292
  """
293
  # Gradio input and output components
294
  input_text = gr.Textbox(lines=2, label="Prompt")
295
+ output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
296
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
 
297
  denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
298
  guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
299
 
300
  # Gradio interface
301
  gr_interface = gr.Interface(
302
  fn=gradio_generate,
303
+ inputs=[input_text, output_format, denoising_steps, guidance_scale],
304
+ outputs=[output_audio],
305
+ title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
306
  description=description_text,
307
  allow_flagging=False,
308
  examples=[