Spaces:

declare-lab
/

tango2

Running on Zero

App Files Files Community

soujanyaporia commited on Dec 28, 2024

Commit

39533a5

verified ·

1 Parent(s): fe39952

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -28

app.py CHANGED Viewed

@@ -8,8 +8,167 @@ from huggingface_hub import snapshot_download
 from models import AudioDiffusion, DDPMScheduler
 from audioldm.audio.stft import TacotronSTFT
 from audioldm.variational_autoencoder import AutoencoderKL
 from gradio import Markdown
 # Automatic device detection
 if torch.cuda.is_available():
@@ -20,7 +179,7 @@ else:
     device_selection = "cpu"
 class Tango:
-    def __init__(self, name="declare-lab/tango-full-ft-audiocaps", device=device_selection):
         path = snapshot_download(repo_id=name)
@@ -53,13 +212,13 @@ class Tango:
         for i in range(0, len(lst), n):
             yield lst[i:i + n]
-    def generate(self, prompt, steps=100, guidance=3, samples=3, disable_progress=True):
         """ Genrate audio for a single prompt string. """
         with torch.no_grad():
             latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
             mel = self.vae.decode_first_stage(latents)
             wave = self.vae.decode_to_waveform(mel)
-        return wave
     def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
         """ Genrate audio for a list of prompt strings. """
@@ -82,22 +241,29 @@ tango = Tango(device="cpu")
 tango.vae.to(device_type)
 tango.stft.to(device_type)
 tango.model.to(device_type)
-@spaces.GPU(duration=120)
-def gradio_generate(prompt, steps, guidance):
-    output_wave = tango.generate(prompt, steps, guidance)
-    # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
-    output_filename_1 = "tmp1_.wav"
-    wavio.write(output_filename_1, output_wave[0], rate=16000, sampwidth=2)
-    output_filename_2 = "tmp2_.wav"
-    wavio.write(output_filename_2, output_wave[1], rate=16000, sampwidth=2)
-    output_filename_3 = "tmp3_.wav"
-    wavio.write(output_filename_3, output_wave[2], rate=16000, sampwidth=2)
-    return [output_filename_1, output_filename_2, output_filename_3]
 # description_text = """
 # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
@@ -119,29 +285,24 @@ def gradio_generate(prompt, steps, guidance):
 # <p/>
 # """
 description_text = """
-<p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
-Generate audio using TANGO by providing a text prompt.
-<br/>
-<br/>
-As TANGO consists of an instruction-tuned LLM, it is able to process complex sound descriptions allowing us to provide more detailed instructions to improve the generation quality.
-For example, ``A boat is moving on the sea'' vs ``The sound of the water lapping against the hull of the boat or splashing as you move through the waves''. The latter is obtained by prompting ChatGPT to explain the sound generated when a boat moves on the sea.
-Using this ChatGPT-generated description of the sound, TANGO provides superior results.
 <p/>
 """
 # Gradio input and output components
 input_text = gr.Textbox(lines=2, label="Prompt")
-output_audio_1 = gr.Audio(label="Generated Audio #1/3", type="filepath")
-output_audio_2 = gr.Audio(label="Generated Audio #2/3", type="filepath")
-output_audio_3 = gr.Audio(label="Generated Audio #3/3", type="filepath")
 denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
 guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
 # Gradio interface
 gr_interface = gr.Interface(
     fn=gradio_generate,
-    inputs=[input_text, denoising_steps, guidance_scale],
-    outputs=[output_audio_1, output_audio_2, output_audio_3],
-    title="Tango: Text-to-Audio Generation using Instruction-tuned LLM and Latent Diffusion Model",
     description=description_text,
     allow_flagging=False,
     examples=[

 from models import AudioDiffusion, DDPMScheduler
 from audioldm.audio.stft import TacotronSTFT
 from audioldm.variational_autoencoder import AutoencoderKL
+from pydub import AudioSegment
 from gradio import Markdown
+import torch
+#from diffusers.models.autoencoder_kl import AutoencoderKL
+from diffusers.models.unet_2d_condition import UNet2DConditionModel
+from diffusers import DiffusionPipeline,AudioPipelineOutput
+from transformers import CLIPTextModel, T5EncoderModel, AutoModel, T5Tokenizer, T5TokenizerFast
+from typing import Union
+from diffusers.utils.torch_utils import randn_tensor
+from tqdm import tqdm
+class Tango2Pipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: T5EncoderModel,
+        tokenizer: Union[T5Tokenizer, T5TokenizerFast],
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler
+    ):
+        super().__init__()
+        self.register_modules(vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=scheduler
+        )
+    def _encode_prompt(self, prompt):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
+        encoder_hidden_states = self.text_encoder(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        boolean_encoder_mask = (attention_mask == 1).to(device)
+        return encoder_hidden_states, boolean_encoder_mask
+    def _encode_text_classifier_free(self, prompt, num_samples_per_prompt):
+        device = self.text_encoder.device
+        batch = self.tokenizer(
+            prompt, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
+        )
+        input_ids, attention_mask = batch.input_ids.to(device), batch.attention_mask.to(device)
+        with torch.no_grad():
+            prompt_embeds = self.text_encoder(
+                input_ids=input_ids, attention_mask=attention_mask
+            )[0]
+        prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+        attention_mask = attention_mask.repeat_interleave(num_samples_per_prompt, 0)
+        # get unconditional embeddings for classifier free guidance
+        uncond_tokens = [""] * len(prompt)
+        max_length = prompt_embeds.shape[1]
+        uncond_batch = self.tokenizer(
+            uncond_tokens, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt",
+        )
+        uncond_input_ids = uncond_batch.input_ids.to(device)
+        uncond_attention_mask = uncond_batch.attention_mask.to(device)
+        with torch.no_grad():
+            negative_prompt_embeds = self.text_encoder(
+                input_ids=uncond_input_ids, attention_mask=uncond_attention_mask
+            )[0]
+        negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+        uncond_attention_mask = uncond_attention_mask.repeat_interleave(num_samples_per_prompt, 0)
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        prompt_mask = torch.cat([uncond_attention_mask, attention_mask])
+        boolean_prompt_mask = (prompt_mask == 1).to(device)
+        return prompt_embeds, boolean_prompt_mask
+    def prepare_latents(self, batch_size, inference_scheduler, num_channels_latents, dtype, device):
+        shape = (batch_size, num_channels_latents, 256, 16)
+        latents = randn_tensor(shape, generator=None, device=device, dtype=dtype)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * inference_scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def inference(self, prompt, inference_scheduler, num_steps=20, guidance_scale=3, num_samples_per_prompt=1,
+                  disable_progress=True):
+        device = self.text_encoder.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(prompt) * num_samples_per_prompt
+        if classifier_free_guidance:
+            prompt_embeds, boolean_prompt_mask = self._encode_text_classifier_free(prompt, num_samples_per_prompt)
+        else:
+            prompt_embeds, boolean_prompt_mask = self._encode_text(prompt)
+            prompt_embeds = prompt_embeds.repeat_interleave(num_samples_per_prompt, 0)
+            boolean_prompt_mask = boolean_prompt_mask.repeat_interleave(num_samples_per_prompt, 0)
+        inference_scheduler.set_timesteps(num_steps, device=device)
+        timesteps = inference_scheduler.timesteps
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(batch_size, inference_scheduler, num_channels_latents, prompt_embeds.dtype, device)
+        num_warmup_steps = len(timesteps) - num_steps * inference_scheduler.order
+        progress_bar = tqdm(range(num_steps), disable=disable_progress)
+        for i, t in enumerate(timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if classifier_free_guidance else latents
+            latent_model_input = inference_scheduler.scale_model_input(latent_model_input, t)
+            noise_pred = self.unet(
+                latent_model_input, t, encoder_hidden_states=prompt_embeds,
+                encoder_attention_mask=boolean_prompt_mask
+            ).sample
+            # perform guidance
+            if classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = inference_scheduler.step(noise_pred, t, latents).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % inference_scheduler.order == 0):
+                progress_bar.update(1)
+        return latents
+    @torch.no_grad()
+    def __call__(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
+        """ Genrate audio for a single prompt string. """
+        with torch.no_grad():
+            latents = self.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
+            mel = self.vae.decode_first_stage(latents)
+            wave = self.vae.decode_to_waveform(mel)
+        return AudioPipelineOutput(audios=wave)
 # Automatic device detection
 if torch.cuda.is_available():
     device_selection = "cpu"
 class Tango:
+    def __init__(self, name="declare-lab/tango2", device=device_selection):
         path = snapshot_download(repo_id=name)
         for i in range(0, len(lst), n):
             yield lst[i:i + n]
+    def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
         """ Genrate audio for a single prompt string. """
         with torch.no_grad():
             latents = self.model.inference([prompt], self.scheduler, steps, guidance, samples, disable_progress=disable_progress)
             mel = self.vae.decode_first_stage(latents)
             wave = self.vae.decode_to_waveform(mel)
+        return wave[0]
     def generate_for_batch(self, prompts, steps=200, guidance=3, samples=1, batch_size=8, disable_progress=True):
         """ Genrate audio for a list of prompt strings. """
 tango.vae.to(device_type)
 tango.stft.to(device_type)
 tango.model.to(device_type)
+pipe = Tango2Pipeline(vae=tango.vae,
+                      text_encoder=tango.model.text_encoder,
+                      tokenizer=tango.model.tokenizer,
+                      unet=tango.model.unet,
+                      scheduler=tango.scheduler
+                      )
+@spaces.GPU(duration=60)
+def gradio_generate(prompt, output_format, steps, guidance):
+    output_wave = pipe(prompt,steps,guidance) ## Using pipeliine automatically uses flash attention for torch2.0 above
+    #output_wave = tango.generate(prompt, steps, guidance)
+    # output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
+    output_wave = output_wave.audios[0]
+    output_filename = "temp.wav"
+    wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
+    if (output_format == "mp3"):
+        AudioSegment.from_wav("temp.wav").export("temp.mp3", format = "mp3")
+        output_filename = "temp.mp3"
+    return output_filename
 # description_text = """
 # <p><a href="https://huggingface.co/spaces/declare-lab/tango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
 # <p/>
 # """
 description_text = """
+<p><a href="https://huggingface.co/spaces/declare-lab/tango2/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
+Generate audio using Tango2 by providing a text prompt. Tango2 was built from Tango and was trained on <a href="https://huggingface.co/datasets/declare-lab/audio-alpaca">Audio-alpaca</a>
+<br/><br/> This is the demo for Tango2 for text to audio generation: <a href="https://arxiv.org/abs/2404.09956">Read our paper.</a>
 <p/>
 """
 # Gradio input and output components
 input_text = gr.Textbox(lines=2, label="Prompt")
+output_format = gr.Radio(label = "Output format", info = "The file you can dowload", choices = ["mp3", "wav"], value = "wav")
+output_audio = gr.Audio(label="Generated Audio", type="filepath")
 denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
 guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
 # Gradio interface
 gr_interface = gr.Interface(
     fn=gradio_generate,
+    inputs=[input_text, output_format, denoising_steps, guidance_scale],
+    outputs=[output_audio],
+    title="Tango 2: Aligning Diffusion-based Text-to-Audio Generations through Direct Preference Optimization",
     description=description_text,
     allow_flagging=False,
     examples=[