Spaces:
Running
Running
import gradio as gr | |
import torch | |
from diffusers import StableAudioPipeline | |
from huggingface_hub import hf_hub_download | |
import spaces | |
from translatepy import Translator | |
import numpy as np | |
import random | |
import soundfile as sf | |
translator = Translator() | |
# Constants | |
model = "stabilityai/stable-audio-open-1.0" | |
MAX_SEED = np.iinfo(np.int32).max | |
CSS = """ | |
.gradio-container { | |
max-width: 690px !important; | |
} | |
footer { | |
visibility: hidden; | |
} | |
""" | |
JS = """function () { | |
gradioURL = window.location.href | |
if (!gradioURL.endsWith('?__theme=dark')) { | |
window.location.replace(gradioURL + '?__theme=dark'); | |
} | |
}""" | |
DESCRIPTION = """ | |
<center> | |
Stable Audio Open 1.0 generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. \ | |
It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, \ | |
a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder. | |
</center> | |
""" | |
# Ensure model and scheduler are initialized in GPU-enabled function | |
if torch.cuda.is_available(): | |
pipe = StableAudioPipeline.from_pretrained( | |
model, | |
torch_dtype=torch.float16) | |
pipe = pipe.to("cuda") | |
# Function | |
def main( | |
prompt, | |
negative="low quality", | |
second: float = 10.0): | |
if seed == -1: | |
seed = random.randint(0, MAX_SEED) | |
seed = int(seed) | |
generator = torch.Generator().manual_seed(seed) | |
prompt = str(translator.translate(prompt, 'English')) | |
print(f'prompt:{prompt}') | |
audio = pipe( | |
prompt, | |
negative_prompt=negative, | |
audio_end_in_s=second, | |
num_inference_steps=200, | |
num_waveforms_per_prompt=3, | |
generator=generator, | |
).audios | |
os.makedirs("outputs", exist_ok=True) | |
base_count = len(glob(os.path.join("outputs", "*.mp4"))) | |
audio_path = os.path.join("outputs", f"{base_count:06d}.wav") | |
sf.write(audio_path, audio[0].T.float().cpu().numpy(), pipe.vae.samping_rate) | |
return audio_path, seed | |
# Gradio Interface | |
with gr.Blocks(theme='soft', css=CSS, js=JS, title="Stable Audio Open") as iface: | |
with gr.Accordion(""): | |
gr.Markdown(DESCRIPTION) | |
output = gr.Audio(label="Podcast", type="filepath", interactive=False, autoplay=True, elem_classes="audio") # Create an output textbox | |
prompt = gr.Textbox(label="Prompt", placeholder="1000 BPM percussive sound of water drops") | |
negative = gr.Textbox(label="Negative prompt", placeholder="Low quality") | |
with gr.Row(): | |
second = gr.Slider(5.0, 60.0, value=10.0, label="Second", step=0.1), | |
seed = gr.Slider(1, MAX_SEED, value=0, label="Seed", step=1), | |
with gr.Row(): | |
submit_btn = gr.Button("π Send") # Create a submit button | |
clear_btn = gr.ClearButton([prompt, seed, output], value="ποΈ Clear") # Create a clear button | |
# Set up the event listeners | |
submit_btn.click(main, inputs=[prompt, negative, second, seed], outputs=[output, seed]) | |
#gr.close_all() | |
iface.queue().launch(show_api=False) # Launch the Gradio interface |