import gradio as gr
import torch
from diffusers import StableAudioPipeline
from huggingface_hub import hf_hub_download
import spaces
from translatepy import Translator
import numpy as np
import random
import soundfile as sf
translator = Translator()
# Constants
model = "stabilityai/stable-audio-open-1.0"
# MAX_SEED = np.iinfo(np.int32).max
CSS = """
.gradio-container {
max-width: 690px !important;
}
footer {
visibility: hidden;
}
"""
JS = """function () {
gradioURL = window.location.href
if (!gradioURL.endsWith('?__theme=dark')) {
window.location.replace(gradioURL + '?__theme=dark');
}
}"""
DESCRIPTION = """
Stable Audio Open 1.0 generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. \
It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, \
a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder.
"""
# Ensure model and scheduler are initialized in GPU-enabled function
if torch.cuda.is_available():
pipe = StableAudioPipeline.from_pretrained(
model,
low_cpu_mem_usage=True,
torch_dtype=torch.float16).to("cuda")
# Function
@spaces.GPU(duration=120)
def generate_image(
prompt,
negative="low quality",
second: float = 10.0):
# if seed == -1:
# seed = random.randint(0, MAX_SEED)
# seed = int(seed)
# generator = torch.Generator().manual_seed(seed)
prompt = str(translator.translate(prompt, 'English'))
print(f'prompt:{prompt}')
audio = pipe(
prompt,
negative_prompt=negative,
audio_end_in_s=second,
).audios
os.makedirs("outputs", exist_ok=True)
base_count = len(glob(os.path.join("outputs", "*.mp4")))
audio_path = os.path.join("outputs", f"{base_count:06d}.wav")
sf.write(audio_path, audio[0].T.float().cpu().numpy(), pipe.vae.samping_rate)
return audio_path
# Gradio Interface
with gr.Blocks(theme='soft', css=CSS, js=JS, title="Stable Audio Open") as iface:
with gr.Accordion(""):
gr.Markdown(DESCRIPTION)
with gr.Row():
output = gr.Audio(label="Podcast", type="filepath", interactive=False, autoplay=True, elem_classes="audio") # Create an output textbox
with gr.Row():
prompt = gr.Textbox(label="Prompt", placeholder="1000 BPM percussive sound of water drops")
with gr.Row():
negative = gr.Textbox(label="Negative prompt", placeholder="Low quality")
second = gr.Slider(5.0, 60.0, value=10.0, label="Second", step=0.1),
with gr.Row():
submit_btn = gr.Button("🚀 Send") # Create a submit button
clear_btn = gr.ClearButton(output, value="🗑️ Clear") # Create a clear button
# Set up the event listeners
submit_btn.click(main, inputs=[prompt, negative, second], outputs=output)
#gr.close_all()
iface.queue().launch(show_api=False) # Launch the Gradio interface