vocos-bark / app.py
ylacombe's picture
Update app.py
9f58137
raw
history blame
3.01 kB
import torch
from threading import Thread
from transformers import AutoProcessor
from transformers import set_seed
from vocos_bark import BarkModel
from scipy.io.wavfile import write
from pydub import AudioSegment
import numpy as np
import os
import gradio as gr
import uuid
import io
from vocos import Vocos
import os
os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp"
set_seed(0)
def _grab_best_device(use_gpu=True):
if torch.cuda.device_count() > 0 and use_gpu:
device = "cuda"
else:
device = "cpu"
return device
device = _grab_best_device()
HUB_PATH = "suno/bark"
processor = AutoProcessor.from_pretrained(HUB_PATH)
speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key])
SAMPLE_RATE = 24_000
vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device)
# import model
if device == "cpu":
bark = BarkModel.from_pretrained(HUB_PATH)
else:
bark = BarkModel.from_pretrained(HUB_PATH).to(device)
bark = bark.to_bettertransformer()
# streaming inference
def generate_audio(text, voice_preset = None, lag = 0):
if voice_preset not in speaker_embeddings:
voice_preset = None
sentences = [
text,
]
inputs = processor(sentences, voice_preset=voice_preset).to(device)
# Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
fine_output = bark.generate(
**inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
)
print("Fine tokens generated")
with torch.no_grad():
encodec_waveform = bark.codec_decode(fine_output)
features = vocos.codes_to_features(fine_output.transpose(0,1))
vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())
# Gradio blocks demo
with gr.Blocks() as demo_blocks:
gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""")
gr.HTML("""<h3 style="text-align:center;">📢Vocos-enhanced TTS 🦾! </h3>""")
with gr.Group():
with gr.Row():
inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here")
dd = gr.Dropdown(
speaker_embeddings,
value=None,
label="Available voice presets",
info="Defaults to no speaker embeddings!"
)
with gr.Row():
btn = gr.Button("Bark with Vocos TTS")
with gr.Row():
out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True)
out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True)
btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos])
demo_blocks.queue().launch(debug=True)