Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoProcessor, BarkModel | |
import scipy | |
from pytube import YouTube | |
import ffmpeg | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device) | |
# model.enable_cpu_offload() | |
device = "cpu" | |
processor = AutoProcessor.from_pretrained("suno/bark-small") | |
model = BarkModel.from_pretrained("suno/bark-small").to(device) | |
num_list = ["1","2","3","4","5","6","7","8","9","10"] | |
lang_list = ["en","de"] | |
def run_bark(text, n, lang): | |
#history_prompt = [] | |
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}" | |
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."], | |
inputs = processor(text=text, | |
voice_preset = semantic_prompt, | |
return_tensors="pt", | |
) | |
speech_values = model.generate(**inputs, do_sample=True) | |
sampling_rate = model.generation_config.sample_rate | |
#sampling_rate = model.config.sample_rate | |
#sampling_rate = 24000 | |
scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze()) | |
return ("bark_out.wav") | |
def load_video_yt(vid): | |
yt = YouTube(vid) | |
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4") | |
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp3") | |
return vid, vid_aud | |
def trim_clip(clip): | |
start_pos = 1 | |
duration = 10 | |
audio_input = ffmpeg.input(clip) | |
audio_output = ffmpeg.output(audio_input, save_location, ss=start_pos, t=duration) | |
audio_output.run() | |
return audio_output | |
with gr.Blocks() as app: | |
with gr.Column(): | |
in_text = gr.Textbox() | |
with gr.Tab("Default"): | |
with gr.Row(): | |
speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1") | |
speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en") | |
go_btn = gr.Button() | |
with gr.Tab("Upload"): | |
with gr.Row(): | |
in_aud_mic = gr.Audio(source='microphone') | |
in_aud_file = gr.Audio(source='upload') | |
with gr.Row(): | |
in_aud_yt = gr.Textbox(label="YouTube URL") | |
load_yt_btn = gr.Button("Load URL") | |
with gr.Row(): | |
trim_clip_btn = gr.Button("Trim Clip") | |
trim_aud = gr.Audio(source='upload') | |
yt_vid = gr.Video() | |
alt_go_btn = gr.Button() | |
#speaker_num = gr.Number(value=0) | |
with gr.Column(): | |
out_audio = gr.Audio() | |
go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio) | |
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file]) | |
trim_clip_btn.click(trim_clip,in_aud_file,trim_aud) | |
#alt_go_btn.click() | |
app.launch() |