Spaces:
Running
Running
File size: 2,969 Bytes
cfd58c5 2a5abe3 88cd06b cfd58c5 d10d42c 0f61089 cfd58c5 88cd06b cfd58c5 88cd06b b026f7e af8075f 88cd06b b026f7e f4e63b7 99cf07f a2597a8 f3aa612 cfd58c5 249facd f3aa612 cfd58c5 c370655 f3aa612 cfd58c5 d10d42c 0f61089 28ee316 0f61089 d10d42c cfd58c5 107d5cd 3f1c26e d10d42c fad8edf 3f1c26e d10d42c fad8edf 0f61089 591c163 0f61089 fad8edf b026f7e cfd58c5 107d5cd b026f7e fad8edf 28ee316 fad8edf cfd58c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import gradio as gr
import torch
from transformers import AutoProcessor, BarkModel
import scipy
from pytube import YouTube
import ffmpeg
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
# model.enable_cpu_offload()
device = "cpu"
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
def run_bark(text, n, lang):
#history_prompt = []
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
inputs = processor(text=text,
voice_preset = semantic_prompt,
return_tensors="pt",
)
speech_values = model.generate(**inputs, do_sample=True)
sampling_rate = model.generation_config.sample_rate
#sampling_rate = model.config.sample_rate
#sampling_rate = 24000
scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
return ("bark_out.wav")
def load_video_yt(vid):
yt = YouTube(vid)
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4")
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp3")
return vid, vid_aud
def trim_clip(clip):
start_pos = 1
duration = 10
audio_input = ffmpeg.input(clip)
audio_output = ffmpeg.output(audio_input, save_location, ss=start_pos, t=duration)
audio_output.run()
return audio_output
with gr.Blocks() as app:
with gr.Column():
in_text = gr.Textbox()
with gr.Tab("Default"):
with gr.Row():
speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en")
go_btn = gr.Button()
with gr.Tab("Upload"):
with gr.Row():
in_aud_mic = gr.Audio(source='microphone')
in_aud_file = gr.Audio(source='upload')
with gr.Row():
in_aud_yt = gr.Textbox(label="YouTube URL")
load_yt_btn = gr.Button("Load URL")
with gr.Row():
trim_clip_btn = gr.Button("Trim Clip")
trim_aud = gr.Audio(source='upload')
yt_vid = gr.Video()
alt_go_btn = gr.Button()
#speaker_num = gr.Number(value=0)
with gr.Column():
out_audio = gr.Audio()
go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio)
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file])
trim_clip_btn.click(trim_clip,in_aud_file,trim_aud)
#alt_go_btn.click()
app.launch() |