import gradio as gr import torch from transformers import AutoProcessor, BarkModel import scipy from pytube import YouTube from pydub import AudioSegment #import ffmpeg # device = "cuda" if torch.cuda.is_available() else "cpu" # model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device) # model.enable_cpu_offload() device = "cpu" processor = AutoProcessor.from_pretrained("suno/bark-small") model = BarkModel.from_pretrained("suno/bark-small").to(device) num_list = ["1","2","3","4","5","6","7","8","9","10"] lang_list = ["en","de"] def run_bark(text, n, lang): #history_prompt = [] semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}" #text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."], inputs = processor(text=text, voice_preset = semantic_prompt, return_tensors="pt", ) speech_values = model.generate(**inputs, do_sample=True) sampling_rate = model.generation_config.sample_rate #sampling_rate = model.config.sample_rate #sampling_rate = 24000 scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze()) return ("bark_out.wav") def load_video_yt(vid): yt = YouTube(vid) vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4") vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp3") return vid, vid_aud def trim_clip(clip): #clip = "tmp_aud.mp3" # Open an mp3 file song = AudioSegment.from_file(f'{clip}', format="mp3") # start and end time start_min = 0 start_sec = 10 end_min = 0 end_sec = 55 # pydub does things in milliseconds, so convert time #start = ((start_min*60)+start_sec)*1000 #end = ((end_min*60)+end_sec)*1000 start = 0 end = 15*1000 # song clip of 10 seconds from starting first_10_seconds = song[start: end] # save file first_10_seconds.export("Mid.mp3", format="mp3") print("New Audio file is created and saved") return "Mid.mp3" with gr.Blocks() as app: with gr.Column(): in_text = gr.Textbox() with gr.Tab("Default"): with gr.Row(): speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1") speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en") go_btn = gr.Button() with gr.Tab("Upload"): with gr.Row(): in_aud_mic = gr.Audio(source='microphone') in_aud_file = gr.Audio(source='upload') with gr.Row(): in_aud_yt = gr.Textbox(label="YouTube URL") load_yt_btn = gr.Button("Load URL") with gr.Row(): trim_clip_btn = gr.Button("Trim Clip") trim_aud = gr.Audio(source='upload') yt_vid = gr.Video(type = 'filepath') trim_vid=gr.Video() alt_go_btn = gr.Button() #speaker_num = gr.Number(value=0) with gr.Column(): out_audio = gr.Audio() go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio) load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file]) trim_clip_btn.click(trim_clip,in_aud_file,trim_aud) #alt_go_btn.click() app.launch()