Spaces:
Running
Running
File size: 4,528 Bytes
cfd58c5 2a5abe3 69b1728 88cd06b cfd58c5 d10d42c 1567aff dce80ca 82f1825 cfd58c5 88cd06b cfd58c5 88cd06b b026f7e af8075f 88cd06b b026f7e f4e63b7 99cf07f a2597a8 f3aa612 cfd58c5 249facd f3aa612 cfd58c5 c370655 f3aa612 cfd58c5 dce80ca 6113dfa dce80ca d10d42c 1cb3525 a8e72fd 1cb3525 0f61089 99954b1 1cb3525 a9f1358 1567aff 1cb3525 1567aff dce80ca 99954b1 1567aff 99954b1 1567aff 82f1825 1567aff d10d42c cfd58c5 107d5cd 3f1c26e d10d42c fad8edf 3f1c26e 071b368 99954b1 fad8edf dce80ca b026f7e cfd58c5 107d5cd b026f7e 402d823 99954b1 6113dfa cfd58c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import gradio as gr
import torch
from pathlib import Path
from transformers import AutoProcessor, BarkModel
import scipy
from pytube import YouTube
from pydub import AudioSegment
from TTS.api import TTS
#import ffmpeg
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
# model.enable_cpu_offload()
device = "cpu"
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
num_list = ["1","2","3","4","5","6","7","8","9","10"]
lang_list = ["en","de"]
def run_bark(text, n, lang):
#history_prompt = []
semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"
#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
inputs = processor(text=text,
voice_preset = semantic_prompt,
return_tensors="pt",
)
speech_values = model.generate(**inputs, do_sample=True)
sampling_rate = model.generation_config.sample_rate
#sampling_rate = model.config.sample_rate
#sampling_rate = 24000
scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
return ("bark_out.wav")
def custom_bark(inp):
speaker_wav=Path("Mid.mp3")
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path="output.wav")
return ("output.wav")
def load_video_yt(vid):
yt = YouTube(vid)
vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4")
vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp4")
print (yt.length)
return vid, vid_aud, "tmp_aud.mp4"
def trim_clip(clip, start_t, end_t):
clip = Path("tmp_aud.mp4")
#clip = "tmp_aud.mp3"
# Open an mp3 file
song = AudioSegment.from_file("tmp_aud.mp4",
format="mp4")
# start and end time
#start_min = 0
#start_sec = 10
#end_min = 0
#end_sec = 55
start_min = int(start_t.split(":",1)[0])
start_sec = int(start_t.split(":",1)[1])
end_min = int(end_t.split(":",1)[0])
end_sec = int(end_t.split(":",1)[1])
# pydub does things in milliseconds, so convert time
start = ((start_min*60)+start_sec)*1000
end = ((end_min*60)+end_sec)*1000
#start = 0
#end = 15*1000
# song clip of 10 seconds from starting
first_10_seconds = song[start: end]
# save file
first_10_seconds.export("Mid.mp3", format="mp3")
print("New Audio file is created and saved")
return "Mid.mp3"
with gr.Blocks() as app:
with gr.Column():
in_text = gr.Textbox()
with gr.Tab("Default"):
with gr.Row():
speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en")
go_btn = gr.Button()
with gr.Tab("Upload"):
with gr.Row():
with gr.Column():
in_aud_mic = gr.Audio(source='microphone')
in_aud_file = gr.Audio(source='upload', interactive = True)
aud_file = gr.File()
with gr.Column():
in_aud_yt = gr.Textbox(label="YouTube URL")
load_yt_btn = gr.Button("Load URL")
with gr.Column():
with gr.Row():
start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")
trim_clip_btn = gr.Button("Trim Clip")
trim_aud = gr.Audio(source='upload', interactive = False)
alt_go_btn = gr.Button()
yt_vid = gr.Video(type = 'filepath')
#speaker_num = gr.Number(value=0)
with gr.Column():
out_audio = gr.Audio()
go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio)
load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
trim_clip_btn.click(trim_clip,[aud_file, start_time, end_time],trim_aud)
alt_go_btn.click(custom_bark, in_text, out_audio)
app.launch() |