import spaces import gradio as gr from gradio_client import Client, handle_file from zerorvc import RVC import soundfile as sf from pydub import AudioSegment from joblib import memory memory = memory.Memory(location="cache", verbose=0) @memory.cache(ignore=["client"]) def split(client, audio): result = client.predict( param_0=handle_file(audio), param_1="BS-RoFormer", api_name="/separate" ) return result[0], result[1] @spaces.GPU def convert(model, vocal, pitch_modification): rvc = RVC.from_pretrained(model) samples = rvc.convert(vocal, pitch_modification=pitch_modification) return samples, rvc.sr def process_audio(client, model, audio, pitch_modification): vocal, bgm = split(client, audio) samples, sr = convert(model, vocal, pitch_modification) sf.write("vocal.wav", samples, sr) vocal = AudioSegment.from_wav("vocal.wav") background = AudioSegment.from_mp3(bgm) combined = background.overlay(vocal) combined.export("combined.mp3", format="mp3") return "combined.mp3", "vocal.wav", bgm iface = gr.Interface( fn=process_audio, inputs=[gr.Audio(type="filepath"), gr.Slider(minimum=-36, maximum=36, value=0, step=1)], outputs=[gr.Audio(label="Combined"), gr.Audio(label="Vocal"), gr.Audio(label="Background")], title="Voice Conversion", description="Upload an audio file and process it.", ) def set_client_for_session(request: gr.Request): x_ip_token = request.headers['x-ip-token'] return Client("JacobLinCool/vocal-separation", headers={"X-IP-Token": x_ip_token}) with gr.Blocks() as demo: client = gr.State() model = gr.Text("someone/model") audio = gr.Audio(type="filepath") pitch = gr.Slider(minimum=-36, maximum=36, value=0, step=1) btn = gr.Button("Run", variant="primary") btn.click( process_audio, [client, model, audio, pitch], [gr.Audio(label="Combined"), gr.Audio(label="Vocal"), gr.Audio(label="Background")] ) demo.load(set_client_for_session, None, client) demo.launch()