Spaces:
Build error
Build error
File size: 4,376 Bytes
8235b4f b3d591c 8235b4f b3d591c 8235b4f b3d591c 8235b4f b3d591c 39e4af1 b3d591c 8235b4f 3d7e2e4 8235b4f 3d7e2e4 8235b4f b3d591c 8235b4f b3d591c 3d7e2e4 8235b4f b3d591c 8235b4f b3d591c 8235b4f 82ca6b2 8235b4f 3d7e2e4 b3d591c 3d7e2e4 8235b4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
from svoice.separate import *
import scipy.io.wavfile as wav
import gradio as gr
import os
import torch
import soundfile as sf
from transformers import pipeline
from glob import glob
load_model()
device = "cuda" if torch.cuda.is_available() else "cpu"
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
os.makedirs('input', exist_ok=True)
os.makedirs('separated', exist_ok=True)
print("Loading ASR model...")
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0 if device == "cuda" else -1)
print("ASR model loaded!")
def transcribe_audio(audiopath):
audio_input, sr = sf.read(audiopath)
return pipe(audio_input, sampling_rate=sr, return_tensors=False, padding=True, max_new_tokens=500)['text']
def separator(audio, rec_audio, example):
outputs= {}
for f in glob('input/*'):
os.remove(f)
for f in glob('separated/*'):
os.remove(f)
if audio:
wav.write('input/original.wav', audio[0], audio[1])
elif rec_audio:
wav.write('input/original.wav', rec_audio[0], rec_audio[1])
else:
os.system(f'cp {example} input/original.wav')
separate_demo(mix_dir="./input")
separated_files = glob(os.path.join('separated', "*.wav"))
separated_files = sorted([f for f in separated_files if "original.wav" not in f])
outputs["transcripts"] = []
for i, f in enumerate(separated_files):
print(f"Transcribing separated audio {i+1} ...")
outputs["transcripts"].append(transcribe_audio(f))
print("Text:", outputs["transcripts"][-1])
return separated_files + outputs['transcripts']
def set_example_audio(example: list) -> dict:
return gr.Audio.update(value=example[0])
demo = gr.Blocks()
with demo:
gr.Markdown('''
<center>
<h1>Multiple Voice Separation with Transcription DEMO</h1>
<div style="display:flex;align-items:center;justify-content:center;"><iframe src="https://streamable.com/e/0x8osl?autoplay=1&nocontrols=1" frameborder="0" allow="autoplay"></iframe></div>
<p>
This is a demo for the multiple voice separation algorithm. The algorithm is trained on the LibriMix7 dataset and can be used to separate multiple voices from a single audio file.
*This is an intermediate checkpoint just for experimentation purpose. It isn't performing well on 16k sample rate so you can go here <b><a href="https://github.com/muhammad-ahmed-ghani/svoice_demo">svoice_demo</a></b> to train it on 8k.
</p>
</center>
''')
with gr.Row():
input_audio = gr.Audio(label="Input audio", type="numpy")
rec_audio = gr.Audio(label="Record Using Microphone", type="numpy", source="microphone")
with gr.Row():
output_audio1 = gr.Audio(label='Speaker 1', interactive=False)
output_text1 = gr.Text(label='Speaker 1', interactive=False)
output_audio2 = gr.Audio(label='Speaker 2', interactive=False)
output_text2 = gr.Text(label='Speaker 2', interactive=False)
with gr.Row():
output_audio3 = gr.Audio(label='Speaker 3', interactive=False)
output_text3 = gr.Text(label='Speaker 3', interactive=False)
output_audio4 = gr.Audio(label='Speaker 4', interactive=False)
output_text4 = gr.Text(label='Speaker 4', interactive=False)
with gr.Row():
output_audio5 = gr.Audio(label='Speaker 5', interactive=False)
output_text5 = gr.Text(label='Speaker 5', interactive=False)
output_audio6 = gr.Audio(label='Speaker 6', interactive=False)
output_text6 = gr.Text(label='Speaker 6', interactive=False)
with gr.Row():
output_audio7 = gr.Audio(label='Speaker 7', interactive=False)
output_text7 = gr.Text(label='Speaker 7', interactive=False)
outputs_audio = [output_audio1, output_audio2, output_audio3, output_audio4, output_audio5, output_audio6, output_audio7]
outputs_text = [output_text1, output_text2, output_text3, output_text4, output_text5, output_text6, output_text7]
button = gr.Button("Separate")
examples = [
"samples/mixture1.wav",
"samples/mixture2.wav",
"samples/mixture3.wav"
]
example_selector = gr.inputs.Radio(examples, label="Example Audio")
button.click(separator, inputs=[input_audio, rec_audio, example_selector], outputs=outputs_audio + outputs_text)
demo.launch() |