Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,348 Bytes
02c7bdf bdaf47a 02c7bdf 7b02833 02c7bdf a1655f3 ed2aa07 02c7bdf 09a7355 2b1da53 09a7355 2b1da53 09a7355 1311e01 341eb54 ed2aa07 341eb54 ed2aa07 341eb54 ca9e869 ed2aa07 4c3cc25 6e4d760 75a5cbb 341eb54 6e4d760 3192961 09a7355 6e4d760 b78b7d0 e805751 7c1bd00 b78b7d0 e805751 b78b7d0 9f61737 6e4d760 8310825 43e8301 b78b7d0 7c1bd00 b78b7d0 99710ec dba113a 49effbd 0f8dddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os
import random
@spaces.GPU
def fn_clearvoice_se(input_wav, sr):
if sr == "16000 Hz":
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
fs = 16000
else:
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
fs = 48000
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced.wav', output_wav[0,:], fs)
return 'enhanced.wav'
@spaces.GPU
def fn_clearvoice_ss(input_wav):
myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav_list = output_wav_dict[key]
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
else:
output_wav_list = output_wav_dict
output_wav_s1 = output_wav_list[0]
output_wav_s2 = output_wav_list[1]
sf.write('separated_s1.wav', output_wav_s1[0,:], 16000)
sf.write('separated_s2.wav', output_wav_s2[0,:], 16000)
return "separated_s1.wav", "separated_s2.wav"
def find_mp4_files(directory):
mp4_files = []
# Walk through the directory and its subdirectories
for root, dirs, files in os.walk(directory):
for file in files:
# Check if the file ends with .mp4
if file.endswith(".mp4") and file[:3] == 'est':
mp4_files.append(os.path.join(root, file))
return mp4_files
@spaces.GPU()
def fn_clearvoice_tse(input_video):
myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
#output_wav_dict =
print(f'input_video: {input_video}')
myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')
output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
return output_list
@spaces.GPU
def fn_clearvoice_sr(input_wav, apply_se):
wavname = input_wav.split('/')[-1]
myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
fs = 48000
if apply_se:
new_wavname = wavname.replace('.wav', str(random.randint(0,1000))+'.wav')
myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
myClearVoice_se(input_path=input_wav, online_write=True, output_path=new_wavname)
input_wav = new_wavname
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
return 'enhanced_high_res.wav'
demo = gr.Blocks()
se_demo = gr.Interface(
fn=fn_clearvoice_se,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Dropdown(
["16000 Hz", "48000 Hz"], value="16000 Hz", multiselect=False, info="Choose a sampling rate for your output."
),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Enhancement",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts clear speech from background noise for enhanced speech quality. It supports both 16 kHz and 48 kHz audio outputs. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
examples = [
["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
["examples/english_speech_48kHz.wav", "48000 Hz"],
],
cache_examples = True,
)
ss_demo = gr.Interface(
fn=fn_clearvoice_ss,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Separation",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
examples = [
['examples/female_female_speech.wav'],
['examples/female_male_speech.wav'],
],
cache_examples = True,
)
tse_demo = gr.Interface(
fn=fn_clearvoice_tse,
inputs = [
gr.Video(label="Input Video"),
],
outputs = [
gr.Gallery(label="Output Video List")
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Audio-Visual Speaker Extraction",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
"To try it, simply upload your video, or click one of the examples. "),
# article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
# "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
examples = [
['examples/001.mp4'],
['examples/002.mp4'],
],
cache_examples = True,
)
sr_demo = gr.Interface(
fn=fn_clearvoice_sr,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Checkbox(label="Apply Speech Enhancement", value=True),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
examples = [
["examples/mandarin_speech_16kHz.wav", True],
["examples/LJSpeech-001-0001-22k.wav", True],
["examples/LibriTTS_986_129388_24k.wav", True],
["examples/english_speech_48kHz.wav", True],
],
cache_examples = True,
)
with demo:
gr.TabbedInterface([se_demo, ss_demo, sr_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Speech Super Resolution", "Task 4: Audio-Visual Speaker Extraction"])
demo.launch() |