almost working
Browse files- app.py +14 -13
- vad_utils.py +15 -2
app.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
-
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
probs = get_speech_probs(wav, model, sampling_rate=16_000)
|
9 |
return make_visualization(probs, 512 / 16_000)
|
10 |
|
11 |
def process_parameters(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
|
12 |
return probs2speech_timestamps(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms)
|
13 |
|
14 |
-
def main():
|
15 |
-
|
|
|
16 |
|
17 |
with gr.Blocks() as demo:
|
18 |
with gr.Row():
|
@@ -20,15 +21,15 @@ def main():
|
|
20 |
button1 = gr.Button("Process Audio")
|
21 |
figure = gr.Image()
|
22 |
|
23 |
-
button1.click(process_audio, inputs=[audio_input
|
24 |
|
25 |
with gr.Row():
|
26 |
probs = gr.State(None)
|
27 |
-
threshold = gr.Number(label="Threshold",
|
28 |
-
min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)",
|
29 |
-
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)",
|
30 |
-
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536],
|
31 |
-
speech_pad_ms = gr.Number(label="Speech Pad (ms)",
|
32 |
button2 = gr.Button("Process Parameters")
|
33 |
output_text = gr.Textbox()
|
34 |
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
+
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
|
4 |
+
import torch
|
5 |
|
6 |
+
def process_audio(audio_input):
|
7 |
+
wav = read_audio(audio_input, sampling_rate=16_000)
|
8 |
+
probs = get_speech_probs(wav, sampling_rate=16_000)
|
|
|
9 |
return make_visualization(probs, 512 / 16_000)
|
10 |
|
11 |
def process_parameters(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
|
12 |
return probs2speech_timestamps(probs, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms)
|
13 |
|
14 |
+
def main():
|
15 |
+
|
16 |
+
|
17 |
|
18 |
with gr.Blocks() as demo:
|
19 |
with gr.Row():
|
|
|
21 |
button1 = gr.Button("Process Audio")
|
22 |
figure = gr.Image()
|
23 |
|
24 |
+
button1.click(process_audio, inputs=[audio_input], outputs=figure)
|
25 |
|
26 |
with gr.Row():
|
27 |
probs = gr.State(None)
|
28 |
+
threshold = gr.Number(label="Threshold", value=0.5, minimum=0.0, maximum=1.0)
|
29 |
+
min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=250)
|
30 |
+
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
|
31 |
+
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
|
32 |
+
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
|
33 |
button2 = gr.Button("Process Parameters")
|
34 |
output_text = gr.Textbox()
|
35 |
|
vad_utils.py
CHANGED
@@ -6,7 +6,7 @@ import torch.nn.functional as F
|
|
6 |
import warnings
|
7 |
|
8 |
def get_speech_probs(audio: torch.Tensor,
|
9 |
-
|
10 |
threshold: float = 0.5,
|
11 |
sampling_rate: int = 16000,
|
12 |
window_size_samples: int = 512,
|
@@ -163,4 +163,17 @@ def make_visualization(probs, step):
|
|
163 |
xlabel='seconds',
|
164 |
ylabel='speech probability',
|
165 |
colormap='tab20')
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import warnings
|
7 |
|
8 |
def get_speech_probs(audio: torch.Tensor,
|
9 |
+
# model,
|
10 |
threshold: float = 0.5,
|
11 |
sampling_rate: int = 16000,
|
12 |
window_size_samples: int = 512,
|
|
|
163 |
xlabel='seconds',
|
164 |
ylabel='speech probability',
|
165 |
colormap='tab20')
|
166 |
+
|
167 |
+
torch.set_num_threads(1)
|
168 |
+
|
169 |
+
|
170 |
+
USE_ONNX = True # change this to True if you want to test onnx model
|
171 |
+
|
172 |
+
|
173 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
174 |
+
model='silero_vad',
|
175 |
+
force_reload=True,
|
176 |
+
onnx=USE_ONNX)
|
177 |
+
(_,
|
178 |
+
_, read_audio,
|
179 |
+
*_) = utils
|