Siddhant commited on
Commit
9264210
1 Parent(s): ec5cf2f

Upload 2 files

Browse files
Files changed (2) hide show
  1. VAD/vad_handler.py +64 -0
  2. VAD/vad_iterator.py +100 -0
VAD/vad_handler.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from VAD.vad_iterator import VADIterator
2
+ from baseHandler import BaseHandler
3
+ import numpy as np
4
+ import torch
5
+ from rich.console import Console
6
+
7
+ from utils.utils import int2float
8
+
9
+ import logging
10
+
11
+ logging.basicConfig(
12
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
13
+ )
14
+ logger = logging.getLogger(__name__)
15
+
16
+ console = Console()
17
+
18
+
19
+ class VADHandler(BaseHandler):
20
+ """
21
+ Handles voice activity detection. When voice activity is detected, audio will be accumulated until the end of speech is detected and then passed
22
+ to the following part.
23
+ """
24
+
25
+ def setup(
26
+ self,
27
+ should_listen,
28
+ thresh=0.3,
29
+ sample_rate=16000,
30
+ min_silence_ms=1000,
31
+ min_speech_ms=500,
32
+ max_speech_ms=float("inf"),
33
+ speech_pad_ms=30,
34
+ ):
35
+ self.should_listen = should_listen
36
+ self.sample_rate = sample_rate
37
+ self.min_silence_ms = min_silence_ms
38
+ self.min_speech_ms = min_speech_ms
39
+ self.max_speech_ms = max_speech_ms
40
+ self.model, _ = torch.hub.load("snakers4/silero-vad", "silero_vad")
41
+ self.iterator = VADIterator(
42
+ self.model,
43
+ threshold=thresh,
44
+ sampling_rate=sample_rate,
45
+ min_silence_duration_ms=min_silence_ms,
46
+ speech_pad_ms=speech_pad_ms,
47
+ )
48
+
49
+ def process(self, audio_chunk):
50
+ audio_int16 = np.frombuffer(audio_chunk, dtype=np.int16)
51
+ audio_float32 = int2float(audio_int16)
52
+ vad_output = self.iterator(torch.from_numpy(audio_float32))
53
+ if vad_output is not None and len(vad_output) != 0:
54
+ logger.debug("VAD: end of speech detected")
55
+ array = torch.cat(vad_output).cpu().numpy()
56
+ duration_ms = len(array) / self.sample_rate * 1000
57
+ if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
58
+ logger.debug(
59
+ f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
60
+ )
61
+ else:
62
+ self.should_listen.clear()
63
+ logger.debug("Stop listening")
64
+ yield array
VAD/vad_iterator.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class VADIterator:
5
+ def __init__(
6
+ self,
7
+ model,
8
+ threshold: float = 0.5,
9
+ sampling_rate: int = 16000,
10
+ min_silence_duration_ms: int = 100,
11
+ speech_pad_ms: int = 30,
12
+ ):
13
+ """
14
+ Mainly taken from https://github.com/snakers4/silero-vad
15
+ Class for stream imitation
16
+
17
+ Parameters
18
+ ----------
19
+ model: preloaded .jit/.onnx silero VAD model
20
+
21
+ threshold: float (default - 0.5)
22
+ Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
23
+ It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
24
+
25
+ sampling_rate: int (default - 16000)
26
+ Currently silero VAD models support 8000 and 16000 sample rates
27
+
28
+ min_silence_duration_ms: int (default - 100 milliseconds)
29
+ In the end of each speech chunk wait for min_silence_duration_ms before separating it
30
+
31
+ speech_pad_ms: int (default - 30 milliseconds)
32
+ Final speech chunks are padded by speech_pad_ms each side
33
+ """
34
+
35
+ self.model = model
36
+ self.threshold = threshold
37
+ self.sampling_rate = sampling_rate
38
+ self.is_speaking = False
39
+ self.buffer = []
40
+
41
+ if sampling_rate not in [8000, 16000]:
42
+ raise ValueError(
43
+ "VADIterator does not support sampling rates other than [8000, 16000]"
44
+ )
45
+
46
+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
47
+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
48
+ self.reset_states()
49
+
50
+ def reset_states(self):
51
+ self.model.reset_states()
52
+ self.triggered = False
53
+ self.temp_end = 0
54
+ self.current_sample = 0
55
+
56
+ @torch.no_grad()
57
+ def __call__(self, x):
58
+ """
59
+ x: torch.Tensor
60
+ audio chunk (see examples in repo)
61
+
62
+ return_seconds: bool (default - False)
63
+ whether return timestamps in seconds (default - samples)
64
+ """
65
+
66
+ if not torch.is_tensor(x):
67
+ try:
68
+ x = torch.Tensor(x)
69
+ except Exception:
70
+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
71
+
72
+ window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
73
+ self.current_sample += window_size_samples
74
+
75
+ speech_prob = self.model(x, self.sampling_rate).item()
76
+
77
+ if (speech_prob >= self.threshold) and self.temp_end:
78
+ self.temp_end = 0
79
+
80
+ if (speech_prob >= self.threshold) and not self.triggered:
81
+ self.triggered = True
82
+ return None
83
+
84
+ if (speech_prob < self.threshold - 0.15) and self.triggered:
85
+ if not self.temp_end:
86
+ self.temp_end = self.current_sample
87
+ if self.current_sample - self.temp_end < self.min_silence_samples:
88
+ return None
89
+ else:
90
+ # end of speak
91
+ self.temp_end = 0
92
+ self.triggered = False
93
+ spoken_utterance = self.buffer
94
+ self.buffer = []
95
+ return spoken_utterance
96
+
97
+ if self.triggered:
98
+ self.buffer.append(x)
99
+
100
+ return None