Spaces:

Siddhant
/

Voice_Assistant

Sleeping

App Files Files Community

Siddhant commited on Sep 7

Commit

9264210

•

1 Parent(s): ec5cf2f

Upload 2 files

Browse files

Files changed (2) hide show

VAD/vad_handler.py +64 -0
VAD/vad_iterator.py +100 -0

VAD/vad_handler.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from VAD.vad_iterator import VADIterator
+from baseHandler import BaseHandler
+import numpy as np
+import torch
+from rich.console import Console
+from utils.utils import int2float
+import logging
+logging.basicConfig(
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+console = Console()
+class VADHandler(BaseHandler):
+ """
+ Handles voice activity detection. When voice activity is detected, audio will be accumulated until the end of speech is detected and then passed
+ to the following part.
+ """
+ def setup(
+ self,
+ should_listen,
+ thresh=0.3,
+ sample_rate=16000,
+ min_silence_ms=1000,
+ min_speech_ms=500,
+ max_speech_ms=float("inf"),
+ speech_pad_ms=30,
+ ):
+ self.should_listen = should_listen
+ self.sample_rate = sample_rate
+ self.min_silence_ms = min_silence_ms
+ self.min_speech_ms = min_speech_ms
+ self.max_speech_ms = max_speech_ms
+ self.model, _ = torch.hub.load("snakers4/silero-vad", "silero_vad")
+ self.iterator = VADIterator(
+ self.model,
+ threshold=thresh,
+ sampling_rate=sample_rate,
+ min_silence_duration_ms=min_silence_ms,
+ speech_pad_ms=speech_pad_ms,
+ )
+ def process(self, audio_chunk):
+ audio_int16 = np.frombuffer(audio_chunk, dtype=np.int16)
+ audio_float32 = int2float(audio_int16)
+ vad_output = self.iterator(torch.from_numpy(audio_float32))
+ if vad_output is not None and len(vad_output) != 0:
+ logger.debug("VAD: end of speech detected")
+ array = torch.cat(vad_output).cpu().numpy()
+ duration_ms = len(array) / self.sample_rate * 1000
+ if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
+ logger.debug(
+ f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
+ )
+ else:
+ self.should_listen.clear()
+ logger.debug("Stop listening")
+ yield array

VAD/vad_iterator.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+class VADIterator:
+ def __init__(
+ self,
+ model,
+ threshold: float = 0.5,
+ sampling_rate: int = 16000,
+ min_silence_duration_ms: int = 100,
+ speech_pad_ms: int = 30,
+ ):
+ """
+ Mainly taken from https://github.com/snakers4/silero-vad
+ Class for stream imitation
+ Parameters
+ ----------
+ model: preloaded .jit/.onnx silero VAD model
+ threshold: float (default - 0.5)
+ Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
+ It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
+ sampling_rate: int (default - 16000)
+ Currently silero VAD models support 8000 and 16000 sample rates
+ min_silence_duration_ms: int (default - 100 milliseconds)
+ In the end of each speech chunk wait for min_silence_duration_ms before separating it
+ speech_pad_ms: int (default - 30 milliseconds)
+ Final speech chunks are padded by speech_pad_ms each side
+ """
+ self.model = model
+ self.threshold = threshold
+ self.sampling_rate = sampling_rate
+ self.is_speaking = False
+ self.buffer = []
+ if sampling_rate not in [8000, 16000]:
+ raise ValueError(
+ "VADIterator does not support sampling rates other than [8000, 16000]"
+ )
+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+ self.reset_states()
+ def reset_states(self):
+ self.model.reset_states()
+ self.triggered = False
+ self.temp_end = 0
+ self.current_sample = 0
+ @torch.no_grad()
+ def __call__(self, x):
+ """
+ x: torch.Tensor
+ audio chunk (see examples in repo)
+ return_seconds: bool (default - False)
+ whether return timestamps in seconds (default - samples)
+ """
+ if not torch.is_tensor(x):
+ try:
+ x = torch.Tensor(x)
+ except Exception:
+ raise TypeError("Audio cannot be casted to tensor. Cast it manually")
+ window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
+ self.current_sample += window_size_samples
+ speech_prob = self.model(x, self.sampling_rate).item()
+ if (speech_prob >= self.threshold) and self.temp_end:
+ self.temp_end = 0
+ if (speech_prob >= self.threshold) and not self.triggered:
+ self.triggered = True
+ return None
+ if (speech_prob < self.threshold - 0.15) and self.triggered:
+ if not self.temp_end:
+ self.temp_end = self.current_sample
+ if self.current_sample - self.temp_end < self.min_silence_samples:
+ return None
+ else:
+ # end of speak
+ self.temp_end = 0
+ self.triggered = False
+ spoken_utterance = self.buffer
+ self.buffer = []
+ return spoken_utterance
+ if self.triggered:
+ self.buffer.append(x)
+ return None