SalesAI / speech_to_text.py
Zasha1's picture
Upload 12 files
a616221 verified
raw
history blame
2.36 kB
from dotenv import load_dotenv
import os
from vosk import Model, KaldiRecognizer
import pyaudio
import json
# Load environment variables from .env file
load_dotenv()
# Get the Vosk model path from the environment variable
vosk_model_path = os.getenv("vosk_model_path")
if not vosk_model_path:
print("Error: vosk_model_path is not set in the .env file.")
exit()
# Initialize the Vosk model
try:
model = Model(vosk_model_path)
print("Vosk model loaded successfully.")
except Exception as e:
print(f"Failed to load Vosk model: {e}")
exit()
# Initialize recognizer and audio input
recognizer = KaldiRecognizer(model, 16000)
audio = pyaudio.PyAudio()
# Open audio stream
stream = audio.open(format=pyaudio.paInt16,
channels=1,
rate=16000,
input=True,
frames_per_buffer=4000)
stream.start_stream()
print("Say 'start listening' to begin transcription and 'stop listening' to stop.")
# State management
is_listening = False
try:
while True:
data = stream.read(4000, exception_on_overflow=False)
if recognizer.AcceptWaveform(data):
result = recognizer.Result()
text = json.loads(result)["text"]
# Check for commands to start or stop listening
if "start listening" in text.lower():
is_listening = True
print("Listening started. Speak into the microphone.")
continue
elif "stop listening" in text.lower():
is_listening = False
print("Listening stopped. Say 'start listening' to resume.")
continue
# Transcribe if actively listening
if is_listening:
print(f"Transcription: {text}")
else:
# Handle partial results if needed
chunk_result = recognizer.PartialResult()
chunk_text = json.loads(chunk_result)["partial"]
# Display partial transcription only if actively listening
if is_listening and chunk_text:
print(f"chunk: {chunk_text}", end="\r")
except KeyboardInterrupt:
print("\nExiting...")
stream.stop_stream()
stream.close()
audio.terminate()