Spaces:

Satyam-Singh
/

Speech-to-speech

Build error

App Files Files Community

Speech-to-speech / app.py

Satyam-Singh

Create app.py

a0e2f2a verified about 14 hours ago

raw

history blame contribute delete

4.93 kB

	import os
	from gtts import gTTS
	from io import BytesIO
	from pydub import AudioSegment
	from pydub.playback import play
	import speech_recognition as sr
	from IPython.display import Audio
	import google.generativeai as genai
	from dotenv import load_dotenv
	import streamlit as st

	# Path to store voice files
	path = "../data/voice/"
	os.makedirs(path, exist_ok=True)

	# 1. Save and play voice created by Google Text-to-Speech (gTTS)
	def text_to_audio(text, filename):
	tts = gTTS(text)
	file_path = os.path.join(path, filename)
	tts.save(file_path)
	return file_path

	def play_audio(file_path):
	audio = AudioSegment.from_file(file_path)
	play(audio)

	# 2. Use microphone to record voice
	def record_audio(duration=4):
	recognizer = sr.Recognizer()
	with sr.Microphone() as source:
	print("Adjusting noise...")
	recognizer.adjust_for_ambient_noise(source, duration=1)
	print(f"Recording for {duration} seconds...")
	recorded_audio = recognizer.listen(source, timeout=duration)
	print("Done recording.")
	return recorded_audio

	# 3. Convert the recorded voice to text through speech-to-text (STT)
	def audio_to_text(audio):
	recognizer = sr.Recognizer()
	try:
	print("Recognizing the text...")
	text = recognizer.recognize_google(audio, language="en-US")
	print("Decoded Text: {}".format(text))
	except sr.UnknownValueError:
	text = "Google Speech Recognition could not understand the audio."
	except sr.RequestError:
	text = "Could not request results from Google Speech Recognition service."
	return text

	# 4. Convert the text to voice through text-to-speech (TTS)
	def text_to_speech(text):
	tts = gTTS(text)
	audio_buffer = BytesIO()
	tts.write_to_fp(audio_buffer)
	audio_buffer.seek(0)
	audio_segment = AudioSegment.from_file(audio_buffer, format="mp3")
	play(audio_segment)

	# 5. Make a voice-to-voice stream
	def voice_to_voice():
	recorded_audio = record_audio()
	text = audio_to_text(recorded_audio)
	text_to_speech(text)

	# 6. Integrate an LLM to respond to voice input with voice output
	load_dotenv()
	GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
	genai.configure(api_key=GOOGLE_API_KEY)
	gemini_pro = genai.GenerativeModel(model_name="models/gemini-pro")

	def respond_by_gemini(input_text, role_text, instructions_text):
	final_prompt = [
	"ROLE: " + role_text,
	"INPUT_TEXT: " + input_text,
	instructions_text,
	]
	response = gemini_pro.generate_content(
	final_prompt,
	stream=True,
	)
	response_list = []
	for chunk in response:
	response_list.append(chunk.text)
	response_text = "".join(response_list)
	return response_text

	def llm_voice_response():
	role = 'You are an intelligent assistant to chat on the topic: `{}`.'
	topic = 'The future of artificial intelligence'
	role_text = role.format(topic)
	instructions = 'Respond to the INPUT_TEXT briefly in chat style. Respond based on your knowledge about `{}` in brief chat style.'
	instructions_text = instructions.format(topic)

	recorded_audio = record_audio()
	text = audio_to_text(recorded_audio)
	response_text = text
	if text not in ["Google Speech Recognition could not understand the audio.", "Could not request results from Google Speech Recognition service."]:
	response_text = respond_by_gemini(text, role_text, instructions_text)
	text_to_speech(response_text)

	# 7. Build a Web interface for the LLM-supported voice assistant
	def main():
	# Streamlit setup with custom CSS
	st.set_page_config(page_title="LLM-Supported Voice Assistant", layout="wide")

	st.markdown("""
	<link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
	<style>
	.main {background-color: #f5f5f5;}
	.container {max-width: 800px; margin: auto; padding-top: 50px;}
	.title {font-family: 'Arial', sans-serif; color: #333333; margin-bottom: 30px;}
	.btn {background-color: #4CAF50; color: white; border: none; padding: 10px 20px; cursor: pointer; font-size: 16px;}
	.btn:hover {background-color: #45a049;}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("<div class='container'><h1 class='title'>LLM-Supported Voice Assistant</h1></div>", unsafe_allow_html=True)

	st.write("This is a voice assistant with LLM support. Speak to the microphone, and the assistant will respond.")

	if st.button("Record and Get Response", key="record_btn"):
	st.write("Listening...")
	llm_voice_response()
	st.write("Done.")

	st.markdown("<div class='container'><h5>Press the button and speak to the microphone. The assistant will generate a response based on the input and speak it out loud.</h5></div>", unsafe_allow_html=True)

	if __name__ == "__main__":
	main()