Spaces:

gauravgulati619
/

MediVox

Sleeping

App Files Files Community

MediVox / patientvoice.py

gauravgulati619

feat: update to Gemini, add optional inputs, and apply new theme

ef46851 about 2 months ago

raw

history blame contribute delete

2.61 kB

	import logging
	import speech_recognition as sr
	from pydub import AudioSegment
	from io import BytesIO
	import os
	import google.generativeai as genai
	from dotenv import load_dotenv
	import base64

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def record_audio(file_path, timeout=20, phrase_time_limit=None):
	"""
	Simplified function to record audio from the microphone and save it as an MP3 file.

	Args:
	file_path (str): Path to save the recorded audio file.
	timeout (int): Maximum time to wait for a phrase to start (in seconds).
	phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds).
	"""
	recognizer = sr.Recognizer()

	try:
	with sr.Microphone() as source:
	logging.info("Adjusting for ambient noise...")
	recognizer.adjust_for_ambient_noise(source, duration=1)
	logging.info("Start speaking now...")

	# Record the audio
	audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit)
	logging.info("Recording complete.")

	# Convert the recorded audio to an MP3 file
	wav_data = audio_data.get_wav_data()
	audio_segment = AudioSegment.from_wav(BytesIO(wav_data))
	audio_segment.export(file_path, format="mp3", bitrate="128k")

	logging.info(f"Audio saved to {file_path}")

	except Exception as e:
	logging.error(f"An error occurred: {e}")

	load_dotenv()
	GOOGLE_AI_STUDIO_API_KEY = os.environ.get("GOOGLE_AI_STUDIO_API_KEY")
	stt_model = "whisper-large-v3" # Keep for compatibility

	def transcribe_with_groq(stt_model, audio_filepath, GOOGLE_AI_STUDIO_API_KEY=None):
	api_key = GOOGLE_AI_STUDIO_API_KEY or os.environ.get("GOOGLE_AI_STUDIO_API_KEY")
	genai.configure(api_key=api_key)

	# Setup Gemini model
	model = genai.GenerativeModel("gemini-2.0-flash")

	# Read audio file
	with open(audio_filepath, "rb") as audio_file:
	audio_data = audio_file.read()

	# Create content for generation
	contents = [
	{
	"role": "user",
	"parts": [
	{"text": "Please transcribe this audio accurately. Output only the transcription with no additional text."},
	{"inline_data": {"mime_type": "audio/mp3", "data": base64.b64encode(audio_data).decode("utf-8")}}
	]
	}
	]

	# Generate transcription
	response = model.generate_content(contents)

	return response.text