import logging import speech_recognition as sr from pydub import AudioSegment from io import BytesIO import os import google.generativeai as genai from dotenv import load_dotenv import base64 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def record_audio(file_path, timeout=20, phrase_time_limit=None): """ Simplified function to record audio from the microphone and save it as an MP3 file. Args: file_path (str): Path to save the recorded audio file. timeout (int): Maximum time to wait for a phrase to start (in seconds). phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds). """ recognizer = sr.Recognizer() try: with sr.Microphone() as source: logging.info("Adjusting for ambient noise...") recognizer.adjust_for_ambient_noise(source, duration=1) logging.info("Start speaking now...") # Record the audio audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit) logging.info("Recording complete.") # Convert the recorded audio to an MP3 file wav_data = audio_data.get_wav_data() audio_segment = AudioSegment.from_wav(BytesIO(wav_data)) audio_segment.export(file_path, format="mp3", bitrate="128k") logging.info(f"Audio saved to {file_path}") except Exception as e: logging.error(f"An error occurred: {e}") load_dotenv() GOOGLE_AI_STUDIO_API_KEY = os.environ.get("GOOGLE_AI_STUDIO_API_KEY") stt_model = "whisper-large-v3" # Keep for compatibility def transcribe_with_groq(stt_model, audio_filepath, GOOGLE_AI_STUDIO_API_KEY=None): api_key = GOOGLE_AI_STUDIO_API_KEY or os.environ.get("GOOGLE_AI_STUDIO_API_KEY") genai.configure(api_key=api_key) # Setup Gemini model model = genai.GenerativeModel("gemini-2.0-flash") # Read audio file with open(audio_filepath, "rb") as audio_file: audio_data = audio_file.read() # Create content for generation contents = [ { "role": "user", "parts": [ {"text": "Please transcribe this audio accurately. Output only the transcription with no additional text."}, {"inline_data": {"mime_type": "audio/mp3", "data": base64.b64encode(audio_data).decode("utf-8")}} ] } ] # Generate transcription response = model.generate_content(contents) return response.text