Spaces:
Sleeping
Sleeping
import logging | |
import speech_recognition as sr | |
from pydub import AudioSegment | |
from io import BytesIO | |
import os | |
import google.generativeai as genai | |
from dotenv import load_dotenv | |
import base64 | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
def record_audio(file_path, timeout=20, phrase_time_limit=None): | |
""" | |
Simplified function to record audio from the microphone and save it as an MP3 file. | |
Args: | |
file_path (str): Path to save the recorded audio file. | |
timeout (int): Maximum time to wait for a phrase to start (in seconds). | |
phrase_time_lfimit (int): Maximum time for the phrase to be recorded (in seconds). | |
""" | |
recognizer = sr.Recognizer() | |
try: | |
with sr.Microphone() as source: | |
logging.info("Adjusting for ambient noise...") | |
recognizer.adjust_for_ambient_noise(source, duration=1) | |
logging.info("Start speaking now...") | |
# Record the audio | |
audio_data = recognizer.listen(source, timeout=timeout, phrase_time_limit=phrase_time_limit) | |
logging.info("Recording complete.") | |
# Convert the recorded audio to an MP3 file | |
wav_data = audio_data.get_wav_data() | |
audio_segment = AudioSegment.from_wav(BytesIO(wav_data)) | |
audio_segment.export(file_path, format="mp3", bitrate="128k") | |
logging.info(f"Audio saved to {file_path}") | |
except Exception as e: | |
logging.error(f"An error occurred: {e}") | |
load_dotenv() | |
GOOGLE_AI_STUDIO_API_KEY = os.environ.get("GOOGLE_AI_STUDIO_API_KEY") | |
stt_model = "whisper-large-v3" # Keep for compatibility | |
def transcribe_with_groq(stt_model, audio_filepath, GOOGLE_AI_STUDIO_API_KEY=None): | |
api_key = GOOGLE_AI_STUDIO_API_KEY or os.environ.get("GOOGLE_AI_STUDIO_API_KEY") | |
genai.configure(api_key=api_key) | |
# Setup Gemini model | |
model = genai.GenerativeModel("gemini-2.0-flash") | |
# Read audio file | |
with open(audio_filepath, "rb") as audio_file: | |
audio_data = audio_file.read() | |
# Create content for generation | |
contents = [ | |
{ | |
"role": "user", | |
"parts": [ | |
{"text": "Please transcribe this audio accurately. Output only the transcription with no additional text."}, | |
{"inline_data": {"mime_type": "audio/mp3", "data": base64.b64encode(audio_data).decode("utf-8")}} | |
] | |
} | |
] | |
# Generate transcription | |
response = model.generate_content(contents) | |
return response.text |