import streamlit as st from transformers import pipeline import torch import numpy as np from pydub import AudioSegment import io # Load the ASR pipeline with Whisper model pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") def audio_to_numpy(audio_file): # Load the audio file into an AudioSegment object audio = AudioSegment.from_file(io.BytesIO(audio_file.read())) # Convert audio to mono and set sample rate to 16000 audio = audio.set_channels(1).set_frame_rate(16000) # Convert to numpy array samples = np.array(audio.get_array_of_samples()) # Normalize the data samples = samples.astype(np.float32) / np.iinfo(audio.sample_width * 8).max return samples def transcribe_audio(audio_file): # Convert audio bytes to numpy array audio_numpy = audio_to_numpy(audio_file) # Transcribe audio transcription = pipe(audio_numpy) return transcription['text'] # Streamlit UI st.title("Speech-to-Text Transcription App") st.write("Upload an audio file to transcribe its content into text.") uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "flac"]) if uploaded_file is not None: try: with st.spinner("Transcribing..."): text = transcribe_audio(uploaded_file) st.subheader("Transcription Result:") st.write(text) except Exception as e: st.error(f"An error occurred: {e}")