import gradio as gr import librosa import soundfile as sf import torch from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC #load wav2vec2 tokenizer and model # define speech-to-text function def asr_transcript(audio_file, language): if language == "English": model_name = "facebook/wav2vec2-large-960h-lv60-self" elif language == "Russian": model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-russian" elif language == "French": model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-french" tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) transcript = "" # Stream over 20 seconds chunks stream = librosa.stream( audio_file.name, block_length=20, frame_length=16000, hop_length=16000 ) for speech in stream: if len(speech.shape) > 1: speech = speech[:, 0] + speech[:, 1] input_values = tokenizer(speech, return_tensors="pt").input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(predicted_ids)[0] transcript += transcription.lower() + " " return transcript gradio_ui = gr.Interface( fn=asr_transcript, title="Automatic speech recognition with Wav2Vec2", description="Upload an audio clip in Russian, English, or French and let AI do the hard work of transcribing", inputs = [gr.inputs.Audio(label="Upload Audio File", type="file"), gr.inputs.Radio(label="Pick a language", choices=["English", "Russian", "French"])], outputs=gr.outputs.Textbox(label="Auto-Transcript"), ) gradio_ui.launch()