Spaces:
Runtime error
Runtime error
File size: 1,844 Bytes
fa39e8b e80ec97 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
import librosa
import soundfile as sf
import torch
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
#load wav2vec2 tokenizer and model
# define speech-to-text function
def asr_transcript(audio_file, language):
if language == "English":
model_name = "facebook/wav2vec2-large-960h-lv60-self"
elif language == "Russian":
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-russian"
elif language == "French":
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-french"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
transcript = ""
# Stream over 20 seconds chunks
stream = librosa.stream(
audio_file.name, block_length=20, frame_length=16000, hop_length=16000
)
for speech in stream:
if len(speech.shape) > 1:
speech = speech[:, 0] + speech[:, 1]
input_values = tokenizer(speech, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]
transcript += transcription.lower() + " "
return transcript
gradio_ui = gr.Interface(
fn=asr_transcript,
title="Speech-to-Text with HuggingFace+Wav2Vec2",
description="Upload an audio clip in Russian, English, or French and let AI do the hard work of transcribing",
inputs = [gr.inputs.Audio(label="Upload Audio File", type="file"),
gr.inputs.Radio(label="Pick an STT Model - (language)",
choices=["English",
"Russian",
"French"])],
outputs=gr.outputs.Textbox(label="Auto-Transcript"),
)
gradio_ui.launch() |