Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
from librosa import resample | |
import numpy as np | |
def transcribe(input_audio, model_id): | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model_id, | |
device="cpu" | |
) | |
sr, speech = input_audio | |
# Convert to mono if stereo | |
if speech.ndim > 1: | |
speech = speech.mean(axis=1) | |
# Convert to float32 if needed | |
if speech.dtype != "float32": | |
speech = speech.astype(np.float32) | |
# Resample if sampling rate is not 16kHz | |
if sr!=16000: | |
speech = resample(speech, orig_sr=sr, target_sr=16000) | |
output = pipe(speech, chunk_length_s=30, stride_length_s=5)['text'] | |
return output | |
model_ids_list = [ | |
"GetmanY1/wav2vec2-base-fi-150k-finetuned", | |
"GetmanY1/wav2vec2-large-fi-150k-finetuned", | |
"GetmanY1/wav2vec2-xlarge-fi-150k-finetuned" | |
] | |
gradio_app = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(sources=["upload","microphone"]), | |
gr.Dropdown( | |
label="Model", | |
value="GetmanY1/wav2vec2-large-fi-150k-finetuned", | |
choices=model_ids_list | |
) | |
], | |
outputs="text", | |
title="Finnish Automatic Speech Recognition", | |
description ="Choose a model from the list. Select the Base model for the fastest inference and the XLarge one for the most accurate results." | |
) | |
if __name__ == "__main__": | |
gradio_app.launch() | |
# if __name__ == "__main__": | |
# gradio_app.launch() |