Spaces:
Sleeping
Sleeping
"""Røst ASR demo.""" | |
import os | |
import warnings | |
import gradio as gr | |
import numpy as np | |
import samplerate | |
import torch | |
from punctfix import PunctFixer | |
from transformers import pipeline | |
from dotenv import load_dotenv | |
load_dotenv() | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
icon = """ | |
<svg xmlns="http://www.w3.org/2000/svg" width="14px" viewBox="0 0 24 24" fill="none" | |
stroke="currentColor" stroke-width="2" stroke-linecap="round" | |
stroke-linejoin="round" style="display: inline;"> | |
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
<polyline points="17 8 12 3 7 8"/> | |
<line x1="12" y1="3" x2="12" y2="15"/> | |
</svg> | |
""" | |
TITLE = "Røst ASR Demo" | |
DESCRIPTION = f""" | |
This is a demo of the Danish speech recognition model | |
[Røst](https://huggingface.co/alexandrainst/roest-315m). Press "Record" to record your | |
own voice. When you're done you can press "Stop" to stop recording and "Submit" to | |
send the audio to the model for transcription. You can also upload an audio file by | |
pressing the {icon} button. | |
""" | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
transcriber = pipeline( | |
task="automatic-speech-recognition", | |
model="alexandrainst/roest-315m", | |
device=device, | |
token=os.getenv("HUGGINGFACE_HUB_TOKEN", True), | |
) | |
transcription_fixer = PunctFixer(language="da", device=device) | |
def transcribe_audio(sampling_rate_and_audio: tuple[int, np.ndarray]) -> str: | |
"""Transcribe the audio. | |
Args: | |
sampling_rate_and_audio: | |
A tuple with the sampling rate and the audio. | |
Returns: | |
The transcription. | |
""" | |
sampling_rate, audio = sampling_rate_and_audio | |
if audio.ndim > 1: | |
audio = np.mean(audio, axis=1) | |
audio = samplerate.resample(audio, 16_000 / sampling_rate, "sinc_best") | |
transcription = transcriber(inputs=audio) | |
if not isinstance(transcription, dict): | |
return "" | |
cleaned_transcription = transcription_fixer.punctuate( | |
text=transcription["text"] | |
) | |
return cleaned_transcription | |
demo = gr.Interface( | |
fn=transcribe_audio, | |
inputs=gr.Audio(sources=["microphone", "upload"]), | |
outputs="textbox", | |
title=TITLE, | |
description=DESCRIPTION, | |
allow_flagging="never", | |
) | |
demo.launch() | |