File size: 2,768 Bytes

a48dac6
3915d32
a48dac6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ca4d96
a48dac6
 
 
 
7538ca1
a48dac6
 
 
 
 
 
9ca4d96
d40a14a
a48dac6
 
d40a14a
 
a48dac6
 
 
 
 
 
 
 
 
 
 
 
d40a14a
792ee14
 
 
 
 
 
 
 
 
 
a67cf5b
792ee14
 
 
a67cf5b
792ee14
 
a48dac6
792ee14
 
3915d32
792ee14

import torch
import gradio as gr
import pytube as pt
from transformers import pipeline
from huggingface_hub import model_info
import time
import unicodedata

MODEL_NAME = "SakshiRathi77/wav2vec2-large-xlsr-300m-hi-kagglex"
lang = "hi"

device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    device=device,
)

def transcribe(microphone, file_upload):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        return "ERROR: You have to either use the microphone or upload an audio file"
    file = microphone if microphone is not None else file_upload
    text = pipe(file)["text"]
    
    return warn_output + text


def rt_transcribe(audio, state=""):
    time.sleep(2)
    text = pipe(audio)["text"]
    state += unicodedata.normalize("NFC",text) + " "
    return state, state

demo = gr.Blocks()
examples=[["examples/example1.mp3"], ["examples/example2.mp3"],["examples/example3.mp3"]]
description = """
<p>
<center>
Welcome to the HindiSpeechPro, a cutting-edge interface powered by a fine-tuned version of facebook/wav2vec2-xls-r-300m on the common_voice dataset. Easily convert your spoken words to accurate text with just a few clicks.
</center>
</p>
<img src="https://huggingface.co/spaces/SakshiRathi77/SakshiRathi77-Wav2Vec2-hi-kagglex/blob/main/Images/Hindi-Speech-Voice-Recognition-Tool.jpg" alt="logo"/>

"""

css='div {background-image: url("https://huggingface.co/spaces/SakshiRathi77/SakshiRathi77-Wav2Vec2-hi-kagglex/resolve/main/Images/image-bg.jpg")}'

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath"),
        gr.inputs.Audio(source="upload", type="filepath"),
    ],
    outputs="text",
    theme="huggingface",
    title="HindiSpeechPro: WAV2VEC-Powered ASR Interface",
    description= description ,
    allow_flagging="never",
    examples=examples,
    css=css,
)

rt_transcribe = gr.Interface(
    fn=rt_transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True), 
        "state"
    ],
    outputs=[ "textbox",
        "state"],
    theme="huggingface",
    title="HindiSpeechPro: WAV2VEC-Powered ASR Interface",
    description= description ,
    allow_flagging="never",
    live=True,
)


with demo:
    gr.TabbedInterface([mf_transcribe, rt_transcribe], ["Transcribe Audio", "Transcribe Realtime Voice"])

demo.launch(share=True)