testnemoasr / app.py
John6666's picture
Upload 5 files
db5bcc3 verified
import os
if os.environ.get("SPACES_ZERO_GPU") is not None:
import spaces
else:
class spaces:
@staticmethod
def GPU(func):
def wrapper(*args, **kwargs):
return func(*args, **kwargs)
return wrapper
import gradio as gr
import subprocess
#subprocess.run("git clone https://github.com/AI4Bharat/NeMo.git && cd NeMo && git checkout nemo-v2 && bash reinstall.sh", shell=True)
import torch
import nemo.collections.asr as nemo_asr
from pathlib import Path
model = nemo_asr.models.ASRModel.from_pretrained("ai4bharat/indicconformer_stt_ml_hybrid_rnnt_large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.freeze() # inference mode
model = model.to(device) # transfer model to device
@spaces.GPU
def infer(srcfile: str):
tmpfile = "sample_audio_infer_ready.wav"
subprocess.run(f"ffmpeg -i {srcfile} -ac 1 -ar 16000 {tmpfile}", shell=True)
model.cur_decoder = "ctc"
ctc_text = model.transcribe([tmpfile], batch_size=1, logprobs=False, language_id='ml')[0]
print(ctc_text)
model.cur_decoder = "rnnt"
rnnt_text = model.transcribe([tmpfile], batch_size=1, language_id='ml')[0]
print(rnnt_text)
if Path(tmpfile).exists(): Path(tmpfile).unlink()
return ctc_text, rnnt_text
with gr.Blocks() as demo:
input_audio = gr.Audio(label="Input", type="filepath", sources=["upload", "microphone"], format="wav")
run_button = gr.Button("Run", variant="primary")
with gr.Row():
ctc_text = gr.Textbox(label="CTC", value="", show_copy_button=True)
rnnt_text = gr.Textbox(label="RNNT", value="", show_copy_button=True)
run_button.click(infer, [input_audio], [ctc_text, rnnt_text])
demo.launch()