Spaces:

ahmedJaafari
/

AnnarabicRecord

Runtime error

App Files Files Community

ahmedJaafari commited on Mar 17, 2022

Commit

9bb0768

1 Parent(s): 61a05a4

Upload app.py

Browse files

Files changed (1) hide show

app.py +59 -0

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import gradio as gr
+import streamlit as st
+import numpy as np
+from transformers.file_utils import cached_path, hf_bucket_url
+import os
+from transformers import Wav2Vec2ProcessorWithLM, AutoModelForCTC
+from datasets import load_dataset
+import torch
+import kenlm
+import torchaudio
+cache_dir = './cache/'
+processor = Wav2Vec2ProcessorWithLM.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token=st.secrets["AnnarabicToken"])
+model = AutoModelForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token=st.secrets["AnnarabicToken"])
+# define function to read in sound file
+def speech_file_to_array_fn(path, max_seconds=10):
+    batch = {"file": path}
+    speech_array, sampling_rate = torchaudio.load(batch["file"])
+    if sampling_rate != 16000:
+      transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
+                                                 new_freq=16000)
+      speech_array = transform(speech_array)
+    speech_array = speech_array[0]
+    if max_seconds > 0:
+      speech_array = speech_array[:max_seconds*16000]
+    batch["speech"] = speech_array.numpy()
+    batch["sampling_rate"] = 16000
+    return batch
+# tokenize
+def inference(audio):
+   # read in sound file
+    # load dummy dataset and read soundfiles
+    ds = speech_file_to_array_fn(audio.name)
+    # infer model
+    input_values = processor(
+          ds["speech"],
+          sampling_rate=ds["sampling_rate"],
+          return_tensors="pt"
+    ).input_values
+    # decode ctc output
+    with torch.no_grad():
+      logits = model(input_values).logits
+    #pred_ids = torch.argmax(logits, dim=-1)
+    h = logits.numpy()[0,:,:]
+    v = np.pad(h, [0, 2], mode='constant')
+    output = processor.decode(v).text
+    return output[:-4]
+inputs = gr.inputs.Audio(label="Input Audio", type="file")
+outputs =  gr.outputs.Textbox(label="Output Text")
+title = "Annarabic Speech Recognition System"
+description = "Gradio demo for Annarabic ASR. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
+examples=[['Aya.mp3'], ['Loubna.mp3']]
+gr.Interface(inference, inputs, outputs, title=title, description=description, examples=examples).launch()