File size: 2,263 Bytes
9bb0768
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72979b7
9bb0768
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
import streamlit as st
import numpy as np
from transformers.file_utils import cached_path, hf_bucket_url
import os
from transformers import Wav2Vec2ProcessorWithLM, AutoModelForCTC
from datasets import load_dataset
import torch
import kenlm
import torchaudio

cache_dir = './cache/'
processor = Wav2Vec2ProcessorWithLM.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token=st.secrets["AnnarabicToken"])
model = AutoModelForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token=st.secrets["AnnarabicToken"])

# define function to read in sound file
def speech_file_to_array_fn(path, max_seconds=10):
    batch = {"file": path}
    speech_array, sampling_rate = torchaudio.load(batch["file"])
    if sampling_rate != 16000:
      transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
                                                 new_freq=16000)
      speech_array = transform(speech_array)
    speech_array = speech_array[0]
    if max_seconds > 0:
      speech_array = speech_array[:max_seconds*16000]
    batch["speech"] = speech_array.numpy()
    batch["sampling_rate"] = 16000
    return batch

# tokenize
def inference(audio):
   # read in sound file
    # load dummy dataset and read soundfiles
    ds = speech_file_to_array_fn(audio.name)
    # infer model
    input_values = processor(
          ds["speech"], 
          sampling_rate=ds["sampling_rate"], 
          return_tensors="pt"
    ).input_values
    # decode ctc output
    with torch.no_grad():
      logits = model(input_values).logits

    #pred_ids = torch.argmax(logits, dim=-1)
    h = logits.numpy()[0,:,:]
    v = np.pad(h, [0, 2], mode='constant')

    output = processor.decode(v).text

    return output[:-4]

inputs = gr.inputs.Audio(label="Record Audio", source="microphone", type='file')
outputs =  gr.outputs.Textbox(label="Output Text")
title = "Annarabic Speech Recognition System"
description = "Gradio demo for Annarabic ASR. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
examples=[['Aya.mp3'], ['Loubna.mp3']]
gr.Interface(inference, inputs, outputs, title=title, description=description, examples=examples).launch()