File size: 2,464 Bytes
85959f4
3d120ba
f3c4afb
85959f4
8a7749b
f3c4afb
 
 
 
 
 
e662762
 
f3c4afb
 
60924ba
f3c4afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c03e9ab
f3c4afb
 
 
 
 
 
 
 
 
 
25cbc36
09d00cd
f3c4afb
8972f37
85959f4
c03e9ab
f3c4afb
 
cc69677
1cbf289
c2eec8b
1cbf289
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
import numpy as np
from transformers.file_utils import cached_path, hf_bucket_url
import os
from transformers import Wav2Vec2ProcessorWithLM, AutoModelForCTC
from datasets import load_dataset
import torch
import kenlm
import torchaudio

cache_dir = './cache/'
processor = Wav2Vec2ProcessorWithLM.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token=os.getenv("AnnarabicToken"))
model = AutoModelForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token=os.getenv("AnnarabicToken"))

# define function to read in sound file
def speech_file_to_array_fn(path, max_seconds=120):
    batch = {"file": path}
    speech_array, sampling_rate = torchaudio.load(batch["file"])
    if sampling_rate != 16000:
      transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
                                                 new_freq=16000)
      speech_array = transform(speech_array)
    speech_array = speech_array[0]
    if max_seconds > 0:
      speech_array = speech_array[:max_seconds*16000]
    batch["speech"] = speech_array.numpy()
    batch["sampling_rate"] = 16000
    return batch

# tokenize
def inference(audio):
   # read in sound file
    # load dummy dataset and read soundfiles
    ds = speech_file_to_array_fn(audio)
    # infer model
    input_values = processor(
          ds["speech"], 
          sampling_rate=ds["sampling_rate"], 
          return_tensors="pt"
    ).input_values
    # decode ctc output
    with torch.no_grad():
      logits = model(input_values).logits

    output = processor.decode(logits.numpy()[0]).text
    print(output)

    return output

inputs = gr.inputs.Audio(label="Input Audio", type="filepath")
outputs =  gr.outputs.Textbox(label="Output Text")
title = "Annarabic Speech Recognition System"
description = 'Demo for <b>Annarabic ASR</b>. To use it, simply upload your audio, or click on one of the examples to load them. Only the 10 first seconds of the audio will be transcribed and GPU runtime is not used. For more information, contact Ahmed Jaafari via email: <a href = "mailto: [email protected]">[email protected]</a> or phone: <a href = "tel: +212658537105">+212658537105</a>.'
examples=[['Aya.mp3'], ['Loubna.mp3'], ['Omar.wav'], ['Yassir.wav']]
article="* The ASR never trained on the given examples."
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()