File size: 2,547 Bytes
85959f4
f3c4afb
85959f4
f3c4afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85959f4
f3c4afb
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
from transformers.file_utils import cached_path, hf_bucket_url
import os
from transformers import Wav2Vec2Processor, AutoModelForCTC
from datasets import load_dataset
import torch
import kenlm
import torchaudio

cache_dir = './cache/'
processor = Wav2Vec2ProcessorWithLM.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
processor2 = Wav2Vec2Processor.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")
model = AutoModelForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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")

# define function to read in sound file
def speech_file_to_array_fn(path, max_seconds=10):
    batch = {"file": path}
    speech_array, sampling_rate = torchaudio.load(batch["file"])
    if sampling_rate != 16000:
      transform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
                                                 new_freq=16000)
      speech_array = transform(speech_array)
    speech_array = speech_array[0]
    if max_seconds > 0:
      speech_array = speech_array[:max_seconds*16000]
    batch["speech"] = speech_array.numpy()
    batch["sampling_rate"] = 16000
    return batch

# tokenize
def inference(audio):
   # read in sound file
    # load dummy dataset and read soundfiles
    ds = speech_file_to_array_fn(audio.name)
    # infer model
    input_values = processor(
          ds["speech"], 
          sampling_rate=ds["sampling_rate"], 
          return_tensors="pt"
    ).input_values
    # decode ctc output
    with torch.no_grad():
      logits = model(input_values).logits

    #pred_ids = torch.argmax(logits, dim=-1)
    h = logits.numpy()[0,:,:]
    v = np.pad(h, [0, 2], mode='constant')

    output = processor.decode(v).text

    return output[:-4]

inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs =  gr.outputs.Textbox(label="Output Text")
title = "Annarabic Speech Recognition System"
description = "Gradio demo for Annarabic ASR. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files"
article = "<a href='https://huggingface.co/ahmedJaafari' target='_blank'>Pretrained model</a></p>"
#examples=[['t1_0001-00010.wav'], ['t1_utt000000042.wav'], ['t2_0000006682.wav']]
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch()