import gradio as gr from transformers.file_utils import cached_path, hf_bucket_url import os from transformers import Wav2Vec2Processor, AutoModelForCTC from datasets import load_dataset import torch import kenlm import torchaudio cache_dir = './cache/' processor = Wav2Vec2ProcessorWithLM.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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") processor2 = Wav2Vec2Processor.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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") model = AutoModelForCTC.from_pretrained("ahmedJaafari/Annarabic3.2", cache_dir=cache_dir, use_auth_token="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") # define function to read in sound file def speech_file_to_array_fn(path, max_seconds=10): batch = {"file": path} speech_array, sampling_rate = torchaudio.load(batch["file"]) if sampling_rate != 16000: transform = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) speech_array = transform(speech_array) speech_array = speech_array[0] if max_seconds > 0: speech_array = speech_array[:max_seconds*16000] batch["speech"] = speech_array.numpy() batch["sampling_rate"] = 16000 return batch # tokenize def inference(audio): # read in sound file # load dummy dataset and read soundfiles ds = speech_file_to_array_fn(audio.name) # infer model input_values = processor( ds["speech"], sampling_rate=ds["sampling_rate"], return_tensors="pt" ).input_values # decode ctc output with torch.no_grad(): logits = model(input_values).logits #pred_ids = torch.argmax(logits, dim=-1) h = logits.numpy()[0,:,:] v = np.pad(h, [0, 2], mode='constant') output = processor.decode(v).text return output[:-4] inputs = gr.inputs.Audio(label="Input Audio", type="file") outputs = gr.outputs.Textbox(label="Output Text") title = "Annarabic Speech Recognition System" description = "Gradio demo for Annarabic ASR. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav 16_000hz files" article = "Pretrained model

" #examples=[['t1_0001-00010.wav'], ['t1_utt000000042.wav'], ['t2_0000006682.wav']] gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch()