|
import gradio as gr |
|
import numpy as np |
|
import tensorflow as tf |
|
from tensorflow import keras |
|
import tensorflow_io as tfio |
|
from huggingface_hub import from_pretrained_keras |
|
|
|
|
|
model = from_pretrained_keras("keras-io/ctc_asr", compile=False) |
|
|
|
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "] |
|
|
|
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="") |
|
|
|
num_to_char = keras.layers.StringLookup( |
|
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True |
|
) |
|
|
|
|
|
frame_length = 256 |
|
|
|
frame_step = 160 |
|
|
|
|
|
fft_length = 384 |
|
|
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
def decode_batch_predictions(pred): |
|
input_len = np.ones(pred.shape[0]) * pred.shape[1] |
|
|
|
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] |
|
|
|
output_text = [] |
|
for result in results: |
|
result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8") |
|
output_text.append(result) |
|
return output_text |
|
|
|
|
|
def load_16k_audio_wav(filename): |
|
|
|
file_content = tf.io.read_file(filename) |
|
|
|
|
|
audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1) |
|
audio_wav = tf.squeeze(audio_wav, axis=-1) |
|
sample_rate = tf.cast(sample_rate, dtype=tf.int64) |
|
|
|
|
|
audio_wav = tfio.audio.resample( |
|
audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE |
|
) |
|
|
|
return audio_wav |
|
|
|
|
|
def mic_to_tensor(recorded_audio_file): |
|
sample_rate, audio = recorded_audio_file |
|
|
|
audio_wav = tf.constant(audio, dtype=tf.float32) |
|
if tf.rank(audio_wav) > 1: |
|
audio_wav = tf.reduce_mean(audio_wav, axis=1) |
|
audio_wav = tfio.audio.resample( |
|
audio_wav, rate_in=sample_rate, rate_out=SAMPLE_RATE |
|
) |
|
|
|
audio_wav = tf.divide(audio_wav, tf.reduce_max(tf.abs(audio_wav))) |
|
|
|
return audio_wav |
|
|
|
|
|
def tensor_to_predictions(audio_tensor): |
|
|
|
audio_tensor = tf.cast(audio_tensor, tf.float32) |
|
|
|
|
|
spectrogram = tf.signal.stft( |
|
audio_tensor, |
|
frame_length=frame_length, |
|
frame_step=frame_step, |
|
fft_length=fft_length, |
|
) |
|
|
|
|
|
spectrogram = tf.abs(spectrogram) |
|
spectrogram = tf.math.pow(spectrogram, 0.5) |
|
|
|
|
|
means = tf.math.reduce_mean(spectrogram, 1, keepdims=True) |
|
stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True) |
|
spectrogram = (spectrogram - means) / (stddevs + 1e-10) |
|
|
|
spectrogram = tf.expand_dims(spectrogram, axis=0) |
|
|
|
batch_predictions = model.predict(spectrogram) |
|
batch_predictions = decode_batch_predictions(batch_predictions) |
|
return batch_predictions |
|
|
|
|
|
def clear_inputs_and_outputs(): |
|
return [None, None, None] |
|
|
|
|
|
def predict(recorded_audio_file, uploaded_audio_file): |
|
|
|
if recorded_audio_file: |
|
audio_tensor = mic_to_tensor(recorded_audio_file) |
|
else: |
|
audio_tensor = load_16k_audio_wav(uploaded_audio_file) |
|
|
|
prediction = tensor_to_predictions(audio_tensor)[0] |
|
return prediction |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.Markdown( |
|
""" |
|
<center><h1>Automatic Speech Recognition using CTC</h1></center> \ |
|
This space is a demo of Automatic Speech Recognition using Keras trained on LJSpeech dataset.<br> \ |
|
In this space, you can record your voice or upload a wav file and the model will predict the words spoken in English<br><br> |
|
""" |
|
) |
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
mic_input = gr.Audio(source="microphone", label="Record your own voice") |
|
upl_input = gr.Audio( |
|
source="upload", type="filepath", label="Upload a wav file" |
|
) |
|
|
|
with gr.Row(): |
|
clr_btn = gr.Button(value="Clear", variant="secondary") |
|
prd_btn = gr.Button(value="Predict") |
|
|
|
|
|
with gr.Column(): |
|
lbl_output = gr.Label(label="Text") |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown( |
|
""" |
|
<h4>Credits</h4> |
|
Author: <a href="https://twitter.com/anuragcomm"> Anurag Singh</a>.<br> |
|
Based on the following Keras example <a href="https://keras.io/examples/audio/ctc_asr">Automatic Speech Recognition using CTC</a> by <a href="https://rbouadjenek.github.io/">Mohamed Reda Bouadjenek</a> and <a href="https://www.linkedin.com/in/parkerhuynh/">Ngoc Dung Huynh</a><br> |
|
Check out the model <a href="https://huggingface.co/keras-io/ctc_asr">here</a> |
|
""" |
|
) |
|
|
|
clr_btn.click( |
|
fn=clear_inputs_and_outputs, |
|
inputs=[], |
|
outputs=[mic_input, upl_input, lbl_output], |
|
) |
|
prd_btn.click( |
|
fn=predict, |
|
inputs=[mic_input, upl_input], |
|
outputs=[lbl_output], |
|
) |
|
|
|
demo.launch(debug=True) |
|
|