Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tensorflow as tf | |
| import librosa | |
| import numpy as np | |
| from huggingface_hub import hf_hub_download, from_pretrained_keras | |
| # Mel Spectrogram parameters | |
| n_fft = 512 # FFT window length | |
| hop_length = 160 # number of samples between successive frames | |
| n_mels = 80 # Number of Mel bands | |
| fmin = 0.0 # Minimum frequency | |
| fmax = 8000.0 # Maximum frequency | |
| sampling_rate = 16000 | |
| def extract_mel_spectrogram(audio) -> np.ndarray: | |
| spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length, | |
| n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0) | |
| spectrogram = librosa.power_to_db(spectrogram, ref=np.max) | |
| #spectrogram = np.expand_dims(spectrogram, axis=-1) # Adding channel dimension for the model | |
| return spectrogram | |
| def CTCLoss(y_true, y_pred): | |
| # Compute the training-time loss value | |
| batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") | |
| input_length = tf.math.reduce_sum(tf.cast(tf.not_equal(tf.reduce_max(y_pred, axis=2), 0), dtype="int64"), axis=1, keepdims=True) | |
| label_length = tf.math.reduce_sum(tf.cast(tf.not_equal(y_true, -1), dtype="int64"), axis=1, keepdims=True) | |
| loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length) | |
| return loss | |
| # Download model from Hugging Face Hub | |
| # model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="saved_model.pb") | |
| # with tf.keras.utils.custom_object_scope({'CTCLoss': CTCLoss}): | |
| # model = tf.keras.models.load_model(model_path) | |
| model = from_pretrained_keras("kobrasoft/kobraspeech-rnn-cs") | |
| import pickle as pkl | |
| num_to_char_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="num_to_char.json") | |
| with open(num_to_char_path, "rb") as f: | |
| num_to_char = tf.keras.layers.StringLookup(vocabulary=pkl.load(f), oov_token="", invert=True) | |
| def label_to_string(label): | |
| return tf.strings.reduce_join(num_to_char(label)).numpy().decode() | |
| def decode_batch_predictions(pred): | |
| input_len = np.ones(pred.shape[0]) * pred.shape[1] | |
| # Use greedy search. For complex tasks, you can use beam search | |
| results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0] | |
| # Iterate over the results and get back the text | |
| output_text = [] | |
| for result in results: | |
| result = label_to_string(result) | |
| output_text.append(result) | |
| return output_text | |
| def transcribe(audio_path): | |
| # Load audio | |
| audio, _ = librosa.load(audio_path, sr=sampling_rate) | |
| # Extract features | |
| features = extract_mel_spectrogram(audio) | |
| # Model expects batch dimension | |
| features = np.expand_dims(features, axis=0) | |
| # Predict | |
| prediction = model.predict(features) | |
| # Assuming you have a method to decode the prediction into text | |
| transcription = decode_batch_predictions(prediction) | |
| return transcription[0] | |
| demo = gr.Blocks() | |
| mic_transcribe = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(sources="microphone", type="filepath"), | |
| outputs=gr.Textbox(), | |
| ) | |
| file_transcribe = gr.Interface( | |
| fn=transcribe, | |
| inputs=gr.Audio(sources="upload", type="filepath"), | |
| outputs=gr.Textbox(), | |
| ) | |
| with demo: | |
| gr.TabbedInterface( | |
| [mic_transcribe, file_transcribe], | |
| ["Transcribe Microphone", "Transcribe Audio File"], | |
| ) | |
| demo.launch(debug=True) | |
| if __name__ == "__main__": | |
| iface.launch() |