Spaces:

kobrasoft
/

kobraspeech-rnn-cs

Sleeping

App Files Files Community

kobraspeech-rnn-cs / app.py

kozak-vaclav

Update app.py

d471c64 verified over 1 year ago

raw

history blame contribute delete

3.48 kB

	import gradio as gr
	import tensorflow as tf
	import librosa
	import numpy as np
	from huggingface_hub import hf_hub_download, from_pretrained_keras

	# Mel Spectrogram parameters
	n_fft = 512 # FFT window length
	hop_length = 160 # number of samples between successive frames
	n_mels = 80 # Number of Mel bands
	fmin = 0.0 # Minimum frequency
	fmax = 8000.0 # Maximum frequency
	sampling_rate = 16000

	def extract_mel_spectrogram(audio) -> np.ndarray:
	spectrogram = librosa.feature.melspectrogram(y=audio, sr=sampling_rate, hop_length=hop_length,
	n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, power=2.0)
	spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
	#spectrogram = np.expand_dims(spectrogram, axis=-1) # Adding channel dimension for the model
	return spectrogram

	def CTCLoss(y_true, y_pred):
	# Compute the training-time loss value
	batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
	input_length = tf.math.reduce_sum(tf.cast(tf.not_equal(tf.reduce_max(y_pred, axis=2), 0), dtype="int64"), axis=1, keepdims=True)
	label_length = tf.math.reduce_sum(tf.cast(tf.not_equal(y_true, -1), dtype="int64"), axis=1, keepdims=True)

	loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
	return loss

	# Download model from Hugging Face Hub
	# model_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="saved_model.pb")
	# with tf.keras.utils.custom_object_scope({'CTCLoss': CTCLoss}):
	# model = tf.keras.models.load_model(model_path)
	model = from_pretrained_keras("kobrasoft/kobraspeech-rnn-cs")

	import pickle as pkl

	num_to_char_path = hf_hub_download(repo_id="kobrasoft/kobraspeech-rnn-cs", filename="num_to_char.json")
	with open(num_to_char_path, "rb") as f:
	num_to_char = tf.keras.layers.StringLookup(vocabulary=pkl.load(f), oov_token="", invert=True)

	def label_to_string(label):
	return tf.strings.reduce_join(num_to_char(label)).numpy().decode()

	def decode_batch_predictions(pred):
	input_len = np.ones(pred.shape[0]) * pred.shape[1]
	# Use greedy search. For complex tasks, you can use beam search
	results = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
	# Iterate over the results and get back the text
	output_text = []
	for result in results:
	result = label_to_string(result)
	output_text.append(result)
	return output_text

	def transcribe(audio_path):
	# Load audio
	audio, _ = librosa.load(audio_path, sr=sampling_rate)

	# Extract features
	features = extract_mel_spectrogram(audio)

	# Model expects batch dimension
	features = np.expand_dims(features, axis=0)

	# Predict
	prediction = model.predict(features)

	# Assuming you have a method to decode the prediction into text
	transcription = decode_batch_predictions(prediction)

	return transcription[0]

	demo = gr.Blocks()

	mic_transcribe = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(sources="microphone", type="filepath"),
	outputs=gr.Textbox(),
	)

	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(sources="upload", type="filepath"),
	outputs=gr.Textbox(),
	)

	with demo:
	gr.TabbedInterface(
	[mic_transcribe, file_transcribe],
	["Transcribe Microphone", "Transcribe Audio File"],
	)

	demo.launch(debug=True)

	if __name__ == "__main__":
	iface.launch()