Spaces:

santiviquez
/

noisy_human

Running

App Files Files Community

noisy_human / app.py

santiviquez

Update app.py

eca2ad1 over 1 year ago

raw

history blame contribute delete

3.09 kB

	import streamlit as st
	from huggingface_hub import hf_hub_url, cached_download
	import torch
	import torchaudio.transforms as transforms
	from miniaudio import SampleFormat, decode
	from librosa.util import fix_length
	import numpy as np
	from audio_recorder_streamlit import audio_recorder


	# Streamlit app title
	st.markdown("## Noisy Human")
	st.markdown("")
	st.markdown(
	"Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes"
	)

	col1, col2 = st.columns(2)
	with st.container():
	with col1:
	st.markdown(
	"""
	* Clapping 👏
	* Footsteps 🦶
	* Brushing Teeth 🪥
	* Drinking Sipping 🧃
	* Laughing 😂
	"""
	)

	with col2:
	st.markdown(
	"""

	* Breathing 🌬️
	* Crying Baby 😭
	* Coughing 🤧
	* Snoring 😴
	* Sneezing 🤧
	"""
	)

	# from audio_recorder_streamlit import audio_recorder

	from cnn import CNN

	REPO_ID = "santiviquez/noisy_human_cnn"
	FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth"
	RATE = 22050


	@st.cache(allow_output_mutation=True)
	def download_model():
	model_weights = torch.load(
	cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu")
	)
	return model_weights


	model_weights = download_model()
	model = CNN(input_channels=2)
	model.load_state_dict(model_weights)
	model.eval()
	audio_bytes = st.file_uploader(
	"Choose an audio (.wav) file", accept_multiple_files=False
	)
	st.caption("OR")
	audio_bytes = audio_recorder()

	if audio_bytes:
	# audio_bytes = audio_file_path.read()
	st.audio(audio_bytes, format="audio/ogg")
	# st.audio(audio_bytes, format="audio/ogg")
	# torch.tensor(audio_bytes).shape
	decoded_audio = decode(
	audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32
	)

	waveform = np.array(decoded_audio.samples)
	waveform = fix_length(waveform, size=5 * RATE)
	waveform = torch.FloatTensor(waveform)

	x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform)
	x_deltas = transforms.ComputeDeltas()(x_mel)
	x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216)

	y_pred = model(x)
	y_pred_softmax = torch.log_softmax(y_pred, dim=1)
	_, y_pred_tags = torch.max(y_pred_softmax, dim=1)

	category_map = {
	0: "Clapping 👏",
	1: "Footsteps 🦶",
	2: "Brushing Teeth 🪥",
	3: "Drinking Sipping 🧃",
	4: "Laughing 😂",
	5: "Breathing 🌬️",
	6: "Crying Baby 😭",
	7: "Coughing 🤧",
	8: "Snoring 😴",
	9: "Sneezing 🤧",
	}

	st.write("Predicted class:", category_map[y_pred_tags.item()])

	st.text("")
	st.text("")
	st.text("")
	st.markdown(
	"""`Create by` [Santiago Viquez](https://twitter.com/santiviquez)
	and [Ivan Padezhki](https://github.com/ivanpadezhki)
	\| `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)"""
	)