Spaces:
Sleeping
Sleeping
import streamlit as st | |
from huggingface_hub import hf_hub_url, cached_download | |
import torch | |
import torchaudio.transforms as transforms | |
from miniaudio import SampleFormat, decode | |
from librosa.util import fix_length | |
import numpy as np | |
from audio_recorder_streamlit import audio_recorder | |
# Streamlit app title | |
st.markdown("## Noisy Human") | |
st.markdown("") | |
st.markdown( | |
"Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes" | |
) | |
col1, col2 = st.columns(2) | |
with st.container(): | |
with col1: | |
st.markdown( | |
""" | |
* Clapping π | |
* Footsteps π¦Ά | |
* Brushing Teeth πͺ₯ | |
* Drinking Sipping π§ | |
* Laughing π | |
""" | |
) | |
with col2: | |
st.markdown( | |
""" | |
* Breathing π¬οΈ | |
* Crying Baby π | |
* Coughing π€§ | |
* Snoring π΄ | |
* Sneezing π€§ | |
""" | |
) | |
# from audio_recorder_streamlit import audio_recorder | |
from cnn import CNN | |
REPO_ID = "santiviquez/noisy_human_cnn" | |
FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth" | |
RATE = 22050 | |
def download_model(): | |
model_weights = torch.load( | |
cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu") | |
) | |
return model_weights | |
model_weights = download_model() | |
model = CNN(input_channels=2) | |
model.load_state_dict(model_weights) | |
model.eval() | |
audio_bytes = st.file_uploader( | |
"Choose an audio (.wav) file", accept_multiple_files=False | |
) | |
st.caption("OR") | |
audio_bytes = audio_recorder() | |
if audio_bytes: | |
# audio_bytes = audio_file_path.read() | |
st.audio(audio_bytes, format="audio/ogg") | |
# st.audio(audio_bytes, format="audio/ogg") | |
# torch.tensor(audio_bytes).shape | |
decoded_audio = decode( | |
audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32 | |
) | |
waveform = np.array(decoded_audio.samples) | |
waveform = fix_length(waveform, size=5 * RATE) | |
waveform = torch.FloatTensor(waveform) | |
x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform) | |
x_deltas = transforms.ComputeDeltas()(x_mel) | |
x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216) | |
y_pred = model(x) | |
y_pred_softmax = torch.log_softmax(y_pred, dim=1) | |
_, y_pred_tags = torch.max(y_pred_softmax, dim=1) | |
category_map = { | |
0: "Clapping π", | |
1: "Footsteps π¦Ά", | |
2: "Brushing Teeth πͺ₯", | |
3: "Drinking Sipping π§", | |
4: "Laughing π", | |
5: "Breathing π¬οΈ", | |
6: "Crying Baby π", | |
7: "Coughing π€§", | |
8: "Snoring π΄", | |
9: "Sneezing π€§", | |
} | |
st.write("**Predicted class:**", category_map[y_pred_tags.item()]) | |
st.text("") | |
st.text("") | |
st.text("") | |
st.markdown( | |
"""`Create by` [Santiago Viquez](https://twitter.com/santiviquez) | |
and [Ivan Padezhki](https://github.com/ivanpadezhki) | |
| `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)""" | |
) | |