noisy_human / app.py
santiviquez's picture
Update app.py
eca2ad1
import streamlit as st
from huggingface_hub import hf_hub_url, cached_download
import torch
import torchaudio.transforms as transforms
from miniaudio import SampleFormat, decode
from librosa.util import fix_length
import numpy as np
from audio_recorder_streamlit import audio_recorder
# Streamlit app title
st.markdown("## Noisy Human")
st.markdown("")
st.markdown(
"Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes"
)
col1, col2 = st.columns(2)
with st.container():
with col1:
st.markdown(
"""
* Clapping πŸ‘
* Footsteps 🦢
* Brushing Teeth πŸͺ₯
* Drinking Sipping πŸ§ƒ
* Laughing πŸ˜‚
"""
)
with col2:
st.markdown(
"""
* Breathing 🌬️
* Crying Baby 😭
* Coughing 🀧
* Snoring 😴
* Sneezing 🀧
"""
)
# from audio_recorder_streamlit import audio_recorder
from cnn import CNN
REPO_ID = "santiviquez/noisy_human_cnn"
FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth"
RATE = 22050
@st.cache(allow_output_mutation=True)
def download_model():
model_weights = torch.load(
cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu")
)
return model_weights
model_weights = download_model()
model = CNN(input_channels=2)
model.load_state_dict(model_weights)
model.eval()
audio_bytes = st.file_uploader(
"Choose an audio (.wav) file", accept_multiple_files=False
)
st.caption("OR")
audio_bytes = audio_recorder()
if audio_bytes:
# audio_bytes = audio_file_path.read()
st.audio(audio_bytes, format="audio/ogg")
# st.audio(audio_bytes, format="audio/ogg")
# torch.tensor(audio_bytes).shape
decoded_audio = decode(
audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32
)
waveform = np.array(decoded_audio.samples)
waveform = fix_length(waveform, size=5 * RATE)
waveform = torch.FloatTensor(waveform)
x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform)
x_deltas = transforms.ComputeDeltas()(x_mel)
x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216)
y_pred = model(x)
y_pred_softmax = torch.log_softmax(y_pred, dim=1)
_, y_pred_tags = torch.max(y_pred_softmax, dim=1)
category_map = {
0: "Clapping πŸ‘",
1: "Footsteps 🦢",
2: "Brushing Teeth πŸͺ₯",
3: "Drinking Sipping πŸ§ƒ",
4: "Laughing πŸ˜‚",
5: "Breathing 🌬️",
6: "Crying Baby 😭",
7: "Coughing 🀧",
8: "Snoring 😴",
9: "Sneezing 🀧",
}
st.write("**Predicted class:**", category_map[y_pred_tags.item()])
st.text("")
st.text("")
st.text("")
st.markdown(
"""`Create by` [Santiago Viquez](https://twitter.com/santiviquez)
and [Ivan Padezhki](https://github.com/ivanpadezhki)
| `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)"""
)