Spaces:
Sleeping
Sleeping
File size: 3,094 Bytes
29457c0 0929195 29457c0 eca2ad1 29457c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import streamlit as st
from huggingface_hub import hf_hub_url, cached_download
import torch
import torchaudio.transforms as transforms
from miniaudio import SampleFormat, decode
from librosa.util import fix_length
import numpy as np
from audio_recorder_streamlit import audio_recorder
# Streamlit app title
st.markdown("## Noisy Human")
st.markdown("")
st.markdown(
"Non-speach human sounds classification. This model can identify with up to 78/% accuracy the following 10 classes"
)
col1, col2 = st.columns(2)
with st.container():
with col1:
st.markdown(
"""
* Clapping π
* Footsteps π¦Ά
* Brushing Teeth πͺ₯
* Drinking Sipping π§
* Laughing π
"""
)
with col2:
st.markdown(
"""
* Breathing π¬οΈ
* Crying Baby π
* Coughing π€§
* Snoring π΄
* Sneezing π€§
"""
)
# from audio_recorder_streamlit import audio_recorder
from cnn import CNN
REPO_ID = "santiviquez/noisy_human_cnn"
FILENAME = "CNN_MelSpec_Deltas_fold_4_.pth"
RATE = 22050
@st.cache(allow_output_mutation=True)
def download_model():
model_weights = torch.load(
cached_download(hf_hub_url(REPO_ID, FILENAME)), map_location=torch.device("cpu")
)
return model_weights
model_weights = download_model()
model = CNN(input_channels=2)
model.load_state_dict(model_weights)
model.eval()
audio_bytes = st.file_uploader(
"Choose an audio (.wav) file", accept_multiple_files=False
)
st.caption("OR")
audio_bytes = audio_recorder()
if audio_bytes:
# audio_bytes = audio_file_path.read()
st.audio(audio_bytes, format="audio/ogg")
# st.audio(audio_bytes, format="audio/ogg")
# torch.tensor(audio_bytes).shape
decoded_audio = decode(
audio_bytes, nchannels=1, sample_rate=RATE, output_format=SampleFormat.SIGNED32
)
waveform = np.array(decoded_audio.samples)
waveform = fix_length(waveform, size=5 * RATE)
waveform = torch.FloatTensor(waveform)
x_mel = transforms.MelSpectrogram(sample_rate=RATE, n_fft=1024, n_mels=60)(waveform)
x_deltas = transforms.ComputeDeltas()(x_mel)
x = torch.cat((x_mel, x_deltas)).view(1, 2, 60, 216)
y_pred = model(x)
y_pred_softmax = torch.log_softmax(y_pred, dim=1)
_, y_pred_tags = torch.max(y_pred_softmax, dim=1)
category_map = {
0: "Clapping π",
1: "Footsteps π¦Ά",
2: "Brushing Teeth πͺ₯",
3: "Drinking Sipping π§",
4: "Laughing π",
5: "Breathing π¬οΈ",
6: "Crying Baby π",
7: "Coughing π€§",
8: "Snoring π΄",
9: "Sneezing π€§",
}
st.write("**Predicted class:**", category_map[y_pred_tags.item()])
st.text("")
st.text("")
st.text("")
st.markdown(
"""`Create by` [Santiago Viquez](https://twitter.com/santiviquez)
and [Ivan Padezhki](https://github.com/ivanpadezhki)
| `Code:` [GitHub](https://github.com/santiviquez/noisy-human-recognition)"""
)
|