Spaces:
Runtime error
Runtime error
Init commit
Browse files- app.py +197 -0
- model8723.json +1 -0
- model8723_weights.h5 +3 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
from json_tricks import load
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import librosa
|
8 |
+
from pydub import AudioSegment, effects
|
9 |
+
import noisereduce as nr
|
10 |
+
|
11 |
+
import tensorflow as tf
|
12 |
+
import keras
|
13 |
+
from keras.models import model_from_json
|
14 |
+
from keras.models import load_model
|
15 |
+
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
import warnings
|
18 |
+
warnings.filterwarnings('ignore')
|
19 |
+
|
20 |
+
saved_model_path = r'./model8723.json'
|
21 |
+
saved_weights_path = r'./model8723_weights.h5'
|
22 |
+
|
23 |
+
#Reading the model from JSON file
|
24 |
+
with open(saved_model_path, 'r') as json_file:
|
25 |
+
json_savedModel = json_file.read()
|
26 |
+
|
27 |
+
# Loading the model architecture, weights
|
28 |
+
model = tf.keras.models.model_from_json(json_savedModel)
|
29 |
+
model.load_weights(saved_weights_path)
|
30 |
+
|
31 |
+
# Compiling the model with similar parameters as the original model.
|
32 |
+
model.compile(loss='categorical_crossentropy',
|
33 |
+
optimizer='RMSProp',
|
34 |
+
metrics=['categorical_accuracy'])
|
35 |
+
|
36 |
+
print(model.summary())
|
37 |
+
|
38 |
+
def convert(y,sr):
|
39 |
+
# convert from float to uint16
|
40 |
+
y = np.array(y * (1<<15), dtype=np.int16)
|
41 |
+
audio_segment = AudioSegment(
|
42 |
+
y.tobytes(),
|
43 |
+
frame_rate=sr,
|
44 |
+
sample_width=y.dtype.itemsize,
|
45 |
+
channels=1
|
46 |
+
)
|
47 |
+
return audio_segment
|
48 |
+
|
49 |
+
def preprocess(y,sr ):
|
50 |
+
|
51 |
+
'''
|
52 |
+
A process to an audio .wav file before execcuting a prediction.
|
53 |
+
Arguments:
|
54 |
+
- file_path - The system path to the audio file.
|
55 |
+
- frame_length - Length of the frame over which to compute the speech features. default: 2048
|
56 |
+
- hop_length - Number of samples to advance for each frame. default: 512
|
57 |
+
|
58 |
+
Return:
|
59 |
+
'X_3D' variable, containing a shape of: (batch, timesteps, feature) for a single file (batch = 1).
|
60 |
+
'''
|
61 |
+
total_length = 204288
|
62 |
+
frame_length = 2048
|
63 |
+
hop_length = 512
|
64 |
+
# Fetch sample rate.
|
65 |
+
# _, sr = librosa.load(path = file_path, sr = None)
|
66 |
+
# Load audio file
|
67 |
+
rawsound = convert(y,sr)
|
68 |
+
# y = y.astype(np.float32)
|
69 |
+
# y /= np.max(np.abs(y))
|
70 |
+
|
71 |
+
# rawsound = AudioSegment.from_mono_audiosegments(y)
|
72 |
+
# Normalize to 5 dBFS
|
73 |
+
normalizedsound = effects.normalize(rawsound, headroom = 5.0)
|
74 |
+
# Transform the audio file to np.array of samples
|
75 |
+
normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
|
76 |
+
|
77 |
+
final_x = nr.reduce_noise(normal_x, sr=sr) #updated 03/03/22
|
78 |
+
|
79 |
+
# Features extraction
|
80 |
+
f1 = librosa.feature.rms(y = final_x, frame_length=frame_length, hop_length=hop_length,center=True,pad_mode='reflect').T # Energy - Root Mean Square
|
81 |
+
f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True).T # ZCR
|
82 |
+
f3 = librosa.feature.mfcc(y = final_x, sr=sr, n_mfcc=13, hop_length = hop_length).T # MFCC
|
83 |
+
|
84 |
+
X = np.concatenate((f1, f2, f3), axis = 1)
|
85 |
+
# Pad the array
|
86 |
+
padding_rows = 448-len(X)
|
87 |
+
X = np.vstack(( X, np.zeros((padding_rows, 15))))
|
88 |
+
|
89 |
+
X_3D = np.expand_dims(X, axis=0)
|
90 |
+
|
91 |
+
return X_3D
|
92 |
+
|
93 |
+
emotions = {
|
94 |
+
0 : 'neutral',
|
95 |
+
1 : 'calm',
|
96 |
+
2 : 'happy',
|
97 |
+
3 : 'sad',
|
98 |
+
4 : 'angry',
|
99 |
+
5 : 'fearful',
|
100 |
+
6 : 'disgust',
|
101 |
+
7 : 'suprised'
|
102 |
+
}
|
103 |
+
emo_list = list(emotions.values())
|
104 |
+
|
105 |
+
def is_silent(data):
|
106 |
+
# Returns 'True' if below the 'silent' threshold
|
107 |
+
return max(data) < 100
|
108 |
+
import pyaudio
|
109 |
+
import wave
|
110 |
+
from array import array
|
111 |
+
import struct
|
112 |
+
import time
|
113 |
+
|
114 |
+
# Initialize variables
|
115 |
+
RATE = 24414
|
116 |
+
CHUNK = 512
|
117 |
+
RECORD_SECONDS = 7.1
|
118 |
+
|
119 |
+
CHANNELS = 1
|
120 |
+
WAVE_OUTPUT_FILE = "./output.wav"
|
121 |
+
|
122 |
+
|
123 |
+
def EmotionRecogniser(stream,new_chunk):
|
124 |
+
# process only when stream gets to length 7.1 seconds, else donot update prediction yet
|
125 |
+
sr, y = new_chunk
|
126 |
+
|
127 |
+
y = y.astype(np.float32)
|
128 |
+
y /= np.max(np.abs(y))
|
129 |
+
|
130 |
+
# SESSION START
|
131 |
+
print("** session started")
|
132 |
+
total_predictions = [] # A list for all predictions in the session.
|
133 |
+
if stream is not None:
|
134 |
+
stream = np.concatenate([stream, y])
|
135 |
+
else:
|
136 |
+
stream = y
|
137 |
+
|
138 |
+
# if len(stream) < int(RATE*RECORD_SECONDS):
|
139 |
+
# return stream, 'neutral'
|
140 |
+
|
141 |
+
x = preprocess(y=stream,sr =sr) # 'output.wav' file preprocessing.
|
142 |
+
print('x shape:', x.shape)
|
143 |
+
# Model's prediction => an 8 emotion probabilities array.
|
144 |
+
predictions = model.predict(x, use_multiprocessing=True)
|
145 |
+
pred_list = list(predictions)
|
146 |
+
pred_np = np.squeeze(np.array(pred_list).tolist(), axis=0) # Get rid of 'array' & 'dtype' statments.
|
147 |
+
total_predictions.append(pred_np)
|
148 |
+
|
149 |
+
#dict of emotions with their respective probabilities
|
150 |
+
emotions_prob = dict(zip(emo_list, pred_np))
|
151 |
+
max_emo = np.argmax(predictions)
|
152 |
+
print('max emotion:', emotions.get(max_emo,-1))
|
153 |
+
|
154 |
+
stream = stream[len(y):] # Reset the stream for the next session.
|
155 |
+
emotions_prob
|
156 |
+
|
157 |
+
return stream , emotions_prob
|
158 |
+
|
159 |
+
# Present emotion distribution for the whole session.
|
160 |
+
# total_predictions_np = np.mean(np.array(total_predictions).tolist(), axis=0)
|
161 |
+
# fig = plt.figure(figsize = (10, 5))
|
162 |
+
# plt.bar(emo_list, total_predictions_np, color = 'indigo')
|
163 |
+
# plt.ylabel("Mean probabilty (%)")
|
164 |
+
# plt.title("Session Summary")
|
165 |
+
# plt.show()
|
166 |
+
|
167 |
+
# print(f"Emotions analyzed for: {(toc - tic):0.4f} seconds")
|
168 |
+
# return str(emotions.get(np.argmax(total_predictions_np),-1))
|
169 |
+
|
170 |
+
##################################################
|
171 |
+
|
172 |
+
import gradio as gr
|
173 |
+
from transformers import pipeline
|
174 |
+
import numpy as np
|
175 |
+
|
176 |
+
# transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
|
177 |
+
|
178 |
+
# def transcribe(stream, new_chunk):
|
179 |
+
# sr, y = new_chunk
|
180 |
+
# y = y.astype(np.float32)
|
181 |
+
# y /= np.max(np.abs(y))
|
182 |
+
|
183 |
+
# if stream is not None:
|
184 |
+
# stream = np.concatenate([stream, y])
|
185 |
+
# else:
|
186 |
+
# stream = y
|
187 |
+
# return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
|
188 |
+
|
189 |
+
|
190 |
+
demo = gr.Interface(
|
191 |
+
EmotionRecogniser,
|
192 |
+
["state",gr.Audio(sources=["microphone"], streaming=True,every=1.0)],
|
193 |
+
["state",'label'],
|
194 |
+
live=True,
|
195 |
+
)
|
196 |
+
|
197 |
+
demo.launch()
|
model8723.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 448, 15], "dtype": "float32", "sparse": false, "ragged": false, "name": "lstm_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 448, 15], "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null, "build_config": {"input_shape": [null, 448, 15]}}, {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "dtype": "float32", "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null, "build_config": {"input_shape": [null, 448, 64]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 8, "activation": "softmax", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 64]}}]}, "keras_version": "2.15.0", "backend": "tensorflow"}
|
model8723_weights.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c5acd498600e161e247956e57b46d051444c65fba7a9799e393e36386b2bb7b6
|
3 |
+
size 235056
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pydub
|
2 |
+
noisereduce
|
3 |
+
pyaudio
|
4 |
+
json-tricks
|
5 |
+
tensorflow
|
6 |
+
keras
|
7 |
+
librosa
|