KhadgaA commited on
Commit
5d41544
·
1 Parent(s): ca632af

Init commit

Browse files
Files changed (4) hide show
  1. app.py +197 -0
  2. model8723.json +1 -0
  3. model8723_weights.h5 +3 -0
  4. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ from json_tricks import load
4
+
5
+ import numpy as np
6
+
7
+ import librosa
8
+ from pydub import AudioSegment, effects
9
+ import noisereduce as nr
10
+
11
+ import tensorflow as tf
12
+ import keras
13
+ from keras.models import model_from_json
14
+ from keras.models import load_model
15
+
16
+ import matplotlib.pyplot as plt
17
+ import warnings
18
+ warnings.filterwarnings('ignore')
19
+
20
+ saved_model_path = r'./model8723.json'
21
+ saved_weights_path = r'./model8723_weights.h5'
22
+
23
+ #Reading the model from JSON file
24
+ with open(saved_model_path, 'r') as json_file:
25
+ json_savedModel = json_file.read()
26
+
27
+ # Loading the model architecture, weights
28
+ model = tf.keras.models.model_from_json(json_savedModel)
29
+ model.load_weights(saved_weights_path)
30
+
31
+ # Compiling the model with similar parameters as the original model.
32
+ model.compile(loss='categorical_crossentropy',
33
+ optimizer='RMSProp',
34
+ metrics=['categorical_accuracy'])
35
+
36
+ print(model.summary())
37
+
38
+ def convert(y,sr):
39
+ # convert from float to uint16
40
+ y = np.array(y * (1<<15), dtype=np.int16)
41
+ audio_segment = AudioSegment(
42
+ y.tobytes(),
43
+ frame_rate=sr,
44
+ sample_width=y.dtype.itemsize,
45
+ channels=1
46
+ )
47
+ return audio_segment
48
+
49
+ def preprocess(y,sr ):
50
+
51
+ '''
52
+ A process to an audio .wav file before execcuting a prediction.
53
+ Arguments:
54
+ - file_path - The system path to the audio file.
55
+ - frame_length - Length of the frame over which to compute the speech features. default: 2048
56
+ - hop_length - Number of samples to advance for each frame. default: 512
57
+
58
+ Return:
59
+ 'X_3D' variable, containing a shape of: (batch, timesteps, feature) for a single file (batch = 1).
60
+ '''
61
+ total_length = 204288
62
+ frame_length = 2048
63
+ hop_length = 512
64
+ # Fetch sample rate.
65
+ # _, sr = librosa.load(path = file_path, sr = None)
66
+ # Load audio file
67
+ rawsound = convert(y,sr)
68
+ # y = y.astype(np.float32)
69
+ # y /= np.max(np.abs(y))
70
+
71
+ # rawsound = AudioSegment.from_mono_audiosegments(y)
72
+ # Normalize to 5 dBFS
73
+ normalizedsound = effects.normalize(rawsound, headroom = 5.0)
74
+ # Transform the audio file to np.array of samples
75
+ normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
76
+
77
+ final_x = nr.reduce_noise(normal_x, sr=sr) #updated 03/03/22
78
+
79
+ # Features extraction
80
+ f1 = librosa.feature.rms(y = final_x, frame_length=frame_length, hop_length=hop_length,center=True,pad_mode='reflect').T # Energy - Root Mean Square
81
+ f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True).T # ZCR
82
+ f3 = librosa.feature.mfcc(y = final_x, sr=sr, n_mfcc=13, hop_length = hop_length).T # MFCC
83
+
84
+ X = np.concatenate((f1, f2, f3), axis = 1)
85
+ # Pad the array
86
+ padding_rows = 448-len(X)
87
+ X = np.vstack(( X, np.zeros((padding_rows, 15))))
88
+
89
+ X_3D = np.expand_dims(X, axis=0)
90
+
91
+ return X_3D
92
+
93
+ emotions = {
94
+ 0 : 'neutral',
95
+ 1 : 'calm',
96
+ 2 : 'happy',
97
+ 3 : 'sad',
98
+ 4 : 'angry',
99
+ 5 : 'fearful',
100
+ 6 : 'disgust',
101
+ 7 : 'suprised'
102
+ }
103
+ emo_list = list(emotions.values())
104
+
105
+ def is_silent(data):
106
+ # Returns 'True' if below the 'silent' threshold
107
+ return max(data) < 100
108
+ import pyaudio
109
+ import wave
110
+ from array import array
111
+ import struct
112
+ import time
113
+
114
+ # Initialize variables
115
+ RATE = 24414
116
+ CHUNK = 512
117
+ RECORD_SECONDS = 7.1
118
+
119
+ CHANNELS = 1
120
+ WAVE_OUTPUT_FILE = "./output.wav"
121
+
122
+
123
+ def EmotionRecogniser(stream,new_chunk):
124
+ # process only when stream gets to length 7.1 seconds, else donot update prediction yet
125
+ sr, y = new_chunk
126
+
127
+ y = y.astype(np.float32)
128
+ y /= np.max(np.abs(y))
129
+
130
+ # SESSION START
131
+ print("** session started")
132
+ total_predictions = [] # A list for all predictions in the session.
133
+ if stream is not None:
134
+ stream = np.concatenate([stream, y])
135
+ else:
136
+ stream = y
137
+
138
+ # if len(stream) < int(RATE*RECORD_SECONDS):
139
+ # return stream, 'neutral'
140
+
141
+ x = preprocess(y=stream,sr =sr) # 'output.wav' file preprocessing.
142
+ print('x shape:', x.shape)
143
+ # Model's prediction => an 8 emotion probabilities array.
144
+ predictions = model.predict(x, use_multiprocessing=True)
145
+ pred_list = list(predictions)
146
+ pred_np = np.squeeze(np.array(pred_list).tolist(), axis=0) # Get rid of 'array' & 'dtype' statments.
147
+ total_predictions.append(pred_np)
148
+
149
+ #dict of emotions with their respective probabilities
150
+ emotions_prob = dict(zip(emo_list, pred_np))
151
+ max_emo = np.argmax(predictions)
152
+ print('max emotion:', emotions.get(max_emo,-1))
153
+
154
+ stream = stream[len(y):] # Reset the stream for the next session.
155
+ emotions_prob
156
+
157
+ return stream , emotions_prob
158
+
159
+ # Present emotion distribution for the whole session.
160
+ # total_predictions_np = np.mean(np.array(total_predictions).tolist(), axis=0)
161
+ # fig = plt.figure(figsize = (10, 5))
162
+ # plt.bar(emo_list, total_predictions_np, color = 'indigo')
163
+ # plt.ylabel("Mean probabilty (%)")
164
+ # plt.title("Session Summary")
165
+ # plt.show()
166
+
167
+ # print(f"Emotions analyzed for: {(toc - tic):0.4f} seconds")
168
+ # return str(emotions.get(np.argmax(total_predictions_np),-1))
169
+
170
+ ##################################################
171
+
172
+ import gradio as gr
173
+ from transformers import pipeline
174
+ import numpy as np
175
+
176
+ # transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
177
+
178
+ # def transcribe(stream, new_chunk):
179
+ # sr, y = new_chunk
180
+ # y = y.astype(np.float32)
181
+ # y /= np.max(np.abs(y))
182
+
183
+ # if stream is not None:
184
+ # stream = np.concatenate([stream, y])
185
+ # else:
186
+ # stream = y
187
+ # return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
188
+
189
+
190
+ demo = gr.Interface(
191
+ EmotionRecogniser,
192
+ ["state",gr.Audio(sources=["microphone"], streaming=True,every=1.0)],
193
+ ["state",'label'],
194
+ live=True,
195
+ )
196
+
197
+ demo.launch()
model8723.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 448, 15], "dtype": "float32", "sparse": false, "ragged": false, "name": "lstm_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 448, 15], "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null, "build_config": {"input_shape": [null, 448, 15]}}, {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "dtype": "float32", "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null, "build_config": {"input_shape": [null, 448, 64]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 8, "activation": "softmax", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 64]}}]}, "keras_version": "2.15.0", "backend": "tensorflow"}
model8723_weights.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5acd498600e161e247956e57b46d051444c65fba7a9799e393e36386b2bb7b6
3
+ size 235056
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pydub
2
+ noisereduce
3
+ pyaudio
4
+ json-tricks
5
+ tensorflow
6
+ keras
7
+ librosa