khalida1wwin commited on
Commit
33192bb
·
1 Parent(s): 81763a1

Add new files

Browse files
Files changed (1) hide show
  1. app.py +240 -7
app.py CHANGED
@@ -1,9 +1,242 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech",
4
- description="TTS using FastSpeech2",
5
- outputs = 'audio',
6
- title="Text to Speech (TTS)",
7
- examples=[["The quick brown fox jumps over the lazy dog."]],
8
- article = "Author: <a href=\"https://huggingface.co/rowel\">Rowel Atienza</a>",
9
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import tensorflow as tf
3
+ from tensorflow.keras.models import model_from_json
4
+ import soundfile as sf
5
+ import numpy as np
6
+ import os
7
+ import scipy
8
+ from scipy.io import wavfile
9
  import gradio as gr
10
 
11
+ def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
12
+ """This function take an audio and split into several frame
13
+ in a numpy matrix of size (nb_frame,frame_length)"""
14
+
15
+ sequence_sample_length = sound_data.shape[0]
16
+
17
+ sound_data_list = [sound_data[start:start + frame_length] for start in range(
18
+ 0, sequence_sample_length - frame_length + 1, hop_length_frame)] # get sliding windows
19
+ sound_data_array = np.vstack(sound_data_list)
20
+
21
+ return sound_data_array
22
+
23
+
24
+ def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
25
+ """This function take audio files of a directory and merge them
26
+ in a numpy matrix of size (nb_frame,frame_length) for a sliding window of size hop_length_frame"""
27
+
28
+ list_sound_array = []
29
+
30
+ for file in list_audio_files:
31
+ # open the audio file
32
+ y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
33
+ total_duration = librosa.get_duration(y=y, sr=sr)
34
+
35
+ if (total_duration >= min_duration):
36
+ list_sound_array.append(audio_to_audio_frame_stack(
37
+ y, frame_length, hop_length_frame))
38
+ else:
39
+ print(
40
+ f"The following file {os.path.join(audio_dir,file)} is below the min duration")
41
+
42
+ return np.vstack(list_sound_array)
43
+
44
+
45
+ def blend_noise_randomly(voice, noise, nb_samples, frame_length):
46
+ """This function takes as input numpy arrays representing frames
47
+ of voice sounds, noise sounds and the number of frames to be created
48
+ and return numpy arrays with voice randomly blend with noise"""
49
+
50
+ prod_voice = np.zeros((nb_samples, frame_length))
51
+ prod_noise = np.zeros((nb_samples, frame_length))
52
+ prod_noisy_voice = np.zeros((nb_samples, frame_length))
53
+
54
+ for i in range(nb_samples):
55
+ id_voice = np.random.randint(0, voice.shape[0])
56
+ id_noise = np.random.randint(0, noise.shape[0])
57
+ level_noise = np.random.uniform(0.2, 0.8)
58
+ prod_voice[i, :] = voice[id_voice, :]
59
+ prod_noise[i, :] = level_noise * noise[id_noise, :]
60
+ prod_noisy_voice[i, :] = prod_voice[i, :] + prod_noise[i, :]
61
+
62
+ return prod_voice, prod_noise, prod_noisy_voice
63
+
64
+
65
+ def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
66
+ """This function takes an audio and convert into spectrogram,
67
+ it returns the magnitude in dB and the phase"""
68
+
69
+ stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
70
+ stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)
71
+
72
+ stftaudio_magnitude_db = librosa.amplitude_to_db(
73
+ stftaudio_magnitude, ref=np.max)
74
+
75
+ return stftaudio_magnitude_db, stftaudio_phase
76
+
77
+
78
+ def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
79
+ """This function takes as input a numpy audi of size (nb_frame,frame_length), and return
80
+ a numpy containing the matrix spectrogram for amplitude in dB and phase. It will have the size
81
+ (nb_frame,dim_square_spec,dim_square_spec)"""
82
+
83
+ nb_audio = numpy_audio.shape[0]
84
+
85
+ m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
86
+ m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
87
+
88
+ for i in range(nb_audio):
89
+ m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(
90
+ n_fft, hop_length_fft, numpy_audio[i])
91
+
92
+ return m_mag_db, m_phase
93
+
94
+
95
+ def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, stftaudio_magnitude_db, stftaudio_phase):
96
+ """This functions reverts a spectrogram to an audio"""
97
+
98
+ stftaudio_magnitude_rev = librosa.db_to_amplitude(stftaudio_magnitude_db, ref=1.0)
99
+
100
+ # taking magnitude and phase of audio
101
+ audio_reverse_stft = stftaudio_magnitude_rev * stftaudio_phase
102
+ audio_reconstruct = librosa.core.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)
103
+
104
+ return audio_reconstruct
105
+
106
+ def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft) :
107
+ """This functions reverts the matrix spectrograms to numpy audio"""
108
+
109
+ list_audio = []
110
+
111
+ nb_spec = m_mag_db.shape[0]
112
+
113
+ for i in range(nb_spec):
114
+
115
+ audio_reconstruct = magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
116
+ list_audio.append(audio_reconstruct)
117
+
118
+ return np.vstack(list_audio)
119
+
120
+ def scaled_in(matrix_spec):
121
+ "global scaling apply to noisy voice spectrograms (scale between -1 and 1)"
122
+ matrix_spec = (matrix_spec + 46)/50
123
+ return matrix_spec
124
+
125
+ def scaled_ou(matrix_spec):
126
+ "global scaling apply to noise models spectrograms (scale between -1 and 1)"
127
+ matrix_spec = (matrix_spec -6 )/82
128
+ return matrix_spec
129
+
130
+ def inv_scaled_in(matrix_spec):
131
+ "inverse global scaling apply to noisy voices spectrograms"
132
+ matrix_spec = matrix_spec * 50 - 46
133
+ return matrix_spec
134
+
135
+ def inv_scaled_ou(matrix_spec):
136
+ "inverse global scaling apply to noise models spectrograms"
137
+ matrix_spec = matrix_spec * 82 + 6
138
+ return matrix_spec
139
+
140
+
141
+ def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
142
+ audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft):
143
+ """ This function takes as input pretrained weights, noisy voice sound to denoise, predict
144
+ the denoise sound and save it to disk.
145
+ """
146
+
147
+ # load json and create model
148
+ json_file = open(weights_path+'/'+name_model+'.json', 'r')
149
+ loaded_model_json = json_file.read()
150
+ json_file.close()
151
+ loaded_model = model_from_json(loaded_model_json)
152
+ # load weights into new model
153
+ loaded_model.load_weights(weights_path+'/'+name_model+'.h5')
154
+ print("Loaded model from disk")
155
+
156
+ # Extracting noise and voice from folder and convert to numpy
157
+ audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate,
158
+ frame_length, hop_length_frame, min_duration)
159
+
160
+ #Dimensions of squared spectrogram
161
+ dim_square_spec = int(n_fft / 2) + 1
162
+ print(dim_square_spec)
163
+
164
+ # Create Amplitude and phase of the sounds
165
+ m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(
166
+ audio, dim_square_spec, n_fft, hop_length_fft)
167
+
168
+ #global scaling to have distribution -1/1
169
+ X_in = scaled_in(m_amp_db_audio)
170
+ #Reshape for prediction
171
+ X_in = X_in.reshape(X_in.shape[0],X_in.shape[1],X_in.shape[2],1)
172
+ #Prediction using loaded network
173
+ X_pred = loaded_model.predict(X_in)
174
+ #Rescale back the noise model
175
+ inv_sca_X_pred = inv_scaled_ou(X_pred)
176
+ #Remove noise model from noisy speech
177
+ X_denoise = m_amp_db_audio - inv_sca_X_pred[:,:,:,0]
178
+ #Reconstruct audio from denoised spectrogram and phase
179
+ print(X_denoise.shape)
180
+ print(m_pha_audio.shape)
181
+ print(frame_length)
182
+ print(hop_length_fft)
183
+ audio_denoise_recons = matrix_spectrogram_to_numpy_audio(X_denoise, m_pha_audio, frame_length, hop_length_fft)
184
+ #Number of frames
185
+ nb_samples = audio_denoise_recons.shape[0]
186
+ #Save all frames in one file
187
+ denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length)*10
188
+ # librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate)
189
+ print(dir_save_prediction + audio_output_prediction)
190
+ sf.write(dir_save_prediction + audio_output_prediction , denoise_long[0, :], sample_rate)
191
+
192
+ def denoise_audio(audioName):
193
+ testNo = audioName
194
+ audio_dir_prediction = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"+testNo +".wav"
195
+ sample_rate, data = wavfile.read(audio_dir_prediction)
196
+ len_data = len(data) # holds length of the numpy array
197
+
198
+
199
+ t = len_data / sample_rate # returns duration but in floats
200
+ print("t:",t)
201
+ weights_path = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"
202
+ name_model = "model_unet"
203
+ audio_dir_prediction = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"
204
+ dir_save_prediction = "/content/drive/MyDrive/projects/resume projects/denoising2/prod/"
205
+ audio_output_prediction = "test"+ testNo+".wav"
206
+ audio_input_prediction = [testNo +".wav"]
207
+ sample_rate = 8000
208
+ min_duration = t
209
+ frame_length = 8064
210
+ hop_length_frame = 8064
211
+ n_fft = 255
212
+ hop_length_fft = 63
213
+
214
+ dim_square_spec = int(n_fft / 2) + 1
215
+
216
+ prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
217
+ audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft)
218
+ print(audio_output_prediction)
219
+ return audio_output_prediction
220
+
221
+
222
+ examples = [
223
+ [os.path.abspath("3.wav")],
224
+ [os.path.abspath("2.wav")]
225
+ ]
226
+
227
+
228
+
229
+ iface = gr.Interface(fn = denoise_audio,
230
+ inputs = 'audio',
231
+ outputs = 'audio',
232
+ verbose = True,
233
+ title = 'audio to denoised Audio Application',
234
+ description = 'A simple application to denoise audio speech usinf UNet deep learning model. Upload your own audio, or click one of the examples to load them.',
235
+ article =
236
+ '''<div>
237
+ <p style="text-align: center"> All you need to do is to upload the pdf file and hit submit, then wait for compiling. After that click on Play/Pause for listing to the audio. The audio is saved in a wav format.</p>
238
+ </div>''',
239
+ examples=examples
240
+ )
241
+
242
+ iface.launch()