MLSpeech commited on
Commit
ec376c7
·
verified ·
1 Parent(s): cc24f04

Initial app commit

Browse files
Files changed (2) hide show
  1. app.py +142 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ import torchaudio
5
+ from librosa.sequence import dtw as lib_dtw
6
+ from scipy.stats import zscore
7
+ import gradio as gr
8
+ from transformers import HubertModel
9
+
10
+ time_frame = 1
11
+ expected_sr = 16000
12
+
13
+ def calculateDistances(snd1, snd2):
14
+ # Load wav files and resample if neeeded
15
+ wav_paths = [
16
+ #"audio/KEI_EF08_EN038.wav",
17
+ #"audio/KEI_KF04_EN038.wav"
18
+ snd1 , snd2
19
+ ]
20
+
21
+ wavs = []
22
+ for wav_path in wav_paths:
23
+ wav, sr = torchaudio.load(wav_path)
24
+ if sr != expected_sr:
25
+ print(f"Sampling rate of {wav_path} is not {expected_sr} -> Resampling the file")
26
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=expected_sr)
27
+ wav = resampler(wav)
28
+ wav.squeeze()
29
+ wavs.append(wav)
30
+
31
+ # Generate Features
32
+ device_name = "cuda" if torch.cuda.is_available() else "cpu"
33
+ device = torch.device(device_name)
34
+ print(f'Running on {device_name}')
35
+
36
+
37
+ model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
38
+
39
+ features = None
40
+ speaker_len = []
41
+ layer = 12
42
+ names = [f.rsplit(".", 1)[0] for f in wav_paths]
43
+
44
+ for wav in wavs:
45
+ wav_features = model(wav, return_dict=True, output_hidden_states=True).hidden_states[
46
+ layer].squeeze().detach().numpy()
47
+ features = wav_features if features is None else np.concatenate([features, wav_features], axis=0)
48
+ speaker_len.append(wav_features.shape[0])
49
+
50
+
51
+ #Create & Fill a dataframe with the details - full dimensionality
52
+ data_subset, df_subset, hubert_feature_columns = create_df(features, speaker_len, names)
53
+
54
+ speaker1 = names[0]
55
+ speaker2 = names[1]
56
+ #print(speaker1)
57
+ #print(speaker2)
58
+
59
+ #using full dimensionality hubert_feature_columns
60
+ features_speaker1 = df_subset[df_subset['speaker'] == speaker1][hubert_feature_columns].to_numpy()
61
+ features_speaker2 = df_subset[df_subset['speaker'] == speaker2][hubert_feature_columns].to_numpy()
62
+ features_speaker1, features_speaker2 = mut_normalize_sequences(features_speaker1, features_speaker2, True)
63
+
64
+ distances = librosa_dtw(features_speaker1, features_speaker2)
65
+ return(distances)
66
+
67
+ def mut_normalize_sequences(sq1, sq2, normalize: bool):
68
+ """
69
+ Normalize the sequences together by z-scoring each dimension.
70
+ sq1: numpy array of shape (t1, d)
71
+ sq2: numpy array of shape (t2, d)
72
+ normalize: if True, normalize the sequences together
73
+ """
74
+ if normalize:
75
+ sq1 = np.copy(sq1)
76
+ sq2 = np.copy(sq2)
77
+ len_sq1 = sq1.shape[0]
78
+
79
+ arr = np.concatenate((sq1, sq2), axis=0)
80
+ for dim in range(sq1.shape[1]):
81
+ arr[:, dim] = zscore(arr[:, dim])
82
+ sq1 = arr[:len_sq1, :]
83
+ sq2 = arr[len_sq1:, :]
84
+ return sq1, sq2
85
+
86
+
87
+ def librosa_dtw(sq1, sq2):
88
+ D, wp = lib_dtw(sq1.transpose(), sq2.transpose(), backtrack=True)
89
+ #out_wp_cols = ["EF08_EN038 index", "frame_range","frame_range_ms", "KF04_EN038 index", 'frame_range',"frame_range_ms", "cost"]
90
+ out_wp = []
91
+ samples_out = []
92
+ for i, j in wp[::-1]:
93
+ match_cost = D[i, j]/(len(sq1)+len(sq2))
94
+ #print(match_cost)
95
+
96
+ samples_per_chunk = 0.02 * expected_sr
97
+ out_wp.append({"spk1_index":i, "spk1_frame_start":(i*samples_per_chunk), "spk1_frame_end":(i*samples_per_chunk+samples_per_chunk), "spk1_time_start":i*0.02, "spk1_time_end":i*0.02+0.02 , "spk2_index":j, "spk2_frame_start":j*samples_per_chunk, "spk2_frame_end":j*samples_per_chunk+samples_per_chunk, "spk2_time_start":j*0.02, "spk2_time_end":j*0.02+0.02 ,"cost":match_cost})
98
+
99
+ return out_wp
100
+
101
+
102
+ def time_txt(time, time_frame=5):
103
+ if time % time_frame == 0:
104
+ return f"{round(time * 0.02, 2)}"
105
+ return ""
106
+
107
+ def create_df(feats, speaker_len, names):
108
+ cols = [f"val {i}" for i in range(feats.shape[1])]
109
+ df = pd.DataFrame(feats, columns=cols)
110
+ df['idx'] = df.index
111
+ time_index = {i: speaker_len[i] for i in range(len(speaker_len))}
112
+ com_time_index = {i: sum(speaker_len[:i]) for i in range(len(speaker_len))}
113
+ df_speaker_count = pd.Series(time_index)
114
+ df_speaker_count = df_speaker_count.reindex(df_speaker_count.index.repeat(df_speaker_count.to_numpy())).rename_axis('speaker_id').reset_index()
115
+ df['speaker_id'] = df_speaker_count['speaker_id']
116
+ df['speaker_len'] = df['speaker_id'].apply(lambda row: speaker_len[row])
117
+ df['com_sum'] = df['speaker_id'].apply(lambda i: com_time_index[i])
118
+ df['speaker'] = df['speaker_id'].apply(lambda i: names[i])
119
+ df['time'] = df['idx'] - df['com_sum']
120
+ df['time_txt'] = df[['time', 'speaker_len']].apply(lambda row: time_txt(row['time'], time_frame), axis=1)
121
+ assert len(df.loc[df['speaker'] == -1]) == 0
122
+ assert len(df_speaker_count) == len(df)
123
+ df_subset = df.copy()
124
+ data_subset = df_subset[cols].values
125
+ return data_subset, df_subset, cols
126
+
127
+
128
+ #main GradIO interface
129
+ with gr.Blocks() as demo:
130
+ sound1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
131
+ sound2 = gr.Audio(sources=["microphone", "upload"], type="filepath")
132
+ runbtn = gr.Button("Run")
133
+ json = gr.JSON()
134
+
135
+ runbtn.click(fn=calculateDistances, inputs=[sound1, sound2], outputs=json)
136
+
137
+
138
+
139
+ # need to wrap function calls in this. See https://github.com/huggingface/transformers/pull/34966#issuecomment-2538598145
140
+ if __name__ == '__main__':
141
+ demo.launch(ssr_mode=False)
142
+ #calculateDistances()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy==1.26.4
2
+ pandas
3
+ torch
4
+ torchaudio
5
+ librosa
6
+ Spicy
7
+ gradio
8
+ transformers