Initial app commit
Browse files- app.py +142 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
from librosa.sequence import dtw as lib_dtw
|
6 |
+
from scipy.stats import zscore
|
7 |
+
import gradio as gr
|
8 |
+
from transformers import HubertModel
|
9 |
+
|
10 |
+
time_frame = 1
|
11 |
+
expected_sr = 16000
|
12 |
+
|
13 |
+
def calculateDistances(snd1, snd2):
|
14 |
+
# Load wav files and resample if neeeded
|
15 |
+
wav_paths = [
|
16 |
+
#"audio/KEI_EF08_EN038.wav",
|
17 |
+
#"audio/KEI_KF04_EN038.wav"
|
18 |
+
snd1 , snd2
|
19 |
+
]
|
20 |
+
|
21 |
+
wavs = []
|
22 |
+
for wav_path in wav_paths:
|
23 |
+
wav, sr = torchaudio.load(wav_path)
|
24 |
+
if sr != expected_sr:
|
25 |
+
print(f"Sampling rate of {wav_path} is not {expected_sr} -> Resampling the file")
|
26 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=expected_sr)
|
27 |
+
wav = resampler(wav)
|
28 |
+
wav.squeeze()
|
29 |
+
wavs.append(wav)
|
30 |
+
|
31 |
+
# Generate Features
|
32 |
+
device_name = "cuda" if torch.cuda.is_available() else "cpu"
|
33 |
+
device = torch.device(device_name)
|
34 |
+
print(f'Running on {device_name}')
|
35 |
+
|
36 |
+
|
37 |
+
model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
|
38 |
+
|
39 |
+
features = None
|
40 |
+
speaker_len = []
|
41 |
+
layer = 12
|
42 |
+
names = [f.rsplit(".", 1)[0] for f in wav_paths]
|
43 |
+
|
44 |
+
for wav in wavs:
|
45 |
+
wav_features = model(wav, return_dict=True, output_hidden_states=True).hidden_states[
|
46 |
+
layer].squeeze().detach().numpy()
|
47 |
+
features = wav_features if features is None else np.concatenate([features, wav_features], axis=0)
|
48 |
+
speaker_len.append(wav_features.shape[0])
|
49 |
+
|
50 |
+
|
51 |
+
#Create & Fill a dataframe with the details - full dimensionality
|
52 |
+
data_subset, df_subset, hubert_feature_columns = create_df(features, speaker_len, names)
|
53 |
+
|
54 |
+
speaker1 = names[0]
|
55 |
+
speaker2 = names[1]
|
56 |
+
#print(speaker1)
|
57 |
+
#print(speaker2)
|
58 |
+
|
59 |
+
#using full dimensionality hubert_feature_columns
|
60 |
+
features_speaker1 = df_subset[df_subset['speaker'] == speaker1][hubert_feature_columns].to_numpy()
|
61 |
+
features_speaker2 = df_subset[df_subset['speaker'] == speaker2][hubert_feature_columns].to_numpy()
|
62 |
+
features_speaker1, features_speaker2 = mut_normalize_sequences(features_speaker1, features_speaker2, True)
|
63 |
+
|
64 |
+
distances = librosa_dtw(features_speaker1, features_speaker2)
|
65 |
+
return(distances)
|
66 |
+
|
67 |
+
def mut_normalize_sequences(sq1, sq2, normalize: bool):
|
68 |
+
"""
|
69 |
+
Normalize the sequences together by z-scoring each dimension.
|
70 |
+
sq1: numpy array of shape (t1, d)
|
71 |
+
sq2: numpy array of shape (t2, d)
|
72 |
+
normalize: if True, normalize the sequences together
|
73 |
+
"""
|
74 |
+
if normalize:
|
75 |
+
sq1 = np.copy(sq1)
|
76 |
+
sq2 = np.copy(sq2)
|
77 |
+
len_sq1 = sq1.shape[0]
|
78 |
+
|
79 |
+
arr = np.concatenate((sq1, sq2), axis=0)
|
80 |
+
for dim in range(sq1.shape[1]):
|
81 |
+
arr[:, dim] = zscore(arr[:, dim])
|
82 |
+
sq1 = arr[:len_sq1, :]
|
83 |
+
sq2 = arr[len_sq1:, :]
|
84 |
+
return sq1, sq2
|
85 |
+
|
86 |
+
|
87 |
+
def librosa_dtw(sq1, sq2):
|
88 |
+
D, wp = lib_dtw(sq1.transpose(), sq2.transpose(), backtrack=True)
|
89 |
+
#out_wp_cols = ["EF08_EN038 index", "frame_range","frame_range_ms", "KF04_EN038 index", 'frame_range',"frame_range_ms", "cost"]
|
90 |
+
out_wp = []
|
91 |
+
samples_out = []
|
92 |
+
for i, j in wp[::-1]:
|
93 |
+
match_cost = D[i, j]/(len(sq1)+len(sq2))
|
94 |
+
#print(match_cost)
|
95 |
+
|
96 |
+
samples_per_chunk = 0.02 * expected_sr
|
97 |
+
out_wp.append({"spk1_index":i, "spk1_frame_start":(i*samples_per_chunk), "spk1_frame_end":(i*samples_per_chunk+samples_per_chunk), "spk1_time_start":i*0.02, "spk1_time_end":i*0.02+0.02 , "spk2_index":j, "spk2_frame_start":j*samples_per_chunk, "spk2_frame_end":j*samples_per_chunk+samples_per_chunk, "spk2_time_start":j*0.02, "spk2_time_end":j*0.02+0.02 ,"cost":match_cost})
|
98 |
+
|
99 |
+
return out_wp
|
100 |
+
|
101 |
+
|
102 |
+
def time_txt(time, time_frame=5):
|
103 |
+
if time % time_frame == 0:
|
104 |
+
return f"{round(time * 0.02, 2)}"
|
105 |
+
return ""
|
106 |
+
|
107 |
+
def create_df(feats, speaker_len, names):
|
108 |
+
cols = [f"val {i}" for i in range(feats.shape[1])]
|
109 |
+
df = pd.DataFrame(feats, columns=cols)
|
110 |
+
df['idx'] = df.index
|
111 |
+
time_index = {i: speaker_len[i] for i in range(len(speaker_len))}
|
112 |
+
com_time_index = {i: sum(speaker_len[:i]) for i in range(len(speaker_len))}
|
113 |
+
df_speaker_count = pd.Series(time_index)
|
114 |
+
df_speaker_count = df_speaker_count.reindex(df_speaker_count.index.repeat(df_speaker_count.to_numpy())).rename_axis('speaker_id').reset_index()
|
115 |
+
df['speaker_id'] = df_speaker_count['speaker_id']
|
116 |
+
df['speaker_len'] = df['speaker_id'].apply(lambda row: speaker_len[row])
|
117 |
+
df['com_sum'] = df['speaker_id'].apply(lambda i: com_time_index[i])
|
118 |
+
df['speaker'] = df['speaker_id'].apply(lambda i: names[i])
|
119 |
+
df['time'] = df['idx'] - df['com_sum']
|
120 |
+
df['time_txt'] = df[['time', 'speaker_len']].apply(lambda row: time_txt(row['time'], time_frame), axis=1)
|
121 |
+
assert len(df.loc[df['speaker'] == -1]) == 0
|
122 |
+
assert len(df_speaker_count) == len(df)
|
123 |
+
df_subset = df.copy()
|
124 |
+
data_subset = df_subset[cols].values
|
125 |
+
return data_subset, df_subset, cols
|
126 |
+
|
127 |
+
|
128 |
+
#main GradIO interface
|
129 |
+
with gr.Blocks() as demo:
|
130 |
+
sound1 = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
131 |
+
sound2 = gr.Audio(sources=["microphone", "upload"], type="filepath")
|
132 |
+
runbtn = gr.Button("Run")
|
133 |
+
json = gr.JSON()
|
134 |
+
|
135 |
+
runbtn.click(fn=calculateDistances, inputs=[sound1, sound2], outputs=json)
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
# need to wrap function calls in this. See https://github.com/huggingface/transformers/pull/34966#issuecomment-2538598145
|
140 |
+
if __name__ == '__main__':
|
141 |
+
demo.launch(ssr_mode=False)
|
142 |
+
#calculateDistances()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy==1.26.4
|
2 |
+
pandas
|
3 |
+
torch
|
4 |
+
torchaudio
|
5 |
+
librosa
|
6 |
+
Spicy
|
7 |
+
gradio
|
8 |
+
transformers
|