Spaces:
Running
on
Zero
Running
on
Zero
fix flake
Browse files
tools/extract_embedding.py
CHANGED
@@ -26,9 +26,9 @@ def single_job(utt):
|
|
26 |
if sample_rate != 16000:
|
27 |
audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
|
28 |
feat = kaldi.fbank(audio,
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
feat = feat - feat.mean(dim=0, keepdim=True)
|
33 |
embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
|
34 |
return utt, embedding
|
|
|
26 |
if sample_rate != 16000:
|
27 |
audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
|
28 |
feat = kaldi.fbank(audio,
|
29 |
+
num_mel_bins=80,
|
30 |
+
dither=0,
|
31 |
+
sample_frequency=16000)
|
32 |
feat = feat - feat.mean(dim=0, keepdim=True)
|
33 |
embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
|
34 |
return utt, embedding
|
tools/extract_speech_token.py
CHANGED
@@ -33,7 +33,7 @@ def single_job(utt):
|
|
33 |
else:
|
34 |
feat = whisper.log_mel_spectrogram(audio, n_mels=128)
|
35 |
speech_token = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
|
36 |
-
|
37 |
return utt, speech_token
|
38 |
|
39 |
|
|
|
33 |
else:
|
34 |
feat = whisper.log_mel_spectrogram(audio, n_mels=128)
|
35 |
speech_token = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
|
36 |
+
ort_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
|
37 |
return utt, speech_token
|
38 |
|
39 |
|