CosyVoice commited on
Commit
f6b5c42
1 Parent(s): ff8e635
tools/extract_embedding.py CHANGED
@@ -26,9 +26,9 @@ def single_job(utt):
26
  if sample_rate != 16000:
27
  audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
28
  feat = kaldi.fbank(audio,
29
- num_mel_bins=80,
30
- dither=0,
31
- sample_frequency=16000)
32
  feat = feat - feat.mean(dim=0, keepdim=True)
33
  embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
34
  return utt, embedding
 
26
  if sample_rate != 16000:
27
  audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(audio)
28
  feat = kaldi.fbank(audio,
29
+ num_mel_bins=80,
30
+ dither=0,
31
+ sample_frequency=16000)
32
  feat = feat - feat.mean(dim=0, keepdim=True)
33
  embedding = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.unsqueeze(dim=0).cpu().numpy()})[0].flatten().tolist()
34
  return utt, embedding
tools/extract_speech_token.py CHANGED
@@ -33,7 +33,7 @@ def single_job(utt):
33
  else:
34
  feat = whisper.log_mel_spectrogram(audio, n_mels=128)
35
  speech_token = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
36
- ort_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
37
  return utt, speech_token
38
 
39
 
 
33
  else:
34
  feat = whisper.log_mel_spectrogram(audio, n_mels=128)
35
  speech_token = ort_session.run(None, {ort_session.get_inputs()[0].name: feat.detach().cpu().numpy(),
36
+ ort_session.get_inputs()[1].name: np.array([feat.shape[2]], dtype=np.int32)})[0].flatten().tolist()
37
  return utt, speech_token
38
 
39