NicolasDenier commited on
Commit
30e388f
·
1 Parent(s): b33ecb6

Upload 2 files

Browse files
xvector-generator.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import numpy
4
+ import argparse
5
+ import torchaudio
6
+ from speechbrain.pretrained import EncoderClassifier
7
+ import torch
8
+ from tqdm import tqdm
9
+ import torch.nn.functional as F
10
+
11
+ spk_model = {
12
+ "speechbrain/spkrec-xvect-voxceleb": 512,
13
+ "speechbrain/spkrec-ecapa-voxceleb": 192,
14
+ }
15
+
16
+ def f2embed(wav_file, classifier, size_embed):
17
+ signal, fs = torchaudio.load(wav_file)
18
+ assert fs == 16000, fs
19
+ with torch.no_grad():
20
+ embeddings = classifier.encode_batch(signal)
21
+ embeddings = F.normalize(embeddings, dim=2)
22
+ embeddings = embeddings.squeeze().cpu().numpy()
23
+ embeddings = numpy.transpose(embeddings)
24
+ assert embeddings.shape[0] == size_embed, embeddings.shape[0]
25
+ return embeddings
26
+
27
+ def process(args):
28
+ wavlst = []
29
+ for split in args.splits.split(","):
30
+ wav_dir = os.path.join(args.arctic_root, split)
31
+ print("wav_dir", wav_dir)
32
+ wavlst_split = glob.glob(os.path.join(wav_dir, "*.wav"))
33
+ print("wavlst_split", wavlst_split)
34
+ print(f"{split} {len(wavlst_split)} utterances.")
35
+ wavlst.extend(wavlst_split)
36
+
37
+ spkemb_root = args.output_root
38
+ if not os.path.exists(spkemb_root):
39
+ print(f"Create speaker embedding directory: {spkemb_root}")
40
+ os.mkdir(spkemb_root)
41
+ device = "cuda" if torch.cuda.is_available() else "cpu"
42
+ classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir=os.path.join('/tmp', args.speaker_embed))
43
+ size_embed = spk_model[args.speaker_embed]
44
+ for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"):
45
+ # TODO rename speaker embedding
46
+ utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "")
47
+ utt_emb = f2embed(utt_i, classifier, size_embed)
48
+ numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb)
49
+
50
+ def main():
51
+ parser = argparse.ArgumentParser()
52
+ parser.add_argument("--arctic-root", "-i", required=True, type=str, help="LibriTTS root directory.")
53
+ parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.")
54
+ parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"],
55
+ help="Pretrained model for extracting speaker embedding.")
56
+ parser.add_argument("--splits", type=str, help="Split of four speakers seperate by comma.",
57
+ default="cmu_us_bdl_arctic,cmu_us_clb_arctic,cmu_us_rms_arctic,cmu_us_slt_arctic")
58
+ args = parser.parse_args()
59
+ print(f"Loading utterances from {args.arctic_root}/{args.splits}, "
60
+ + f"Save speaker embedding 'npy' to {args.output_root}, "
61
+ + f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.")
62
+ process(args)
63
+
64
+ if __name__ == "__main__":
65
+ """
66
+ python utils/xvector-generator.py \
67
+ -i /root/data/cmu_arctic/CMUARCTIC \
68
+ -o /root/data/cmu_arctic/CMUARCTIC/spkrec-xvect \
69
+ -s speechbrain/spkrec-xvect-voxceleb
70
+ """
71
+ main()
xvectors/french_recording-bernard-candide_segment_090.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a1ce32ce48973c1bd8f91f2a2a03fb7a088d582436b7c105081bb31034a8dd
3
+ size 4224