NicolasDenier
commited on
Commit
·
30e388f
1
Parent(s):
b33ecb6
Upload 2 files
Browse files
xvector-generator.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import numpy
|
4 |
+
import argparse
|
5 |
+
import torchaudio
|
6 |
+
from speechbrain.pretrained import EncoderClassifier
|
7 |
+
import torch
|
8 |
+
from tqdm import tqdm
|
9 |
+
import torch.nn.functional as F
|
10 |
+
|
11 |
+
spk_model = {
|
12 |
+
"speechbrain/spkrec-xvect-voxceleb": 512,
|
13 |
+
"speechbrain/spkrec-ecapa-voxceleb": 192,
|
14 |
+
}
|
15 |
+
|
16 |
+
def f2embed(wav_file, classifier, size_embed):
|
17 |
+
signal, fs = torchaudio.load(wav_file)
|
18 |
+
assert fs == 16000, fs
|
19 |
+
with torch.no_grad():
|
20 |
+
embeddings = classifier.encode_batch(signal)
|
21 |
+
embeddings = F.normalize(embeddings, dim=2)
|
22 |
+
embeddings = embeddings.squeeze().cpu().numpy()
|
23 |
+
embeddings = numpy.transpose(embeddings)
|
24 |
+
assert embeddings.shape[0] == size_embed, embeddings.shape[0]
|
25 |
+
return embeddings
|
26 |
+
|
27 |
+
def process(args):
|
28 |
+
wavlst = []
|
29 |
+
for split in args.splits.split(","):
|
30 |
+
wav_dir = os.path.join(args.arctic_root, split)
|
31 |
+
print("wav_dir", wav_dir)
|
32 |
+
wavlst_split = glob.glob(os.path.join(wav_dir, "*.wav"))
|
33 |
+
print("wavlst_split", wavlst_split)
|
34 |
+
print(f"{split} {len(wavlst_split)} utterances.")
|
35 |
+
wavlst.extend(wavlst_split)
|
36 |
+
|
37 |
+
spkemb_root = args.output_root
|
38 |
+
if not os.path.exists(spkemb_root):
|
39 |
+
print(f"Create speaker embedding directory: {spkemb_root}")
|
40 |
+
os.mkdir(spkemb_root)
|
41 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
42 |
+
classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir=os.path.join('/tmp', args.speaker_embed))
|
43 |
+
size_embed = spk_model[args.speaker_embed]
|
44 |
+
for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"):
|
45 |
+
# TODO rename speaker embedding
|
46 |
+
utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "")
|
47 |
+
utt_emb = f2embed(utt_i, classifier, size_embed)
|
48 |
+
numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb)
|
49 |
+
|
50 |
+
def main():
|
51 |
+
parser = argparse.ArgumentParser()
|
52 |
+
parser.add_argument("--arctic-root", "-i", required=True, type=str, help="LibriTTS root directory.")
|
53 |
+
parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.")
|
54 |
+
parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"],
|
55 |
+
help="Pretrained model for extracting speaker embedding.")
|
56 |
+
parser.add_argument("--splits", type=str, help="Split of four speakers seperate by comma.",
|
57 |
+
default="cmu_us_bdl_arctic,cmu_us_clb_arctic,cmu_us_rms_arctic,cmu_us_slt_arctic")
|
58 |
+
args = parser.parse_args()
|
59 |
+
print(f"Loading utterances from {args.arctic_root}/{args.splits}, "
|
60 |
+
+ f"Save speaker embedding 'npy' to {args.output_root}, "
|
61 |
+
+ f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.")
|
62 |
+
process(args)
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
"""
|
66 |
+
python utils/xvector-generator.py \
|
67 |
+
-i /root/data/cmu_arctic/CMUARCTIC \
|
68 |
+
-o /root/data/cmu_arctic/CMUARCTIC/spkrec-xvect \
|
69 |
+
-s speechbrain/spkrec-xvect-voxceleb
|
70 |
+
"""
|
71 |
+
main()
|
xvectors/french_recording-bernard-candide_segment_090.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d3a1ce32ce48973c1bd8f91f2a2a03fb7a088d582436b7c105081bb31034a8dd
|
3 |
+
size 4224
|