File size: 5,860 Bytes
0a4a54d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# Generated 2022-12-15 from:
# /home/pcp22wc/exps/speaker-recognition/hparams/train_etdnn.yaml
# yamllint disable
# ################################
# Model: Speaker identification with ECAPA
# Authors: Hwidong Na & Mirco Ravanelli
# ################################

# Basic parameters
seed: 914
__set_seed: !apply:torch.manual_seed [914]
output_folder: results/etdnn_augment/914
save_folder: results/etdnn_augment/914/save
train_log: results/etdnn_augment/914/train_log.txt

# Data files
data_folder: /fastdata/pcp22wc/audio/VoxCeleb2/dev, /fastdata/pcp22wc/audio/VoxCeleb1/test  # e.g. /path/to/Voxceleb
train_annotation: save/train.csv
valid_annotation: save/dev.csv

# Folder to extract data augmentation files
rir_folder: /fastdata/pcp22wc/audio # Change it if needed
musan_folder: /fastdata/pcp22wc/audio/musan
music_csv: save/music.csv
noise_csv: save/noise.csv
speech_csv: save/speech.csv

# Use the following links for the official voxceleb splits:
# VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
# VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
# VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
# VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
# Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
verification_file: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt

skip_prep: true
ckpt_interval_minutes: 15 # save checkpoint every N min

# Training parameters
number_of_epochs: 40
batch_size: 512
lr: 0.001
lr_final: 0.0001
step_size: 65000
sample_rate: 16000
sentence_len: 3.0 # seconds
shuffle: true
random_chunk: true

# Feature parameters
n_mels: 80
deltas: false

# Number of speakers
out_n_neurons: 5994 #1211 for vox1  # 5994 for vox2, 7205 for vox1+vox2

dataloader_options:
  batch_size: 512
  shuffle: true
  num_workers: 8

# Functions
compute_features: &id007 !new:speechbrain.lobes.features.Fbank
    # augment_wavedrop: !ref <augment_wavedrop>
    # augment_speed: !ref <augment_speed>
  n_mels: 80
  deltas: false

embedding_model: &id008 !new:speechbrain.lobes.models.Xvector.Xvector
  in_channels: 80
  activation: !name:torch.nn.LeakyReLU
  tdnn_blocks: 10
  tdnn_channels: [512, 512, 512, 512, 512, 512, 512, 512, 512, 1500]
  tdnn_kernel_sizes: [5, 1, 3, 1, 3, 1, 5, 1, 1, 1]
  tdnn_dilations: [2, 1, 1, 1, 1, 1, 2, 1, 1, 1]
  lin_neurons: 512

classifier: &id009 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
  input_size: 512
  out_neurons: 5994

epoch_counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: 40


augment_wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 16000
  speeds: [100]

augment_speed: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 16000
  speeds: [95, 100, 105]

add_rev: &id001 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /fastdata/pcp22wc/audio
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 0.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_noise: &id002 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /fastdata/pcp22wc/audio
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_rev_noise: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /fastdata/pcp22wc/audio
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_noise_musan: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
  noise_csv: save/noise.csv
  babble_prob: 0.0
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15

add_music_musan: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
  noise_csv: save/music.csv
  babble_prob: 0.0
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15

add_speech_musan: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
  noise_csv: save/speech.csv
  babble_prob: 0.0
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15

# Definition of the augmentation pipeline.
# If concat_augment = False, the augmentation techniques are applied
# in sequence. If concat_augment = True, all the augmented signals
# # are concatenated in a single big batch.

augment_pipeline: [*id001, *id002, *id003, *id004, *id005, *id006]
concat_augment: true

mean_var_norm: &id010 !new:speechbrain.processing.features.InputNormalization

  norm_type: sentence
  std_norm: false

modules:
  compute_features: *id007
  add_rev: *id001
  add_noise: *id002
  add_rev_noise: *id003
  add_noise_musan: *id004
  add_music_musan: *id005
  add_speech_musan: *id006
  embedding_model: *id008
  classifier: *id009
  mean_var_norm: *id010
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
  loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
    margin: 0.2
    scale: 30

# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
  lr: 0.001
  weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
  initial_value: 0.001
  final_value: 0.0001
  epoch_count: 40

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: results/etdnn_augment/914/train_log.txt

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
  metric: !name:speechbrain.nnet.losses.classification_error
    reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/etdnn_augment/914/save
  recoverables:
    embedding_model: *id008
    classifier: *id009
    normalizer: *id010
    counter: *id011