File size: 5,860 Bytes

0a4a54d

# Generated 2022-12-15 from:
# /home/pcp22wc/exps/speaker-recognition/hparams/train_etdnn.yaml
# yamllint disable
# ################################
# Model: Speaker identification with ECAPA
# Authors: Hwidong Na & Mirco Ravanelli
# ################################

# Basic parameters
seed: 914
__set_seed: !apply:torch.manual_seed [914]
output_folder: results/etdnn_augment/914
save_folder: results/etdnn_augment/914/save
train_log: results/etdnn_augment/914/train_log.txt

# Data files
data_folder: /fastdata/pcp22wc/audio/VoxCeleb2/dev, /fastdata/pcp22wc/audio/VoxCeleb1/test  # e.g. /path/to/Voxceleb
train_annotation: save/train.csv
valid_annotation: save/dev.csv

# Folder to extract data augmentation files
rir_folder: /fastdata/pcp22wc/audio # Change it if needed
musan_folder: /fastdata/pcp22wc/audio/musan
music_csv: save/music.csv
noise_csv: save/noise.csv
speech_csv: save/speech.csv

# Use the following links for the official voxceleb splits:
# VoxCeleb1 (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt
# VoxCeleb1-H (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_hard2.txt
# VoxCeleb1-E (cleaned): https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/list_test_all2.txt.
# VoxCeleb1-E and VoxCeleb1-H lists are drawn from the VoxCeleb1 training set.
# Therefore you cannot use any files in VoxCeleb1 for training if you are using these lists for testing.
verification_file: https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt

skip_prep: true
ckpt_interval_minutes: 15 # save checkpoint every N min

# Training parameters
number_of_epochs: 40
batch_size: 512
lr: 0.001
lr_final: 0.0001
step_size: 65000
sample_rate: 16000
sentence_len: 3.0 # seconds
shuffle: true
random_chunk: true

# Feature parameters
n_mels: 80
deltas: false

# Number of speakers
out_n_neurons: 5994 #1211 for vox1  # 5994 for vox2, 7205 for vox1+vox2

dataloader_options:
  batch_size: 512
  shuffle: true
  num_workers: 8

# Functions
compute_features: &id007 !new:speechbrain.lobes.features.Fbank
    # augment_wavedrop: !ref <augment_wavedrop>
    # augment_speed: !ref <augment_speed>
  n_mels: 80
  deltas: false

embedding_model: &id008 !new:speechbrain.lobes.models.Xvector.Xvector
  in_channels: 80
  activation: !name:torch.nn.LeakyReLU
  tdnn_blocks: 10
  tdnn_channels: [512, 512, 512, 512, 512, 512, 512, 512, 512, 1500]
  tdnn_kernel_sizes: [5, 1, 3, 1, 3, 1, 5, 1, 1, 1]
  tdnn_dilations: [2, 1, 1, 1, 1, 1, 2, 1, 1, 1]
  lin_neurons: 512

classifier: &id009 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
  input_size: 512
  out_neurons: 5994

epoch_counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: 40


augment_wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 16000
  speeds: [100]

augment_speed: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 16000
  speeds: [95, 100, 105]

add_rev: &id001 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /fastdata/pcp22wc/audio
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 0.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_noise: &id002 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /fastdata/pcp22wc/audio
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_rev_noise: &id003 !new:speechbrain.lobes.augment.EnvCorrupt
  openrir_folder: /fastdata/pcp22wc/audio
  openrir_max_noise_len: 3.0    # seconds
  reverb_prob: 1.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15
  rir_scale_factor: 1.0

add_noise_musan: &id004 !new:speechbrain.lobes.augment.EnvCorrupt
  noise_csv: save/noise.csv
  babble_prob: 0.0
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15

add_music_musan: &id005 !new:speechbrain.lobes.augment.EnvCorrupt
  noise_csv: save/music.csv
  babble_prob: 0.0
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15

add_speech_musan: &id006 !new:speechbrain.lobes.augment.EnvCorrupt
  noise_csv: save/speech.csv
  babble_prob: 0.0
  reverb_prob: 0.0
  noise_prob: 1.0
  noise_snr_low: 0
  noise_snr_high: 15

# Definition of the augmentation pipeline.
# If concat_augment = False, the augmentation techniques are applied
# in sequence. If concat_augment = True, all the augmented signals
# # are concatenated in a single big batch.

augment_pipeline: [*id001, *id002, *id003, *id004, *id005, *id006]
concat_augment: true

mean_var_norm: &id010 !new:speechbrain.processing.features.InputNormalization

  norm_type: sentence
  std_norm: false

modules:
  compute_features: *id007
  add_rev: *id001
  add_noise: *id002
  add_rev_noise: *id003
  add_noise_musan: *id004
  add_music_musan: *id005
  add_speech_musan: *id006
  embedding_model: *id008
  classifier: *id009
  mean_var_norm: *id010
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
  loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
    margin: 0.2
    scale: 30

# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
  lr: 0.001
  weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
  initial_value: 0.001
  final_value: 0.0001
  epoch_count: 40

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: results/etdnn_augment/914/train_log.txt

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
  metric: !name:speechbrain.nnet.losses.classification_error
    reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: results/etdnn_augment/914/save
  recoverables:
    embedding_model: *id008
    classifier: *id009
    normalizer: *id010
    counter: *id011