File size: 4,423 Bytes
e63fe3d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# Generated 2023-01-07 from:
# /home/salah/kenlm_train/to_copy/wavlm_partly_frozen.yaml
# yamllint disable
# ################################
# Model: wav2vec2 + DNN + CTC
# Augmentation: SpecAugment
# Authors: Sung-Lin Yeh 2021
# ################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [1986]
output_folder: partly_frozen_splitted_wavlm
wer_file: partly_frozen_splitted_wavlm/wer.txt
save_folder: partly_frozen_splitted_wavlm/save
train_log: partly_frozen_splitted_wavlm/train_log.txt
# URL for the biggest Fairseq english wav2vec2 model.
# Data files
data_folder: /gpfsscratch/rech/nou/uzn19yk/Libri/LibriSpeech/ # e,g./path/to/LibriSpeech
# noise/ris dataset will automatically be downloaded
data_folder_rirs: /gpfsscratch/rech/nou/uzn19yk/Libri/LibriSpeech/
train_splits: [train-clean-100]
dev_splits: [dev-clean]
test_splits: [test-clean, test-other]
skip_prep: false
ckpt_interval_minutes: 25 # save checkpoint every N min
csv_folder: /gpfsstore/rech/nou/uzn19yk/iwslt/splitted_clean_tunisian_csvs/
train_csv: test_salah_local.csv
valid_csv: test_salah_local.csv
test_csv:
- test_salah_local.csv
# Training parameters
number_of_epochs: 12
lr: 1
lr_wav2vec: 0.0001
sorting: ascending
auto_mix_prec: false
sample_rate: 16000
avoid_if_longer_than: 10
# With data_parallel batch_size is split into N jobs
# With DDP batch_size is multiplied by N jobs
# Must be 3 per GPU to fit 32GB of VRAM
batch_size: 1
test_batch_size: 1
# Dataloader options
train_dataloader_opts:
batch_size: 1
valid_dataloader_opts:
batch_size: 1
test_dataloader_opts:
batch_size: 1
# Model parameters
activation: &id001 !name:torch.nn.LeakyReLU
dnn_layers: 2
dnn_neurons: 1024
freeze_wav2vec: false
# Outputs
output_neurons: 41 # BPE size, index(blank/eos/bos) = 0
# Decoding parameters
blank_index: 0
bos_index: 1
eos_index: 2
#
# Functions and classes
#
epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 12
augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
sample_rate: 16000
speeds: [95, 100, 105]
enc: &id003 !new:speechbrain.lobes.models.VanillaNN.VanillaNN
input_shape: [null, null, 1024]
activation: *id001
dnn_blocks: 2
dnn_neurons: 1024
wav2vec2: &id002 !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
source: wavlm-large/
output_norm: true
freeze: false
freeze_feature_extractor: true
save_path: partly_frozen_splitted_wavlm/save/wav2vec2_hubert_checkpoint
#####
# Uncomment this block if you prefer to use a Fairseq pretrained model instead
# of a HuggingFace one. Here, we provide an URL that is obtained from the
# Fairseq github for the multilingual XLSR.
#
#wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt
#wav2vec2: !new:speechbrain.lobes.models.fairseq_wav2vec.FairseqWav2Vec2
# pretrained_path: !ref <wav2vec2_url>
# output_norm: True
# freeze: False
# save_path: !ref <save_folder>/wav2vec2_checkpoint/model.pt
ctc_lin: &id004 !new:speechbrain.nnet.linear.Linear
input_size: 1024
n_neurons: 41
log_softmax: !new:speechbrain.nnet.activations.Softmax
apply_log: true
ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
blank_index: 0
modules:
wav2vec2: *id002
enc: *id003
ctc_lin: *id004
model: &id005 !new:torch.nn.ModuleList
- [*id003, *id004]
model_opt_class: !name:torch.optim.Adadelta
lr: 1
rho: 0.95
eps: 1.e-8
wav2vec_opt_class: !name:torch.optim.Adam
lr: 0.0001
lr_annealing_model: &id006 !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: 1
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
lr_annealing_wav2vec: &id007 !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: 0.0001
improvement_threshold: 0.0025
annealing_factor: 0.9
patient: 0
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir: partly_frozen_splitted_wavlm/save
recoverables:
wav2vec2: *id002
model: *id005
scheduler_model: *id006
scheduler_wav2vec: *id007
counter: *id008
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file: partly_frozen_splitted_wavlm/train_log.txt
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
split_tokens: true
|