File size: 4,423 Bytes
e63fe3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Generated 2023-01-07 from:
# /home/salah/kenlm_train/to_copy/wavlm_partly_frozen.yaml
# yamllint disable
# ################################
# Model: wav2vec2 + DNN + CTC
# Augmentation: SpecAugment
# Authors: Sung-Lin Yeh 2021
# ################################

# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 1986
__set_seed: !apply:torch.manual_seed [1986]
output_folder: partly_frozen_splitted_wavlm
wer_file: partly_frozen_splitted_wavlm/wer.txt
save_folder: partly_frozen_splitted_wavlm/save
train_log: partly_frozen_splitted_wavlm/train_log.txt

# URL for the biggest Fairseq english wav2vec2 model.

# Data files
data_folder: /gpfsscratch/rech/nou/uzn19yk/Libri/LibriSpeech/ # e,g./path/to/LibriSpeech
# noise/ris dataset will automatically be downloaded
data_folder_rirs: /gpfsscratch/rech/nou/uzn19yk/Libri/LibriSpeech/
train_splits: [train-clean-100]
dev_splits: [dev-clean]
test_splits: [test-clean, test-other]
skip_prep: false
ckpt_interval_minutes: 25 # save checkpoint every N min
csv_folder: /gpfsstore/rech/nou/uzn19yk/iwslt/splitted_clean_tunisian_csvs/
train_csv: test_salah_local.csv
valid_csv: test_salah_local.csv
test_csv:
- test_salah_local.csv

# Training parameters
number_of_epochs: 12
lr: 1
lr_wav2vec: 0.0001
sorting: ascending
auto_mix_prec: false
sample_rate: 16000

avoid_if_longer_than: 10
# With data_parallel batch_size is split into N jobs
# With DDP batch_size is multiplied by N jobs
# Must be 3 per GPU to fit 32GB of VRAM
batch_size: 1
test_batch_size: 1

# Dataloader options
train_dataloader_opts:
  batch_size: 1

valid_dataloader_opts:
  batch_size: 1

test_dataloader_opts:
  batch_size: 1

# Model parameters
activation: &id001 !name:torch.nn.LeakyReLU
dnn_layers: 2
dnn_neurons: 1024
freeze_wav2vec: false

# Outputs
output_neurons: 41  # BPE size, index(blank/eos/bos) = 0

# Decoding parameters
blank_index: 0
bos_index: 1
eos_index: 2

#
# Functions and classes
#
epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter

  limit: 12

augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
  sample_rate: 16000
  speeds: [95, 100, 105]

enc: &id003 !new:speechbrain.lobes.models.VanillaNN.VanillaNN
  input_shape: [null, null, 1024]
  activation: *id001
  dnn_blocks: 2
  dnn_neurons: 1024

wav2vec2: &id002 !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
  source: wavlm-large/
  output_norm: true
  freeze: false
  freeze_feature_extractor: true
  save_path: partly_frozen_splitted_wavlm/save/wav2vec2_hubert_checkpoint

#####
# Uncomment this block if you prefer to use a Fairseq pretrained model instead
# of a HuggingFace one. Here, we provide an URL that is obtained from the
# Fairseq github for the multilingual XLSR.
#
#wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt
#wav2vec2: !new:speechbrain.lobes.models.fairseq_wav2vec.FairseqWav2Vec2
#    pretrained_path: !ref <wav2vec2_url>
#    output_norm: True
#    freeze: False
#    save_path: !ref <save_folder>/wav2vec2_checkpoint/model.pt

ctc_lin: &id004 !new:speechbrain.nnet.linear.Linear

  input_size: 1024
  n_neurons: 41

log_softmax: !new:speechbrain.nnet.activations.Softmax
  apply_log: true

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
  blank_index: 0

modules:
  wav2vec2: *id002
  enc: *id003
  ctc_lin: *id004
model: &id005 !new:torch.nn.ModuleList
- [*id003, *id004]
model_opt_class: !name:torch.optim.Adadelta
  lr: 1
  rho: 0.95
  eps: 1.e-8

wav2vec_opt_class: !name:torch.optim.Adam
  lr: 0.0001

lr_annealing_model: &id006 !new:speechbrain.nnet.schedulers.NewBobScheduler
  initial_value: 1
  improvement_threshold: 0.0025
  annealing_factor: 0.8
  patient: 0

lr_annealing_wav2vec: &id007 !new:speechbrain.nnet.schedulers.NewBobScheduler
  initial_value: 0.0001
  improvement_threshold: 0.0025
  annealing_factor: 0.9
  patient: 0


checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: partly_frozen_splitted_wavlm/save
  recoverables:
    wav2vec2: *id002
    model: *id005
    scheduler_model: *id006
    scheduler_wav2vec: *id007
    counter: *id008
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: partly_frozen_splitted_wavlm/train_log.txt

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
  split_tokens: true