# Generated 2023-01-07 from: # /home/salah/kenlm_train/to_copy/wavlm_partly_frozen.yaml # yamllint disable # ################################ # Model: wav2vec2 + DNN + CTC # Augmentation: SpecAugment # Authors: Sung-Lin Yeh 2021 # ################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1986 __set_seed: !apply:torch.manual_seed [1986] output_folder: partly_frozen_splitted_wavlm wer_file: partly_frozen_splitted_wavlm/wer.txt save_folder: partly_frozen_splitted_wavlm/save train_log: partly_frozen_splitted_wavlm/train_log.txt # URL for the biggest Fairseq english wav2vec2 model. # Data files data_folder: /gpfsscratch/rech/nou/uzn19yk/Libri/LibriSpeech/ # e,g./path/to/LibriSpeech # noise/ris dataset will automatically be downloaded data_folder_rirs: /gpfsscratch/rech/nou/uzn19yk/Libri/LibriSpeech/ train_splits: [train-clean-100] dev_splits: [dev-clean] test_splits: [test-clean, test-other] skip_prep: false ckpt_interval_minutes: 25 # save checkpoint every N min csv_folder: /gpfsstore/rech/nou/uzn19yk/iwslt/splitted_clean_tunisian_csvs/ train_csv: test_salah_local.csv valid_csv: test_salah_local.csv test_csv: - test_salah_local.csv # Training parameters number_of_epochs: 12 lr: 1 lr_wav2vec: 0.0001 sorting: ascending auto_mix_prec: false sample_rate: 16000 avoid_if_longer_than: 10 # With data_parallel batch_size is split into N jobs # With DDP batch_size is multiplied by N jobs # Must be 3 per GPU to fit 32GB of VRAM batch_size: 1 test_batch_size: 1 # Dataloader options train_dataloader_opts: batch_size: 1 valid_dataloader_opts: batch_size: 1 test_dataloader_opts: batch_size: 1 # Model parameters activation: &id001 !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 freeze_wav2vec: false # Outputs output_neurons: 41 # BPE size, index(blank/eos/bos) = 0 # Decoding parameters blank_index: 0 bos_index: 1 eos_index: 2 # # Functions and classes # epoch_counter: &id008 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 12 augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: 16000 speeds: [95, 100, 105] enc: &id003 !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] activation: *id001 dnn_blocks: 2 dnn_neurons: 1024 wav2vec2: &id002 !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2 source: wavlm-large/ output_norm: true freeze: false freeze_feature_extractor: true save_path: partly_frozen_splitted_wavlm/save/wav2vec2_hubert_checkpoint ##### # Uncomment this block if you prefer to use a Fairseq pretrained model instead # of a HuggingFace one. Here, we provide an URL that is obtained from the # Fairseq github for the multilingual XLSR. # #wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt #wav2vec2: !new:speechbrain.lobes.models.fairseq_wav2vec.FairseqWav2Vec2 # pretrained_path: !ref # output_norm: True # freeze: False # save_path: !ref /wav2vec2_checkpoint/model.pt ctc_lin: &id004 !new:speechbrain.nnet.linear.Linear input_size: 1024 n_neurons: 41 log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: true ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: 0 modules: wav2vec2: *id002 enc: *id003 ctc_lin: *id004 model: &id005 !new:torch.nn.ModuleList - [*id003, *id004] model_opt_class: !name:torch.optim.Adadelta lr: 1 rho: 0.95 eps: 1.e-8 wav2vec_opt_class: !name:torch.optim.Adam lr: 0.0001 lr_annealing_model: &id006 !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: 1 improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 lr_annealing_wav2vec: &id007 !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: 0.0001 improvement_threshold: 0.0025 annealing_factor: 0.9 patient: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: partly_frozen_splitted_wavlm/save recoverables: wav2vec2: *id002 model: *id005 scheduler_model: *id006 scheduler_wav2vec: *id007 counter: *id008 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: partly_frozen_splitted_wavlm/train_log.txt error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: true