# ################################ # Model: wav2vec2 + DNN + CTC # Augmentation: SpecAugment # Authors: Sung-Lin Yeh 2021 # ################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 1986 __set_seed: !apply:torch.manual_seed [!ref ] output_folder: partly_frozen_splitted_wavlm/1986/ wer_file: !ref /wer.txt save_folder: !ref /save train_log: !ref /train_log.txt # URL for the biggest Fairseq english wav2vec2 model. # Data files data_folder: /gpfsscratch/rech/nou/uzn19yk/Libri/LibriSpeech/ # e,g./path/to/LibriSpeech # noise/ris dataset will automatically be downloaded data_folder_rirs: !ref train_splits: ["train-clean-100"] dev_splits: ["dev-clean"] test_splits: ["test-clean", "test-other"] skip_prep: False ckpt_interval_minutes: 25 # save checkpoint every N min csv_folder: /gpfsstore/rech/nou/uzn19yk/iwslt/splitted_clean_tunisian_csvs/ train_csv: test_salah_local.csv valid_csv: test_salah_local.csv test_csv: - test_salah_local.csv # Training parameters number_of_epochs: 12 lr: 1 lr_wav2vec: 0.0001 sorting: ascending auto_mix_prec: False sample_rate: 16000 avoid_if_longer_than: 10 # With data_parallel batch_size is split into N jobs # With DDP batch_size is multiplied by N jobs # Must be 3 per GPU to fit 32GB of VRAM batch_size: 1 test_batch_size: 1 # Dataloader options train_dataloader_opts: batch_size: !ref valid_dataloader_opts: batch_size: !ref test_dataloader_opts: batch_size: !ref # Model parameters activation: !name:torch.nn.LeakyReLU dnn_layers: 2 dnn_neurons: 1024 freeze_wav2vec: False # Outputs output_neurons: 41 # BPE size, index(blank/eos/bos) = 0 # Decoding parameters blank_index: 0 bos_index: 1 eos_index: 2 # # Functions and classes # epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter limit: !ref augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: !ref speeds: [95, 100, 105] enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN input_shape: [null, null, 1024] activation: !ref dnn_blocks: !ref dnn_neurons: !ref wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2 source: wavlm-large/ output_norm: True freeze: False freeze_feature_extractor: True save_path: !ref /wav2vec2_hubert_checkpoint ##### # Uncomment this block if you prefer to use a Fairseq pretrained model instead # of a HuggingFace one. Here, we provide an URL that is obtained from the # Fairseq github for the multilingual XLSR. # #wav2vec2_url: https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt #wav2vec2: !new:speechbrain.lobes.models.fairseq_wav2vec.FairseqWav2Vec2 # pretrained_path: !ref # output_norm: True # freeze: False # save_path: !ref /wav2vec2_checkpoint/model.pt ctc_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: !ref modules: wav2vec2: !ref enc: !ref ctc_lin: !ref model: !new:torch.nn.ModuleList - [!ref , !ref ] model_opt_class: !name:torch.optim.Adadelta lr: !ref rho: 0.95 eps: 1.e-8 wav2vec_opt_class: !name:torch.optim.Adam lr: !ref lr_annealing_model: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 lr_annealing_wav2vec: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.9 patient: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: wav2vec2: !ref model: !ref scheduler_model: !ref scheduler_wav2vec: !ref counter: !ref train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: True