# Generated 2022-11-21 from: # /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml # yamllint disable # ################################# # Basic training parameters for sound classification using the ESC50 dataset. # This recipe uses the ecapa-tdnn backbone for classification. # # Author: # * Cem Subakan # (based on the SpeechBrain UrbanSound8k recipe) # ################################# # Seed needs to be set at top of yaml, before objects with parameters are made seed: 11 __set_seed: !!python/object/apply:torch.manual_seed [11] # Set up folders for reading from and writing to # Dataset must already exist at `audio_data_folder` data_folder: /data2/ESC-50-master # e.g., /localscratch/UrbanSound8K open_rir_folder: /RIRS # Change if needed audio_data_folder: /data2/ESC-50-master/audio # TODO the follwing folder will contain the resampled audio # files (mono channel and config SR) to train on #reasmpled_audio_data_folder: !ref /audio_mono16kHz # experiment_name: cnn14 output_folder: ./results/cnn14/11 save_folder: ./results/cnn14/11/save train_log: ./results/cnn14/11/train_log.txt test_only: false # Tensorboard logs use_tensorboard: false tensorboard_logs_folder: ./results/cnn14/11/tb_logs/ # Path where data manifest files will be stored train_annotation: /data2/ESC-50-master/manifest/train.json valid_annotation: /data2/ESC-50-master/manifest/valid.json test_annotation: /data2/ESC-50-master/manifest/test.json # To standardize results, UrbanSound8k has pre-separated samples into # 10 folds for multi-fold validation train_fold_nums: [1, 2, 3] valid_fold_nums: [4] test_fold_nums: [5] skip_manifest_creation: false ckpt_interval_minutes: 15 # save checkpoint every N min # Training parameters number_of_epochs: 200 batch_size: 32 lr: 0.0002 base_lr: 0.00000001 max_lr: 0.0002 step_size: 65000 sample_rate: 44100 device: cpu # Feature parameters n_mels: 80 left_frames: 0 right_frames: 0 deltas: false amp_to_db: true normalize: true use_melspectra: true # Number of classes out_n_neurons: 50 # Note that it's actually important to shuffle the data here # (or at the very least, not sort the data by duration) # Also note that this does not violate the UrbanSound8k "no-shuffle" policy # because this does not mix samples from folds in train to valid/test, only # within train or valid, or test shuffle: true dataloader_options: batch_size: 32 shuffle: true num_workers: 0 # Functions compute_features: &id003 !new:speechbrain.lobes.features.Fbank n_mels: 80 left_frames: 0 right_frames: 0 deltas: false sample_rate: 44100 n_fft: 1024 win_length: 20 hop_length: 10 use_pretrain: false embedding_model: &id009 !new:speechbrain.lobes.models.Cnn14.Cnn14 mel_bins: 80 emb_dim: 2048 classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier input_size: 2048 out_neurons: 50 lin_blocks: 1 epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter # If you do not want to use the pretrained separator you can simply delete pretrained_separator field. limit: 200 # Definition of the augmentation pipeline. # If concat_augment = False, the augmentation techniques are applied # in sequence. If concat_augment = True, all the augmented signals # # are concatenated in a single big batch. augment_pipeline: [] concat_augment: true mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization norm_type: sentence std_norm: false # pre-processing n_fft: 1024 spec_mag_power: 0.5 hop_length: 11.6099 win_length: 23.2199 compute_stft: &id001 !new:speechbrain.processing.features.STFT n_fft: 1024 hop_length: 11.6099 win_length: 23.2199 sample_rate: 44100 compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank n_mels: 80 n_fft: 1024 sample_rate: 44100 modules: compute_stft: *id001 compute_fbank: *id002 compute_features: *id003 embedding_model: *id009 classifier: *id010 mean_var_norm: *id011 compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin margin: 0.2 scale: 30 # compute_error: !name:speechbrain.nnet.losses.classification_error opt_class: !name:torch.optim.Adam lr: 0.0002 weight_decay: 0.000002 lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler base_lr: 0.00000001 max_lr: 0.0002 step_size: 65000 # Logging + checkpoints train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: ./results/cnn14/11/train_log.txt error_stats: !name:speechbrain.utils.metric_stats.MetricStats metric: !name:speechbrain.nnet.losses.classification_error reduction: batch checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: ./results/cnn14/11/save recoverables: embedding_model: *id009 classifier: *id010 normalizer: *id011 counter: *id012 label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder pretrained_path: speechbrain/cnn14-esc50 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: embedding_model: !ref classifier: !ref label_encoder: !ref paths: embedding_model: !ref /embedding_model_esc50ft.ckpt classifier: !ref /classifier_esc50.ckpt label_encoder: !ref /label_encoder.txt