Files changed (1) hide show
  1. hyperparams.yaml +12 -118
hyperparams.yaml CHANGED
@@ -1,77 +1,12 @@
1
- # Generated 2023-06-20 from:
2
- # /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/Enhancement/fine-tuning/hparams/sepformer_16k.yaml
3
- # yamllint disable
4
  # ################################
5
  # Model: SepFormer for source separation
6
  # https://arxiv.org/abs/2010.13154
7
- #
8
- # Author: Sangeet Sagar 2022
9
  # Dataset : RescueSpeech
10
  # ################################
11
 
12
- # Basic parameters
13
- # Seed needs to be set at top of yaml, before objects with parameters are made
14
- seed: 8201
15
- __set_seed: !apply:torch.manual_seed [8201]
16
- experiment_name: sepformer-enhancement
17
- output_folder: results/sepformer-enhancement/8201
18
- train_log: results/sepformer-enhancement/8201/train_log.txt
19
- save_folder: results/sepformer-enhancement/8201/save
20
-
21
- # Dataset prep parameters
22
- data_folder: dataset/audio_sythesis/Task_enhancement/ # !PLACEHOLDER
23
- csv_dir: csv_files
24
- train_csv: csv_files/train.csv
25
- valid_csv: csv_files/dev.csv
26
- test_csv: csv_files/test.csv
27
- skip_prep: false
28
  sample_rate: 16000
29
- task: enhance
30
-
31
- dereverberate: false
32
- shuffle_train_data: true
33
-
34
- # Pretrained models
35
- pretrained_model_path:
36
- /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/pre-trained/sepformer_dns_16k # !PLACEHOLDER # sepformer_dns_16k model
37
-
38
- # Basic parameters
39
- use_tensorboard: false
40
- tensorboard_logs: results/sepformer-enhancement/8201/logs/
41
-
42
- # Experiment params
43
- auto_mix_prec: true # Set it to True for mixed precision
44
- test_only: false
45
  num_spks: 1
46
- noprogressbar: false
47
- save_audio: true # Save estimated sources on disk
48
- downsample: false
49
- n_audio_to_save: 500
50
-
51
- # Training parameters
52
- N_epochs: 150
53
- batch_size: 1
54
- batch_size_test: 1
55
- lr: 0.00015
56
- clip_grad_norm: 5
57
- loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
58
- # if True, the training sequences are cut to a specified length
59
- limit_training_signal_len: false
60
- # this is the length of sequences if we choose to limit
61
- # the signal length of training sequences
62
- training_signal_len: 32000
63
- ckpt_interval_minutes: 60
64
-
65
- # Parameters for data augmentation
66
- use_wavedrop: false
67
- use_speedperturb: true
68
- use_rand_shift: false
69
- min_shift: -8000
70
- max_shift: 8000
71
-
72
- # loss thresholding -- this thresholds the training loss
73
- threshold_byloss: true
74
- threshold: -30
75
 
76
  # Encoder parameters
77
  N_encoder_out: 256
@@ -79,25 +14,12 @@ out_channels: 256
79
  kernel_size: 16
80
  kernel_stride: 8
81
 
82
- # Dataloader options
83
- dataloader_opts:
84
- batch_size: 1
85
- num_workers: 3
86
-
87
- dataloader_opts_valid:
88
- batch_size: 1
89
- num_workers: 3
90
-
91
- dataloader_opts_test:
92
- batch_size: 1
93
- num_workers: 3
94
-
95
  # Specifying the network
96
- Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
97
  kernel_size: 16
98
  out_channels: 256
99
 
100
- SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
101
  num_layers: 8
102
  d_model: 256
103
  nhead: 8
@@ -106,7 +28,7 @@ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
106
  use_positional_encoding: true
107
  norm_before: true
108
 
109
- SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
110
  num_layers: 8
111
  d_model: 256
112
  nhead: 8
@@ -115,58 +37,30 @@ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
115
  use_positional_encoding: true
116
  norm_before: true
117
 
118
- MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
119
-
120
  num_spks: 1
121
  in_channels: 256
122
  out_channels: 256
123
  num_layers: 2
124
  K: 250
125
- intra_model: *id001
126
- inter_model: *id002
127
  norm: ln
128
  linear_layer_after_inter_intra: false
129
  skip_around_intra: true
130
 
131
- Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
132
  in_channels: 256
133
  out_channels: 1
134
  kernel_size: 16
135
  stride: 8
136
  bias: false
137
 
138
- optimizer: !name:torch.optim.Adam
139
- lr: 0.00015
140
- weight_decay: 0
141
-
142
- loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
143
-
144
- lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
145
-
146
- factor: 0.5
147
- patience: 2
148
- dont_halve_until_epoch: 85
149
-
150
- epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter
151
- limit: 150
152
-
153
  modules:
154
- encoder: *id003
155
- decoder: *id004
156
- masknet: *id005
157
- save_all_checkpoints: false
158
- checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
159
- checkpoints_dir: results/sepformer-enhancement/8201/save
160
- recoverables:
161
- encoder: *id003
162
- decoder: *id004
163
- masknet: *id005
164
- counter: *id006
165
- lr_scheduler: *id007
166
- train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
167
- save_file: results/sepformer-enhancement/8201/train_log.txt
168
-
169
- ## Uncomment if you wish to fine-tune a pre-trained model.
170
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
171
  loadables:
172
  encoder: !ref <Encoder>
 
1
+
 
 
2
  # ################################
3
  # Model: SepFormer for source separation
4
  # https://arxiv.org/abs/2010.13154
 
 
5
  # Dataset : RescueSpeech
6
  # ################################
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  sample_rate: 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  num_spks: 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  # Encoder parameters
12
  N_encoder_out: 256
 
14
  kernel_size: 16
15
  kernel_stride: 8
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Specifying the network
18
+ Encoder: !new:speechbrain.lobes.models.dual_path.Encoder
19
  kernel_size: 16
20
  out_channels: 256
21
 
22
+ SBtfintra: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
23
  num_layers: 8
24
  d_model: 256
25
  nhead: 8
 
28
  use_positional_encoding: true
29
  norm_before: true
30
 
31
+ SBtfinter: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
32
  num_layers: 8
33
  d_model: 256
34
  nhead: 8
 
37
  use_positional_encoding: true
38
  norm_before: true
39
 
40
+ MaskNet: new:speechbrain.lobes.models.dual_path.Dual_Path_Model
 
41
  num_spks: 1
42
  in_channels: 256
43
  out_channels: 256
44
  num_layers: 2
45
  K: 250
46
+ intra_model: !ref <SBtfintra>
47
+ inter_model: !ref <SBtfinter>
48
  norm: ln
49
  linear_layer_after_inter_intra: false
50
  skip_around_intra: true
51
 
52
+ Decoder: !new:speechbrain.lobes.models.dual_path.Decoder
53
  in_channels: 256
54
  out_channels: 1
55
  kernel_size: 16
56
  stride: 8
57
  bias: false
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  modules:
60
+ encoder: !ref <Encoder>
61
+ decoder: !ref <Decoder>
62
+ masknet: !ref <MaskNet>
63
+
 
 
 
 
 
 
 
 
 
 
 
 
64
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
65
  loadables:
66
  encoder: !ref <Encoder>