shikhar7ssu commited on
Commit
c28a0ac
1 Parent(s): db0d98c

Update model

Browse files
README.md ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - automatic-speech-recognition
6
+ language: en1
7
+ datasets:
8
+ - esc50
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ASR model
13
+
14
+ ### `shikhar7ssu/BEATs-ESC-FinetunedFold1`
15
+
16
+ This model was trained by Shikhar Bharadwaj using esc50 recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout ca9ca1ef8bc86753238ca7a0de05d87b8f57abb3
26
+ pip install -e .
27
+ cd egs2/esc50/asr1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model shikhar7ssu/BEATs-ESC-FinetunedFold1
29
+ ```
30
+
31
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
32
+ # RESULTS
33
+ ## Environments
34
+ - date: `Sat Dec 14 19:04:56 EST 2024`
35
+ - python version: `3.9.20 (main, Oct 3 2024, 07:27:41) [GCC 11.2.0]`
36
+ - espnet version: `espnet 202412`
37
+ - pytorch version: `pytorch 2.4.0`
38
+ - Git hash: `cb80e61a15d6a13dc342ae5a413d2b870dd869c6`
39
+ - Commit date: `Fri Dec 13 11:57:16 2024 -0500`
40
+
41
+ ## /compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/inference_ctc_weight0.0_maxlenratio-1_asr_model_valid.acc.best
42
+ ### WER
43
+
44
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
45
+ |---|---|---|---|---|---|---|---|---|
46
+ |org/val1|400|400|94.3|5.8|0.0|0.0|5.8|5.8|
47
+
48
+ ### CER
49
+
50
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
51
+ |---|---|---|---|---|---|---|---|---|
52
+ |org/val1|400|5520|99.4|0.5|0.1|0.1|0.7|5.8|
53
+
54
+ ### TER
55
+
56
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
57
+ |---|---|---|---|---|---|---|---|---|
58
+
59
+ ## ASR config
60
+
61
+ <details><summary>expand</summary>
62
+
63
+ ```
64
+ config: conf/beats_classification.yaml
65
+ print_config: false
66
+ log_level: INFO
67
+ drop_last_iter: false
68
+ dry_run: false
69
+ iterator_type: sequence
70
+ valid_iterator_type: null
71
+ output_dir: /compute/babel-13-33/sbharad2/expdir/asr_fast.fold1
72
+ ngpu: 1
73
+ seed: 0
74
+ num_workers: 2
75
+ num_att_plot: 0
76
+ dist_backend: nccl
77
+ dist_init_method: env://
78
+ dist_world_size: null
79
+ dist_rank: null
80
+ local_rank: 0
81
+ dist_master_addr: null
82
+ dist_master_port: null
83
+ dist_launcher: null
84
+ multiprocessing_distributed: false
85
+ unused_parameters: true
86
+ sharded_ddp: false
87
+ use_deepspeed: false
88
+ deepspeed_config: null
89
+ cudnn_enabled: true
90
+ cudnn_benchmark: false
91
+ cudnn_deterministic: true
92
+ use_tf32: false
93
+ collect_stats: false
94
+ write_collected_feats: false
95
+ max_epoch: 1000
96
+ patience: null
97
+ val_scheduler_criterion:
98
+ - valid
99
+ - loss
100
+ early_stopping_criterion:
101
+ - valid
102
+ - loss
103
+ - min
104
+ best_model_criterion:
105
+ - - valid
106
+ - acc
107
+ - max
108
+ keep_nbest_models: 1
109
+ nbest_averaging_interval: 0
110
+ grad_clip: 1
111
+ grad_clip_type: 2.0
112
+ grad_noise: false
113
+ accum_grad: 1
114
+ no_forward_run: false
115
+ resume: true
116
+ train_dtype: float32
117
+ use_amp: false
118
+ log_interval: null
119
+ use_matplotlib: true
120
+ use_tensorboard: true
121
+ create_graph_in_tensorboard: false
122
+ use_wandb: true
123
+ wandb_project: BEATs-ESC
124
+ wandb_id: null
125
+ wandb_entity: shikhar
126
+ wandb_name: fast.fold1
127
+ wandb_model_log_interval: 0
128
+ detect_anomaly: false
129
+ use_adapter: false
130
+ adapter: lora
131
+ save_strategy: all
132
+ adapter_conf: {}
133
+ pretrain_path: null
134
+ init_param: []
135
+ ignore_init_mismatch: false
136
+ freeze_param: []
137
+ num_iters_per_epoch: null
138
+ batch_size: 128
139
+ valid_batch_size: null
140
+ batch_bins: 1000000
141
+ valid_batch_bins: null
142
+ category_sample_size: 10
143
+ train_shape_file:
144
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/train/speech_shape
145
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/train/text_shape.word
146
+ valid_shape_file:
147
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/valid/speech_shape
148
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/valid/text_shape.word
149
+ batch_type: folded
150
+ valid_batch_type: null
151
+ fold_length:
152
+ - 100000
153
+ - 150
154
+ sort_in_batch: descending
155
+ shuffle_within_batch: false
156
+ sort_batch: descending
157
+ multiple_iterator: false
158
+ chunk_length: 500
159
+ chunk_shift_ratio: 0.5
160
+ num_cache_chunks: 1024
161
+ chunk_excluded_key_prefixes: []
162
+ chunk_default_fs: null
163
+ chunk_max_abs_length: null
164
+ chunk_discard_short_samples: true
165
+ train_data_path_and_name_and_type:
166
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/train1/wav.scp
167
+ - speech
168
+ - sound
169
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/train1/text
170
+ - text
171
+ - text
172
+ valid_data_path_and_name_and_type:
173
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/val1/wav.scp
174
+ - speech
175
+ - sound
176
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/val1/text
177
+ - text
178
+ - text
179
+ multi_task_dataset: false
180
+ allow_variable_data_keys: false
181
+ max_cache_size: 0.0
182
+ max_cache_fd: 32
183
+ allow_multi_rates: false
184
+ valid_max_cache_size: null
185
+ exclude_weight_decay: false
186
+ exclude_weight_decay_conf: {}
187
+ optim: adamw
188
+ optim_conf:
189
+ lr: 0.0001
190
+ weight_decay: 0.01
191
+ betas:
192
+ - 0.9
193
+ - 0.98
194
+ scheduler: cosineannealingwarmuprestarts
195
+ scheduler_conf:
196
+ first_cycle_steps: 6000
197
+ warmup_steps: 300
198
+ max_lr: 0.0001
199
+ min_lr: 5.0e-06
200
+ token_list:
201
+ - <blank>
202
+ - <unk>
203
+ - audio_class:0
204
+ - audio_class:14
205
+ - audio_class:36
206
+ - audio_class:19
207
+ - audio_class:30
208
+ - audio_class:34
209
+ - audio_class:9
210
+ - audio_class:22
211
+ - audio_class:48
212
+ - audio_class:41
213
+ - audio_class:47
214
+ - audio_class:31
215
+ - audio_class:17
216
+ - audio_class:45
217
+ - audio_class:8
218
+ - audio_class:15
219
+ - audio_class:46
220
+ - audio_class:37
221
+ - audio_class:32
222
+ - audio_class:16
223
+ - audio_class:25
224
+ - audio_class:4
225
+ - audio_class:3
226
+ - audio_class:27
227
+ - audio_class:43
228
+ - audio_class:12
229
+ - audio_class:40
230
+ - audio_class:29
231
+ - audio_class:10
232
+ - audio_class:7
233
+ - audio_class:26
234
+ - audio_class:6
235
+ - audio_class:44
236
+ - audio_class:23
237
+ - audio_class:20
238
+ - audio_class:49
239
+ - audio_class:24
240
+ - audio_class:39
241
+ - audio_class:28
242
+ - audio_class:18
243
+ - audio_class:2
244
+ - audio_class:35
245
+ - audio_class:38
246
+ - audio_class:21
247
+ - audio_class:1
248
+ - audio_class:11
249
+ - audio_class:42
250
+ - audio_class:5
251
+ - audio_class:33
252
+ - audio_class:13
253
+ - <sos/eos>
254
+ init: xavier_normal
255
+ input_size: 1
256
+ ctc_conf:
257
+ dropout_rate: 0.0
258
+ ctc_type: builtin
259
+ reduce: true
260
+ ignore_nan_grad: null
261
+ zero_infinity: true
262
+ brctc_risk_strategy: exp
263
+ brctc_group_strategy: end
264
+ brctc_risk_factor: 0.0
265
+ joint_net_conf: null
266
+ use_preprocessor: true
267
+ use_lang_prompt: false
268
+ use_nlp_prompt: false
269
+ token_type: word
270
+ bpemodel: null
271
+ non_linguistic_symbols: null
272
+ cleaner: null
273
+ g2p: null
274
+ speech_volume_normalize: null
275
+ rir_scp: null
276
+ rir_apply_prob: 1.0
277
+ noise_scp: null
278
+ noise_apply_prob: 1.0
279
+ noise_db_range: '13_15'
280
+ short_noise_thres: 0.5
281
+ aux_ctc_tasks: []
282
+ frontend: null
283
+ frontend_conf: {}
284
+ specaug: null
285
+ specaug_conf: {}
286
+ normalize: null
287
+ normalize_conf: {}
288
+ model: espnet
289
+ model_conf:
290
+ ctc_weight: 0.0
291
+ lsm_weight: 0.1
292
+ length_normalized_loss: true
293
+ preencoder: null
294
+ preencoder_conf: {}
295
+ encoder: beats
296
+ encoder_conf:
297
+ beats_ckpt_path: /compute/babel-13-33/sbharad2/models/BEATs/BEATs_iter3.pt
298
+ fbank_mean: 11.72215
299
+ fbank_std: 10.60431
300
+ beats_config:
301
+ layer_wise_gradient_decay_ratio: 0.2
302
+ encoder_layerdrop: 0.1
303
+ dropout: 0.0
304
+ specaug_config:
305
+ apply_time_warp: true
306
+ apply_freq_mask: false
307
+ freq_mask_width_range:
308
+ - 0
309
+ - 32
310
+ num_freq_mask: 1
311
+ apply_time_mask: true
312
+ time_mask_width_ratio_range:
313
+ - 0
314
+ - 0.06
315
+ num_time_mask: 1
316
+ roll_augment: true
317
+ roll_interval: 16000
318
+ use_weighted_representation: false
319
+ postencoder: null
320
+ postencoder_conf: {}
321
+ decoder: linear_decoder
322
+ decoder_conf:
323
+ pooling: mean
324
+ dropout: 0.1
325
+ preprocessor: default
326
+ preprocessor_conf: {}
327
+ required:
328
+ - output_dir
329
+ - token_list
330
+ version: '202412'
331
+ distributed: false
332
+ ```
333
+
334
+ </details>
335
+
336
+
337
+
338
+ ### Citing ESPnet
339
+
340
+ ```BibTex
341
+ @inproceedings{watanabe2018espnet,
342
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
343
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
344
+ year={2018},
345
+ booktitle={Proceedings of Interspeech},
346
+ pages={2207--2211},
347
+ doi={10.21437/Interspeech.2018-1456},
348
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
349
+ }
350
+
351
+
352
+
353
+
354
+
355
+
356
+ ```
357
+
358
+ or arXiv:
359
+
360
+ ```bibtex
361
+ @misc{watanabe2018espnet,
362
+ title={ESPnet: End-to-End Speech Processing Toolkit},
363
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
364
+ year={2018},
365
+ eprint={1804.00015},
366
+ archivePrefix={arXiv},
367
+ primaryClass={cs.CL}
368
+ }
369
+ ```
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/945epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0f96d6f1268557c666922cded00ac6784eea480a747299aaaccd147119952ef
3
+ size 361495813
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/RESULTS.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by scripts/utils/show_asr_result.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Sat Dec 14 19:04:56 EST 2024`
5
+ - python version: `3.9.20 (main, Oct 3 2024, 07:27:41) [GCC 11.2.0]`
6
+ - espnet version: `espnet 202412`
7
+ - pytorch version: `pytorch 2.4.0`
8
+ - Git hash: `cb80e61a15d6a13dc342ae5a413d2b870dd869c6`
9
+ - Commit date: `Fri Dec 13 11:57:16 2024 -0500`
10
+
11
+ ## /compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/inference_ctc_weight0.0_maxlenratio-1_asr_model_valid.acc.best
12
+ ### WER
13
+
14
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
15
+ |---|---|---|---|---|---|---|---|---|
16
+ |org/val1|400|400|94.3|5.8|0.0|0.0|5.8|5.8|
17
+
18
+ ### CER
19
+
20
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
21
+ |---|---|---|---|---|---|---|---|---|
22
+ |org/val1|400|5520|99.4|0.5|0.1|0.1|0.7|5.8|
23
+
24
+ ### TER
25
+
26
+ |dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
27
+ |---|---|---|---|---|---|---|---|---|
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/config.yaml ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/beats_classification.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: /compute/babel-13-33/sbharad2/expdir/asr_fast.fold1
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 2
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ cudnn_enabled: true
27
+ cudnn_benchmark: false
28
+ cudnn_deterministic: true
29
+ use_tf32: false
30
+ collect_stats: false
31
+ write_collected_feats: false
32
+ max_epoch: 1000
33
+ patience: null
34
+ val_scheduler_criterion:
35
+ - valid
36
+ - loss
37
+ early_stopping_criterion:
38
+ - valid
39
+ - loss
40
+ - min
41
+ best_model_criterion:
42
+ - - valid
43
+ - acc
44
+ - max
45
+ keep_nbest_models: 1
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 1
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 1
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: false
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: true
60
+ wandb_project: BEATs-ESC
61
+ wandb_id: null
62
+ wandb_entity: shikhar
63
+ wandb_name: fast.fold1
64
+ wandb_model_log_interval: 0
65
+ detect_anomaly: false
66
+ use_adapter: false
67
+ adapter: lora
68
+ save_strategy: all
69
+ adapter_conf: {}
70
+ pretrain_path: null
71
+ init_param: []
72
+ ignore_init_mismatch: false
73
+ freeze_param: []
74
+ num_iters_per_epoch: null
75
+ batch_size: 128
76
+ valid_batch_size: null
77
+ batch_bins: 1000000
78
+ valid_batch_bins: null
79
+ category_sample_size: 10
80
+ train_shape_file:
81
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/train/speech_shape
82
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/train/text_shape.word
83
+ valid_shape_file:
84
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/valid/speech_shape
85
+ - /compute/babel-13-33/sbharad2/expdir/asr_stats_raw_1_word/valid/text_shape.word
86
+ batch_type: folded
87
+ valid_batch_type: null
88
+ fold_length:
89
+ - 100000
90
+ - 150
91
+ sort_in_batch: descending
92
+ shuffle_within_batch: false
93
+ sort_batch: descending
94
+ multiple_iterator: false
95
+ chunk_length: 500
96
+ chunk_shift_ratio: 0.5
97
+ num_cache_chunks: 1024
98
+ chunk_excluded_key_prefixes: []
99
+ chunk_default_fs: null
100
+ chunk_max_abs_length: null
101
+ chunk_discard_short_samples: true
102
+ train_data_path_and_name_and_type:
103
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/train1/wav.scp
104
+ - speech
105
+ - sound
106
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/train1/text
107
+ - text
108
+ - text
109
+ valid_data_path_and_name_and_type:
110
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/val1/wav.scp
111
+ - speech
112
+ - sound
113
+ - - /compute/babel-13-33/sbharad2/dumpdir/raw/val1/text
114
+ - text
115
+ - text
116
+ multi_task_dataset: false
117
+ allow_variable_data_keys: false
118
+ max_cache_size: 0.0
119
+ max_cache_fd: 32
120
+ allow_multi_rates: false
121
+ valid_max_cache_size: null
122
+ exclude_weight_decay: false
123
+ exclude_weight_decay_conf: {}
124
+ optim: adamw
125
+ optim_conf:
126
+ lr: 0.0001
127
+ weight_decay: 0.01
128
+ betas:
129
+ - 0.9
130
+ - 0.98
131
+ scheduler: cosineannealingwarmuprestarts
132
+ scheduler_conf:
133
+ first_cycle_steps: 6000
134
+ warmup_steps: 300
135
+ max_lr: 0.0001
136
+ min_lr: 5.0e-06
137
+ token_list:
138
+ - <blank>
139
+ - <unk>
140
+ - audio_class:0
141
+ - audio_class:14
142
+ - audio_class:36
143
+ - audio_class:19
144
+ - audio_class:30
145
+ - audio_class:34
146
+ - audio_class:9
147
+ - audio_class:22
148
+ - audio_class:48
149
+ - audio_class:41
150
+ - audio_class:47
151
+ - audio_class:31
152
+ - audio_class:17
153
+ - audio_class:45
154
+ - audio_class:8
155
+ - audio_class:15
156
+ - audio_class:46
157
+ - audio_class:37
158
+ - audio_class:32
159
+ - audio_class:16
160
+ - audio_class:25
161
+ - audio_class:4
162
+ - audio_class:3
163
+ - audio_class:27
164
+ - audio_class:43
165
+ - audio_class:12
166
+ - audio_class:40
167
+ - audio_class:29
168
+ - audio_class:10
169
+ - audio_class:7
170
+ - audio_class:26
171
+ - audio_class:6
172
+ - audio_class:44
173
+ - audio_class:23
174
+ - audio_class:20
175
+ - audio_class:49
176
+ - audio_class:24
177
+ - audio_class:39
178
+ - audio_class:28
179
+ - audio_class:18
180
+ - audio_class:2
181
+ - audio_class:35
182
+ - audio_class:38
183
+ - audio_class:21
184
+ - audio_class:1
185
+ - audio_class:11
186
+ - audio_class:42
187
+ - audio_class:5
188
+ - audio_class:33
189
+ - audio_class:13
190
+ - <sos/eos>
191
+ init: xavier_normal
192
+ input_size: 1
193
+ ctc_conf:
194
+ dropout_rate: 0.0
195
+ ctc_type: builtin
196
+ reduce: true
197
+ ignore_nan_grad: null
198
+ zero_infinity: true
199
+ brctc_risk_strategy: exp
200
+ brctc_group_strategy: end
201
+ brctc_risk_factor: 0.0
202
+ joint_net_conf: null
203
+ use_preprocessor: true
204
+ use_lang_prompt: false
205
+ use_nlp_prompt: false
206
+ token_type: word
207
+ bpemodel: null
208
+ non_linguistic_symbols: null
209
+ cleaner: null
210
+ g2p: null
211
+ speech_volume_normalize: null
212
+ rir_scp: null
213
+ rir_apply_prob: 1.0
214
+ noise_scp: null
215
+ noise_apply_prob: 1.0
216
+ noise_db_range: '13_15'
217
+ short_noise_thres: 0.5
218
+ aux_ctc_tasks: []
219
+ frontend: null
220
+ frontend_conf: {}
221
+ specaug: null
222
+ specaug_conf: {}
223
+ normalize: null
224
+ normalize_conf: {}
225
+ model: espnet
226
+ model_conf:
227
+ ctc_weight: 0.0
228
+ lsm_weight: 0.1
229
+ length_normalized_loss: true
230
+ preencoder: null
231
+ preencoder_conf: {}
232
+ encoder: beats
233
+ encoder_conf:
234
+ beats_ckpt_path: /compute/babel-13-33/sbharad2/models/BEATs/BEATs_iter3.pt
235
+ fbank_mean: 11.72215
236
+ fbank_std: 10.60431
237
+ beats_config:
238
+ layer_wise_gradient_decay_ratio: 0.2
239
+ encoder_layerdrop: 0.1
240
+ dropout: 0.0
241
+ specaug_config:
242
+ apply_time_warp: true
243
+ apply_freq_mask: false
244
+ freq_mask_width_range:
245
+ - 0
246
+ - 32
247
+ num_freq_mask: 1
248
+ apply_time_mask: true
249
+ time_mask_width_ratio_range:
250
+ - 0
251
+ - 0.06
252
+ num_time_mask: 1
253
+ roll_augment: true
254
+ roll_interval: 16000
255
+ use_weighted_representation: false
256
+ postencoder: null
257
+ postencoder_conf: {}
258
+ decoder: linear_decoder
259
+ decoder_conf:
260
+ pooling: mean
261
+ dropout: 0.1
262
+ preprocessor: default
263
+ preprocessor_conf: {}
264
+ required:
265
+ - output_dir
266
+ - token_list
267
+ version: '202412'
268
+ distributed: false
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/acc.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/backward_time.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/clip.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/forward_time.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/gpu_max_cached_mem_GB.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/grad_norm.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/iter_time.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/loss.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/loss_scale.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/optim0_lr0.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/optim_step_time.png ADDED
compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202412'
2
+ files:
3
+ asr_model_file: /compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/945epoch.pth
4
+ python: "3.9.20 (main, Oct 3 2024, 07:27:41) \n[GCC 11.2.0]"
5
+ timestamp: 1734231568.41626
6
+ torch: 2.4.0
7
+ yaml_files:
8
+ asr_train_config: /compute/babel-13-33/sbharad2/expdir/asr_fast.fold1/config.yaml