File size: 3,089 Bytes
62e9ca6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# @package _group_

common:
  fp16: true
  log_format: json
  log_interval: 200
  seed: 1337
  tensorboard_logdir: tblog

checkpoint:
  save_dir: ???
  save_interval: 4
  keep_last_epochs: 4
  save_interval_updates: 50000
  keep_interval_updates: -1
  keep_interval_updates_pattern: 50000
  # no_epoch_checkpoints: true

distributed_training:
  ddp_backend: legacy_ddp
  distributed_backend: 'nccl'
  distributed_port: -1
  distributed_world_size: 32
  nprocs_per_node: 8
  find_unused_parameters: true

task:
  _name: joint_sc2t_pretraining
  data: ???
  label_dir: ???
  labels: ???
  label_rate: ${model.label_rate}
  store_labels: true
  sample_rate: 16000
  max_sample_size: 250000
  min_sample_size: 32000
  pad_audio: false
  random_crop: true
  normalize: false # must be consistent with extractor
  add_decoder_target: false
  text_cfg:
    seed: ${common.seed}
    text_data: ???
    data_config: config.yaml
    sample_break_mode: eos
    tokens_per_sample: 1024
    shorten_method: "random_crop"
    text_maxtokens_ratio: 1.0

dataset:
  num_workers: 6
  max_tokens: 1400000
  skip_invalid_size_inputs_valid_test: true
  validate_interval: ${checkpoint.save_interval}
  validate_interval_updates: ${checkpoint.save_interval_updates}
  required_batch_size_multiple: 1

criterion:
  _name: speechlm_criterion
  pred_masked_weight: 1.0
  pred_nomask_weight: 0.0
  loss_weights: [10,]
  text_ctc_weight: 0.1
  text_mum_weight: 0.0

optimization:
  max_update: 400000
  lr: [0.0005]
  clip_norm: 10.0

optimizer:
  _name: adam
  adam_betas: (0.9,0.98)
  adam_eps: 1e-06
  weight_decay: 0.01

lr_scheduler:
  _name: polynomial_decay
  warmup_updates: 32000

model:
  _name: speechlm
  label_rate: ???
  skip_masked: false
  skip_nomask: false
  mask_prob: 0.80
  extractor_mode: default
  conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
  final_dim: 256
  activation_fn: "gelu"
  encoder_layers: 6
  encoder_attention_heads: 8
  encoder_layerdrop: 0.1
  dropout_input: 0.1
  dropout_features: 0.1
  dropout: 0.1
  attention_dropout: 0.1
  feature_grad_mult: 0.1
  untie_final_proj: true
  activation_dropout: 0.0
  use_rel_pos_enc: true
  add_unit_encoder: true
  add_text_ctc: true
  mask_u2t: true
  compute_mum: false
  mix_with_unit: true
  text_transformer:
    activation_fn: ${model.activation_fn}
    dropout: ${model.dropout}
    attention_dropout: ${model.attention_dropout}
    activation_dropout: ${model.activation_dropout}
    max_source_positions: 3000
    no_scale_embedding: true
    layernorm_embedding: true
    no_token_positional_embeddings: false
    encoder:
      embed_dim: 768
      ffn_embed_dim: 3072
      layers: 6
      attention_heads: 8
      normalize_before: false
      learned_pos: true
      layerdrop: ${model.encoder_layerdrop}

hydra:
  job:
    config:
      override_dirname:
        kv_sep: '-'
        item_sep: '__'
        exclude_keys:
          - run
          - task.data
          - task.label_dir
  run:
    dir: ???
  sweep:
    dir: ???
    subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}