artst-demo-asr / SpeechT5 /SpeechLM /speechlm /config /pretrain /speechlm_base_librispeech.yaml
amupd's picture
SpeechT5 upload
62e9ca6
raw
history blame
3.09 kB
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
seed: 1337
tensorboard_logdir: tblog
checkpoint:
save_dir: ???
save_interval: 4
keep_last_epochs: 4
save_interval_updates: 50000
keep_interval_updates: -1
keep_interval_updates_pattern: 50000
# no_epoch_checkpoints: true
distributed_training:
ddp_backend: legacy_ddp
distributed_backend: 'nccl'
distributed_port: -1
distributed_world_size: 32
nprocs_per_node: 8
find_unused_parameters: true
task:
_name: joint_sc2t_pretraining
data: ???
label_dir: ???
labels: ???
label_rate: ${model.label_rate}
store_labels: true
sample_rate: 16000
max_sample_size: 250000
min_sample_size: 32000
pad_audio: false
random_crop: true
normalize: false # must be consistent with extractor
add_decoder_target: false
text_cfg:
seed: ${common.seed}
text_data: ???
data_config: config.yaml
sample_break_mode: eos
tokens_per_sample: 1024
shorten_method: "random_crop"
text_maxtokens_ratio: 1.0
dataset:
num_workers: 6
max_tokens: 1400000
skip_invalid_size_inputs_valid_test: true
validate_interval: ${checkpoint.save_interval}
validate_interval_updates: ${checkpoint.save_interval_updates}
required_batch_size_multiple: 1
criterion:
_name: speechlm_criterion
pred_masked_weight: 1.0
pred_nomask_weight: 0.0
loss_weights: [10,]
text_ctc_weight: 0.1
text_mum_weight: 0.0
optimization:
max_update: 400000
lr: [0.0005]
clip_norm: 10.0
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: speechlm
label_rate: ???
skip_masked: false
skip_nomask: false
mask_prob: 0.80
extractor_mode: default
conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
final_dim: 256
activation_fn: "gelu"
encoder_layers: 6
encoder_attention_heads: 8
encoder_layerdrop: 0.1
dropout_input: 0.1
dropout_features: 0.1
dropout: 0.1
attention_dropout: 0.1
feature_grad_mult: 0.1
untie_final_proj: true
activation_dropout: 0.0
use_rel_pos_enc: true
add_unit_encoder: true
add_text_ctc: true
mask_u2t: true
compute_mum: false
mix_with_unit: true
text_transformer:
activation_fn: ${model.activation_fn}
dropout: ${model.dropout}
attention_dropout: ${model.attention_dropout}
activation_dropout: ${model.activation_dropout}
max_source_positions: 3000
no_scale_embedding: true
layernorm_embedding: true
no_token_positional_embeddings: false
encoder:
embed_dim: 768
ffn_embed_dim: 3072
layers: 6
attention_heads: 8
normalize_before: false
learned_pos: true
layerdrop: ${model.encoder_layerdrop}
hydra:
job:
config:
override_dirname:
kv_sep: '-'
item_sep: '__'
exclude_keys:
- run
- task.data
- task.label_dir
run:
dir: ???
sweep:
dir: ???
subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}