| ## IO | |
| save_data: enfr/data_spm | |
| overwrite: True | |
| seed: 1234 | |
| report_every: 100 | |
| valid_metrics: ["BLEU"] | |
| tensorboard: true | |
| tensorboard_log_dir: tensorboard | |
| ### Vocab | |
| src_vocab: enfr/joint.eole.vocab | |
| tgt_vocab: enfr/joint.eole.vocab | |
| src_vocab_size: 50000 | |
| tgt_vocab_size: 50000 | |
| vocab_size_multiple: 8 | |
| share_vocab: True | |
| n_sample: 0 | |
| data: | |
| corpus_1: | |
| path_src: hf://quickmt/quickmt-train.fr-en/en | |
| path_tgt: hf://quickmt/quickmt-train.fr-en/fr | |
| path_sco: hf://quickmt/quickmt-train.fr-en/sco | |
| valid: | |
| path_src: enfr/dev.en | |
| path_tgt: enfr/dev.fr | |
| transforms: [sentencepiece, filtertoolong] | |
| transforms_configs: | |
| sentencepiece: | |
| src_subword_model: "enfr/joint.spm.model" | |
| tgt_subword_model: "enfr/joint.spm.model" | |
| filtertoolong: | |
| src_seq_length: 256 | |
| tgt_seq_length: 256 | |
| training: | |
| # Run configuration | |
| model_path: enfr/model | |
| train_from: enfr/model | |
| keep_checkpoint: 4 | |
| save_checkpoint_steps: 2000 | |
| train_steps: 100000 | |
| valid_steps: 2000 | |
| # Train on a single GPU | |
| world_size: 1 | |
| gpu_ranks: [0] | |
| # Batching | |
| batch_type: "tokens" | |
| batch_size: 16384 | |
| valid_batch_size: 16384 | |
| batch_size_multiple: 8 | |
| accum_count: [8] | |
| accum_steps: [0] | |
| # Optimizer & Compute | |
| compute_dtype: "bf16" | |
| optim: "pagedadamw8bit" | |
| #optim: "adamw" | |
| learning_rate: 2.0 | |
| warmup_steps: 10000 | |
| decay_method: "noam" | |
| adam_beta2: 0.998 | |
| # Data loading | |
| bucket_size: 128000 | |
| num_workers: 4 | |
| prefetch_factor: 100 | |
| # Hyperparams | |
| dropout_steps: [0] | |
| dropout: [0.1] | |
| attention_dropout: [0.1] | |
| max_grad_norm: 2 | |
| label_smoothing: 0.1 | |
| average_decay: 0.0001 | |
| param_init_method: xavier_uniform | |
| normalization: "tokens" | |
| model: | |
| architecture: "transformer" | |
| layer_norm: standard | |
| share_embeddings: true | |
| share_decoder_embeddings: true | |
| add_ffnbias: true | |
| mlp_activation_fn: gelu | |
| add_estimator: false | |
| add_qkvbias: false | |
| norm_eps: 1e-6 | |
| hidden_size: 1024 | |
| encoder: | |
| layers: 8 | |
| decoder: | |
| layers: 2 | |
| heads: 8 | |
| transformer_ff: 4096 | |
| embeddings: | |
| word_vec_size: 1024 | |
| position_encoding_type: "SinusoidalInterleaved" | |