activation: softmax adam_beta1: 0.9 adam_beta2: 0.99 adam_epsilon: 1.0e-06 alpha: 0.1 attn_implementation: null beta: 0.125 bf16: true block_size: 512 checkpoint_dir: mlruns/896390784617014591/892b97fa0aa6499288906c463545ae00/checkpoints compile: false config_path: configs/JZ/NRJ_base-wiki-original.yaml dataloader_num_workers: 8 dataset_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/ ddp_find_unused_parameters: false disable_tqdm: true do_eval: true dropout: 0.1 embedding_dim: 768 eval_steps: 25000 evaluation_strategy: steps forward_memories: 3072 fp16: false gradient_accumulation_steps: 1 ignore_lines: false layer_norm: 1.0e-12 learning_rate: 0.0007 log_on_each_node: false logging_steps: 1000 logging_strategy: steps lr_scheduler_kwargs: {} lr_scheduler_type: cosine max_steps: 500000 model_name: NRJ-V_30000K_bpe-NL12-NH12-EMB768-FFN3072 model_type: energyBERT n_run: 51 num_heads: 12 num_layers: 12 num_params: 50638896 optimizer: adamw_torch output_dir: null per_device_eval_batch_size: 8 per_device_train_batch_size: 64 remove_unused_columns: false report_to: mlflow save_steps: 25000 save_strategy: steps seed: 42 share_layers: false test_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.test.txt tie_weights: false tokenizer_path: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/data-bin/wiki_20220301-cleaned-valid001-BPE30K/tokenizer tokenizer_type: bpe total_batch_size: 4096 training_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.train.txt valid_file: /lustre/fswork/projects/rech/oou/uqh26ve/data/pre_training/en/en_wiki/wiki_20220301-cleaned-valid001/wikipedia.valid.txt vocabulary_size: 30000 warmup_ratio: 0.0 warmup_steps: 24000 weight_decay: 0.01