{"train_loss": [], "val_loss": [], "val_pp": [], "val_acc": [], "args": {"config_format": "base", "experiment_name": null, "seed": 0, "data_seed": 1337, "eval_interval": 200, "full_eval_at": [], "eval_batches": 32, "device": "cuda:0", "distributed_backend": "nccl", "log_interval": 50, "results_base_folder": "./exps", "permanent_ckpt_interval": 0, "latest_ckpt_interval": 20000, "resume_from": null, "resume_from_swa": null, "auto_resume": true, "wandb": true, "wandb_project": "llm-baselines", "wandb_run_prefix": "UNTIED-50M-FourEightTrustQuantizer@4:TrustQuantizer@4-c4", "eval_seq_prefix": "none", "log_dynamics": false, "dynamics_logger_cfg": "./src/logger/rotational_logger.yaml", "scheduler": "cos", "cos_inf_steps": 0, "iterations": 19073, "warmup_steps": 1907, "lr": 0.0012, "wsd_final_lr_scale": 0.0, "wsd_fract_decay": 0.1, "decay_type": "linear", "opt": "adamw", "batch_size": 64, "acc_steps": 1, "weight_decay": 0.1, "beta1": 0.9, "beta2": 0.95, "grad_clip": 1.0, "weight_average": false, "wa_interval": 5, "wa_horizon": 500, "wa_dtype": "float32", "wa_use_temp_dir": false, "wa_sweep_horizon": false, "max_num_wa_sweeps": 5, "exponential_moving_average": false, "ema_interval": 10, "ema_decay": 0.95, "ema_after_warmup": false, "datasets_dir": "./datasets/", "dataset": "c4", "tokenizer": "gpt2", "vocab_size": 50304, "data_in_ram": false, "model": "llama", "parallel_block": false, "use_pretrained": "none", "from_dense": false, "init_std": 0.02, "dropout": 0.0, "n_head": 6, "n_layer": 7, "sequence_length": 512, "n_embd": 768, "multiple_of": 256, "rmsnorm_eps": 1e-05, "dtype": "bfloat16", "bias": false, "compile": true, "mlp_dim_exp_factor": 1.0, "w_quant": "FourEightTrustQuantizer", "w_quant_kwargs": {"bits": 4}, "a_quant": "TrustQuantizer", "a_quant_kwargs": {"bits": 4}, "world_size": 8}} |