{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 63, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "learning_rate": 0.0001, "loss": 2.5627, "step": 1 }, { "epoch": 0.1, "learning_rate": 0.0002, "loss": 2.5872, "step": 2 }, { "epoch": 0.14, "learning_rate": 0.00019986740898848306, "loss": 0.2394, "step": 3 }, { "epoch": 0.19, "learning_rate": 0.0001994699875614589, "loss": 0.3004, "step": 4 }, { "epoch": 0.24, "learning_rate": 0.00019880878960910772, "loss": 0.2071, "step": 5 }, { "epoch": 0.29, "learning_rate": 0.0001978855685095358, "loss": 0.2135, "step": 6 }, { "epoch": 0.33, "learning_rate": 0.00019670277247913205, "loss": 0.1922, "step": 7 }, { "epoch": 0.38, "learning_rate": 0.00019526353808033825, "loss": 0.211, "step": 8 }, { "epoch": 0.43, "learning_rate": 0.00019357168190404936, "loss": 0.1895, "step": 9 }, { "epoch": 0.48, "learning_rate": 0.0001916316904487005, "loss": 0.1772, "step": 10 }, { "epoch": 0.52, "learning_rate": 0.00018944870822287956, "loss": 0.1635, "step": 11 }, { "epoch": 0.57, "learning_rate": 0.00018702852410301554, "loss": 0.1521, "step": 12 }, { "epoch": 0.62, "learning_rate": 0.00018437755598231856, "loss": 0.2165, "step": 13 }, { "epoch": 0.67, "learning_rate": 0.00018150283375168114, "loss": 0.1462, "step": 14 }, { "epoch": 0.71, "learning_rate": 0.00017841198065767107, "loss": 0.1535, "step": 15 }, { "epoch": 0.76, "learning_rate": 0.00017511319308705198, "loss": 0.1654, "step": 16 }, { "epoch": 0.81, "learning_rate": 0.00017161521883143934, "loss": 0.1542, "step": 17 }, { "epoch": 0.86, "learning_rate": 0.00016792733388972932, "loss": 0.1482, "step": 18 }, { "epoch": 0.9, "learning_rate": 0.00016405931786981755, "loss": 0.1389, "step": 19 }, { "epoch": 0.95, "learning_rate": 0.00016002142805483685, "loss": 0.1234, "step": 20 }, { "epoch": 1.0, "learning_rate": 0.00015582437220268647, "loss": 0.1184, "step": 21 }, { "epoch": 1.05, "learning_rate": 0.0001514792801509831, "loss": 0.11, "step": 22 }, { "epoch": 1.1, "learning_rate": 0.000146997674302732, "loss": 0.1085, "step": 23 }, { "epoch": 1.14, "learning_rate": 0.0001423914390709861, "loss": 0.108, "step": 24 }, { "epoch": 1.19, "learning_rate": 0.00013767278936351854, "loss": 0.1044, "step": 25 }, { "epoch": 1.24, "learning_rate": 0.0001328542381910835, "loss": 0.0888, "step": 26 }, { "epoch": 1.29, "learning_rate": 0.00012794856348516095, "loss": 0.1162, "step": 27 }, { "epoch": 1.33, "learning_rate": 0.0001229687742131796, "loss": 0.0989, "step": 28 }, { "epoch": 1.38, "learning_rate": 0.00011792807588107357, "loss": 0.0849, "step": 29 }, { "epoch": 1.43, "learning_rate": 0.00011283983551465511, "loss": 0.1005, "step": 30 }, { "epoch": 1.48, "learning_rate": 0.00010771754621266466, "loss": 0.0979, "step": 31 }, { "epoch": 1.52, "learning_rate": 0.00010257479136549889, "loss": 0.0917, "step": 32 }, { "epoch": 1.57, "learning_rate": 9.742520863450115e-05, "loss": 0.0946, "step": 33 }, { "epoch": 1.62, "learning_rate": 9.228245378733537e-05, "loss": 0.0871, "step": 34 }, { "epoch": 1.67, "learning_rate": 8.71601644853449e-05, "loss": 0.0733, "step": 35 }, { "epoch": 1.71, "learning_rate": 8.207192411892646e-05, "loss": 0.0797, "step": 36 }, { "epoch": 1.76, "learning_rate": 7.703122578682046e-05, "loss": 0.0914, "step": 37 }, { "epoch": 1.81, "learning_rate": 7.205143651483906e-05, "loss": 0.0562, "step": 38 }, { "epoch": 1.86, "learning_rate": 6.714576180891654e-05, "loss": 0.0735, "step": 39 }, { "epoch": 1.9, "learning_rate": 6.232721063648148e-05, "loss": 0.048, "step": 40 }, { "epoch": 1.95, "learning_rate": 5.7608560929013946e-05, "loss": 0.0603, "step": 41 }, { "epoch": 2.0, "learning_rate": 5.300232569726804e-05, "loss": 0.0856, "step": 42 }, { "epoch": 2.05, "learning_rate": 4.852071984901696e-05, "loss": 0.0655, "step": 43 }, { "epoch": 2.1, "learning_rate": 4.417562779731355e-05, "loss": 0.0494, "step": 44 }, { "epoch": 2.14, "learning_rate": 3.997857194516319e-05, "loss": 0.0334, "step": 45 }, { "epoch": 2.19, "learning_rate": 3.594068213018249e-05, "loss": 0.0378, "step": 46 }, { "epoch": 2.24, "learning_rate": 3.207266611027069e-05, "loss": 0.0425, "step": 47 }, { "epoch": 2.29, "learning_rate": 2.8384781168560693e-05, "loss": 0.0272, "step": 48 }, { "epoch": 2.33, "learning_rate": 2.4886806912948035e-05, "loss": 0.0511, "step": 49 }, { "epoch": 2.38, "learning_rate": 2.1588019342328968e-05, "loss": 0.0393, "step": 50 }, { "epoch": 2.43, "learning_rate": 1.8497166248318876e-05, "loss": 0.0265, "step": 51 }, { "epoch": 2.48, "learning_rate": 1.562244401768144e-05, "loss": 0.0442, "step": 52 }, { "epoch": 2.52, "learning_rate": 1.2971475896984475e-05, "loss": 0.0196, "step": 53 }, { "epoch": 2.57, "learning_rate": 1.0551291777120464e-05, "loss": 0.0296, "step": 54 }, { "epoch": 2.62, "learning_rate": 8.368309551299536e-06, "loss": 0.0482, "step": 55 }, { "epoch": 2.67, "learning_rate": 6.428318095950647e-06, "loss": 0.0442, "step": 56 }, { "epoch": 2.71, "learning_rate": 4.7364619196617495e-06, "loss": 0.0442, "step": 57 }, { "epoch": 2.76, "learning_rate": 3.2972275208679625e-06, "loss": 0.027, "step": 58 }, { "epoch": 2.81, "learning_rate": 2.1144314904642195e-06, "loss": 0.053, "step": 59 }, { "epoch": 2.86, "learning_rate": 1.1912103908922945e-06, "loss": 0.037, "step": 60 }, { "epoch": 2.9, "learning_rate": 5.300124385410943e-07, "loss": 0.0439, "step": 61 }, { "epoch": 2.95, "learning_rate": 1.3259101151694708e-07, "loss": 0.0234, "step": 62 }, { "epoch": 3.0, "learning_rate": 0.0, "loss": 0.0241, "step": 63 }, { "epoch": 3.0, "step": 63, "total_flos": 3251608289280.0, "train_loss": 0.1782660081923481, "train_runtime": 283.1757, "train_samples_per_second": 27.947, "train_steps_per_second": 0.222 } ], "logging_steps": 1.0, "max_steps": 63, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000, "total_flos": 3251608289280.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }