{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 6.950901508331299, "learning_rate": 1.4285714285714286e-06, "loss": 1.8735, "step": 2 }, { "epoch": 0.06, "grad_norm": 8.133766174316406, "learning_rate": 4.2857142857142855e-06, "loss": 2.126, "step": 4 }, { "epoch": 0.09, "grad_norm": 6.692324161529541, "learning_rate": 7.142857142857143e-06, "loss": 1.926, "step": 6 }, { "epoch": 0.12, "grad_norm": 6.0254693031311035, "learning_rate": 9.999999999999999e-06, "loss": 1.9762, "step": 8 }, { "epoch": 0.15, "grad_norm": 5.805467128753662, "learning_rate": 1.2857142857142857e-05, "loss": 1.7727, "step": 10 }, { "epoch": 0.18, "grad_norm": 6.195497989654541, "learning_rate": 1.5714285714285715e-05, "loss": 1.6793, "step": 12 }, { "epoch": 0.21, "grad_norm": 4.377630710601807, "learning_rate": 1.8571428571428572e-05, "loss": 1.629, "step": 14 }, { "epoch": 0.24, "grad_norm": 3.5771281719207764, "learning_rate": 2.1428571428571428e-05, "loss": 1.4296, "step": 16 }, { "epoch": 0.26, "grad_norm": 2.8232216835021973, "learning_rate": 2.4285714285714288e-05, "loss": 1.4971, "step": 18 }, { "epoch": 0.29, "grad_norm": 2.096635341644287, "learning_rate": 2.7142857142857144e-05, "loss": 1.2129, "step": 20 }, { "epoch": 0.32, "grad_norm": 1.4359596967697144, "learning_rate": 3e-05, "loss": 1.0623, "step": 22 }, { "epoch": 0.35, "grad_norm": 1.3408057689666748, "learning_rate": 2.9672131147540984e-05, "loss": 1.1274, "step": 24 }, { "epoch": 0.38, "grad_norm": 1.5449613332748413, "learning_rate": 2.9344262295081968e-05, "loss": 1.0534, "step": 26 }, { "epoch": 0.41, "grad_norm": 1.266711950302124, "learning_rate": 2.901639344262295e-05, "loss": 1.0022, "step": 28 }, { "epoch": 0.44, "grad_norm": 1.2879749536514282, "learning_rate": 2.8688524590163935e-05, "loss": 0.8819, "step": 30 }, { "epoch": 0.47, "grad_norm": 1.1592856645584106, "learning_rate": 2.836065573770492e-05, "loss": 0.9526, "step": 32 }, { "epoch": 0.5, "grad_norm": 1.3098297119140625, "learning_rate": 2.8032786885245902e-05, "loss": 0.8577, "step": 34 }, { "epoch": 0.53, "grad_norm": 1.294083833694458, "learning_rate": 2.7704918032786886e-05, "loss": 0.8352, "step": 36 }, { "epoch": 0.56, "grad_norm": 1.0912269353866577, "learning_rate": 2.737704918032787e-05, "loss": 0.7198, "step": 38 }, { "epoch": 0.59, "grad_norm": 1.0710548162460327, "learning_rate": 2.7049180327868853e-05, "loss": 0.7445, "step": 40 }, { "epoch": 0.62, "grad_norm": 1.0573712587356567, "learning_rate": 2.6721311475409837e-05, "loss": 0.6865, "step": 42 }, { "epoch": 0.65, "grad_norm": 1.0720133781433105, "learning_rate": 2.639344262295082e-05, "loss": 0.6729, "step": 44 }, { "epoch": 0.68, "grad_norm": 1.1398358345031738, "learning_rate": 2.6065573770491804e-05, "loss": 0.6805, "step": 46 }, { "epoch": 0.71, "grad_norm": 1.3724112510681152, "learning_rate": 2.5737704918032787e-05, "loss": 0.6923, "step": 48 }, { "epoch": 0.74, "grad_norm": 1.1828075647354126, "learning_rate": 2.5409836065573774e-05, "loss": 0.6927, "step": 50 }, { "epoch": 0.76, "grad_norm": 0.9459984302520752, "learning_rate": 2.5081967213114754e-05, "loss": 0.6539, "step": 52 }, { "epoch": 0.79, "grad_norm": 1.0317862033843994, "learning_rate": 2.4754098360655738e-05, "loss": 0.6816, "step": 54 }, { "epoch": 0.82, "grad_norm": 0.9241882562637329, "learning_rate": 2.442622950819672e-05, "loss": 0.6342, "step": 56 }, { "epoch": 0.85, "grad_norm": 1.2747896909713745, "learning_rate": 2.4098360655737705e-05, "loss": 0.6752, "step": 58 }, { "epoch": 0.88, "grad_norm": 1.2308931350708008, "learning_rate": 2.377049180327869e-05, "loss": 0.6703, "step": 60 }, { "epoch": 0.91, "grad_norm": 0.9486699104309082, "learning_rate": 2.3442622950819672e-05, "loss": 0.6265, "step": 62 }, { "epoch": 0.94, "grad_norm": 0.9226927757263184, "learning_rate": 2.3114754098360656e-05, "loss": 0.6806, "step": 64 }, { "epoch": 0.97, "grad_norm": 0.9970481991767883, "learning_rate": 2.278688524590164e-05, "loss": 0.6684, "step": 66 }, { "epoch": 1.0, "grad_norm": 0.9154540300369263, "learning_rate": 2.2459016393442626e-05, "loss": 0.6076, "step": 68 }, { "epoch": 1.03, "grad_norm": 0.9942613840103149, "learning_rate": 2.213114754098361e-05, "loss": 0.6423, "step": 70 }, { "epoch": 1.06, "grad_norm": 0.7813049554824829, "learning_rate": 2.180327868852459e-05, "loss": 0.6045, "step": 72 }, { "epoch": 1.09, "grad_norm": 0.8933289051055908, "learning_rate": 2.1475409836065574e-05, "loss": 0.6022, "step": 74 }, { "epoch": 1.12, "grad_norm": 1.1493891477584839, "learning_rate": 2.1147540983606557e-05, "loss": 0.6012, "step": 76 }, { "epoch": 1.15, "grad_norm": 1.102952480316162, "learning_rate": 2.081967213114754e-05, "loss": 0.6232, "step": 78 }, { "epoch": 1.18, "grad_norm": 1.0626366138458252, "learning_rate": 2.0491803278688525e-05, "loss": 0.6064, "step": 80 }, { "epoch": 1.21, "grad_norm": 1.103053331375122, "learning_rate": 2.0163934426229508e-05, "loss": 0.5939, "step": 82 }, { "epoch": 1.24, "grad_norm": 1.2533128261566162, "learning_rate": 1.9836065573770492e-05, "loss": 0.6151, "step": 84 }, { "epoch": 1.26, "grad_norm": 1.1883020401000977, "learning_rate": 1.9508196721311475e-05, "loss": 0.5898, "step": 86 }, { "epoch": 1.29, "grad_norm": 0.941169023513794, "learning_rate": 1.9180327868852462e-05, "loss": 0.5877, "step": 88 }, { "epoch": 1.32, "grad_norm": 1.0497690439224243, "learning_rate": 1.8852459016393442e-05, "loss": 0.603, "step": 90 }, { "epoch": 1.35, "grad_norm": 1.006282091140747, "learning_rate": 1.8524590163934426e-05, "loss": 0.5779, "step": 92 }, { "epoch": 1.38, "grad_norm": 0.8880963325500488, "learning_rate": 1.819672131147541e-05, "loss": 0.5785, "step": 94 }, { "epoch": 1.41, "grad_norm": 0.8170259594917297, "learning_rate": 1.7868852459016393e-05, "loss": 0.5973, "step": 96 }, { "epoch": 1.44, "grad_norm": 0.9137255549430847, "learning_rate": 1.7540983606557377e-05, "loss": 0.5859, "step": 98 }, { "epoch": 1.47, "grad_norm": 0.8934600353240967, "learning_rate": 1.721311475409836e-05, "loss": 0.5712, "step": 100 }, { "epoch": 1.5, "grad_norm": 0.9546217322349548, "learning_rate": 1.6885245901639344e-05, "loss": 0.5977, "step": 102 }, { "epoch": 1.53, "grad_norm": 1.1362155675888062, "learning_rate": 1.6557377049180328e-05, "loss": 0.5957, "step": 104 }, { "epoch": 1.56, "grad_norm": 1.0691664218902588, "learning_rate": 1.6229508196721314e-05, "loss": 0.6303, "step": 106 }, { "epoch": 1.59, "grad_norm": 1.1143447160720825, "learning_rate": 1.5901639344262298e-05, "loss": 0.5946, "step": 108 }, { "epoch": 1.62, "grad_norm": 0.9838765859603882, "learning_rate": 1.5573770491803278e-05, "loss": 0.6253, "step": 110 }, { "epoch": 1.65, "grad_norm": 1.0309878587722778, "learning_rate": 1.5245901639344264e-05, "loss": 0.5695, "step": 112 }, { "epoch": 1.68, "grad_norm": 1.0644599199295044, "learning_rate": 1.4918032786885245e-05, "loss": 0.5974, "step": 114 }, { "epoch": 1.71, "grad_norm": 0.9344809651374817, "learning_rate": 1.4590163934426229e-05, "loss": 0.5772, "step": 116 }, { "epoch": 1.74, "grad_norm": 0.9217023253440857, "learning_rate": 1.4262295081967213e-05, "loss": 0.5659, "step": 118 }, { "epoch": 1.76, "grad_norm": 1.1142168045043945, "learning_rate": 1.3934426229508198e-05, "loss": 0.5811, "step": 120 }, { "epoch": 1.79, "grad_norm": 1.1095991134643555, "learning_rate": 1.3606557377049181e-05, "loss": 0.6029, "step": 122 }, { "epoch": 1.82, "grad_norm": 1.0066099166870117, "learning_rate": 1.3278688524590163e-05, "loss": 0.5568, "step": 124 }, { "epoch": 1.85, "grad_norm": 0.8575509190559387, "learning_rate": 1.2950819672131147e-05, "loss": 0.5703, "step": 126 }, { "epoch": 1.88, "grad_norm": 1.1447674036026, "learning_rate": 1.2622950819672132e-05, "loss": 0.5836, "step": 128 }, { "epoch": 1.91, "grad_norm": 0.9783082604408264, "learning_rate": 1.2295081967213116e-05, "loss": 0.5948, "step": 130 }, { "epoch": 1.94, "grad_norm": 1.0990079641342163, "learning_rate": 1.19672131147541e-05, "loss": 0.568, "step": 132 }, { "epoch": 1.97, "grad_norm": 1.0109045505523682, "learning_rate": 1.1639344262295081e-05, "loss": 0.5628, "step": 134 }, { "epoch": 2.0, "grad_norm": 1.0725024938583374, "learning_rate": 1.1311475409836065e-05, "loss": 0.5838, "step": 136 }, { "epoch": 2.03, "grad_norm": 1.1007121801376343, "learning_rate": 1.098360655737705e-05, "loss": 0.5584, "step": 138 }, { "epoch": 2.06, "grad_norm": 0.9721304774284363, "learning_rate": 1.0655737704918034e-05, "loss": 0.563, "step": 140 }, { "epoch": 2.09, "grad_norm": 1.0708929300308228, "learning_rate": 1.0327868852459017e-05, "loss": 0.5351, "step": 142 }, { "epoch": 2.12, "grad_norm": 1.0382109880447388, "learning_rate": 9.999999999999999e-06, "loss": 0.6197, "step": 144 }, { "epoch": 2.15, "grad_norm": 1.1190615892410278, "learning_rate": 9.672131147540984e-06, "loss": 0.5663, "step": 146 }, { "epoch": 2.18, "grad_norm": 1.0649045705795288, "learning_rate": 9.344262295081968e-06, "loss": 0.5553, "step": 148 }, { "epoch": 2.21, "grad_norm": 1.201451063156128, "learning_rate": 9.016393442622952e-06, "loss": 0.5765, "step": 150 }, { "epoch": 2.24, "grad_norm": 1.0463545322418213, "learning_rate": 8.688524590163935e-06, "loss": 0.5466, "step": 152 }, { "epoch": 2.26, "grad_norm": 1.2062188386917114, "learning_rate": 8.360655737704917e-06, "loss": 0.5732, "step": 154 }, { "epoch": 2.29, "grad_norm": 1.1948459148406982, "learning_rate": 8.032786885245902e-06, "loss": 0.554, "step": 156 }, { "epoch": 2.32, "grad_norm": 1.0267665386199951, "learning_rate": 7.704918032786886e-06, "loss": 0.5366, "step": 158 }, { "epoch": 2.35, "grad_norm": 0.9985692501068115, "learning_rate": 7.377049180327869e-06, "loss": 0.5424, "step": 160 }, { "epoch": 2.38, "grad_norm": 1.119655728340149, "learning_rate": 7.049180327868853e-06, "loss": 0.5553, "step": 162 }, { "epoch": 2.41, "grad_norm": 0.9959714412689209, "learning_rate": 6.721311475409837e-06, "loss": 0.5895, "step": 164 }, { "epoch": 2.44, "grad_norm": 1.0639784336090088, "learning_rate": 6.393442622950819e-06, "loss": 0.5704, "step": 166 }, { "epoch": 2.47, "grad_norm": 1.1006407737731934, "learning_rate": 6.065573770491804e-06, "loss": 0.5416, "step": 168 }, { "epoch": 2.5, "grad_norm": 0.98119056224823, "learning_rate": 5.7377049180327865e-06, "loss": 0.5839, "step": 170 }, { "epoch": 2.53, "grad_norm": 1.1397459506988525, "learning_rate": 5.409836065573771e-06, "loss": 0.5604, "step": 172 }, { "epoch": 2.56, "grad_norm": 1.0882867574691772, "learning_rate": 5.081967213114754e-06, "loss": 0.556, "step": 174 }, { "epoch": 2.59, "grad_norm": 1.1950050592422485, "learning_rate": 4.754098360655738e-06, "loss": 0.5694, "step": 176 }, { "epoch": 2.62, "grad_norm": 1.0115201473236084, "learning_rate": 4.426229508196722e-06, "loss": 0.5464, "step": 178 }, { "epoch": 2.65, "grad_norm": 1.1844886541366577, "learning_rate": 4.098360655737705e-06, "loss": 0.5489, "step": 180 }, { "epoch": 2.68, "grad_norm": 1.192030668258667, "learning_rate": 3.770491803278689e-06, "loss": 0.5359, "step": 182 }, { "epoch": 2.71, "grad_norm": 1.2273727655410767, "learning_rate": 3.4426229508196724e-06, "loss": 0.5721, "step": 184 }, { "epoch": 2.74, "grad_norm": 0.950477123260498, "learning_rate": 3.114754098360656e-06, "loss": 0.5534, "step": 186 }, { "epoch": 2.76, "grad_norm": 0.9607704877853394, "learning_rate": 2.7868852459016396e-06, "loss": 0.5706, "step": 188 }, { "epoch": 2.79, "grad_norm": 1.0138517618179321, "learning_rate": 2.4590163934426227e-06, "loss": 0.5592, "step": 190 }, { "epoch": 2.82, "grad_norm": 1.1338858604431152, "learning_rate": 2.1311475409836063e-06, "loss": 0.5659, "step": 192 }, { "epoch": 2.85, "grad_norm": 1.057421088218689, "learning_rate": 1.80327868852459e-06, "loss": 0.586, "step": 194 }, { "epoch": 2.88, "grad_norm": 1.1511048078536987, "learning_rate": 1.4754098360655737e-06, "loss": 0.5768, "step": 196 }, { "epoch": 2.91, "grad_norm": 0.9774185419082642, "learning_rate": 1.1475409836065575e-06, "loss": 0.564, "step": 198 }, { "epoch": 2.94, "grad_norm": 1.1010777950286865, "learning_rate": 8.196721311475409e-07, "loss": 0.5891, "step": 200 }, { "epoch": 2.97, "grad_norm": 1.024880290031433, "learning_rate": 4.918032786885246e-07, "loss": 0.5536, "step": 202 }, { "epoch": 3.0, "grad_norm": 0.9065414667129517, "learning_rate": 1.639344262295082e-07, "loss": 0.5979, "step": 204 } ], "logging_steps": 2, "max_steps": 204, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 5017254261424128.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }