|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 204, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.950901508331299, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 1.8735, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 8.133766174316406, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 2.126, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.692324161529541, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 1.926, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.0254693031311035, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 1.9762, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.805467128753662, |
|
"learning_rate": 1.2857142857142857e-05, |
|
"loss": 1.7727, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.195497989654541, |
|
"learning_rate": 1.5714285714285715e-05, |
|
"loss": 1.6793, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.377630710601807, |
|
"learning_rate": 1.8571428571428572e-05, |
|
"loss": 1.629, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.5771281719207764, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 1.4296, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.8232216835021973, |
|
"learning_rate": 2.4285714285714288e-05, |
|
"loss": 1.4971, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.096635341644287, |
|
"learning_rate": 2.7142857142857144e-05, |
|
"loss": 1.2129, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4359596967697144, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0623, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.3408057689666748, |
|
"learning_rate": 2.9672131147540984e-05, |
|
"loss": 1.1274, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.5449613332748413, |
|
"learning_rate": 2.9344262295081968e-05, |
|
"loss": 1.0534, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.266711950302124, |
|
"learning_rate": 2.901639344262295e-05, |
|
"loss": 1.0022, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.2879749536514282, |
|
"learning_rate": 2.8688524590163935e-05, |
|
"loss": 0.8819, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1592856645584106, |
|
"learning_rate": 2.836065573770492e-05, |
|
"loss": 0.9526, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3098297119140625, |
|
"learning_rate": 2.8032786885245902e-05, |
|
"loss": 0.8577, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.294083833694458, |
|
"learning_rate": 2.7704918032786886e-05, |
|
"loss": 0.8352, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0912269353866577, |
|
"learning_rate": 2.737704918032787e-05, |
|
"loss": 0.7198, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0710548162460327, |
|
"learning_rate": 2.7049180327868853e-05, |
|
"loss": 0.7445, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0573712587356567, |
|
"learning_rate": 2.6721311475409837e-05, |
|
"loss": 0.6865, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0720133781433105, |
|
"learning_rate": 2.639344262295082e-05, |
|
"loss": 0.6729, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1398358345031738, |
|
"learning_rate": 2.6065573770491804e-05, |
|
"loss": 0.6805, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.3724112510681152, |
|
"learning_rate": 2.5737704918032787e-05, |
|
"loss": 0.6923, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1828075647354126, |
|
"learning_rate": 2.5409836065573774e-05, |
|
"loss": 0.6927, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9459984302520752, |
|
"learning_rate": 2.5081967213114754e-05, |
|
"loss": 0.6539, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0317862033843994, |
|
"learning_rate": 2.4754098360655738e-05, |
|
"loss": 0.6816, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.9241882562637329, |
|
"learning_rate": 2.442622950819672e-05, |
|
"loss": 0.6342, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.2747896909713745, |
|
"learning_rate": 2.4098360655737705e-05, |
|
"loss": 0.6752, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.2308931350708008, |
|
"learning_rate": 2.377049180327869e-05, |
|
"loss": 0.6703, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9486699104309082, |
|
"learning_rate": 2.3442622950819672e-05, |
|
"loss": 0.6265, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9226927757263184, |
|
"learning_rate": 2.3114754098360656e-05, |
|
"loss": 0.6806, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9970481991767883, |
|
"learning_rate": 2.278688524590164e-05, |
|
"loss": 0.6684, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9154540300369263, |
|
"learning_rate": 2.2459016393442626e-05, |
|
"loss": 0.6076, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.9942613840103149, |
|
"learning_rate": 2.213114754098361e-05, |
|
"loss": 0.6423, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.7813049554824829, |
|
"learning_rate": 2.180327868852459e-05, |
|
"loss": 0.6045, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.8933289051055908, |
|
"learning_rate": 2.1475409836065574e-05, |
|
"loss": 0.6022, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.1493891477584839, |
|
"learning_rate": 2.1147540983606557e-05, |
|
"loss": 0.6012, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.102952480316162, |
|
"learning_rate": 2.081967213114754e-05, |
|
"loss": 0.6232, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.0626366138458252, |
|
"learning_rate": 2.0491803278688525e-05, |
|
"loss": 0.6064, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 1.103053331375122, |
|
"learning_rate": 2.0163934426229508e-05, |
|
"loss": 0.5939, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.2533128261566162, |
|
"learning_rate": 1.9836065573770492e-05, |
|
"loss": 0.6151, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.1883020401000977, |
|
"learning_rate": 1.9508196721311475e-05, |
|
"loss": 0.5898, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.941169023513794, |
|
"learning_rate": 1.9180327868852462e-05, |
|
"loss": 0.5877, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.0497690439224243, |
|
"learning_rate": 1.8852459016393442e-05, |
|
"loss": 0.603, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.006282091140747, |
|
"learning_rate": 1.8524590163934426e-05, |
|
"loss": 0.5779, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.8880963325500488, |
|
"learning_rate": 1.819672131147541e-05, |
|
"loss": 0.5785, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.8170259594917297, |
|
"learning_rate": 1.7868852459016393e-05, |
|
"loss": 0.5973, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.9137255549430847, |
|
"learning_rate": 1.7540983606557377e-05, |
|
"loss": 0.5859, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.8934600353240967, |
|
"learning_rate": 1.721311475409836e-05, |
|
"loss": 0.5712, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9546217322349548, |
|
"learning_rate": 1.6885245901639344e-05, |
|
"loss": 0.5977, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.1362155675888062, |
|
"learning_rate": 1.6557377049180328e-05, |
|
"loss": 0.5957, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.0691664218902588, |
|
"learning_rate": 1.6229508196721314e-05, |
|
"loss": 0.6303, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.1143447160720825, |
|
"learning_rate": 1.5901639344262298e-05, |
|
"loss": 0.5946, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.9838765859603882, |
|
"learning_rate": 1.5573770491803278e-05, |
|
"loss": 0.6253, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.0309878587722778, |
|
"learning_rate": 1.5245901639344264e-05, |
|
"loss": 0.5695, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.0644599199295044, |
|
"learning_rate": 1.4918032786885245e-05, |
|
"loss": 0.5974, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.9344809651374817, |
|
"learning_rate": 1.4590163934426229e-05, |
|
"loss": 0.5772, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.9217023253440857, |
|
"learning_rate": 1.4262295081967213e-05, |
|
"loss": 0.5659, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.1142168045043945, |
|
"learning_rate": 1.3934426229508198e-05, |
|
"loss": 0.5811, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.1095991134643555, |
|
"learning_rate": 1.3606557377049181e-05, |
|
"loss": 0.6029, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.0066099166870117, |
|
"learning_rate": 1.3278688524590163e-05, |
|
"loss": 0.5568, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.8575509190559387, |
|
"learning_rate": 1.2950819672131147e-05, |
|
"loss": 0.5703, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.1447674036026, |
|
"learning_rate": 1.2622950819672132e-05, |
|
"loss": 0.5836, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.9783082604408264, |
|
"learning_rate": 1.2295081967213116e-05, |
|
"loss": 0.5948, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.0990079641342163, |
|
"learning_rate": 1.19672131147541e-05, |
|
"loss": 0.568, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.0109045505523682, |
|
"learning_rate": 1.1639344262295081e-05, |
|
"loss": 0.5628, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0725024938583374, |
|
"learning_rate": 1.1311475409836065e-05, |
|
"loss": 0.5838, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.1007121801376343, |
|
"learning_rate": 1.098360655737705e-05, |
|
"loss": 0.5584, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.9721304774284363, |
|
"learning_rate": 1.0655737704918034e-05, |
|
"loss": 0.563, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.0708929300308228, |
|
"learning_rate": 1.0327868852459017e-05, |
|
"loss": 0.5351, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.0382109880447388, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 0.6197, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.1190615892410278, |
|
"learning_rate": 9.672131147540984e-06, |
|
"loss": 0.5663, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.0649045705795288, |
|
"learning_rate": 9.344262295081968e-06, |
|
"loss": 0.5553, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.201451063156128, |
|
"learning_rate": 9.016393442622952e-06, |
|
"loss": 0.5765, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.0463545322418213, |
|
"learning_rate": 8.688524590163935e-06, |
|
"loss": 0.5466, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.2062188386917114, |
|
"learning_rate": 8.360655737704917e-06, |
|
"loss": 0.5732, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.1948459148406982, |
|
"learning_rate": 8.032786885245902e-06, |
|
"loss": 0.554, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.0267665386199951, |
|
"learning_rate": 7.704918032786886e-06, |
|
"loss": 0.5366, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.9985692501068115, |
|
"learning_rate": 7.377049180327869e-06, |
|
"loss": 0.5424, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.119655728340149, |
|
"learning_rate": 7.049180327868853e-06, |
|
"loss": 0.5553, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.9959714412689209, |
|
"learning_rate": 6.721311475409837e-06, |
|
"loss": 0.5895, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.0639784336090088, |
|
"learning_rate": 6.393442622950819e-06, |
|
"loss": 0.5704, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.1006407737731934, |
|
"learning_rate": 6.065573770491804e-06, |
|
"loss": 0.5416, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.98119056224823, |
|
"learning_rate": 5.7377049180327865e-06, |
|
"loss": 0.5839, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.1397459506988525, |
|
"learning_rate": 5.409836065573771e-06, |
|
"loss": 0.5604, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.0882867574691772, |
|
"learning_rate": 5.081967213114754e-06, |
|
"loss": 0.556, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.1950050592422485, |
|
"learning_rate": 4.754098360655738e-06, |
|
"loss": 0.5694, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.0115201473236084, |
|
"learning_rate": 4.426229508196722e-06, |
|
"loss": 0.5464, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.1844886541366577, |
|
"learning_rate": 4.098360655737705e-06, |
|
"loss": 0.5489, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.192030668258667, |
|
"learning_rate": 3.770491803278689e-06, |
|
"loss": 0.5359, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.2273727655410767, |
|
"learning_rate": 3.4426229508196724e-06, |
|
"loss": 0.5721, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.950477123260498, |
|
"learning_rate": 3.114754098360656e-06, |
|
"loss": 0.5534, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.9607704877853394, |
|
"learning_rate": 2.7868852459016396e-06, |
|
"loss": 0.5706, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 1.0138517618179321, |
|
"learning_rate": 2.4590163934426227e-06, |
|
"loss": 0.5592, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.1338858604431152, |
|
"learning_rate": 2.1311475409836063e-06, |
|
"loss": 0.5659, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.057421088218689, |
|
"learning_rate": 1.80327868852459e-06, |
|
"loss": 0.586, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.1511048078536987, |
|
"learning_rate": 1.4754098360655737e-06, |
|
"loss": 0.5768, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.9774185419082642, |
|
"learning_rate": 1.1475409836065575e-06, |
|
"loss": 0.564, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.1010777950286865, |
|
"learning_rate": 8.196721311475409e-07, |
|
"loss": 0.5891, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 1.024880290031433, |
|
"learning_rate": 4.918032786885246e-07, |
|
"loss": 0.5536, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.9065414667129517, |
|
"learning_rate": 1.639344262295082e-07, |
|
"loss": 0.5979, |
|
"step": 204 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 204, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 5017254261424128.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|