|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.384615384615385, |
|
"eval_steps": 10, |
|
"global_step": 1750, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006153846153846154, |
|
"eval_loss": 1.489196538925171, |
|
"eval_runtime": 1.2747, |
|
"eval_samples_per_second": 89.432, |
|
"eval_steps_per_second": 4.707, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"grad_norm": 17.32219123840332, |
|
"learning_rate": 8.771929824561404e-07, |
|
"loss": 1.5722, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03076923076923077, |
|
"eval_loss": 1.3865292072296143, |
|
"eval_runtime": 1.1749, |
|
"eval_samples_per_second": 97.029, |
|
"eval_steps_per_second": 5.107, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"grad_norm": 21.88077735900879, |
|
"learning_rate": 1.7543859649122807e-06, |
|
"loss": 1.3935, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06153846153846154, |
|
"eval_loss": 1.0996264219284058, |
|
"eval_runtime": 1.1706, |
|
"eval_samples_per_second": 97.386, |
|
"eval_steps_per_second": 5.126, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"grad_norm": 17.395177841186523, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 1.0664, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09230769230769231, |
|
"eval_loss": 0.8006665706634521, |
|
"eval_runtime": 1.1747, |
|
"eval_samples_per_second": 97.044, |
|
"eval_steps_per_second": 5.108, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"grad_norm": 6.869284152984619, |
|
"learning_rate": 3.5087719298245615e-06, |
|
"loss": 0.7994, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12307692307692308, |
|
"eval_loss": 0.708875298500061, |
|
"eval_runtime": 1.1742, |
|
"eval_samples_per_second": 97.087, |
|
"eval_steps_per_second": 5.11, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 6.4737162590026855, |
|
"learning_rate": 4.385964912280702e-06, |
|
"loss": 0.7751, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"eval_loss": 0.6778659224510193, |
|
"eval_runtime": 1.1722, |
|
"eval_samples_per_second": 97.257, |
|
"eval_steps_per_second": 5.119, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"grad_norm": 5.418182373046875, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 0.6203, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18461538461538463, |
|
"eval_loss": 0.6695793271064758, |
|
"eval_runtime": 1.1744, |
|
"eval_samples_per_second": 97.071, |
|
"eval_steps_per_second": 5.109, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"grad_norm": 2.7423934936523438, |
|
"learning_rate": 6.140350877192983e-06, |
|
"loss": 0.767, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2153846153846154, |
|
"eval_loss": 0.6650205850601196, |
|
"eval_runtime": 1.1671, |
|
"eval_samples_per_second": 97.675, |
|
"eval_steps_per_second": 5.141, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"grad_norm": 2.572129726409912, |
|
"learning_rate": 7.017543859649123e-06, |
|
"loss": 0.6336, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24615384615384617, |
|
"eval_loss": 0.6613836884498596, |
|
"eval_runtime": 1.1671, |
|
"eval_samples_per_second": 97.676, |
|
"eval_steps_per_second": 5.141, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"grad_norm": 4.622325420379639, |
|
"learning_rate": 7.894736842105265e-06, |
|
"loss": 0.631, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27692307692307694, |
|
"eval_loss": 0.6617783904075623, |
|
"eval_runtime": 1.1744, |
|
"eval_samples_per_second": 97.071, |
|
"eval_steps_per_second": 5.109, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 3.24354887008667, |
|
"learning_rate": 8.771929824561405e-06, |
|
"loss": 0.6086, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"eval_loss": 0.662232518196106, |
|
"eval_runtime": 1.172, |
|
"eval_samples_per_second": 97.27, |
|
"eval_steps_per_second": 5.119, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"grad_norm": 3.5868513584136963, |
|
"learning_rate": 9.649122807017545e-06, |
|
"loss": 0.7057, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3384615384615385, |
|
"eval_loss": 0.6619779467582703, |
|
"eval_runtime": 1.165, |
|
"eval_samples_per_second": 97.856, |
|
"eval_steps_per_second": 5.15, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"grad_norm": 3.4894511699676514, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 0.7385, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.36923076923076925, |
|
"eval_loss": 0.6673641800880432, |
|
"eval_runtime": 1.1682, |
|
"eval_samples_per_second": 97.588, |
|
"eval_steps_per_second": 5.136, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.117406845092773, |
|
"learning_rate": 1.1403508771929826e-05, |
|
"loss": 0.6533, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6705042123794556, |
|
"eval_runtime": 1.1726, |
|
"eval_samples_per_second": 97.217, |
|
"eval_steps_per_second": 5.117, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"grad_norm": 3.9341437816619873, |
|
"learning_rate": 1.2280701754385966e-05, |
|
"loss": 0.7066, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4307692307692308, |
|
"eval_loss": 0.6691470146179199, |
|
"eval_runtime": 1.1702, |
|
"eval_samples_per_second": 97.418, |
|
"eval_steps_per_second": 5.127, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 3.4131579399108887, |
|
"learning_rate": 1.3157894736842108e-05, |
|
"loss": 0.6065, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"eval_loss": 0.6724759340286255, |
|
"eval_runtime": 1.1746, |
|
"eval_samples_per_second": 97.05, |
|
"eval_steps_per_second": 5.108, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"grad_norm": 3.820042848587036, |
|
"learning_rate": 1.4035087719298246e-05, |
|
"loss": 0.6474, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.49230769230769234, |
|
"eval_loss": 0.6704264879226685, |
|
"eval_runtime": 1.1772, |
|
"eval_samples_per_second": 96.838, |
|
"eval_steps_per_second": 5.097, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"grad_norm": 2.8097360134124756, |
|
"learning_rate": 1.4912280701754388e-05, |
|
"loss": 0.7022, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5230769230769231, |
|
"eval_loss": 0.6691609025001526, |
|
"eval_runtime": 1.1676, |
|
"eval_samples_per_second": 97.638, |
|
"eval_steps_per_second": 5.139, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"grad_norm": 2.3641347885131836, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 0.6667, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5538461538461539, |
|
"eval_loss": 0.6753159165382385, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.524, |
|
"eval_steps_per_second": 5.133, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"grad_norm": 6.359607219696045, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.6889, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5846153846153846, |
|
"eval_loss": 0.6796454787254333, |
|
"eval_runtime": 1.1714, |
|
"eval_samples_per_second": 97.323, |
|
"eval_steps_per_second": 5.122, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 4.7979960441589355, |
|
"learning_rate": 1.754385964912281e-05, |
|
"loss": 0.6937, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"eval_loss": 0.6748442649841309, |
|
"eval_runtime": 1.1773, |
|
"eval_samples_per_second": 96.831, |
|
"eval_steps_per_second": 5.096, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"grad_norm": 3.5100464820861816, |
|
"learning_rate": 1.8421052631578947e-05, |
|
"loss": 0.6613, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6461538461538462, |
|
"eval_loss": 0.6842787861824036, |
|
"eval_runtime": 1.1662, |
|
"eval_samples_per_second": 97.752, |
|
"eval_steps_per_second": 5.145, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"grad_norm": 3.3538153171539307, |
|
"learning_rate": 1.929824561403509e-05, |
|
"loss": 0.6512, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.676923076923077, |
|
"eval_loss": 0.6873583197593689, |
|
"eval_runtime": 1.1667, |
|
"eval_samples_per_second": 97.712, |
|
"eval_steps_per_second": 5.143, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"grad_norm": 3.7595653533935547, |
|
"learning_rate": 1.9999952892103225e-05, |
|
"loss": 0.7656, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7076923076923077, |
|
"eval_loss": 0.6970181465148926, |
|
"eval_runtime": 1.1663, |
|
"eval_samples_per_second": 97.741, |
|
"eval_steps_per_second": 5.144, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"grad_norm": 2.9787654876708984, |
|
"learning_rate": 1.999830416231782e-05, |
|
"loss": 0.6412, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7384615384615385, |
|
"eval_loss": 0.7058220505714417, |
|
"eval_runtime": 1.1688, |
|
"eval_samples_per_second": 97.537, |
|
"eval_steps_per_second": 5.134, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 2.3455305099487305, |
|
"learning_rate": 1.9994300481505595e-05, |
|
"loss": 0.6148, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"eval_loss": 0.7070643305778503, |
|
"eval_runtime": 1.1728, |
|
"eval_samples_per_second": 97.202, |
|
"eval_steps_per_second": 5.116, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.057767868041992, |
|
"learning_rate": 1.998794279267369e-05, |
|
"loss": 0.6433, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7127295136451721, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.529, |
|
"eval_steps_per_second": 5.133, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"grad_norm": 2.5686323642730713, |
|
"learning_rate": 1.9979232593280637e-05, |
|
"loss": 0.5972, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8307692307692308, |
|
"eval_loss": 0.7106388211250305, |
|
"eval_runtime": 1.166, |
|
"eval_samples_per_second": 97.77, |
|
"eval_steps_per_second": 5.146, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"grad_norm": 1.2748245000839233, |
|
"learning_rate": 1.9968171934883647e-05, |
|
"loss": 0.6795, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8615384615384616, |
|
"eval_loss": 0.7127251029014587, |
|
"eval_runtime": 1.1701, |
|
"eval_samples_per_second": 97.429, |
|
"eval_steps_per_second": 5.128, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"grad_norm": 3.1174163818359375, |
|
"learning_rate": 1.9954763422655396e-05, |
|
"loss": 0.6589, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8923076923076924, |
|
"eval_loss": 0.7157883048057556, |
|
"eval_runtime": 1.1727, |
|
"eval_samples_per_second": 97.211, |
|
"eval_steps_per_second": 5.116, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 2.3197476863861084, |
|
"learning_rate": 1.9939010214770426e-05, |
|
"loss": 0.5539, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"eval_loss": 0.7173091173171997, |
|
"eval_runtime": 1.1712, |
|
"eval_samples_per_second": 97.332, |
|
"eval_steps_per_second": 5.123, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"grad_norm": 2.7073607444763184, |
|
"learning_rate": 1.9920916021661277e-05, |
|
"loss": 0.7515, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9538461538461539, |
|
"eval_loss": 0.720274806022644, |
|
"eval_runtime": 1.1673, |
|
"eval_samples_per_second": 97.665, |
|
"eval_steps_per_second": 5.14, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"grad_norm": 2.78871488571167, |
|
"learning_rate": 1.9900485105144544e-05, |
|
"loss": 0.6282, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9846153846153847, |
|
"eval_loss": 0.7291054725646973, |
|
"eval_runtime": 1.1713, |
|
"eval_samples_per_second": 97.331, |
|
"eval_steps_per_second": 5.123, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"grad_norm": 2.181795597076416, |
|
"learning_rate": 1.9877722277417085e-05, |
|
"loss": 0.4711, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0153846153846153, |
|
"eval_loss": 0.7346097826957703, |
|
"eval_runtime": 1.1674, |
|
"eval_samples_per_second": 97.652, |
|
"eval_steps_per_second": 5.14, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"grad_norm": 6.42896842956543, |
|
"learning_rate": 1.985263289992256e-05, |
|
"loss": 0.4595, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0461538461538462, |
|
"eval_loss": 0.7707703709602356, |
|
"eval_runtime": 1.1711, |
|
"eval_samples_per_second": 97.348, |
|
"eval_steps_per_second": 5.124, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 6.206396102905273, |
|
"learning_rate": 1.9825222882088647e-05, |
|
"loss": 0.3704, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"eval_loss": 0.7343254685401917, |
|
"eval_runtime": 1.1717, |
|
"eval_samples_per_second": 97.298, |
|
"eval_steps_per_second": 5.121, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"grad_norm": 2.2610416412353516, |
|
"learning_rate": 1.9795498679935144e-05, |
|
"loss": 0.3845, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1076923076923078, |
|
"eval_loss": 0.7747918963432312, |
|
"eval_runtime": 1.1652, |
|
"eval_samples_per_second": 97.838, |
|
"eval_steps_per_second": 5.149, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"grad_norm": 2.490269184112549, |
|
"learning_rate": 1.9763467294553364e-05, |
|
"loss": 0.3461, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1384615384615384, |
|
"eval_loss": 0.7602437138557434, |
|
"eval_runtime": 1.1685, |
|
"eval_samples_per_second": 97.563, |
|
"eval_steps_per_second": 5.135, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"grad_norm": 2.435138702392578, |
|
"learning_rate": 1.9729136270457118e-05, |
|
"loss": 0.3871, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1692307692307693, |
|
"eval_loss": 0.7585355043411255, |
|
"eval_runtime": 1.1702, |
|
"eval_samples_per_second": 97.419, |
|
"eval_steps_per_second": 5.127, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.649342656135559, |
|
"learning_rate": 1.9692513693805738e-05, |
|
"loss": 0.345, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.7817304730415344, |
|
"eval_runtime": 1.169, |
|
"eval_samples_per_second": 97.52, |
|
"eval_steps_per_second": 5.133, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 2.2906811237335205, |
|
"learning_rate": 1.965360819049948e-05, |
|
"loss": 0.4372, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"eval_loss": 0.7519503235816956, |
|
"eval_runtime": 1.1693, |
|
"eval_samples_per_second": 97.495, |
|
"eval_steps_per_second": 5.131, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"grad_norm": 3.925938606262207, |
|
"learning_rate": 1.9612428924147842e-05, |
|
"loss": 0.4879, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2615384615384615, |
|
"eval_loss": 0.7463933229446411, |
|
"eval_runtime": 1.1712, |
|
"eval_samples_per_second": 97.334, |
|
"eval_steps_per_second": 5.123, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"grad_norm": 1.4445178508758545, |
|
"learning_rate": 1.9568985593911206e-05, |
|
"loss": 0.4265, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2923076923076924, |
|
"eval_loss": 0.7673818469047546, |
|
"eval_runtime": 1.1696, |
|
"eval_samples_per_second": 97.471, |
|
"eval_steps_per_second": 5.13, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"grad_norm": 2.707318067550659, |
|
"learning_rate": 1.9523288432216333e-05, |
|
"loss": 0.4255, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.323076923076923, |
|
"eval_loss": 0.776730477809906, |
|
"eval_runtime": 1.1686, |
|
"eval_samples_per_second": 97.552, |
|
"eval_steps_per_second": 5.134, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"grad_norm": 2.1543335914611816, |
|
"learning_rate": 1.9475348202346292e-05, |
|
"loss": 0.3992, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.353846153846154, |
|
"eval_loss": 0.7587007284164429, |
|
"eval_runtime": 1.1741, |
|
"eval_samples_per_second": 97.094, |
|
"eval_steps_per_second": 5.11, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 2.1207425594329834, |
|
"learning_rate": 1.942517619590531e-05, |
|
"loss": 0.4197, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"eval_loss": 0.7629592418670654, |
|
"eval_runtime": 1.1714, |
|
"eval_samples_per_second": 97.323, |
|
"eval_steps_per_second": 5.122, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"grad_norm": 3.0015456676483154, |
|
"learning_rate": 1.9372784230159213e-05, |
|
"loss": 0.3963, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4153846153846155, |
|
"eval_loss": 0.7871745824813843, |
|
"eval_runtime": 1.1671, |
|
"eval_samples_per_second": 97.676, |
|
"eval_steps_per_second": 5.141, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"grad_norm": 2.6770448684692383, |
|
"learning_rate": 1.9318184645252037e-05, |
|
"loss": 0.3689, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.4461538461538461, |
|
"eval_loss": 0.7728400826454163, |
|
"eval_runtime": 1.1721, |
|
"eval_samples_per_second": 97.262, |
|
"eval_steps_per_second": 5.119, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"grad_norm": 2.6416432857513428, |
|
"learning_rate": 1.926139030129951e-05, |
|
"loss": 0.4004, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.476923076923077, |
|
"eval_loss": 0.7785875797271729, |
|
"eval_runtime": 1.1696, |
|
"eval_samples_per_second": 97.466, |
|
"eval_steps_per_second": 5.13, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"grad_norm": 1.8383666276931763, |
|
"learning_rate": 1.9202414575360024e-05, |
|
"loss": 0.4265, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5076923076923077, |
|
"eval_loss": 0.7735582590103149, |
|
"eval_runtime": 1.168, |
|
"eval_samples_per_second": 97.604, |
|
"eval_steps_per_second": 5.137, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.905369758605957, |
|
"learning_rate": 1.9141271358283874e-05, |
|
"loss": 0.3675, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"eval_loss": 0.7821589708328247, |
|
"eval_runtime": 1.168, |
|
"eval_samples_per_second": 97.603, |
|
"eval_steps_per_second": 5.137, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.5692307692307692, |
|
"grad_norm": 2.0887465476989746, |
|
"learning_rate": 1.9077975051441487e-05, |
|
"loss": 0.3769, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.5692307692307692, |
|
"eval_loss": 0.8046853542327881, |
|
"eval_runtime": 1.168, |
|
"eval_samples_per_second": 97.604, |
|
"eval_steps_per_second": 5.137, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.3952693939208984, |
|
"learning_rate": 1.9012540563331375e-05, |
|
"loss": 0.3842, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.7695035338401794, |
|
"eval_runtime": 1.1677, |
|
"eval_samples_per_second": 97.627, |
|
"eval_steps_per_second": 5.138, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.6307692307692307, |
|
"grad_norm": 2.4253690242767334, |
|
"learning_rate": 1.8944983306068683e-05, |
|
"loss": 0.372, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6307692307692307, |
|
"eval_loss": 0.7812924385070801, |
|
"eval_runtime": 1.1688, |
|
"eval_samples_per_second": 97.535, |
|
"eval_steps_per_second": 5.133, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"grad_norm": 1.9009945392608643, |
|
"learning_rate": 1.8875319191755083e-05, |
|
"loss": 0.3825, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6615384615384614, |
|
"eval_loss": 0.7782070636749268, |
|
"eval_runtime": 1.1713, |
|
"eval_samples_per_second": 97.328, |
|
"eval_steps_per_second": 5.123, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 3.183929443359375, |
|
"learning_rate": 1.8803564628730916e-05, |
|
"loss": 0.396, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"eval_loss": 0.7839831709861755, |
|
"eval_runtime": 1.1706, |
|
"eval_samples_per_second": 97.388, |
|
"eval_steps_per_second": 5.126, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"grad_norm": 2.031071901321411, |
|
"learning_rate": 1.8729736517710454e-05, |
|
"loss": 0.3862, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7230769230769232, |
|
"eval_loss": 0.7782466411590576, |
|
"eval_runtime": 1.1669, |
|
"eval_samples_per_second": 97.695, |
|
"eval_steps_per_second": 5.142, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.7538461538461538, |
|
"grad_norm": 1.2456035614013672, |
|
"learning_rate": 1.865385224780119e-05, |
|
"loss": 0.3909, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7538461538461538, |
|
"eval_loss": 0.7737441062927246, |
|
"eval_runtime": 1.1644, |
|
"eval_samples_per_second": 97.908, |
|
"eval_steps_per_second": 5.153, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.7846153846153845, |
|
"grad_norm": 1.3416839838027954, |
|
"learning_rate": 1.8575929692408105e-05, |
|
"loss": 0.3585, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.7846153846153845, |
|
"eval_loss": 0.7757642269134521, |
|
"eval_runtime": 1.1707, |
|
"eval_samples_per_second": 97.376, |
|
"eval_steps_per_second": 5.125, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.8153846153846154, |
|
"grad_norm": 2.5810840129852295, |
|
"learning_rate": 1.8495987205023832e-05, |
|
"loss": 0.4018, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8153846153846154, |
|
"eval_loss": 0.7671994566917419, |
|
"eval_runtime": 1.1677, |
|
"eval_samples_per_second": 97.627, |
|
"eval_steps_per_second": 5.138, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 2.186009407043457, |
|
"learning_rate": 1.8414043614905782e-05, |
|
"loss": 0.3936, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"eval_loss": 0.7763370871543884, |
|
"eval_runtime": 1.1743, |
|
"eval_samples_per_second": 97.078, |
|
"eval_steps_per_second": 5.109, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.876923076923077, |
|
"grad_norm": 2.5505406856536865, |
|
"learning_rate": 1.8330118222641192e-05, |
|
"loss": 0.4963, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.876923076923077, |
|
"eval_loss": 0.7642151713371277, |
|
"eval_runtime": 1.1648, |
|
"eval_samples_per_second": 97.867, |
|
"eval_steps_per_second": 5.151, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.9076923076923076, |
|
"grad_norm": 2.2109644412994385, |
|
"learning_rate": 1.824423079560116e-05, |
|
"loss": 0.4821, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9076923076923076, |
|
"eval_loss": 0.7671634554862976, |
|
"eval_runtime": 1.1645, |
|
"eval_samples_per_second": 97.898, |
|
"eval_steps_per_second": 5.153, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"grad_norm": 2.702033519744873, |
|
"learning_rate": 1.8156401563284724e-05, |
|
"loss": 0.4216, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9384615384615385, |
|
"eval_loss": 0.7776830792427063, |
|
"eval_runtime": 1.1704, |
|
"eval_samples_per_second": 97.399, |
|
"eval_steps_per_second": 5.126, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"grad_norm": 2.2329928874969482, |
|
"learning_rate": 1.8066651212554126e-05, |
|
"loss": 0.4422, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.9692307692307693, |
|
"eval_loss": 0.7678297162055969, |
|
"eval_runtime": 1.171, |
|
"eval_samples_per_second": 97.351, |
|
"eval_steps_per_second": 5.124, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.3216522932052612, |
|
"learning_rate": 1.797500088276232e-05, |
|
"loss": 0.3888, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7601237297058105, |
|
"eval_runtime": 1.1955, |
|
"eval_samples_per_second": 95.354, |
|
"eval_steps_per_second": 5.019, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0307692307692307, |
|
"grad_norm": 1.9767566919326782, |
|
"learning_rate": 1.7881472160773912e-05, |
|
"loss": 0.2158, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0307692307692307, |
|
"eval_loss": 0.8567830920219421, |
|
"eval_runtime": 1.1711, |
|
"eval_samples_per_second": 97.344, |
|
"eval_steps_per_second": 5.123, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.0615384615384613, |
|
"grad_norm": 1.724826693534851, |
|
"learning_rate": 1.7786087075880698e-05, |
|
"loss": 0.1589, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0615384615384613, |
|
"eval_loss": 0.8708633780479431, |
|
"eval_runtime": 1.1707, |
|
"eval_samples_per_second": 97.379, |
|
"eval_steps_per_second": 5.125, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0923076923076924, |
|
"grad_norm": 1.9074368476867676, |
|
"learning_rate": 1.7688868094613e-05, |
|
"loss": 0.1974, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.0923076923076924, |
|
"eval_loss": 0.8557892441749573, |
|
"eval_runtime": 1.1707, |
|
"eval_samples_per_second": 97.382, |
|
"eval_steps_per_second": 5.125, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.123076923076923, |
|
"grad_norm": 1.2026904821395874, |
|
"learning_rate": 1.7589838115448005e-05, |
|
"loss": 0.1832, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.123076923076923, |
|
"eval_loss": 0.861406147480011, |
|
"eval_runtime": 1.1692, |
|
"eval_samples_per_second": 97.502, |
|
"eval_steps_per_second": 5.132, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 1.541394829750061, |
|
"learning_rate": 1.748902046341637e-05, |
|
"loss": 0.1835, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"eval_loss": 0.8678261041641235, |
|
"eval_runtime": 1.1824, |
|
"eval_samples_per_second": 96.41, |
|
"eval_steps_per_second": 5.074, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.184615384615385, |
|
"grad_norm": 2.980255126953125, |
|
"learning_rate": 1.7386438884608366e-05, |
|
"loss": 0.1555, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.184615384615385, |
|
"eval_loss": 0.8728282451629639, |
|
"eval_runtime": 1.1698, |
|
"eval_samples_per_second": 97.454, |
|
"eval_steps_per_second": 5.129, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"grad_norm": 1.7097492218017578, |
|
"learning_rate": 1.7282117540580833e-05, |
|
"loss": 0.1789, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.2153846153846155, |
|
"eval_loss": 0.8826200366020203, |
|
"eval_runtime": 1.1687, |
|
"eval_samples_per_second": 97.541, |
|
"eval_steps_per_second": 5.134, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.246153846153846, |
|
"grad_norm": 2.0315046310424805, |
|
"learning_rate": 1.7176081002666295e-05, |
|
"loss": 0.1825, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.246153846153846, |
|
"eval_loss": 0.8737024068832397, |
|
"eval_runtime": 1.1727, |
|
"eval_samples_per_second": 97.211, |
|
"eval_steps_per_second": 5.116, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.276923076923077, |
|
"grad_norm": 1.5558000802993774, |
|
"learning_rate": 1.706835424618555e-05, |
|
"loss": 0.1906, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.276923076923077, |
|
"eval_loss": 0.8740295767784119, |
|
"eval_runtime": 1.1746, |
|
"eval_samples_per_second": 97.054, |
|
"eval_steps_per_second": 5.108, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 2.007802724838257, |
|
"learning_rate": 1.695896264456509e-05, |
|
"loss": 0.2171, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"eval_loss": 0.8841921091079712, |
|
"eval_runtime": 1.1694, |
|
"eval_samples_per_second": 97.487, |
|
"eval_steps_per_second": 5.131, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.3384615384615386, |
|
"grad_norm": 2.0817086696624756, |
|
"learning_rate": 1.6847931963360796e-05, |
|
"loss": 0.1993, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3384615384615386, |
|
"eval_loss": 0.8875246644020081, |
|
"eval_runtime": 1.1683, |
|
"eval_samples_per_second": 97.576, |
|
"eval_steps_per_second": 5.136, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.3692307692307693, |
|
"grad_norm": 1.752740740776062, |
|
"learning_rate": 1.6735288354189225e-05, |
|
"loss": 0.1928, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.3692307692307693, |
|
"eval_loss": 0.8896489143371582, |
|
"eval_runtime": 1.1654, |
|
"eval_samples_per_second": 97.821, |
|
"eval_steps_per_second": 5.148, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.6042486429214478, |
|
"learning_rate": 1.6621058348568008e-05, |
|
"loss": 0.2059, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.8732131719589233, |
|
"eval_runtime": 1.1715, |
|
"eval_samples_per_second": 97.315, |
|
"eval_steps_per_second": 5.122, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.430769230769231, |
|
"grad_norm": 2.1651504039764404, |
|
"learning_rate": 1.6505268851666717e-05, |
|
"loss": 0.2101, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.430769230769231, |
|
"eval_loss": 0.8754842877388, |
|
"eval_runtime": 1.1681, |
|
"eval_samples_per_second": 97.595, |
|
"eval_steps_per_second": 5.137, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 2.9434351921081543, |
|
"learning_rate": 1.6387947135969796e-05, |
|
"loss": 0.2305, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"eval_loss": 0.9017049074172974, |
|
"eval_runtime": 1.1681, |
|
"eval_samples_per_second": 97.598, |
|
"eval_steps_per_second": 5.137, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"grad_norm": 1.2757177352905273, |
|
"learning_rate": 1.6269120834852892e-05, |
|
"loss": 0.1837, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.4923076923076923, |
|
"eval_loss": 0.8789340853691101, |
|
"eval_runtime": 1.1681, |
|
"eval_samples_per_second": 97.593, |
|
"eval_steps_per_second": 5.136, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.523076923076923, |
|
"grad_norm": 2.2374846935272217, |
|
"learning_rate": 1.6148817936074267e-05, |
|
"loss": 0.1846, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.523076923076923, |
|
"eval_loss": 0.8868066668510437, |
|
"eval_runtime": 1.1703, |
|
"eval_samples_per_second": 97.411, |
|
"eval_steps_per_second": 5.127, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.5538461538461537, |
|
"grad_norm": 2.112977981567383, |
|
"learning_rate": 1.6027066775182664e-05, |
|
"loss": 0.215, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5538461538461537, |
|
"eval_loss": 0.8842012286186218, |
|
"eval_runtime": 1.168, |
|
"eval_samples_per_second": 97.604, |
|
"eval_steps_per_second": 5.137, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.5846153846153848, |
|
"grad_norm": 1.7254525423049927, |
|
"learning_rate": 1.5903896028843316e-05, |
|
"loss": 0.2021, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.5846153846153848, |
|
"eval_loss": 0.8752718567848206, |
|
"eval_runtime": 1.172, |
|
"eval_samples_per_second": 97.267, |
|
"eval_steps_per_second": 5.119, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 2.7533833980560303, |
|
"learning_rate": 1.5779334708083585e-05, |
|
"loss": 0.2087, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"eval_loss": 0.881074070930481, |
|
"eval_runtime": 1.1742, |
|
"eval_samples_per_second": 97.086, |
|
"eval_steps_per_second": 5.11, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.646153846153846, |
|
"grad_norm": 1.364652395248413, |
|
"learning_rate": 1.565341215145983e-05, |
|
"loss": 0.205, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.646153846153846, |
|
"eval_loss": 0.8765241503715515, |
|
"eval_runtime": 1.1685, |
|
"eval_samples_per_second": 97.561, |
|
"eval_steps_per_second": 5.135, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.676923076923077, |
|
"grad_norm": 2.8074488639831543, |
|
"learning_rate": 1.5526158018147168e-05, |
|
"loss": 0.1872, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.676923076923077, |
|
"eval_loss": 0.8835176229476929, |
|
"eval_runtime": 1.1654, |
|
"eval_samples_per_second": 97.823, |
|
"eval_steps_per_second": 5.149, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.707692307692308, |
|
"grad_norm": 1.8859968185424805, |
|
"learning_rate": 1.5397602280953695e-05, |
|
"loss": 0.197, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.707692307692308, |
|
"eval_loss": 0.8719732761383057, |
|
"eval_runtime": 1.1708, |
|
"eval_samples_per_second": 97.368, |
|
"eval_steps_per_second": 5.125, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.7384615384615385, |
|
"grad_norm": 1.2429983615875244, |
|
"learning_rate": 1.526777521926084e-05, |
|
"loss": 0.1932, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.7384615384615385, |
|
"eval_loss": 0.8760843276977539, |
|
"eval_runtime": 1.1705, |
|
"eval_samples_per_second": 97.392, |
|
"eval_steps_per_second": 5.126, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 1.7974516153335571, |
|
"learning_rate": 1.5136707411891483e-05, |
|
"loss": 0.2008, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"eval_loss": 0.8759164214134216, |
|
"eval_runtime": 1.1739, |
|
"eval_samples_per_second": 97.116, |
|
"eval_steps_per_second": 5.111, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.4502002000808716, |
|
"learning_rate": 1.5004429729907619e-05, |
|
"loss": 0.1998, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.8729867935180664, |
|
"eval_runtime": 1.1659, |
|
"eval_samples_per_second": 97.775, |
|
"eval_steps_per_second": 5.146, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.830769230769231, |
|
"grad_norm": 1.3150732517242432, |
|
"learning_rate": 1.4870973329339112e-05, |
|
"loss": 0.1936, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.830769230769231, |
|
"eval_loss": 0.8798808455467224, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 97.737, |
|
"eval_steps_per_second": 5.144, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.8615384615384616, |
|
"grad_norm": 1.8317633867263794, |
|
"learning_rate": 1.4736369643845346e-05, |
|
"loss": 0.1951, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8615384615384616, |
|
"eval_loss": 0.900560736656189, |
|
"eval_runtime": 1.1713, |
|
"eval_samples_per_second": 97.324, |
|
"eval_steps_per_second": 5.122, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.8923076923076922, |
|
"grad_norm": 2.284203290939331, |
|
"learning_rate": 1.4600650377311523e-05, |
|
"loss": 0.1884, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.8923076923076922, |
|
"eval_loss": 0.8905934691429138, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 97.462, |
|
"eval_steps_per_second": 5.13, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 1.9384973049163818, |
|
"learning_rate": 1.446384749638128e-05, |
|
"loss": 0.1881, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"eval_loss": 0.8698312640190125, |
|
"eval_runtime": 1.1727, |
|
"eval_samples_per_second": 97.213, |
|
"eval_steps_per_second": 5.116, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"grad_norm": 1.7943904399871826, |
|
"learning_rate": 1.4325993222927414e-05, |
|
"loss": 0.2166, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.953846153846154, |
|
"eval_loss": 0.8718281984329224, |
|
"eval_runtime": 1.1666, |
|
"eval_samples_per_second": 97.724, |
|
"eval_steps_per_second": 5.143, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.9846153846153847, |
|
"grad_norm": 2.0599663257598877, |
|
"learning_rate": 1.4187120026462508e-05, |
|
"loss": 0.2082, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.9846153846153847, |
|
"eval_loss": 0.8722580075263977, |
|
"eval_runtime": 1.1673, |
|
"eval_samples_per_second": 97.658, |
|
"eval_steps_per_second": 5.14, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0153846153846153, |
|
"grad_norm": 1.2260268926620483, |
|
"learning_rate": 1.4047260616491225e-05, |
|
"loss": 0.1631, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.0153846153846153, |
|
"eval_loss": 0.8925275206565857, |
|
"eval_runtime": 1.1687, |
|
"eval_samples_per_second": 97.541, |
|
"eval_steps_per_second": 5.134, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.046153846153846, |
|
"grad_norm": 1.544405460357666, |
|
"learning_rate": 1.3906447934806074e-05, |
|
"loss": 0.1024, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.046153846153846, |
|
"eval_loss": 0.9651603102684021, |
|
"eval_runtime": 1.1687, |
|
"eval_samples_per_second": 97.54, |
|
"eval_steps_per_second": 5.134, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 1.8806118965148926, |
|
"learning_rate": 1.3764715147728451e-05, |
|
"loss": 0.1111, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"eval_loss": 0.9773316383361816, |
|
"eval_runtime": 1.1719, |
|
"eval_samples_per_second": 97.274, |
|
"eval_steps_per_second": 5.12, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.1076923076923078, |
|
"grad_norm": 1.2032195329666138, |
|
"learning_rate": 1.3622095638296827e-05, |
|
"loss": 0.1011, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1076923076923078, |
|
"eval_loss": 0.9423761367797852, |
|
"eval_runtime": 1.1686, |
|
"eval_samples_per_second": 97.556, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.1384615384615384, |
|
"grad_norm": 1.2994767427444458, |
|
"learning_rate": 1.3478622998403861e-05, |
|
"loss": 0.1078, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.1384615384615384, |
|
"eval_loss": 0.9416558146476746, |
|
"eval_runtime": 1.1698, |
|
"eval_samples_per_second": 97.455, |
|
"eval_steps_per_second": 5.129, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.169230769230769, |
|
"grad_norm": 1.061914324760437, |
|
"learning_rate": 1.3334331020884328e-05, |
|
"loss": 0.0991, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.169230769230769, |
|
"eval_loss": 0.9638768434524536, |
|
"eval_runtime": 1.1664, |
|
"eval_samples_per_second": 97.741, |
|
"eval_steps_per_second": 5.144, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.4495635032653809, |
|
"learning_rate": 1.318925369155574e-05, |
|
"loss": 0.0979, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.9724640846252441, |
|
"eval_runtime": 1.1718, |
|
"eval_samples_per_second": 97.282, |
|
"eval_steps_per_second": 5.12, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 1.5616148710250854, |
|
"learning_rate": 1.3043425181213471e-05, |
|
"loss": 0.1109, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"eval_loss": 0.972594141960144, |
|
"eval_runtime": 1.169, |
|
"eval_samples_per_second": 97.515, |
|
"eval_steps_per_second": 5.132, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.2615384615384615, |
|
"grad_norm": 2.1167333126068115, |
|
"learning_rate": 1.2896879837582356e-05, |
|
"loss": 0.1047, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.2615384615384615, |
|
"eval_loss": 0.9514709711074829, |
|
"eval_runtime": 1.1679, |
|
"eval_samples_per_second": 97.612, |
|
"eval_steps_per_second": 5.137, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.292307692307692, |
|
"grad_norm": 1.5117723941802979, |
|
"learning_rate": 1.2749652177226592e-05, |
|
"loss": 0.1075, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.292307692307692, |
|
"eval_loss": 0.9520531296730042, |
|
"eval_runtime": 1.1746, |
|
"eval_samples_per_second": 97.058, |
|
"eval_steps_per_second": 5.108, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.3230769230769233, |
|
"grad_norm": 1.231341004371643, |
|
"learning_rate": 1.2601776877419876e-05, |
|
"loss": 0.1021, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.3230769230769233, |
|
"eval_loss": 0.9573836922645569, |
|
"eval_runtime": 1.1666, |
|
"eval_samples_per_second": 97.723, |
|
"eval_steps_per_second": 5.143, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.353846153846154, |
|
"grad_norm": 1.5957950353622437, |
|
"learning_rate": 1.2453288767977686e-05, |
|
"loss": 0.1069, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.353846153846154, |
|
"eval_loss": 0.9602900147438049, |
|
"eval_runtime": 1.169, |
|
"eval_samples_per_second": 97.519, |
|
"eval_steps_per_second": 5.133, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 0.7704668641090393, |
|
"learning_rate": 1.2304222823053653e-05, |
|
"loss": 0.0969, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"eval_loss": 0.9564027190208435, |
|
"eval_runtime": 1.1725, |
|
"eval_samples_per_second": 97.232, |
|
"eval_steps_per_second": 5.117, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.4153846153846152, |
|
"grad_norm": 1.638713002204895, |
|
"learning_rate": 1.2154614152901916e-05, |
|
"loss": 0.1136, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4153846153846152, |
|
"eval_loss": 0.9655954241752625, |
|
"eval_runtime": 1.1671, |
|
"eval_samples_per_second": 97.68, |
|
"eval_steps_per_second": 5.141, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.4461538461538463, |
|
"grad_norm": 1.2840876579284668, |
|
"learning_rate": 1.2004497995607415e-05, |
|
"loss": 0.1097, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.4461538461538463, |
|
"eval_loss": 0.9535898566246033, |
|
"eval_runtime": 1.1708, |
|
"eval_samples_per_second": 97.368, |
|
"eval_steps_per_second": 5.125, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.476923076923077, |
|
"grad_norm": 1.021537184715271, |
|
"learning_rate": 1.1853909708786111e-05, |
|
"loss": 0.1048, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.476923076923077, |
|
"eval_loss": 0.952057421207428, |
|
"eval_runtime": 1.1712, |
|
"eval_samples_per_second": 97.336, |
|
"eval_steps_per_second": 5.123, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.5076923076923077, |
|
"grad_norm": 2.217410087585449, |
|
"learning_rate": 1.1702884761257003e-05, |
|
"loss": 0.118, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.5076923076923077, |
|
"eval_loss": 0.9404497146606445, |
|
"eval_runtime": 1.1731, |
|
"eval_samples_per_second": 97.178, |
|
"eval_steps_per_second": 5.115, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 1.5386874675750732, |
|
"learning_rate": 1.1551458724688e-05, |
|
"loss": 0.1053, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"eval_loss": 0.9536699056625366, |
|
"eval_runtime": 1.1718, |
|
"eval_samples_per_second": 97.284, |
|
"eval_steps_per_second": 5.12, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.569230769230769, |
|
"grad_norm": 0.8064364790916443, |
|
"learning_rate": 1.1399667265217522e-05, |
|
"loss": 0.1061, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.569230769230769, |
|
"eval_loss": 0.9544848799705505, |
|
"eval_runtime": 1.1669, |
|
"eval_samples_per_second": 97.693, |
|
"eval_steps_per_second": 5.142, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.917335569858551, |
|
"learning_rate": 1.1247546135053904e-05, |
|
"loss": 0.102, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 0.9585210084915161, |
|
"eval_runtime": 1.17, |
|
"eval_samples_per_second": 97.434, |
|
"eval_steps_per_second": 5.128, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.6307692307692307, |
|
"grad_norm": 1.0991976261138916, |
|
"learning_rate": 1.1095131164054476e-05, |
|
"loss": 0.1072, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.6307692307692307, |
|
"eval_loss": 0.9673210978507996, |
|
"eval_runtime": 1.1692, |
|
"eval_samples_per_second": 97.5, |
|
"eval_steps_per_second": 5.132, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.6615384615384614, |
|
"grad_norm": 1.2590489387512207, |
|
"learning_rate": 1.0942458251286384e-05, |
|
"loss": 0.1007, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.6615384615384614, |
|
"eval_loss": 0.9589065313339233, |
|
"eval_runtime": 1.1693, |
|
"eval_samples_per_second": 97.491, |
|
"eval_steps_per_second": 5.131, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 0.555060625076294, |
|
"learning_rate": 1.078956335657109e-05, |
|
"loss": 0.1045, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"eval_loss": 0.9449043273925781, |
|
"eval_runtime": 1.1697, |
|
"eval_samples_per_second": 97.457, |
|
"eval_steps_per_second": 5.129, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.723076923076923, |
|
"grad_norm": 1.4285695552825928, |
|
"learning_rate": 1.0636482492014603e-05, |
|
"loss": 0.1042, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.723076923076923, |
|
"eval_loss": 0.9505679607391357, |
|
"eval_runtime": 1.1646, |
|
"eval_samples_per_second": 97.886, |
|
"eval_steps_per_second": 5.152, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.753846153846154, |
|
"grad_norm": 1.0818232297897339, |
|
"learning_rate": 1.0483251713525335e-05, |
|
"loss": 0.1066, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.753846153846154, |
|
"eval_loss": 0.958114743232727, |
|
"eval_runtime": 1.1679, |
|
"eval_samples_per_second": 97.609, |
|
"eval_steps_per_second": 5.137, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.7846153846153845, |
|
"grad_norm": 1.027013897895813, |
|
"learning_rate": 1.0329907112321685e-05, |
|
"loss": 0.1103, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.7846153846153845, |
|
"eval_loss": 0.9584141969680786, |
|
"eval_runtime": 1.1717, |
|
"eval_samples_per_second": 97.292, |
|
"eval_steps_per_second": 5.121, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.815384615384615, |
|
"grad_norm": 1.2633482217788696, |
|
"learning_rate": 1.0176484806431288e-05, |
|
"loss": 0.1135, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.815384615384615, |
|
"eval_loss": 0.9456351399421692, |
|
"eval_runtime": 1.1713, |
|
"eval_samples_per_second": 97.325, |
|
"eval_steps_per_second": 5.122, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 1.5647493600845337, |
|
"learning_rate": 1.002302093218396e-05, |
|
"loss": 0.102, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_loss": 0.9635660648345947, |
|
"eval_runtime": 1.1695, |
|
"eval_samples_per_second": 97.479, |
|
"eval_steps_per_second": 5.13, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.876923076923077, |
|
"grad_norm": 1.1846381425857544, |
|
"learning_rate": 9.869551635700321e-06, |
|
"loss": 0.0996, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.876923076923077, |
|
"eval_loss": 0.9652856588363647, |
|
"eval_runtime": 1.1667, |
|
"eval_samples_per_second": 97.709, |
|
"eval_steps_per_second": 5.143, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.9076923076923076, |
|
"grad_norm": 1.5169893503189087, |
|
"learning_rate": 9.716113064378113e-06, |
|
"loss": 0.0995, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.9076923076923076, |
|
"eval_loss": 0.9649413228034973, |
|
"eval_runtime": 1.1669, |
|
"eval_samples_per_second": 97.691, |
|
"eval_steps_per_second": 5.142, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.9384615384615387, |
|
"grad_norm": 1.013720154762268, |
|
"learning_rate": 9.562741358378239e-06, |
|
"loss": 0.1124, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.9384615384615387, |
|
"eval_loss": 0.9643102288246155, |
|
"eval_runtime": 1.1719, |
|
"eval_samples_per_second": 97.276, |
|
"eval_steps_per_second": 5.12, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.9692307692307693, |
|
"grad_norm": 1.1078283786773682, |
|
"learning_rate": 9.409472642112454e-06, |
|
"loss": 0.096, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.9692307692307693, |
|
"eval_loss": 0.9525210857391357, |
|
"eval_runtime": 1.1921, |
|
"eval_samples_per_second": 95.632, |
|
"eval_steps_per_second": 5.033, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.2566571235656738, |
|
"learning_rate": 9.256343015734842e-06, |
|
"loss": 0.1133, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.9479546546936035, |
|
"eval_runtime": 1.1715, |
|
"eval_samples_per_second": 97.308, |
|
"eval_steps_per_second": 5.121, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.030769230769231, |
|
"grad_norm": 1.1318272352218628, |
|
"learning_rate": 9.103388546638929e-06, |
|
"loss": 0.069, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.030769230769231, |
|
"eval_loss": 0.966812789440155, |
|
"eval_runtime": 1.1672, |
|
"eval_samples_per_second": 97.667, |
|
"eval_steps_per_second": 5.14, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.061538461538461, |
|
"grad_norm": 0.9845967292785645, |
|
"learning_rate": 8.950645260962572e-06, |
|
"loss": 0.0716, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.061538461538461, |
|
"eval_loss": 1.0023221969604492, |
|
"eval_runtime": 1.17, |
|
"eval_samples_per_second": 97.437, |
|
"eval_steps_per_second": 5.128, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.092307692307692, |
|
"grad_norm": 0.5891923904418945, |
|
"learning_rate": 8.798149135102528e-06, |
|
"loss": 0.0637, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.092307692307692, |
|
"eval_loss": 1.0226399898529053, |
|
"eval_runtime": 1.1693, |
|
"eval_samples_per_second": 97.493, |
|
"eval_steps_per_second": 5.131, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.123076923076923, |
|
"grad_norm": 0.9881710410118103, |
|
"learning_rate": 8.645936087240758e-06, |
|
"loss": 0.0695, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.123076923076923, |
|
"eval_loss": 1.0229477882385254, |
|
"eval_runtime": 1.1678, |
|
"eval_samples_per_second": 97.615, |
|
"eval_steps_per_second": 5.138, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 0.47998377680778503, |
|
"learning_rate": 8.494041968884423e-06, |
|
"loss": 0.0628, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"eval_loss": 1.0193408727645874, |
|
"eval_runtime": 1.1721, |
|
"eval_samples_per_second": 97.261, |
|
"eval_steps_per_second": 5.119, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.184615384615385, |
|
"grad_norm": 0.6444534063339233, |
|
"learning_rate": 8.342502556421627e-06, |
|
"loss": 0.0711, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.184615384615385, |
|
"eval_loss": 1.0234897136688232, |
|
"eval_runtime": 1.1655, |
|
"eval_samples_per_second": 97.808, |
|
"eval_steps_per_second": 5.148, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.2153846153846155, |
|
"grad_norm": 1.1395206451416016, |
|
"learning_rate": 8.19135354269479e-06, |
|
"loss": 0.0662, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.2153846153846155, |
|
"eval_loss": 1.027916669845581, |
|
"eval_runtime": 1.1668, |
|
"eval_samples_per_second": 97.702, |
|
"eval_steps_per_second": 5.142, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.246153846153846, |
|
"grad_norm": 0.6894501447677612, |
|
"learning_rate": 8.040630528593753e-06, |
|
"loss": 0.0748, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.246153846153846, |
|
"eval_loss": 1.0266273021697998, |
|
"eval_runtime": 1.1674, |
|
"eval_samples_per_second": 97.65, |
|
"eval_steps_per_second": 5.139, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.276923076923077, |
|
"grad_norm": 0.6127138137817383, |
|
"learning_rate": 7.890369014670512e-06, |
|
"loss": 0.0571, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.276923076923077, |
|
"eval_loss": 1.0378963947296143, |
|
"eval_runtime": 1.169, |
|
"eval_samples_per_second": 97.523, |
|
"eval_steps_per_second": 5.133, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 1.168372392654419, |
|
"learning_rate": 7.740604392777612e-06, |
|
"loss": 0.067, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"eval_loss": 1.0480735301971436, |
|
"eval_runtime": 1.1695, |
|
"eval_samples_per_second": 97.475, |
|
"eval_steps_per_second": 5.13, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.338461538461538, |
|
"grad_norm": 0.5983268022537231, |
|
"learning_rate": 7.591371937732091e-06, |
|
"loss": 0.0691, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.338461538461538, |
|
"eval_loss": 1.0469272136688232, |
|
"eval_runtime": 1.1663, |
|
"eval_samples_per_second": 97.748, |
|
"eval_steps_per_second": 5.145, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.36923076923077, |
|
"grad_norm": 0.7790025472640991, |
|
"learning_rate": 7.442706799007056e-06, |
|
"loss": 0.0711, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.36923076923077, |
|
"eval_loss": 1.0372542142868042, |
|
"eval_runtime": 1.1684, |
|
"eval_samples_per_second": 97.571, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.0238187313079834, |
|
"learning_rate": 7.294643992452735e-06, |
|
"loss": 0.0737, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"eval_loss": 1.03952157497406, |
|
"eval_runtime": 1.1679, |
|
"eval_samples_per_second": 97.615, |
|
"eval_steps_per_second": 5.138, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.430769230769231, |
|
"grad_norm": 1.3187087774276733, |
|
"learning_rate": 7.147218392049026e-06, |
|
"loss": 0.0673, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.430769230769231, |
|
"eval_loss": 1.036614179611206, |
|
"eval_runtime": 1.169, |
|
"eval_samples_per_second": 97.522, |
|
"eval_steps_per_second": 5.133, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 1.0066938400268555, |
|
"learning_rate": 7.000464721691438e-06, |
|
"loss": 0.0671, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"eval_loss": 1.033462643623352, |
|
"eval_runtime": 1.1706, |
|
"eval_samples_per_second": 97.388, |
|
"eval_steps_per_second": 5.126, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.492307692307692, |
|
"grad_norm": 0.8664164543151855, |
|
"learning_rate": 6.854417547012415e-06, |
|
"loss": 0.0637, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.492307692307692, |
|
"eval_loss": 1.0385520458221436, |
|
"eval_runtime": 1.1667, |
|
"eval_samples_per_second": 97.708, |
|
"eval_steps_per_second": 5.143, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.523076923076923, |
|
"grad_norm": 0.41295212507247925, |
|
"learning_rate": 6.7091112672399e-06, |
|
"loss": 0.0626, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.523076923076923, |
|
"eval_loss": 1.0354348421096802, |
|
"eval_runtime": 1.1643, |
|
"eval_samples_per_second": 97.913, |
|
"eval_steps_per_second": 5.153, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.553846153846154, |
|
"grad_norm": 1.439942479133606, |
|
"learning_rate": 6.564580107095133e-06, |
|
"loss": 0.0675, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.553846153846154, |
|
"eval_loss": 1.0380290746688843, |
|
"eval_runtime": 1.1702, |
|
"eval_samples_per_second": 97.415, |
|
"eval_steps_per_second": 5.127, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.584615384615384, |
|
"grad_norm": 0.6881715059280396, |
|
"learning_rate": 6.4208581087315035e-06, |
|
"loss": 0.0662, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.584615384615384, |
|
"eval_loss": 1.029496192932129, |
|
"eval_runtime": 1.1716, |
|
"eval_samples_per_second": 97.305, |
|
"eval_steps_per_second": 5.121, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.5975565314292908, |
|
"learning_rate": 6.277979123716455e-06, |
|
"loss": 0.0628, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"eval_loss": 1.0207117795944214, |
|
"eval_runtime": 1.17, |
|
"eval_samples_per_second": 97.434, |
|
"eval_steps_per_second": 5.128, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.6461538461538465, |
|
"grad_norm": 0.7314721345901489, |
|
"learning_rate": 6.13597680505823e-06, |
|
"loss": 0.0748, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.6461538461538465, |
|
"eval_loss": 1.021004557609558, |
|
"eval_runtime": 1.1652, |
|
"eval_samples_per_second": 97.839, |
|
"eval_steps_per_second": 5.149, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.676923076923077, |
|
"grad_norm": 1.1508655548095703, |
|
"learning_rate": 5.994884599279443e-06, |
|
"loss": 0.0707, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.676923076923077, |
|
"eval_loss": 1.023290753364563, |
|
"eval_runtime": 1.1651, |
|
"eval_samples_per_second": 97.845, |
|
"eval_steps_per_second": 5.15, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.707692307692308, |
|
"grad_norm": 0.9571641087532043, |
|
"learning_rate": 5.854735738539203e-06, |
|
"loss": 0.0672, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.707692307692308, |
|
"eval_loss": 1.026742935180664, |
|
"eval_runtime": 1.1684, |
|
"eval_samples_per_second": 97.565, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.7384615384615385, |
|
"grad_norm": 0.8185418248176575, |
|
"learning_rate": 5.715563232805825e-06, |
|
"loss": 0.0606, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.7384615384615385, |
|
"eval_loss": 1.032530426979065, |
|
"eval_runtime": 1.1692, |
|
"eval_samples_per_second": 97.5, |
|
"eval_steps_per_second": 5.132, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 0.9479151368141174, |
|
"learning_rate": 5.577399862081789e-06, |
|
"loss": 0.0632, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"eval_loss": 1.039183497428894, |
|
"eval_runtime": 1.1755, |
|
"eval_samples_per_second": 96.981, |
|
"eval_steps_per_second": 5.104, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.7193753123283386, |
|
"learning_rate": 5.4402781686829184e-06, |
|
"loss": 0.0709, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_loss": 1.0410054922103882, |
|
"eval_runtime": 1.1653, |
|
"eval_samples_per_second": 97.831, |
|
"eval_steps_per_second": 5.149, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.8307692307692305, |
|
"grad_norm": 0.9305492639541626, |
|
"learning_rate": 5.304230449573523e-06, |
|
"loss": 0.0791, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.8307692307692305, |
|
"eval_loss": 1.0399616956710815, |
|
"eval_runtime": 1.17, |
|
"eval_samples_per_second": 97.434, |
|
"eval_steps_per_second": 5.128, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.861538461538462, |
|
"grad_norm": 0.687660276889801, |
|
"learning_rate": 5.169288748759327e-06, |
|
"loss": 0.0711, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.861538461538462, |
|
"eval_loss": 1.0336360931396484, |
|
"eval_runtime": 1.1698, |
|
"eval_samples_per_second": 97.451, |
|
"eval_steps_per_second": 5.129, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.892307692307693, |
|
"grad_norm": 0.8751097321510315, |
|
"learning_rate": 5.0354848497399865e-06, |
|
"loss": 0.0642, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.892307692307693, |
|
"eval_loss": 1.025221824645996, |
|
"eval_runtime": 1.1733, |
|
"eval_samples_per_second": 97.161, |
|
"eval_steps_per_second": 5.114, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.4916076958179474, |
|
"learning_rate": 4.902850268022959e-06, |
|
"loss": 0.0686, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"eval_loss": 1.0251351594924927, |
|
"eval_runtime": 1.1702, |
|
"eval_samples_per_second": 97.42, |
|
"eval_steps_per_second": 5.127, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.953846153846154, |
|
"grad_norm": 0.9400696754455566, |
|
"learning_rate": 4.771416243700495e-06, |
|
"loss": 0.0733, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.953846153846154, |
|
"eval_loss": 1.0282999277114868, |
|
"eval_runtime": 1.1685, |
|
"eval_samples_per_second": 97.563, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.984615384615385, |
|
"grad_norm": 0.5670651197433472, |
|
"learning_rate": 4.641213734091507e-06, |
|
"loss": 0.0664, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.984615384615385, |
|
"eval_loss": 1.029799461364746, |
|
"eval_runtime": 1.1659, |
|
"eval_samples_per_second": 97.776, |
|
"eval_steps_per_second": 5.146, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.015384615384615, |
|
"grad_norm": 0.3696945607662201, |
|
"learning_rate": 4.5122734064500365e-06, |
|
"loss": 0.0547, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.015384615384615, |
|
"eval_loss": 1.037708044052124, |
|
"eval_runtime": 1.1669, |
|
"eval_samples_per_second": 97.696, |
|
"eval_steps_per_second": 5.142, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.046153846153846, |
|
"grad_norm": 0.6425753831863403, |
|
"learning_rate": 4.384625630742031e-06, |
|
"loss": 0.0465, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.046153846153846, |
|
"eval_loss": 1.0521013736724854, |
|
"eval_runtime": 1.1713, |
|
"eval_samples_per_second": 97.324, |
|
"eval_steps_per_second": 5.122, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"grad_norm": 0.4301888346672058, |
|
"learning_rate": 4.258300472492165e-06, |
|
"loss": 0.0472, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"eval_loss": 1.0649975538253784, |
|
"eval_runtime": 1.172, |
|
"eval_samples_per_second": 97.273, |
|
"eval_steps_per_second": 5.12, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.107692307692307, |
|
"grad_norm": 0.37904953956604004, |
|
"learning_rate": 4.1333276857023515e-06, |
|
"loss": 0.0499, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.107692307692307, |
|
"eval_loss": 1.072283148765564, |
|
"eval_runtime": 1.1667, |
|
"eval_samples_per_second": 97.708, |
|
"eval_steps_per_second": 5.143, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.138461538461539, |
|
"grad_norm": 0.5074180364608765, |
|
"learning_rate": 4.0097367058436156e-06, |
|
"loss": 0.0468, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.138461538461539, |
|
"eval_loss": 1.080543041229248, |
|
"eval_runtime": 1.1689, |
|
"eval_samples_per_second": 97.528, |
|
"eval_steps_per_second": 5.133, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.1692307692307695, |
|
"grad_norm": 0.5539130568504333, |
|
"learning_rate": 3.887556642923047e-06, |
|
"loss": 0.0481, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.1692307692307695, |
|
"eval_loss": 1.084172248840332, |
|
"eval_runtime": 1.1733, |
|
"eval_samples_per_second": 97.159, |
|
"eval_steps_per_second": 5.114, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.3505866825580597, |
|
"learning_rate": 3.7668162746273283e-06, |
|
"loss": 0.046, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"eval_loss": 1.0881775617599487, |
|
"eval_runtime": 1.1713, |
|
"eval_samples_per_second": 97.327, |
|
"eval_steps_per_second": 5.122, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"grad_norm": 0.438381552696228, |
|
"learning_rate": 3.647544039544615e-06, |
|
"loss": 0.0488, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"eval_loss": 1.0943236351013184, |
|
"eval_runtime": 1.1694, |
|
"eval_samples_per_second": 97.489, |
|
"eval_steps_per_second": 5.131, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.2615384615384615, |
|
"grad_norm": 0.6292533278465271, |
|
"learning_rate": 3.5297680304662374e-06, |
|
"loss": 0.0514, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.2615384615384615, |
|
"eval_loss": 1.0977230072021484, |
|
"eval_runtime": 1.1685, |
|
"eval_samples_per_second": 97.562, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.292307692307692, |
|
"grad_norm": 1.0136582851409912, |
|
"learning_rate": 3.4135159877698633e-06, |
|
"loss": 0.0509, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.292307692307692, |
|
"eval_loss": 1.0975443124771118, |
|
"eval_runtime": 1.1684, |
|
"eval_samples_per_second": 97.57, |
|
"eval_steps_per_second": 5.135, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.323076923076923, |
|
"grad_norm": 0.5158536434173584, |
|
"learning_rate": 3.29881529288567e-06, |
|
"loss": 0.0421, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.323076923076923, |
|
"eval_loss": 1.0977866649627686, |
|
"eval_runtime": 1.1671, |
|
"eval_samples_per_second": 97.68, |
|
"eval_steps_per_second": 5.141, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.3538461538461535, |
|
"grad_norm": 0.39954596757888794, |
|
"learning_rate": 3.1856929618470635e-06, |
|
"loss": 0.0439, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.3538461538461535, |
|
"eval_loss": 1.1025538444519043, |
|
"eval_runtime": 1.1755, |
|
"eval_samples_per_second": 96.979, |
|
"eval_steps_per_second": 5.104, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 0.723860502243042, |
|
"learning_rate": 3.0741756389274325e-06, |
|
"loss": 0.056, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"eval_loss": 1.102371096611023, |
|
"eval_runtime": 1.1735, |
|
"eval_samples_per_second": 97.148, |
|
"eval_steps_per_second": 5.113, |
|
"step": 1750 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2275, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 50, |
|
"total_flos": 1.0842145111932928e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|