{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.3846153846153846, "eval_steps": 10, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006153846153846154, "eval_loss": 1.489196538925171, "eval_runtime": 1.2747, "eval_samples_per_second": 89.432, "eval_steps_per_second": 4.707, "step": 2 }, { "epoch": 0.03076923076923077, "grad_norm": 17.32219123840332, "learning_rate": 8.771929824561404e-07, "loss": 1.5722, "step": 10 }, { "epoch": 0.03076923076923077, "eval_loss": 1.3865292072296143, "eval_runtime": 1.1749, "eval_samples_per_second": 97.029, "eval_steps_per_second": 5.107, "step": 10 }, { "epoch": 0.06153846153846154, "grad_norm": 21.88077735900879, "learning_rate": 1.7543859649122807e-06, "loss": 1.3935, "step": 20 }, { "epoch": 0.06153846153846154, "eval_loss": 1.0996264219284058, "eval_runtime": 1.1706, "eval_samples_per_second": 97.386, "eval_steps_per_second": 5.126, "step": 20 }, { "epoch": 0.09230769230769231, "grad_norm": 17.395177841186523, "learning_rate": 2.631578947368421e-06, "loss": 1.0664, "step": 30 }, { "epoch": 0.09230769230769231, "eval_loss": 0.8006665706634521, "eval_runtime": 1.1747, "eval_samples_per_second": 97.044, "eval_steps_per_second": 5.108, "step": 30 }, { "epoch": 0.12307692307692308, "grad_norm": 6.869284152984619, "learning_rate": 3.5087719298245615e-06, "loss": 0.7994, "step": 40 }, { "epoch": 0.12307692307692308, "eval_loss": 0.708875298500061, "eval_runtime": 1.1742, "eval_samples_per_second": 97.087, "eval_steps_per_second": 5.11, "step": 40 }, { "epoch": 0.15384615384615385, "grad_norm": 6.4737162590026855, "learning_rate": 4.385964912280702e-06, "loss": 0.7751, "step": 50 }, { "epoch": 0.15384615384615385, "eval_loss": 0.6778659224510193, "eval_runtime": 1.1722, "eval_samples_per_second": 97.257, "eval_steps_per_second": 5.119, "step": 50 }, { "epoch": 0.18461538461538463, "grad_norm": 5.418182373046875, "learning_rate": 5.263157894736842e-06, "loss": 0.6203, "step": 60 }, { "epoch": 0.18461538461538463, "eval_loss": 0.6695793271064758, "eval_runtime": 1.1744, "eval_samples_per_second": 97.071, "eval_steps_per_second": 5.109, "step": 60 }, { "epoch": 0.2153846153846154, "grad_norm": 2.7423934936523438, "learning_rate": 6.140350877192983e-06, "loss": 0.767, "step": 70 }, { "epoch": 0.2153846153846154, "eval_loss": 0.6650205850601196, "eval_runtime": 1.1671, "eval_samples_per_second": 97.675, "eval_steps_per_second": 5.141, "step": 70 }, { "epoch": 0.24615384615384617, "grad_norm": 2.572129726409912, "learning_rate": 7.017543859649123e-06, "loss": 0.6336, "step": 80 }, { "epoch": 0.24615384615384617, "eval_loss": 0.6613836884498596, "eval_runtime": 1.1671, "eval_samples_per_second": 97.676, "eval_steps_per_second": 5.141, "step": 80 }, { "epoch": 0.27692307692307694, "grad_norm": 4.622325420379639, "learning_rate": 7.894736842105265e-06, "loss": 0.631, "step": 90 }, { "epoch": 0.27692307692307694, "eval_loss": 0.6617783904075623, "eval_runtime": 1.1744, "eval_samples_per_second": 97.071, "eval_steps_per_second": 5.109, "step": 90 }, { "epoch": 0.3076923076923077, "grad_norm": 3.24354887008667, "learning_rate": 8.771929824561405e-06, "loss": 0.6086, "step": 100 }, { "epoch": 0.3076923076923077, "eval_loss": 0.662232518196106, "eval_runtime": 1.172, "eval_samples_per_second": 97.27, "eval_steps_per_second": 5.119, "step": 100 }, { "epoch": 0.3384615384615385, "grad_norm": 3.5868513584136963, "learning_rate": 9.649122807017545e-06, "loss": 0.7057, "step": 110 }, { "epoch": 0.3384615384615385, "eval_loss": 0.6619779467582703, "eval_runtime": 1.165, "eval_samples_per_second": 97.856, "eval_steps_per_second": 5.15, "step": 110 }, { "epoch": 0.36923076923076925, "grad_norm": 3.4894511699676514, "learning_rate": 1.0526315789473684e-05, "loss": 0.7385, "step": 120 }, { "epoch": 0.36923076923076925, "eval_loss": 0.6673641800880432, "eval_runtime": 1.1682, "eval_samples_per_second": 97.588, "eval_steps_per_second": 5.136, "step": 120 }, { "epoch": 0.4, "grad_norm": 5.117406845092773, "learning_rate": 1.1403508771929826e-05, "loss": 0.6533, "step": 130 }, { "epoch": 0.4, "eval_loss": 0.6705042123794556, "eval_runtime": 1.1726, "eval_samples_per_second": 97.217, "eval_steps_per_second": 5.117, "step": 130 }, { "epoch": 0.4307692307692308, "grad_norm": 3.9341437816619873, "learning_rate": 1.2280701754385966e-05, "loss": 0.7066, "step": 140 }, { "epoch": 0.4307692307692308, "eval_loss": 0.6691470146179199, "eval_runtime": 1.1702, "eval_samples_per_second": 97.418, "eval_steps_per_second": 5.127, "step": 140 }, { "epoch": 0.46153846153846156, "grad_norm": 3.4131579399108887, "learning_rate": 1.3157894736842108e-05, "loss": 0.6065, "step": 150 }, { "epoch": 0.46153846153846156, "eval_loss": 0.6724759340286255, "eval_runtime": 1.1746, "eval_samples_per_second": 97.05, "eval_steps_per_second": 5.108, "step": 150 }, { "epoch": 0.49230769230769234, "grad_norm": 3.820042848587036, "learning_rate": 1.4035087719298246e-05, "loss": 0.6474, "step": 160 }, { "epoch": 0.49230769230769234, "eval_loss": 0.6704264879226685, "eval_runtime": 1.1772, "eval_samples_per_second": 96.838, "eval_steps_per_second": 5.097, "step": 160 }, { "epoch": 0.5230769230769231, "grad_norm": 2.8097360134124756, "learning_rate": 1.4912280701754388e-05, "loss": 0.7022, "step": 170 }, { "epoch": 0.5230769230769231, "eval_loss": 0.6691609025001526, "eval_runtime": 1.1676, "eval_samples_per_second": 97.638, "eval_steps_per_second": 5.139, "step": 170 }, { "epoch": 0.5538461538461539, "grad_norm": 2.3641347885131836, "learning_rate": 1.578947368421053e-05, "loss": 0.6667, "step": 180 }, { "epoch": 0.5538461538461539, "eval_loss": 0.6753159165382385, "eval_runtime": 1.1689, "eval_samples_per_second": 97.524, "eval_steps_per_second": 5.133, "step": 180 }, { "epoch": 0.5846153846153846, "grad_norm": 6.359607219696045, "learning_rate": 1.6666666666666667e-05, "loss": 0.6889, "step": 190 }, { "epoch": 0.5846153846153846, "eval_loss": 0.6796454787254333, "eval_runtime": 1.1714, "eval_samples_per_second": 97.323, "eval_steps_per_second": 5.122, "step": 190 }, { "epoch": 0.6153846153846154, "grad_norm": 4.7979960441589355, "learning_rate": 1.754385964912281e-05, "loss": 0.6937, "step": 200 }, { "epoch": 0.6153846153846154, "eval_loss": 0.6748442649841309, "eval_runtime": 1.1773, "eval_samples_per_second": 96.831, "eval_steps_per_second": 5.096, "step": 200 }, { "epoch": 0.6461538461538462, "grad_norm": 3.5100464820861816, "learning_rate": 1.8421052631578947e-05, "loss": 0.6613, "step": 210 }, { "epoch": 0.6461538461538462, "eval_loss": 0.6842787861824036, "eval_runtime": 1.1662, "eval_samples_per_second": 97.752, "eval_steps_per_second": 5.145, "step": 210 }, { "epoch": 0.676923076923077, "grad_norm": 3.3538153171539307, "learning_rate": 1.929824561403509e-05, "loss": 0.6512, "step": 220 }, { "epoch": 0.676923076923077, "eval_loss": 0.6873583197593689, "eval_runtime": 1.1667, "eval_samples_per_second": 97.712, "eval_steps_per_second": 5.143, "step": 220 }, { "epoch": 0.7076923076923077, "grad_norm": 3.7595653533935547, "learning_rate": 1.9999952892103225e-05, "loss": 0.7656, "step": 230 }, { "epoch": 0.7076923076923077, "eval_loss": 0.6970181465148926, "eval_runtime": 1.1663, "eval_samples_per_second": 97.741, "eval_steps_per_second": 5.144, "step": 230 }, { "epoch": 0.7384615384615385, "grad_norm": 2.9787654876708984, "learning_rate": 1.999830416231782e-05, "loss": 0.6412, "step": 240 }, { "epoch": 0.7384615384615385, "eval_loss": 0.7058220505714417, "eval_runtime": 1.1688, "eval_samples_per_second": 97.537, "eval_steps_per_second": 5.134, "step": 240 }, { "epoch": 0.7692307692307693, "grad_norm": 2.3455305099487305, "learning_rate": 1.9994300481505595e-05, "loss": 0.6148, "step": 250 }, { "epoch": 0.7692307692307693, "eval_loss": 0.7070643305778503, "eval_runtime": 1.1728, "eval_samples_per_second": 97.202, "eval_steps_per_second": 5.116, "step": 250 }, { "epoch": 0.8, "grad_norm": 4.057767868041992, "learning_rate": 1.998794279267369e-05, "loss": 0.6433, "step": 260 }, { "epoch": 0.8, "eval_loss": 0.7127295136451721, "eval_runtime": 1.1689, "eval_samples_per_second": 97.529, "eval_steps_per_second": 5.133, "step": 260 }, { "epoch": 0.8307692307692308, "grad_norm": 2.5686323642730713, "learning_rate": 1.9979232593280637e-05, "loss": 0.5972, "step": 270 }, { "epoch": 0.8307692307692308, "eval_loss": 0.7106388211250305, "eval_runtime": 1.166, "eval_samples_per_second": 97.77, "eval_steps_per_second": 5.146, "step": 270 }, { "epoch": 0.8615384615384616, "grad_norm": 1.2748245000839233, "learning_rate": 1.9968171934883647e-05, "loss": 0.6795, "step": 280 }, { "epoch": 0.8615384615384616, "eval_loss": 0.7127251029014587, "eval_runtime": 1.1701, "eval_samples_per_second": 97.429, "eval_steps_per_second": 5.128, "step": 280 }, { "epoch": 0.8923076923076924, "grad_norm": 3.1174163818359375, "learning_rate": 1.9954763422655396e-05, "loss": 0.6589, "step": 290 }, { "epoch": 0.8923076923076924, "eval_loss": 0.7157883048057556, "eval_runtime": 1.1727, "eval_samples_per_second": 97.211, "eval_steps_per_second": 5.116, "step": 290 }, { "epoch": 0.9230769230769231, "grad_norm": 2.3197476863861084, "learning_rate": 1.9939010214770426e-05, "loss": 0.5539, "step": 300 }, { "epoch": 0.9230769230769231, "eval_loss": 0.7173091173171997, "eval_runtime": 1.1712, "eval_samples_per_second": 97.332, "eval_steps_per_second": 5.123, "step": 300 }, { "epoch": 0.9538461538461539, "grad_norm": 2.7073607444763184, "learning_rate": 1.9920916021661277e-05, "loss": 0.7515, "step": 310 }, { "epoch": 0.9538461538461539, "eval_loss": 0.720274806022644, "eval_runtime": 1.1673, "eval_samples_per_second": 97.665, "eval_steps_per_second": 5.14, "step": 310 }, { "epoch": 0.9846153846153847, "grad_norm": 2.78871488571167, "learning_rate": 1.9900485105144544e-05, "loss": 0.6282, "step": 320 }, { "epoch": 0.9846153846153847, "eval_loss": 0.7291054725646973, "eval_runtime": 1.1713, "eval_samples_per_second": 97.331, "eval_steps_per_second": 5.123, "step": 320 }, { "epoch": 1.0153846153846153, "grad_norm": 2.181795597076416, "learning_rate": 1.9877722277417085e-05, "loss": 0.4711, "step": 330 }, { "epoch": 1.0153846153846153, "eval_loss": 0.7346097826957703, "eval_runtime": 1.1674, "eval_samples_per_second": 97.652, "eval_steps_per_second": 5.14, "step": 330 }, { "epoch": 1.0461538461538462, "grad_norm": 6.42896842956543, "learning_rate": 1.985263289992256e-05, "loss": 0.4595, "step": 340 }, { "epoch": 1.0461538461538462, "eval_loss": 0.7707703709602356, "eval_runtime": 1.1711, "eval_samples_per_second": 97.348, "eval_steps_per_second": 5.124, "step": 340 }, { "epoch": 1.0769230769230769, "grad_norm": 6.206396102905273, "learning_rate": 1.9825222882088647e-05, "loss": 0.3704, "step": 350 }, { "epoch": 1.0769230769230769, "eval_loss": 0.7343254685401917, "eval_runtime": 1.1717, "eval_samples_per_second": 97.298, "eval_steps_per_second": 5.121, "step": 350 }, { "epoch": 1.1076923076923078, "grad_norm": 2.2610416412353516, "learning_rate": 1.9795498679935144e-05, "loss": 0.3845, "step": 360 }, { "epoch": 1.1076923076923078, "eval_loss": 0.7747918963432312, "eval_runtime": 1.1652, "eval_samples_per_second": 97.838, "eval_steps_per_second": 5.149, "step": 360 }, { "epoch": 1.1384615384615384, "grad_norm": 2.490269184112549, "learning_rate": 1.9763467294553364e-05, "loss": 0.3461, "step": 370 }, { "epoch": 1.1384615384615384, "eval_loss": 0.7602437138557434, "eval_runtime": 1.1685, "eval_samples_per_second": 97.563, "eval_steps_per_second": 5.135, "step": 370 }, { "epoch": 1.1692307692307693, "grad_norm": 2.435138702392578, "learning_rate": 1.9729136270457118e-05, "loss": 0.3871, "step": 380 }, { "epoch": 1.1692307692307693, "eval_loss": 0.7585355043411255, "eval_runtime": 1.1702, "eval_samples_per_second": 97.419, "eval_steps_per_second": 5.127, "step": 380 }, { "epoch": 1.2, "grad_norm": 1.649342656135559, "learning_rate": 1.9692513693805738e-05, "loss": 0.345, "step": 390 }, { "epoch": 1.2, "eval_loss": 0.7817304730415344, "eval_runtime": 1.169, "eval_samples_per_second": 97.52, "eval_steps_per_second": 5.133, "step": 390 }, { "epoch": 1.2307692307692308, "grad_norm": 2.2906811237335205, "learning_rate": 1.965360819049948e-05, "loss": 0.4372, "step": 400 }, { "epoch": 1.2307692307692308, "eval_loss": 0.7519503235816956, "eval_runtime": 1.1693, "eval_samples_per_second": 97.495, "eval_steps_per_second": 5.131, "step": 400 }, { "epoch": 1.2615384615384615, "grad_norm": 3.925938606262207, "learning_rate": 1.9612428924147842e-05, "loss": 0.4879, "step": 410 }, { "epoch": 1.2615384615384615, "eval_loss": 0.7463933229446411, "eval_runtime": 1.1712, "eval_samples_per_second": 97.334, "eval_steps_per_second": 5.123, "step": 410 }, { "epoch": 1.2923076923076924, "grad_norm": 1.4445178508758545, "learning_rate": 1.9568985593911206e-05, "loss": 0.4265, "step": 420 }, { "epoch": 1.2923076923076924, "eval_loss": 0.7673818469047546, "eval_runtime": 1.1696, "eval_samples_per_second": 97.471, "eval_steps_per_second": 5.13, "step": 420 }, { "epoch": 1.323076923076923, "grad_norm": 2.707318067550659, "learning_rate": 1.9523288432216333e-05, "loss": 0.4255, "step": 430 }, { "epoch": 1.323076923076923, "eval_loss": 0.776730477809906, "eval_runtime": 1.1686, "eval_samples_per_second": 97.552, "eval_steps_per_second": 5.134, "step": 430 }, { "epoch": 1.353846153846154, "grad_norm": 2.1543335914611816, "learning_rate": 1.9475348202346292e-05, "loss": 0.3992, "step": 440 }, { "epoch": 1.353846153846154, "eval_loss": 0.7587007284164429, "eval_runtime": 1.1741, "eval_samples_per_second": 97.094, "eval_steps_per_second": 5.11, "step": 440 }, { "epoch": 1.3846153846153846, "grad_norm": 2.1207425594329834, "learning_rate": 1.942517619590531e-05, "loss": 0.4197, "step": 450 }, { "epoch": 1.3846153846153846, "eval_loss": 0.7629592418670654, "eval_runtime": 1.1714, "eval_samples_per_second": 97.323, "eval_steps_per_second": 5.122, "step": 450 }, { "epoch": 1.4153846153846155, "grad_norm": 3.0015456676483154, "learning_rate": 1.9372784230159213e-05, "loss": 0.3963, "step": 460 }, { "epoch": 1.4153846153846155, "eval_loss": 0.7871745824813843, "eval_runtime": 1.1671, "eval_samples_per_second": 97.676, "eval_steps_per_second": 5.141, "step": 460 }, { "epoch": 1.4461538461538461, "grad_norm": 2.6770448684692383, "learning_rate": 1.9318184645252037e-05, "loss": 0.3689, "step": 470 }, { "epoch": 1.4461538461538461, "eval_loss": 0.7728400826454163, "eval_runtime": 1.1721, "eval_samples_per_second": 97.262, "eval_steps_per_second": 5.119, "step": 470 }, { "epoch": 1.476923076923077, "grad_norm": 2.6416432857513428, "learning_rate": 1.926139030129951e-05, "loss": 0.4004, "step": 480 }, { "epoch": 1.476923076923077, "eval_loss": 0.7785875797271729, "eval_runtime": 1.1696, "eval_samples_per_second": 97.466, "eval_steps_per_second": 5.13, "step": 480 }, { "epoch": 1.5076923076923077, "grad_norm": 1.8383666276931763, "learning_rate": 1.9202414575360024e-05, "loss": 0.4265, "step": 490 }, { "epoch": 1.5076923076923077, "eval_loss": 0.7735582590103149, "eval_runtime": 1.168, "eval_samples_per_second": 97.604, "eval_steps_per_second": 5.137, "step": 490 }, { "epoch": 1.5384615384615383, "grad_norm": 1.905369758605957, "learning_rate": 1.9141271358283874e-05, "loss": 0.3675, "step": 500 }, { "epoch": 1.5384615384615383, "eval_loss": 0.7821589708328247, "eval_runtime": 1.168, "eval_samples_per_second": 97.603, "eval_steps_per_second": 5.137, "step": 500 }, { "epoch": 1.5692307692307692, "grad_norm": 2.0887465476989746, "learning_rate": 1.9077975051441487e-05, "loss": 0.3769, "step": 510 }, { "epoch": 1.5692307692307692, "eval_loss": 0.8046853542327881, "eval_runtime": 1.168, "eval_samples_per_second": 97.604, "eval_steps_per_second": 5.137, "step": 510 }, { "epoch": 1.6, "grad_norm": 1.3952693939208984, "learning_rate": 1.9012540563331375e-05, "loss": 0.3842, "step": 520 }, { "epoch": 1.6, "eval_loss": 0.7695035338401794, "eval_runtime": 1.1677, "eval_samples_per_second": 97.627, "eval_steps_per_second": 5.138, "step": 520 }, { "epoch": 1.6307692307692307, "grad_norm": 2.4253690242767334, "learning_rate": 1.8944983306068683e-05, "loss": 0.372, "step": 530 }, { "epoch": 1.6307692307692307, "eval_loss": 0.7812924385070801, "eval_runtime": 1.1688, "eval_samples_per_second": 97.535, "eval_steps_per_second": 5.133, "step": 530 }, { "epoch": 1.6615384615384614, "grad_norm": 1.9009945392608643, "learning_rate": 1.8875319191755083e-05, "loss": 0.3825, "step": 540 }, { "epoch": 1.6615384615384614, "eval_loss": 0.7782070636749268, "eval_runtime": 1.1713, "eval_samples_per_second": 97.328, "eval_steps_per_second": 5.123, "step": 540 }, { "epoch": 1.6923076923076923, "grad_norm": 3.183929443359375, "learning_rate": 1.8803564628730916e-05, "loss": 0.396, "step": 550 }, { "epoch": 1.6923076923076923, "eval_loss": 0.7839831709861755, "eval_runtime": 1.1706, "eval_samples_per_second": 97.388, "eval_steps_per_second": 5.126, "step": 550 }, { "epoch": 1.7230769230769232, "grad_norm": 2.031071901321411, "learning_rate": 1.8729736517710454e-05, "loss": 0.3862, "step": 560 }, { "epoch": 1.7230769230769232, "eval_loss": 0.7782466411590576, "eval_runtime": 1.1669, "eval_samples_per_second": 97.695, "eval_steps_per_second": 5.142, "step": 560 }, { "epoch": 1.7538461538461538, "grad_norm": 1.2456035614013672, "learning_rate": 1.865385224780119e-05, "loss": 0.3909, "step": 570 }, { "epoch": 1.7538461538461538, "eval_loss": 0.7737441062927246, "eval_runtime": 1.1644, "eval_samples_per_second": 97.908, "eval_steps_per_second": 5.153, "step": 570 }, { "epoch": 1.7846153846153845, "grad_norm": 1.3416839838027954, "learning_rate": 1.8575929692408105e-05, "loss": 0.3585, "step": 580 }, { "epoch": 1.7846153846153845, "eval_loss": 0.7757642269134521, "eval_runtime": 1.1707, "eval_samples_per_second": 97.376, "eval_steps_per_second": 5.125, "step": 580 }, { "epoch": 1.8153846153846154, "grad_norm": 2.5810840129852295, "learning_rate": 1.8495987205023832e-05, "loss": 0.4018, "step": 590 }, { "epoch": 1.8153846153846154, "eval_loss": 0.7671994566917419, "eval_runtime": 1.1677, "eval_samples_per_second": 97.627, "eval_steps_per_second": 5.138, "step": 590 }, { "epoch": 1.8461538461538463, "grad_norm": 2.186009407043457, "learning_rate": 1.8414043614905782e-05, "loss": 0.3936, "step": 600 }, { "epoch": 1.8461538461538463, "eval_loss": 0.7763370871543884, "eval_runtime": 1.1743, "eval_samples_per_second": 97.078, "eval_steps_per_second": 5.109, "step": 600 }, { "epoch": 1.876923076923077, "grad_norm": 2.5505406856536865, "learning_rate": 1.8330118222641192e-05, "loss": 0.4963, "step": 610 }, { "epoch": 1.876923076923077, "eval_loss": 0.7642151713371277, "eval_runtime": 1.1648, "eval_samples_per_second": 97.867, "eval_steps_per_second": 5.151, "step": 610 }, { "epoch": 1.9076923076923076, "grad_norm": 2.2109644412994385, "learning_rate": 1.824423079560116e-05, "loss": 0.4821, "step": 620 }, { "epoch": 1.9076923076923076, "eval_loss": 0.7671634554862976, "eval_runtime": 1.1645, "eval_samples_per_second": 97.898, "eval_steps_per_second": 5.153, "step": 620 }, { "epoch": 1.9384615384615385, "grad_norm": 2.702033519744873, "learning_rate": 1.8156401563284724e-05, "loss": 0.4216, "step": 630 }, { "epoch": 1.9384615384615385, "eval_loss": 0.7776830792427063, "eval_runtime": 1.1704, "eval_samples_per_second": 97.399, "eval_steps_per_second": 5.126, "step": 630 }, { "epoch": 1.9692307692307693, "grad_norm": 2.2329928874969482, "learning_rate": 1.8066651212554126e-05, "loss": 0.4422, "step": 640 }, { "epoch": 1.9692307692307693, "eval_loss": 0.7678297162055969, "eval_runtime": 1.171, "eval_samples_per_second": 97.351, "eval_steps_per_second": 5.124, "step": 640 }, { "epoch": 2.0, "grad_norm": 1.3216522932052612, "learning_rate": 1.797500088276232e-05, "loss": 0.3888, "step": 650 }, { "epoch": 2.0, "eval_loss": 0.7601237297058105, "eval_runtime": 1.1955, "eval_samples_per_second": 95.354, "eval_steps_per_second": 5.019, "step": 650 }, { "epoch": 2.0307692307692307, "grad_norm": 1.9767566919326782, "learning_rate": 1.7881472160773912e-05, "loss": 0.2158, "step": 660 }, { "epoch": 2.0307692307692307, "eval_loss": 0.8567830920219421, "eval_runtime": 1.1711, "eval_samples_per_second": 97.344, "eval_steps_per_second": 5.123, "step": 660 }, { "epoch": 2.0615384615384613, "grad_norm": 1.724826693534851, "learning_rate": 1.7786087075880698e-05, "loss": 0.1589, "step": 670 }, { "epoch": 2.0615384615384613, "eval_loss": 0.8708633780479431, "eval_runtime": 1.1707, "eval_samples_per_second": 97.379, "eval_steps_per_second": 5.125, "step": 670 }, { "epoch": 2.0923076923076924, "grad_norm": 1.9074368476867676, "learning_rate": 1.7688868094613e-05, "loss": 0.1974, "step": 680 }, { "epoch": 2.0923076923076924, "eval_loss": 0.8557892441749573, "eval_runtime": 1.1707, "eval_samples_per_second": 97.382, "eval_steps_per_second": 5.125, "step": 680 }, { "epoch": 2.123076923076923, "grad_norm": 1.2026904821395874, "learning_rate": 1.7589838115448005e-05, "loss": 0.1832, "step": 690 }, { "epoch": 2.123076923076923, "eval_loss": 0.861406147480011, "eval_runtime": 1.1692, "eval_samples_per_second": 97.502, "eval_steps_per_second": 5.132, "step": 690 }, { "epoch": 2.1538461538461537, "grad_norm": 1.541394829750061, "learning_rate": 1.748902046341637e-05, "loss": 0.1835, "step": 700 }, { "epoch": 2.1538461538461537, "eval_loss": 0.8678261041641235, "eval_runtime": 1.1824, "eval_samples_per_second": 96.41, "eval_steps_per_second": 5.074, "step": 700 }, { "epoch": 2.184615384615385, "grad_norm": 2.980255126953125, "learning_rate": 1.7386438884608366e-05, "loss": 0.1555, "step": 710 }, { "epoch": 2.184615384615385, "eval_loss": 0.8728282451629639, "eval_runtime": 1.1698, "eval_samples_per_second": 97.454, "eval_steps_per_second": 5.129, "step": 710 }, { "epoch": 2.2153846153846155, "grad_norm": 1.7097492218017578, "learning_rate": 1.7282117540580833e-05, "loss": 0.1789, "step": 720 }, { "epoch": 2.2153846153846155, "eval_loss": 0.8826200366020203, "eval_runtime": 1.1687, "eval_samples_per_second": 97.541, "eval_steps_per_second": 5.134, "step": 720 }, { "epoch": 2.246153846153846, "grad_norm": 2.0315046310424805, "learning_rate": 1.7176081002666295e-05, "loss": 0.1825, "step": 730 }, { "epoch": 2.246153846153846, "eval_loss": 0.8737024068832397, "eval_runtime": 1.1727, "eval_samples_per_second": 97.211, "eval_steps_per_second": 5.116, "step": 730 }, { "epoch": 2.276923076923077, "grad_norm": 1.5558000802993774, "learning_rate": 1.706835424618555e-05, "loss": 0.1906, "step": 740 }, { "epoch": 2.276923076923077, "eval_loss": 0.8740295767784119, "eval_runtime": 1.1746, "eval_samples_per_second": 97.054, "eval_steps_per_second": 5.108, "step": 740 }, { "epoch": 2.3076923076923075, "grad_norm": 2.007802724838257, "learning_rate": 1.695896264456509e-05, "loss": 0.2171, "step": 750 }, { "epoch": 2.3076923076923075, "eval_loss": 0.8841921091079712, "eval_runtime": 1.1694, "eval_samples_per_second": 97.487, "eval_steps_per_second": 5.131, "step": 750 }, { "epoch": 2.3384615384615386, "grad_norm": 2.0817086696624756, "learning_rate": 1.6847931963360796e-05, "loss": 0.1993, "step": 760 }, { "epoch": 2.3384615384615386, "eval_loss": 0.8875246644020081, "eval_runtime": 1.1683, "eval_samples_per_second": 97.576, "eval_steps_per_second": 5.136, "step": 760 }, { "epoch": 2.3692307692307693, "grad_norm": 1.752740740776062, "learning_rate": 1.6735288354189225e-05, "loss": 0.1928, "step": 770 }, { "epoch": 2.3692307692307693, "eval_loss": 0.8896489143371582, "eval_runtime": 1.1654, "eval_samples_per_second": 97.821, "eval_steps_per_second": 5.148, "step": 770 }, { "epoch": 2.4, "grad_norm": 1.6042486429214478, "learning_rate": 1.6621058348568008e-05, "loss": 0.2059, "step": 780 }, { "epoch": 2.4, "eval_loss": 0.8732131719589233, "eval_runtime": 1.1715, "eval_samples_per_second": 97.315, "eval_steps_per_second": 5.122, "step": 780 }, { "epoch": 2.430769230769231, "grad_norm": 2.1651504039764404, "learning_rate": 1.6505268851666717e-05, "loss": 0.2101, "step": 790 }, { "epoch": 2.430769230769231, "eval_loss": 0.8754842877388, "eval_runtime": 1.1681, "eval_samples_per_second": 97.595, "eval_steps_per_second": 5.137, "step": 790 }, { "epoch": 2.4615384615384617, "grad_norm": 2.9434351921081543, "learning_rate": 1.6387947135969796e-05, "loss": 0.2305, "step": 800 }, { "epoch": 2.4615384615384617, "eval_loss": 0.9017049074172974, "eval_runtime": 1.1681, "eval_samples_per_second": 97.598, "eval_steps_per_second": 5.137, "step": 800 }, { "epoch": 2.4923076923076923, "grad_norm": 1.2757177352905273, "learning_rate": 1.6269120834852892e-05, "loss": 0.1837, "step": 810 }, { "epoch": 2.4923076923076923, "eval_loss": 0.8789340853691101, "eval_runtime": 1.1681, "eval_samples_per_second": 97.593, "eval_steps_per_second": 5.136, "step": 810 }, { "epoch": 2.523076923076923, "grad_norm": 2.2374846935272217, "learning_rate": 1.6148817936074267e-05, "loss": 0.1846, "step": 820 }, { "epoch": 2.523076923076923, "eval_loss": 0.8868066668510437, "eval_runtime": 1.1703, "eval_samples_per_second": 97.411, "eval_steps_per_second": 5.127, "step": 820 }, { "epoch": 2.5538461538461537, "grad_norm": 2.112977981567383, "learning_rate": 1.6027066775182664e-05, "loss": 0.215, "step": 830 }, { "epoch": 2.5538461538461537, "eval_loss": 0.8842012286186218, "eval_runtime": 1.168, "eval_samples_per_second": 97.604, "eval_steps_per_second": 5.137, "step": 830 }, { "epoch": 2.5846153846153848, "grad_norm": 1.7254525423049927, "learning_rate": 1.5903896028843316e-05, "loss": 0.2021, "step": 840 }, { "epoch": 2.5846153846153848, "eval_loss": 0.8752718567848206, "eval_runtime": 1.172, "eval_samples_per_second": 97.267, "eval_steps_per_second": 5.119, "step": 840 }, { "epoch": 2.6153846153846154, "grad_norm": 2.7533833980560303, "learning_rate": 1.5779334708083585e-05, "loss": 0.2087, "step": 850 }, { "epoch": 2.6153846153846154, "eval_loss": 0.881074070930481, "eval_runtime": 1.1742, "eval_samples_per_second": 97.086, "eval_steps_per_second": 5.11, "step": 850 }, { "epoch": 2.646153846153846, "grad_norm": 1.364652395248413, "learning_rate": 1.565341215145983e-05, "loss": 0.205, "step": 860 }, { "epoch": 2.646153846153846, "eval_loss": 0.8765241503715515, "eval_runtime": 1.1685, "eval_samples_per_second": 97.561, "eval_steps_per_second": 5.135, "step": 860 }, { "epoch": 2.676923076923077, "grad_norm": 2.8074488639831543, "learning_rate": 1.5526158018147168e-05, "loss": 0.1872, "step": 870 }, { "epoch": 2.676923076923077, "eval_loss": 0.8835176229476929, "eval_runtime": 1.1654, "eval_samples_per_second": 97.823, "eval_steps_per_second": 5.149, "step": 870 }, { "epoch": 2.707692307692308, "grad_norm": 1.8859968185424805, "learning_rate": 1.5397602280953695e-05, "loss": 0.197, "step": 880 }, { "epoch": 2.707692307692308, "eval_loss": 0.8719732761383057, "eval_runtime": 1.1708, "eval_samples_per_second": 97.368, "eval_steps_per_second": 5.125, "step": 880 }, { "epoch": 2.7384615384615385, "grad_norm": 1.2429983615875244, "learning_rate": 1.526777521926084e-05, "loss": 0.1932, "step": 890 }, { "epoch": 2.7384615384615385, "eval_loss": 0.8760843276977539, "eval_runtime": 1.1705, "eval_samples_per_second": 97.392, "eval_steps_per_second": 5.126, "step": 890 }, { "epoch": 2.769230769230769, "grad_norm": 1.7974516153335571, "learning_rate": 1.5136707411891483e-05, "loss": 0.2008, "step": 900 }, { "epoch": 2.769230769230769, "eval_loss": 0.8759164214134216, "eval_runtime": 1.1739, "eval_samples_per_second": 97.116, "eval_steps_per_second": 5.111, "step": 900 }, { "epoch": 2.8, "grad_norm": 1.4502002000808716, "learning_rate": 1.5004429729907619e-05, "loss": 0.1998, "step": 910 }, { "epoch": 2.8, "eval_loss": 0.8729867935180664, "eval_runtime": 1.1659, "eval_samples_per_second": 97.775, "eval_steps_per_second": 5.146, "step": 910 }, { "epoch": 2.830769230769231, "grad_norm": 1.3150732517242432, "learning_rate": 1.4870973329339112e-05, "loss": 0.1936, "step": 920 }, { "epoch": 2.830769230769231, "eval_loss": 0.8798808455467224, "eval_runtime": 1.1664, "eval_samples_per_second": 97.737, "eval_steps_per_second": 5.144, "step": 920 }, { "epoch": 2.8615384615384616, "grad_norm": 1.8317633867263794, "learning_rate": 1.4736369643845346e-05, "loss": 0.1951, "step": 930 }, { "epoch": 2.8615384615384616, "eval_loss": 0.900560736656189, "eval_runtime": 1.1713, "eval_samples_per_second": 97.324, "eval_steps_per_second": 5.122, "step": 930 }, { "epoch": 2.8923076923076922, "grad_norm": 2.284203290939331, "learning_rate": 1.4600650377311523e-05, "loss": 0.1884, "step": 940 }, { "epoch": 2.8923076923076922, "eval_loss": 0.8905934691429138, "eval_runtime": 1.1697, "eval_samples_per_second": 97.462, "eval_steps_per_second": 5.13, "step": 940 }, { "epoch": 2.9230769230769234, "grad_norm": 1.9384973049163818, "learning_rate": 1.446384749638128e-05, "loss": 0.1881, "step": 950 }, { "epoch": 2.9230769230769234, "eval_loss": 0.8698312640190125, "eval_runtime": 1.1727, "eval_samples_per_second": 97.213, "eval_steps_per_second": 5.116, "step": 950 }, { "epoch": 2.953846153846154, "grad_norm": 1.7943904399871826, "learning_rate": 1.4325993222927414e-05, "loss": 0.2166, "step": 960 }, { "epoch": 2.953846153846154, "eval_loss": 0.8718281984329224, "eval_runtime": 1.1666, "eval_samples_per_second": 97.724, "eval_steps_per_second": 5.143, "step": 960 }, { "epoch": 2.9846153846153847, "grad_norm": 2.0599663257598877, "learning_rate": 1.4187120026462508e-05, "loss": 0.2082, "step": 970 }, { "epoch": 2.9846153846153847, "eval_loss": 0.8722580075263977, "eval_runtime": 1.1673, "eval_samples_per_second": 97.658, "eval_steps_per_second": 5.14, "step": 970 }, { "epoch": 3.0153846153846153, "grad_norm": 1.2260268926620483, "learning_rate": 1.4047260616491225e-05, "loss": 0.1631, "step": 980 }, { "epoch": 3.0153846153846153, "eval_loss": 0.8925275206565857, "eval_runtime": 1.1687, "eval_samples_per_second": 97.541, "eval_steps_per_second": 5.134, "step": 980 }, { "epoch": 3.046153846153846, "grad_norm": 1.544405460357666, "learning_rate": 1.3906447934806074e-05, "loss": 0.1024, "step": 990 }, { "epoch": 3.046153846153846, "eval_loss": 0.9651603102684021, "eval_runtime": 1.1687, "eval_samples_per_second": 97.54, "eval_steps_per_second": 5.134, "step": 990 }, { "epoch": 3.076923076923077, "grad_norm": 1.8806118965148926, "learning_rate": 1.3764715147728451e-05, "loss": 0.1111, "step": 1000 }, { "epoch": 3.076923076923077, "eval_loss": 0.9773316383361816, "eval_runtime": 1.1719, "eval_samples_per_second": 97.274, "eval_steps_per_second": 5.12, "step": 1000 }, { "epoch": 3.1076923076923078, "grad_norm": 1.2032195329666138, "learning_rate": 1.3622095638296827e-05, "loss": 0.1011, "step": 1010 }, { "epoch": 3.1076923076923078, "eval_loss": 0.9423761367797852, "eval_runtime": 1.1686, "eval_samples_per_second": 97.556, "eval_steps_per_second": 5.135, "step": 1010 }, { "epoch": 3.1384615384615384, "grad_norm": 1.2994767427444458, "learning_rate": 1.3478622998403861e-05, "loss": 0.1078, "step": 1020 }, { "epoch": 3.1384615384615384, "eval_loss": 0.9416558146476746, "eval_runtime": 1.1698, "eval_samples_per_second": 97.455, "eval_steps_per_second": 5.129, "step": 1020 }, { "epoch": 3.169230769230769, "grad_norm": 1.061914324760437, "learning_rate": 1.3334331020884328e-05, "loss": 0.0991, "step": 1030 }, { "epoch": 3.169230769230769, "eval_loss": 0.9638768434524536, "eval_runtime": 1.1664, "eval_samples_per_second": 97.741, "eval_steps_per_second": 5.144, "step": 1030 }, { "epoch": 3.2, "grad_norm": 1.4495635032653809, "learning_rate": 1.318925369155574e-05, "loss": 0.0979, "step": 1040 }, { "epoch": 3.2, "eval_loss": 0.9724640846252441, "eval_runtime": 1.1718, "eval_samples_per_second": 97.282, "eval_steps_per_second": 5.12, "step": 1040 }, { "epoch": 3.230769230769231, "grad_norm": 1.5616148710250854, "learning_rate": 1.3043425181213471e-05, "loss": 0.1109, "step": 1050 }, { "epoch": 3.230769230769231, "eval_loss": 0.972594141960144, "eval_runtime": 1.169, "eval_samples_per_second": 97.515, "eval_steps_per_second": 5.132, "step": 1050 }, { "epoch": 3.2615384615384615, "grad_norm": 2.1167333126068115, "learning_rate": 1.2896879837582356e-05, "loss": 0.1047, "step": 1060 }, { "epoch": 3.2615384615384615, "eval_loss": 0.9514709711074829, "eval_runtime": 1.1679, "eval_samples_per_second": 97.612, "eval_steps_per_second": 5.137, "step": 1060 }, { "epoch": 3.292307692307692, "grad_norm": 1.5117723941802979, "learning_rate": 1.2749652177226592e-05, "loss": 0.1075, "step": 1070 }, { "epoch": 3.292307692307692, "eval_loss": 0.9520531296730042, "eval_runtime": 1.1746, "eval_samples_per_second": 97.058, "eval_steps_per_second": 5.108, "step": 1070 }, { "epoch": 3.3230769230769233, "grad_norm": 1.231341004371643, "learning_rate": 1.2601776877419876e-05, "loss": 0.1021, "step": 1080 }, { "epoch": 3.3230769230769233, "eval_loss": 0.9573836922645569, "eval_runtime": 1.1666, "eval_samples_per_second": 97.723, "eval_steps_per_second": 5.143, "step": 1080 }, { "epoch": 3.353846153846154, "grad_norm": 1.5957950353622437, "learning_rate": 1.2453288767977686e-05, "loss": 0.1069, "step": 1090 }, { "epoch": 3.353846153846154, "eval_loss": 0.9602900147438049, "eval_runtime": 1.169, "eval_samples_per_second": 97.519, "eval_steps_per_second": 5.133, "step": 1090 }, { "epoch": 3.3846153846153846, "grad_norm": 0.7704668641090393, "learning_rate": 1.2304222823053653e-05, "loss": 0.0969, "step": 1100 }, { "epoch": 3.3846153846153846, "eval_loss": 0.9564027190208435, "eval_runtime": 1.1725, "eval_samples_per_second": 97.232, "eval_steps_per_second": 5.117, "step": 1100 } ], "logging_steps": 10, "max_steps": 2275, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 50, "total_flos": 6.802748878513766e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }