“Sara
adding model files
8153d6d
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.769230769230769,
"eval_steps": 10,
"global_step": 1550,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006153846153846154,
"eval_loss": 1.489196538925171,
"eval_runtime": 1.2747,
"eval_samples_per_second": 89.432,
"eval_steps_per_second": 4.707,
"step": 2
},
{
"epoch": 0.03076923076923077,
"grad_norm": 17.32219123840332,
"learning_rate": 8.771929824561404e-07,
"loss": 1.5722,
"step": 10
},
{
"epoch": 0.03076923076923077,
"eval_loss": 1.3865292072296143,
"eval_runtime": 1.1749,
"eval_samples_per_second": 97.029,
"eval_steps_per_second": 5.107,
"step": 10
},
{
"epoch": 0.06153846153846154,
"grad_norm": 21.88077735900879,
"learning_rate": 1.7543859649122807e-06,
"loss": 1.3935,
"step": 20
},
{
"epoch": 0.06153846153846154,
"eval_loss": 1.0996264219284058,
"eval_runtime": 1.1706,
"eval_samples_per_second": 97.386,
"eval_steps_per_second": 5.126,
"step": 20
},
{
"epoch": 0.09230769230769231,
"grad_norm": 17.395177841186523,
"learning_rate": 2.631578947368421e-06,
"loss": 1.0664,
"step": 30
},
{
"epoch": 0.09230769230769231,
"eval_loss": 0.8006665706634521,
"eval_runtime": 1.1747,
"eval_samples_per_second": 97.044,
"eval_steps_per_second": 5.108,
"step": 30
},
{
"epoch": 0.12307692307692308,
"grad_norm": 6.869284152984619,
"learning_rate": 3.5087719298245615e-06,
"loss": 0.7994,
"step": 40
},
{
"epoch": 0.12307692307692308,
"eval_loss": 0.708875298500061,
"eval_runtime": 1.1742,
"eval_samples_per_second": 97.087,
"eval_steps_per_second": 5.11,
"step": 40
},
{
"epoch": 0.15384615384615385,
"grad_norm": 6.4737162590026855,
"learning_rate": 4.385964912280702e-06,
"loss": 0.7751,
"step": 50
},
{
"epoch": 0.15384615384615385,
"eval_loss": 0.6778659224510193,
"eval_runtime": 1.1722,
"eval_samples_per_second": 97.257,
"eval_steps_per_second": 5.119,
"step": 50
},
{
"epoch": 0.18461538461538463,
"grad_norm": 5.418182373046875,
"learning_rate": 5.263157894736842e-06,
"loss": 0.6203,
"step": 60
},
{
"epoch": 0.18461538461538463,
"eval_loss": 0.6695793271064758,
"eval_runtime": 1.1744,
"eval_samples_per_second": 97.071,
"eval_steps_per_second": 5.109,
"step": 60
},
{
"epoch": 0.2153846153846154,
"grad_norm": 2.7423934936523438,
"learning_rate": 6.140350877192983e-06,
"loss": 0.767,
"step": 70
},
{
"epoch": 0.2153846153846154,
"eval_loss": 0.6650205850601196,
"eval_runtime": 1.1671,
"eval_samples_per_second": 97.675,
"eval_steps_per_second": 5.141,
"step": 70
},
{
"epoch": 0.24615384615384617,
"grad_norm": 2.572129726409912,
"learning_rate": 7.017543859649123e-06,
"loss": 0.6336,
"step": 80
},
{
"epoch": 0.24615384615384617,
"eval_loss": 0.6613836884498596,
"eval_runtime": 1.1671,
"eval_samples_per_second": 97.676,
"eval_steps_per_second": 5.141,
"step": 80
},
{
"epoch": 0.27692307692307694,
"grad_norm": 4.622325420379639,
"learning_rate": 7.894736842105265e-06,
"loss": 0.631,
"step": 90
},
{
"epoch": 0.27692307692307694,
"eval_loss": 0.6617783904075623,
"eval_runtime": 1.1744,
"eval_samples_per_second": 97.071,
"eval_steps_per_second": 5.109,
"step": 90
},
{
"epoch": 0.3076923076923077,
"grad_norm": 3.24354887008667,
"learning_rate": 8.771929824561405e-06,
"loss": 0.6086,
"step": 100
},
{
"epoch": 0.3076923076923077,
"eval_loss": 0.662232518196106,
"eval_runtime": 1.172,
"eval_samples_per_second": 97.27,
"eval_steps_per_second": 5.119,
"step": 100
},
{
"epoch": 0.3384615384615385,
"grad_norm": 3.5868513584136963,
"learning_rate": 9.649122807017545e-06,
"loss": 0.7057,
"step": 110
},
{
"epoch": 0.3384615384615385,
"eval_loss": 0.6619779467582703,
"eval_runtime": 1.165,
"eval_samples_per_second": 97.856,
"eval_steps_per_second": 5.15,
"step": 110
},
{
"epoch": 0.36923076923076925,
"grad_norm": 3.4894511699676514,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.7385,
"step": 120
},
{
"epoch": 0.36923076923076925,
"eval_loss": 0.6673641800880432,
"eval_runtime": 1.1682,
"eval_samples_per_second": 97.588,
"eval_steps_per_second": 5.136,
"step": 120
},
{
"epoch": 0.4,
"grad_norm": 5.117406845092773,
"learning_rate": 1.1403508771929826e-05,
"loss": 0.6533,
"step": 130
},
{
"epoch": 0.4,
"eval_loss": 0.6705042123794556,
"eval_runtime": 1.1726,
"eval_samples_per_second": 97.217,
"eval_steps_per_second": 5.117,
"step": 130
},
{
"epoch": 0.4307692307692308,
"grad_norm": 3.9341437816619873,
"learning_rate": 1.2280701754385966e-05,
"loss": 0.7066,
"step": 140
},
{
"epoch": 0.4307692307692308,
"eval_loss": 0.6691470146179199,
"eval_runtime": 1.1702,
"eval_samples_per_second": 97.418,
"eval_steps_per_second": 5.127,
"step": 140
},
{
"epoch": 0.46153846153846156,
"grad_norm": 3.4131579399108887,
"learning_rate": 1.3157894736842108e-05,
"loss": 0.6065,
"step": 150
},
{
"epoch": 0.46153846153846156,
"eval_loss": 0.6724759340286255,
"eval_runtime": 1.1746,
"eval_samples_per_second": 97.05,
"eval_steps_per_second": 5.108,
"step": 150
},
{
"epoch": 0.49230769230769234,
"grad_norm": 3.820042848587036,
"learning_rate": 1.4035087719298246e-05,
"loss": 0.6474,
"step": 160
},
{
"epoch": 0.49230769230769234,
"eval_loss": 0.6704264879226685,
"eval_runtime": 1.1772,
"eval_samples_per_second": 96.838,
"eval_steps_per_second": 5.097,
"step": 160
},
{
"epoch": 0.5230769230769231,
"grad_norm": 2.8097360134124756,
"learning_rate": 1.4912280701754388e-05,
"loss": 0.7022,
"step": 170
},
{
"epoch": 0.5230769230769231,
"eval_loss": 0.6691609025001526,
"eval_runtime": 1.1676,
"eval_samples_per_second": 97.638,
"eval_steps_per_second": 5.139,
"step": 170
},
{
"epoch": 0.5538461538461539,
"grad_norm": 2.3641347885131836,
"learning_rate": 1.578947368421053e-05,
"loss": 0.6667,
"step": 180
},
{
"epoch": 0.5538461538461539,
"eval_loss": 0.6753159165382385,
"eval_runtime": 1.1689,
"eval_samples_per_second": 97.524,
"eval_steps_per_second": 5.133,
"step": 180
},
{
"epoch": 0.5846153846153846,
"grad_norm": 6.359607219696045,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.6889,
"step": 190
},
{
"epoch": 0.5846153846153846,
"eval_loss": 0.6796454787254333,
"eval_runtime": 1.1714,
"eval_samples_per_second": 97.323,
"eval_steps_per_second": 5.122,
"step": 190
},
{
"epoch": 0.6153846153846154,
"grad_norm": 4.7979960441589355,
"learning_rate": 1.754385964912281e-05,
"loss": 0.6937,
"step": 200
},
{
"epoch": 0.6153846153846154,
"eval_loss": 0.6748442649841309,
"eval_runtime": 1.1773,
"eval_samples_per_second": 96.831,
"eval_steps_per_second": 5.096,
"step": 200
},
{
"epoch": 0.6461538461538462,
"grad_norm": 3.5100464820861816,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.6613,
"step": 210
},
{
"epoch": 0.6461538461538462,
"eval_loss": 0.6842787861824036,
"eval_runtime": 1.1662,
"eval_samples_per_second": 97.752,
"eval_steps_per_second": 5.145,
"step": 210
},
{
"epoch": 0.676923076923077,
"grad_norm": 3.3538153171539307,
"learning_rate": 1.929824561403509e-05,
"loss": 0.6512,
"step": 220
},
{
"epoch": 0.676923076923077,
"eval_loss": 0.6873583197593689,
"eval_runtime": 1.1667,
"eval_samples_per_second": 97.712,
"eval_steps_per_second": 5.143,
"step": 220
},
{
"epoch": 0.7076923076923077,
"grad_norm": 3.7595653533935547,
"learning_rate": 1.9999952892103225e-05,
"loss": 0.7656,
"step": 230
},
{
"epoch": 0.7076923076923077,
"eval_loss": 0.6970181465148926,
"eval_runtime": 1.1663,
"eval_samples_per_second": 97.741,
"eval_steps_per_second": 5.144,
"step": 230
},
{
"epoch": 0.7384615384615385,
"grad_norm": 2.9787654876708984,
"learning_rate": 1.999830416231782e-05,
"loss": 0.6412,
"step": 240
},
{
"epoch": 0.7384615384615385,
"eval_loss": 0.7058220505714417,
"eval_runtime": 1.1688,
"eval_samples_per_second": 97.537,
"eval_steps_per_second": 5.134,
"step": 240
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.3455305099487305,
"learning_rate": 1.9994300481505595e-05,
"loss": 0.6148,
"step": 250
},
{
"epoch": 0.7692307692307693,
"eval_loss": 0.7070643305778503,
"eval_runtime": 1.1728,
"eval_samples_per_second": 97.202,
"eval_steps_per_second": 5.116,
"step": 250
},
{
"epoch": 0.8,
"grad_norm": 4.057767868041992,
"learning_rate": 1.998794279267369e-05,
"loss": 0.6433,
"step": 260
},
{
"epoch": 0.8,
"eval_loss": 0.7127295136451721,
"eval_runtime": 1.1689,
"eval_samples_per_second": 97.529,
"eval_steps_per_second": 5.133,
"step": 260
},
{
"epoch": 0.8307692307692308,
"grad_norm": 2.5686323642730713,
"learning_rate": 1.9979232593280637e-05,
"loss": 0.5972,
"step": 270
},
{
"epoch": 0.8307692307692308,
"eval_loss": 0.7106388211250305,
"eval_runtime": 1.166,
"eval_samples_per_second": 97.77,
"eval_steps_per_second": 5.146,
"step": 270
},
{
"epoch": 0.8615384615384616,
"grad_norm": 1.2748245000839233,
"learning_rate": 1.9968171934883647e-05,
"loss": 0.6795,
"step": 280
},
{
"epoch": 0.8615384615384616,
"eval_loss": 0.7127251029014587,
"eval_runtime": 1.1701,
"eval_samples_per_second": 97.429,
"eval_steps_per_second": 5.128,
"step": 280
},
{
"epoch": 0.8923076923076924,
"grad_norm": 3.1174163818359375,
"learning_rate": 1.9954763422655396e-05,
"loss": 0.6589,
"step": 290
},
{
"epoch": 0.8923076923076924,
"eval_loss": 0.7157883048057556,
"eval_runtime": 1.1727,
"eval_samples_per_second": 97.211,
"eval_steps_per_second": 5.116,
"step": 290
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.3197476863861084,
"learning_rate": 1.9939010214770426e-05,
"loss": 0.5539,
"step": 300
},
{
"epoch": 0.9230769230769231,
"eval_loss": 0.7173091173171997,
"eval_runtime": 1.1712,
"eval_samples_per_second": 97.332,
"eval_steps_per_second": 5.123,
"step": 300
},
{
"epoch": 0.9538461538461539,
"grad_norm": 2.7073607444763184,
"learning_rate": 1.9920916021661277e-05,
"loss": 0.7515,
"step": 310
},
{
"epoch": 0.9538461538461539,
"eval_loss": 0.720274806022644,
"eval_runtime": 1.1673,
"eval_samples_per_second": 97.665,
"eval_steps_per_second": 5.14,
"step": 310
},
{
"epoch": 0.9846153846153847,
"grad_norm": 2.78871488571167,
"learning_rate": 1.9900485105144544e-05,
"loss": 0.6282,
"step": 320
},
{
"epoch": 0.9846153846153847,
"eval_loss": 0.7291054725646973,
"eval_runtime": 1.1713,
"eval_samples_per_second": 97.331,
"eval_steps_per_second": 5.123,
"step": 320
},
{
"epoch": 1.0153846153846153,
"grad_norm": 2.181795597076416,
"learning_rate": 1.9877722277417085e-05,
"loss": 0.4711,
"step": 330
},
{
"epoch": 1.0153846153846153,
"eval_loss": 0.7346097826957703,
"eval_runtime": 1.1674,
"eval_samples_per_second": 97.652,
"eval_steps_per_second": 5.14,
"step": 330
},
{
"epoch": 1.0461538461538462,
"grad_norm": 6.42896842956543,
"learning_rate": 1.985263289992256e-05,
"loss": 0.4595,
"step": 340
},
{
"epoch": 1.0461538461538462,
"eval_loss": 0.7707703709602356,
"eval_runtime": 1.1711,
"eval_samples_per_second": 97.348,
"eval_steps_per_second": 5.124,
"step": 340
},
{
"epoch": 1.0769230769230769,
"grad_norm": 6.206396102905273,
"learning_rate": 1.9825222882088647e-05,
"loss": 0.3704,
"step": 350
},
{
"epoch": 1.0769230769230769,
"eval_loss": 0.7343254685401917,
"eval_runtime": 1.1717,
"eval_samples_per_second": 97.298,
"eval_steps_per_second": 5.121,
"step": 350
},
{
"epoch": 1.1076923076923078,
"grad_norm": 2.2610416412353516,
"learning_rate": 1.9795498679935144e-05,
"loss": 0.3845,
"step": 360
},
{
"epoch": 1.1076923076923078,
"eval_loss": 0.7747918963432312,
"eval_runtime": 1.1652,
"eval_samples_per_second": 97.838,
"eval_steps_per_second": 5.149,
"step": 360
},
{
"epoch": 1.1384615384615384,
"grad_norm": 2.490269184112549,
"learning_rate": 1.9763467294553364e-05,
"loss": 0.3461,
"step": 370
},
{
"epoch": 1.1384615384615384,
"eval_loss": 0.7602437138557434,
"eval_runtime": 1.1685,
"eval_samples_per_second": 97.563,
"eval_steps_per_second": 5.135,
"step": 370
},
{
"epoch": 1.1692307692307693,
"grad_norm": 2.435138702392578,
"learning_rate": 1.9729136270457118e-05,
"loss": 0.3871,
"step": 380
},
{
"epoch": 1.1692307692307693,
"eval_loss": 0.7585355043411255,
"eval_runtime": 1.1702,
"eval_samples_per_second": 97.419,
"eval_steps_per_second": 5.127,
"step": 380
},
{
"epoch": 1.2,
"grad_norm": 1.649342656135559,
"learning_rate": 1.9692513693805738e-05,
"loss": 0.345,
"step": 390
},
{
"epoch": 1.2,
"eval_loss": 0.7817304730415344,
"eval_runtime": 1.169,
"eval_samples_per_second": 97.52,
"eval_steps_per_second": 5.133,
"step": 390
},
{
"epoch": 1.2307692307692308,
"grad_norm": 2.2906811237335205,
"learning_rate": 1.965360819049948e-05,
"loss": 0.4372,
"step": 400
},
{
"epoch": 1.2307692307692308,
"eval_loss": 0.7519503235816956,
"eval_runtime": 1.1693,
"eval_samples_per_second": 97.495,
"eval_steps_per_second": 5.131,
"step": 400
},
{
"epoch": 1.2615384615384615,
"grad_norm": 3.925938606262207,
"learning_rate": 1.9612428924147842e-05,
"loss": 0.4879,
"step": 410
},
{
"epoch": 1.2615384615384615,
"eval_loss": 0.7463933229446411,
"eval_runtime": 1.1712,
"eval_samples_per_second": 97.334,
"eval_steps_per_second": 5.123,
"step": 410
},
{
"epoch": 1.2923076923076924,
"grad_norm": 1.4445178508758545,
"learning_rate": 1.9568985593911206e-05,
"loss": 0.4265,
"step": 420
},
{
"epoch": 1.2923076923076924,
"eval_loss": 0.7673818469047546,
"eval_runtime": 1.1696,
"eval_samples_per_second": 97.471,
"eval_steps_per_second": 5.13,
"step": 420
},
{
"epoch": 1.323076923076923,
"grad_norm": 2.707318067550659,
"learning_rate": 1.9523288432216333e-05,
"loss": 0.4255,
"step": 430
},
{
"epoch": 1.323076923076923,
"eval_loss": 0.776730477809906,
"eval_runtime": 1.1686,
"eval_samples_per_second": 97.552,
"eval_steps_per_second": 5.134,
"step": 430
},
{
"epoch": 1.353846153846154,
"grad_norm": 2.1543335914611816,
"learning_rate": 1.9475348202346292e-05,
"loss": 0.3992,
"step": 440
},
{
"epoch": 1.353846153846154,
"eval_loss": 0.7587007284164429,
"eval_runtime": 1.1741,
"eval_samples_per_second": 97.094,
"eval_steps_per_second": 5.11,
"step": 440
},
{
"epoch": 1.3846153846153846,
"grad_norm": 2.1207425594329834,
"learning_rate": 1.942517619590531e-05,
"loss": 0.4197,
"step": 450
},
{
"epoch": 1.3846153846153846,
"eval_loss": 0.7629592418670654,
"eval_runtime": 1.1714,
"eval_samples_per_second": 97.323,
"eval_steps_per_second": 5.122,
"step": 450
},
{
"epoch": 1.4153846153846155,
"grad_norm": 3.0015456676483154,
"learning_rate": 1.9372784230159213e-05,
"loss": 0.3963,
"step": 460
},
{
"epoch": 1.4153846153846155,
"eval_loss": 0.7871745824813843,
"eval_runtime": 1.1671,
"eval_samples_per_second": 97.676,
"eval_steps_per_second": 5.141,
"step": 460
},
{
"epoch": 1.4461538461538461,
"grad_norm": 2.6770448684692383,
"learning_rate": 1.9318184645252037e-05,
"loss": 0.3689,
"step": 470
},
{
"epoch": 1.4461538461538461,
"eval_loss": 0.7728400826454163,
"eval_runtime": 1.1721,
"eval_samples_per_second": 97.262,
"eval_steps_per_second": 5.119,
"step": 470
},
{
"epoch": 1.476923076923077,
"grad_norm": 2.6416432857513428,
"learning_rate": 1.926139030129951e-05,
"loss": 0.4004,
"step": 480
},
{
"epoch": 1.476923076923077,
"eval_loss": 0.7785875797271729,
"eval_runtime": 1.1696,
"eval_samples_per_second": 97.466,
"eval_steps_per_second": 5.13,
"step": 480
},
{
"epoch": 1.5076923076923077,
"grad_norm": 1.8383666276931763,
"learning_rate": 1.9202414575360024e-05,
"loss": 0.4265,
"step": 490
},
{
"epoch": 1.5076923076923077,
"eval_loss": 0.7735582590103149,
"eval_runtime": 1.168,
"eval_samples_per_second": 97.604,
"eval_steps_per_second": 5.137,
"step": 490
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.905369758605957,
"learning_rate": 1.9141271358283874e-05,
"loss": 0.3675,
"step": 500
},
{
"epoch": 1.5384615384615383,
"eval_loss": 0.7821589708328247,
"eval_runtime": 1.168,
"eval_samples_per_second": 97.603,
"eval_steps_per_second": 5.137,
"step": 500
},
{
"epoch": 1.5692307692307692,
"grad_norm": 2.0887465476989746,
"learning_rate": 1.9077975051441487e-05,
"loss": 0.3769,
"step": 510
},
{
"epoch": 1.5692307692307692,
"eval_loss": 0.8046853542327881,
"eval_runtime": 1.168,
"eval_samples_per_second": 97.604,
"eval_steps_per_second": 5.137,
"step": 510
},
{
"epoch": 1.6,
"grad_norm": 1.3952693939208984,
"learning_rate": 1.9012540563331375e-05,
"loss": 0.3842,
"step": 520
},
{
"epoch": 1.6,
"eval_loss": 0.7695035338401794,
"eval_runtime": 1.1677,
"eval_samples_per_second": 97.627,
"eval_steps_per_second": 5.138,
"step": 520
},
{
"epoch": 1.6307692307692307,
"grad_norm": 2.4253690242767334,
"learning_rate": 1.8944983306068683e-05,
"loss": 0.372,
"step": 530
},
{
"epoch": 1.6307692307692307,
"eval_loss": 0.7812924385070801,
"eval_runtime": 1.1688,
"eval_samples_per_second": 97.535,
"eval_steps_per_second": 5.133,
"step": 530
},
{
"epoch": 1.6615384615384614,
"grad_norm": 1.9009945392608643,
"learning_rate": 1.8875319191755083e-05,
"loss": 0.3825,
"step": 540
},
{
"epoch": 1.6615384615384614,
"eval_loss": 0.7782070636749268,
"eval_runtime": 1.1713,
"eval_samples_per_second": 97.328,
"eval_steps_per_second": 5.123,
"step": 540
},
{
"epoch": 1.6923076923076923,
"grad_norm": 3.183929443359375,
"learning_rate": 1.8803564628730916e-05,
"loss": 0.396,
"step": 550
},
{
"epoch": 1.6923076923076923,
"eval_loss": 0.7839831709861755,
"eval_runtime": 1.1706,
"eval_samples_per_second": 97.388,
"eval_steps_per_second": 5.126,
"step": 550
},
{
"epoch": 1.7230769230769232,
"grad_norm": 2.031071901321411,
"learning_rate": 1.8729736517710454e-05,
"loss": 0.3862,
"step": 560
},
{
"epoch": 1.7230769230769232,
"eval_loss": 0.7782466411590576,
"eval_runtime": 1.1669,
"eval_samples_per_second": 97.695,
"eval_steps_per_second": 5.142,
"step": 560
},
{
"epoch": 1.7538461538461538,
"grad_norm": 1.2456035614013672,
"learning_rate": 1.865385224780119e-05,
"loss": 0.3909,
"step": 570
},
{
"epoch": 1.7538461538461538,
"eval_loss": 0.7737441062927246,
"eval_runtime": 1.1644,
"eval_samples_per_second": 97.908,
"eval_steps_per_second": 5.153,
"step": 570
},
{
"epoch": 1.7846153846153845,
"grad_norm": 1.3416839838027954,
"learning_rate": 1.8575929692408105e-05,
"loss": 0.3585,
"step": 580
},
{
"epoch": 1.7846153846153845,
"eval_loss": 0.7757642269134521,
"eval_runtime": 1.1707,
"eval_samples_per_second": 97.376,
"eval_steps_per_second": 5.125,
"step": 580
},
{
"epoch": 1.8153846153846154,
"grad_norm": 2.5810840129852295,
"learning_rate": 1.8495987205023832e-05,
"loss": 0.4018,
"step": 590
},
{
"epoch": 1.8153846153846154,
"eval_loss": 0.7671994566917419,
"eval_runtime": 1.1677,
"eval_samples_per_second": 97.627,
"eval_steps_per_second": 5.138,
"step": 590
},
{
"epoch": 1.8461538461538463,
"grad_norm": 2.186009407043457,
"learning_rate": 1.8414043614905782e-05,
"loss": 0.3936,
"step": 600
},
{
"epoch": 1.8461538461538463,
"eval_loss": 0.7763370871543884,
"eval_runtime": 1.1743,
"eval_samples_per_second": 97.078,
"eval_steps_per_second": 5.109,
"step": 600
},
{
"epoch": 1.876923076923077,
"grad_norm": 2.5505406856536865,
"learning_rate": 1.8330118222641192e-05,
"loss": 0.4963,
"step": 610
},
{
"epoch": 1.876923076923077,
"eval_loss": 0.7642151713371277,
"eval_runtime": 1.1648,
"eval_samples_per_second": 97.867,
"eval_steps_per_second": 5.151,
"step": 610
},
{
"epoch": 1.9076923076923076,
"grad_norm": 2.2109644412994385,
"learning_rate": 1.824423079560116e-05,
"loss": 0.4821,
"step": 620
},
{
"epoch": 1.9076923076923076,
"eval_loss": 0.7671634554862976,
"eval_runtime": 1.1645,
"eval_samples_per_second": 97.898,
"eval_steps_per_second": 5.153,
"step": 620
},
{
"epoch": 1.9384615384615385,
"grad_norm": 2.702033519744873,
"learning_rate": 1.8156401563284724e-05,
"loss": 0.4216,
"step": 630
},
{
"epoch": 1.9384615384615385,
"eval_loss": 0.7776830792427063,
"eval_runtime": 1.1704,
"eval_samples_per_second": 97.399,
"eval_steps_per_second": 5.126,
"step": 630
},
{
"epoch": 1.9692307692307693,
"grad_norm": 2.2329928874969482,
"learning_rate": 1.8066651212554126e-05,
"loss": 0.4422,
"step": 640
},
{
"epoch": 1.9692307692307693,
"eval_loss": 0.7678297162055969,
"eval_runtime": 1.171,
"eval_samples_per_second": 97.351,
"eval_steps_per_second": 5.124,
"step": 640
},
{
"epoch": 2.0,
"grad_norm": 1.3216522932052612,
"learning_rate": 1.797500088276232e-05,
"loss": 0.3888,
"step": 650
},
{
"epoch": 2.0,
"eval_loss": 0.7601237297058105,
"eval_runtime": 1.1955,
"eval_samples_per_second": 95.354,
"eval_steps_per_second": 5.019,
"step": 650
},
{
"epoch": 2.0307692307692307,
"grad_norm": 1.9767566919326782,
"learning_rate": 1.7881472160773912e-05,
"loss": 0.2158,
"step": 660
},
{
"epoch": 2.0307692307692307,
"eval_loss": 0.8567830920219421,
"eval_runtime": 1.1711,
"eval_samples_per_second": 97.344,
"eval_steps_per_second": 5.123,
"step": 660
},
{
"epoch": 2.0615384615384613,
"grad_norm": 1.724826693534851,
"learning_rate": 1.7786087075880698e-05,
"loss": 0.1589,
"step": 670
},
{
"epoch": 2.0615384615384613,
"eval_loss": 0.8708633780479431,
"eval_runtime": 1.1707,
"eval_samples_per_second": 97.379,
"eval_steps_per_second": 5.125,
"step": 670
},
{
"epoch": 2.0923076923076924,
"grad_norm": 1.9074368476867676,
"learning_rate": 1.7688868094613e-05,
"loss": 0.1974,
"step": 680
},
{
"epoch": 2.0923076923076924,
"eval_loss": 0.8557892441749573,
"eval_runtime": 1.1707,
"eval_samples_per_second": 97.382,
"eval_steps_per_second": 5.125,
"step": 680
},
{
"epoch": 2.123076923076923,
"grad_norm": 1.2026904821395874,
"learning_rate": 1.7589838115448005e-05,
"loss": 0.1832,
"step": 690
},
{
"epoch": 2.123076923076923,
"eval_loss": 0.861406147480011,
"eval_runtime": 1.1692,
"eval_samples_per_second": 97.502,
"eval_steps_per_second": 5.132,
"step": 690
},
{
"epoch": 2.1538461538461537,
"grad_norm": 1.541394829750061,
"learning_rate": 1.748902046341637e-05,
"loss": 0.1835,
"step": 700
},
{
"epoch": 2.1538461538461537,
"eval_loss": 0.8678261041641235,
"eval_runtime": 1.1824,
"eval_samples_per_second": 96.41,
"eval_steps_per_second": 5.074,
"step": 700
},
{
"epoch": 2.184615384615385,
"grad_norm": 2.980255126953125,
"learning_rate": 1.7386438884608366e-05,
"loss": 0.1555,
"step": 710
},
{
"epoch": 2.184615384615385,
"eval_loss": 0.8728282451629639,
"eval_runtime": 1.1698,
"eval_samples_per_second": 97.454,
"eval_steps_per_second": 5.129,
"step": 710
},
{
"epoch": 2.2153846153846155,
"grad_norm": 1.7097492218017578,
"learning_rate": 1.7282117540580833e-05,
"loss": 0.1789,
"step": 720
},
{
"epoch": 2.2153846153846155,
"eval_loss": 0.8826200366020203,
"eval_runtime": 1.1687,
"eval_samples_per_second": 97.541,
"eval_steps_per_second": 5.134,
"step": 720
},
{
"epoch": 2.246153846153846,
"grad_norm": 2.0315046310424805,
"learning_rate": 1.7176081002666295e-05,
"loss": 0.1825,
"step": 730
},
{
"epoch": 2.246153846153846,
"eval_loss": 0.8737024068832397,
"eval_runtime": 1.1727,
"eval_samples_per_second": 97.211,
"eval_steps_per_second": 5.116,
"step": 730
},
{
"epoch": 2.276923076923077,
"grad_norm": 1.5558000802993774,
"learning_rate": 1.706835424618555e-05,
"loss": 0.1906,
"step": 740
},
{
"epoch": 2.276923076923077,
"eval_loss": 0.8740295767784119,
"eval_runtime": 1.1746,
"eval_samples_per_second": 97.054,
"eval_steps_per_second": 5.108,
"step": 740
},
{
"epoch": 2.3076923076923075,
"grad_norm": 2.007802724838257,
"learning_rate": 1.695896264456509e-05,
"loss": 0.2171,
"step": 750
},
{
"epoch": 2.3076923076923075,
"eval_loss": 0.8841921091079712,
"eval_runtime": 1.1694,
"eval_samples_per_second": 97.487,
"eval_steps_per_second": 5.131,
"step": 750
},
{
"epoch": 2.3384615384615386,
"grad_norm": 2.0817086696624756,
"learning_rate": 1.6847931963360796e-05,
"loss": 0.1993,
"step": 760
},
{
"epoch": 2.3384615384615386,
"eval_loss": 0.8875246644020081,
"eval_runtime": 1.1683,
"eval_samples_per_second": 97.576,
"eval_steps_per_second": 5.136,
"step": 760
},
{
"epoch": 2.3692307692307693,
"grad_norm": 1.752740740776062,
"learning_rate": 1.6735288354189225e-05,
"loss": 0.1928,
"step": 770
},
{
"epoch": 2.3692307692307693,
"eval_loss": 0.8896489143371582,
"eval_runtime": 1.1654,
"eval_samples_per_second": 97.821,
"eval_steps_per_second": 5.148,
"step": 770
},
{
"epoch": 2.4,
"grad_norm": 1.6042486429214478,
"learning_rate": 1.6621058348568008e-05,
"loss": 0.2059,
"step": 780
},
{
"epoch": 2.4,
"eval_loss": 0.8732131719589233,
"eval_runtime": 1.1715,
"eval_samples_per_second": 97.315,
"eval_steps_per_second": 5.122,
"step": 780
},
{
"epoch": 2.430769230769231,
"grad_norm": 2.1651504039764404,
"learning_rate": 1.6505268851666717e-05,
"loss": 0.2101,
"step": 790
},
{
"epoch": 2.430769230769231,
"eval_loss": 0.8754842877388,
"eval_runtime": 1.1681,
"eval_samples_per_second": 97.595,
"eval_steps_per_second": 5.137,
"step": 790
},
{
"epoch": 2.4615384615384617,
"grad_norm": 2.9434351921081543,
"learning_rate": 1.6387947135969796e-05,
"loss": 0.2305,
"step": 800
},
{
"epoch": 2.4615384615384617,
"eval_loss": 0.9017049074172974,
"eval_runtime": 1.1681,
"eval_samples_per_second": 97.598,
"eval_steps_per_second": 5.137,
"step": 800
},
{
"epoch": 2.4923076923076923,
"grad_norm": 1.2757177352905273,
"learning_rate": 1.6269120834852892e-05,
"loss": 0.1837,
"step": 810
},
{
"epoch": 2.4923076923076923,
"eval_loss": 0.8789340853691101,
"eval_runtime": 1.1681,
"eval_samples_per_second": 97.593,
"eval_steps_per_second": 5.136,
"step": 810
},
{
"epoch": 2.523076923076923,
"grad_norm": 2.2374846935272217,
"learning_rate": 1.6148817936074267e-05,
"loss": 0.1846,
"step": 820
},
{
"epoch": 2.523076923076923,
"eval_loss": 0.8868066668510437,
"eval_runtime": 1.1703,
"eval_samples_per_second": 97.411,
"eval_steps_per_second": 5.127,
"step": 820
},
{
"epoch": 2.5538461538461537,
"grad_norm": 2.112977981567383,
"learning_rate": 1.6027066775182664e-05,
"loss": 0.215,
"step": 830
},
{
"epoch": 2.5538461538461537,
"eval_loss": 0.8842012286186218,
"eval_runtime": 1.168,
"eval_samples_per_second": 97.604,
"eval_steps_per_second": 5.137,
"step": 830
},
{
"epoch": 2.5846153846153848,
"grad_norm": 1.7254525423049927,
"learning_rate": 1.5903896028843316e-05,
"loss": 0.2021,
"step": 840
},
{
"epoch": 2.5846153846153848,
"eval_loss": 0.8752718567848206,
"eval_runtime": 1.172,
"eval_samples_per_second": 97.267,
"eval_steps_per_second": 5.119,
"step": 840
},
{
"epoch": 2.6153846153846154,
"grad_norm": 2.7533833980560303,
"learning_rate": 1.5779334708083585e-05,
"loss": 0.2087,
"step": 850
},
{
"epoch": 2.6153846153846154,
"eval_loss": 0.881074070930481,
"eval_runtime": 1.1742,
"eval_samples_per_second": 97.086,
"eval_steps_per_second": 5.11,
"step": 850
},
{
"epoch": 2.646153846153846,
"grad_norm": 1.364652395248413,
"learning_rate": 1.565341215145983e-05,
"loss": 0.205,
"step": 860
},
{
"epoch": 2.646153846153846,
"eval_loss": 0.8765241503715515,
"eval_runtime": 1.1685,
"eval_samples_per_second": 97.561,
"eval_steps_per_second": 5.135,
"step": 860
},
{
"epoch": 2.676923076923077,
"grad_norm": 2.8074488639831543,
"learning_rate": 1.5526158018147168e-05,
"loss": 0.1872,
"step": 870
},
{
"epoch": 2.676923076923077,
"eval_loss": 0.8835176229476929,
"eval_runtime": 1.1654,
"eval_samples_per_second": 97.823,
"eval_steps_per_second": 5.149,
"step": 870
},
{
"epoch": 2.707692307692308,
"grad_norm": 1.8859968185424805,
"learning_rate": 1.5397602280953695e-05,
"loss": 0.197,
"step": 880
},
{
"epoch": 2.707692307692308,
"eval_loss": 0.8719732761383057,
"eval_runtime": 1.1708,
"eval_samples_per_second": 97.368,
"eval_steps_per_second": 5.125,
"step": 880
},
{
"epoch": 2.7384615384615385,
"grad_norm": 1.2429983615875244,
"learning_rate": 1.526777521926084e-05,
"loss": 0.1932,
"step": 890
},
{
"epoch": 2.7384615384615385,
"eval_loss": 0.8760843276977539,
"eval_runtime": 1.1705,
"eval_samples_per_second": 97.392,
"eval_steps_per_second": 5.126,
"step": 890
},
{
"epoch": 2.769230769230769,
"grad_norm": 1.7974516153335571,
"learning_rate": 1.5136707411891483e-05,
"loss": 0.2008,
"step": 900
},
{
"epoch": 2.769230769230769,
"eval_loss": 0.8759164214134216,
"eval_runtime": 1.1739,
"eval_samples_per_second": 97.116,
"eval_steps_per_second": 5.111,
"step": 900
},
{
"epoch": 2.8,
"grad_norm": 1.4502002000808716,
"learning_rate": 1.5004429729907619e-05,
"loss": 0.1998,
"step": 910
},
{
"epoch": 2.8,
"eval_loss": 0.8729867935180664,
"eval_runtime": 1.1659,
"eval_samples_per_second": 97.775,
"eval_steps_per_second": 5.146,
"step": 910
},
{
"epoch": 2.830769230769231,
"grad_norm": 1.3150732517242432,
"learning_rate": 1.4870973329339112e-05,
"loss": 0.1936,
"step": 920
},
{
"epoch": 2.830769230769231,
"eval_loss": 0.8798808455467224,
"eval_runtime": 1.1664,
"eval_samples_per_second": 97.737,
"eval_steps_per_second": 5.144,
"step": 920
},
{
"epoch": 2.8615384615384616,
"grad_norm": 1.8317633867263794,
"learning_rate": 1.4736369643845346e-05,
"loss": 0.1951,
"step": 930
},
{
"epoch": 2.8615384615384616,
"eval_loss": 0.900560736656189,
"eval_runtime": 1.1713,
"eval_samples_per_second": 97.324,
"eval_steps_per_second": 5.122,
"step": 930
},
{
"epoch": 2.8923076923076922,
"grad_norm": 2.284203290939331,
"learning_rate": 1.4600650377311523e-05,
"loss": 0.1884,
"step": 940
},
{
"epoch": 2.8923076923076922,
"eval_loss": 0.8905934691429138,
"eval_runtime": 1.1697,
"eval_samples_per_second": 97.462,
"eval_steps_per_second": 5.13,
"step": 940
},
{
"epoch": 2.9230769230769234,
"grad_norm": 1.9384973049163818,
"learning_rate": 1.446384749638128e-05,
"loss": 0.1881,
"step": 950
},
{
"epoch": 2.9230769230769234,
"eval_loss": 0.8698312640190125,
"eval_runtime": 1.1727,
"eval_samples_per_second": 97.213,
"eval_steps_per_second": 5.116,
"step": 950
},
{
"epoch": 2.953846153846154,
"grad_norm": 1.7943904399871826,
"learning_rate": 1.4325993222927414e-05,
"loss": 0.2166,
"step": 960
},
{
"epoch": 2.953846153846154,
"eval_loss": 0.8718281984329224,
"eval_runtime": 1.1666,
"eval_samples_per_second": 97.724,
"eval_steps_per_second": 5.143,
"step": 960
},
{
"epoch": 2.9846153846153847,
"grad_norm": 2.0599663257598877,
"learning_rate": 1.4187120026462508e-05,
"loss": 0.2082,
"step": 970
},
{
"epoch": 2.9846153846153847,
"eval_loss": 0.8722580075263977,
"eval_runtime": 1.1673,
"eval_samples_per_second": 97.658,
"eval_steps_per_second": 5.14,
"step": 970
},
{
"epoch": 3.0153846153846153,
"grad_norm": 1.2260268926620483,
"learning_rate": 1.4047260616491225e-05,
"loss": 0.1631,
"step": 980
},
{
"epoch": 3.0153846153846153,
"eval_loss": 0.8925275206565857,
"eval_runtime": 1.1687,
"eval_samples_per_second": 97.541,
"eval_steps_per_second": 5.134,
"step": 980
},
{
"epoch": 3.046153846153846,
"grad_norm": 1.544405460357666,
"learning_rate": 1.3906447934806074e-05,
"loss": 0.1024,
"step": 990
},
{
"epoch": 3.046153846153846,
"eval_loss": 0.9651603102684021,
"eval_runtime": 1.1687,
"eval_samples_per_second": 97.54,
"eval_steps_per_second": 5.134,
"step": 990
},
{
"epoch": 3.076923076923077,
"grad_norm": 1.8806118965148926,
"learning_rate": 1.3764715147728451e-05,
"loss": 0.1111,
"step": 1000
},
{
"epoch": 3.076923076923077,
"eval_loss": 0.9773316383361816,
"eval_runtime": 1.1719,
"eval_samples_per_second": 97.274,
"eval_steps_per_second": 5.12,
"step": 1000
},
{
"epoch": 3.1076923076923078,
"grad_norm": 1.2032195329666138,
"learning_rate": 1.3622095638296827e-05,
"loss": 0.1011,
"step": 1010
},
{
"epoch": 3.1076923076923078,
"eval_loss": 0.9423761367797852,
"eval_runtime": 1.1686,
"eval_samples_per_second": 97.556,
"eval_steps_per_second": 5.135,
"step": 1010
},
{
"epoch": 3.1384615384615384,
"grad_norm": 1.2994767427444458,
"learning_rate": 1.3478622998403861e-05,
"loss": 0.1078,
"step": 1020
},
{
"epoch": 3.1384615384615384,
"eval_loss": 0.9416558146476746,
"eval_runtime": 1.1698,
"eval_samples_per_second": 97.455,
"eval_steps_per_second": 5.129,
"step": 1020
},
{
"epoch": 3.169230769230769,
"grad_norm": 1.061914324760437,
"learning_rate": 1.3334331020884328e-05,
"loss": 0.0991,
"step": 1030
},
{
"epoch": 3.169230769230769,
"eval_loss": 0.9638768434524536,
"eval_runtime": 1.1664,
"eval_samples_per_second": 97.741,
"eval_steps_per_second": 5.144,
"step": 1030
},
{
"epoch": 3.2,
"grad_norm": 1.4495635032653809,
"learning_rate": 1.318925369155574e-05,
"loss": 0.0979,
"step": 1040
},
{
"epoch": 3.2,
"eval_loss": 0.9724640846252441,
"eval_runtime": 1.1718,
"eval_samples_per_second": 97.282,
"eval_steps_per_second": 5.12,
"step": 1040
},
{
"epoch": 3.230769230769231,
"grad_norm": 1.5616148710250854,
"learning_rate": 1.3043425181213471e-05,
"loss": 0.1109,
"step": 1050
},
{
"epoch": 3.230769230769231,
"eval_loss": 0.972594141960144,
"eval_runtime": 1.169,
"eval_samples_per_second": 97.515,
"eval_steps_per_second": 5.132,
"step": 1050
},
{
"epoch": 3.2615384615384615,
"grad_norm": 2.1167333126068115,
"learning_rate": 1.2896879837582356e-05,
"loss": 0.1047,
"step": 1060
},
{
"epoch": 3.2615384615384615,
"eval_loss": 0.9514709711074829,
"eval_runtime": 1.1679,
"eval_samples_per_second": 97.612,
"eval_steps_per_second": 5.137,
"step": 1060
},
{
"epoch": 3.292307692307692,
"grad_norm": 1.5117723941802979,
"learning_rate": 1.2749652177226592e-05,
"loss": 0.1075,
"step": 1070
},
{
"epoch": 3.292307692307692,
"eval_loss": 0.9520531296730042,
"eval_runtime": 1.1746,
"eval_samples_per_second": 97.058,
"eval_steps_per_second": 5.108,
"step": 1070
},
{
"epoch": 3.3230769230769233,
"grad_norm": 1.231341004371643,
"learning_rate": 1.2601776877419876e-05,
"loss": 0.1021,
"step": 1080
},
{
"epoch": 3.3230769230769233,
"eval_loss": 0.9573836922645569,
"eval_runtime": 1.1666,
"eval_samples_per_second": 97.723,
"eval_steps_per_second": 5.143,
"step": 1080
},
{
"epoch": 3.353846153846154,
"grad_norm": 1.5957950353622437,
"learning_rate": 1.2453288767977686e-05,
"loss": 0.1069,
"step": 1090
},
{
"epoch": 3.353846153846154,
"eval_loss": 0.9602900147438049,
"eval_runtime": 1.169,
"eval_samples_per_second": 97.519,
"eval_steps_per_second": 5.133,
"step": 1090
},
{
"epoch": 3.3846153846153846,
"grad_norm": 0.7704668641090393,
"learning_rate": 1.2304222823053653e-05,
"loss": 0.0969,
"step": 1100
},
{
"epoch": 3.3846153846153846,
"eval_loss": 0.9564027190208435,
"eval_runtime": 1.1725,
"eval_samples_per_second": 97.232,
"eval_steps_per_second": 5.117,
"step": 1100
},
{
"epoch": 3.4153846153846152,
"grad_norm": 1.638713002204895,
"learning_rate": 1.2154614152901916e-05,
"loss": 0.1136,
"step": 1110
},
{
"epoch": 3.4153846153846152,
"eval_loss": 0.9655954241752625,
"eval_runtime": 1.1671,
"eval_samples_per_second": 97.68,
"eval_steps_per_second": 5.141,
"step": 1110
},
{
"epoch": 3.4461538461538463,
"grad_norm": 1.2840876579284668,
"learning_rate": 1.2004497995607415e-05,
"loss": 0.1097,
"step": 1120
},
{
"epoch": 3.4461538461538463,
"eval_loss": 0.9535898566246033,
"eval_runtime": 1.1708,
"eval_samples_per_second": 97.368,
"eval_steps_per_second": 5.125,
"step": 1120
},
{
"epoch": 3.476923076923077,
"grad_norm": 1.021537184715271,
"learning_rate": 1.1853909708786111e-05,
"loss": 0.1048,
"step": 1130
},
{
"epoch": 3.476923076923077,
"eval_loss": 0.952057421207428,
"eval_runtime": 1.1712,
"eval_samples_per_second": 97.336,
"eval_steps_per_second": 5.123,
"step": 1130
},
{
"epoch": 3.5076923076923077,
"grad_norm": 2.217410087585449,
"learning_rate": 1.1702884761257003e-05,
"loss": 0.118,
"step": 1140
},
{
"epoch": 3.5076923076923077,
"eval_loss": 0.9404497146606445,
"eval_runtime": 1.1731,
"eval_samples_per_second": 97.178,
"eval_steps_per_second": 5.115,
"step": 1140
},
{
"epoch": 3.5384615384615383,
"grad_norm": 1.5386874675750732,
"learning_rate": 1.1551458724688e-05,
"loss": 0.1053,
"step": 1150
},
{
"epoch": 3.5384615384615383,
"eval_loss": 0.9536699056625366,
"eval_runtime": 1.1718,
"eval_samples_per_second": 97.284,
"eval_steps_per_second": 5.12,
"step": 1150
},
{
"epoch": 3.569230769230769,
"grad_norm": 0.8064364790916443,
"learning_rate": 1.1399667265217522e-05,
"loss": 0.1061,
"step": 1160
},
{
"epoch": 3.569230769230769,
"eval_loss": 0.9544848799705505,
"eval_runtime": 1.1669,
"eval_samples_per_second": 97.693,
"eval_steps_per_second": 5.142,
"step": 1160
},
{
"epoch": 3.6,
"grad_norm": 0.917335569858551,
"learning_rate": 1.1247546135053904e-05,
"loss": 0.102,
"step": 1170
},
{
"epoch": 3.6,
"eval_loss": 0.9585210084915161,
"eval_runtime": 1.17,
"eval_samples_per_second": 97.434,
"eval_steps_per_second": 5.128,
"step": 1170
},
{
"epoch": 3.6307692307692307,
"grad_norm": 1.0991976261138916,
"learning_rate": 1.1095131164054476e-05,
"loss": 0.1072,
"step": 1180
},
{
"epoch": 3.6307692307692307,
"eval_loss": 0.9673210978507996,
"eval_runtime": 1.1692,
"eval_samples_per_second": 97.5,
"eval_steps_per_second": 5.132,
"step": 1180
},
{
"epoch": 3.6615384615384614,
"grad_norm": 1.2590489387512207,
"learning_rate": 1.0942458251286384e-05,
"loss": 0.1007,
"step": 1190
},
{
"epoch": 3.6615384615384614,
"eval_loss": 0.9589065313339233,
"eval_runtime": 1.1693,
"eval_samples_per_second": 97.491,
"eval_steps_per_second": 5.131,
"step": 1190
},
{
"epoch": 3.6923076923076925,
"grad_norm": 0.555060625076294,
"learning_rate": 1.078956335657109e-05,
"loss": 0.1045,
"step": 1200
},
{
"epoch": 3.6923076923076925,
"eval_loss": 0.9449043273925781,
"eval_runtime": 1.1697,
"eval_samples_per_second": 97.457,
"eval_steps_per_second": 5.129,
"step": 1200
},
{
"epoch": 3.723076923076923,
"grad_norm": 1.4285695552825928,
"learning_rate": 1.0636482492014603e-05,
"loss": 0.1042,
"step": 1210
},
{
"epoch": 3.723076923076923,
"eval_loss": 0.9505679607391357,
"eval_runtime": 1.1646,
"eval_samples_per_second": 97.886,
"eval_steps_per_second": 5.152,
"step": 1210
},
{
"epoch": 3.753846153846154,
"grad_norm": 1.0818232297897339,
"learning_rate": 1.0483251713525335e-05,
"loss": 0.1066,
"step": 1220
},
{
"epoch": 3.753846153846154,
"eval_loss": 0.958114743232727,
"eval_runtime": 1.1679,
"eval_samples_per_second": 97.609,
"eval_steps_per_second": 5.137,
"step": 1220
},
{
"epoch": 3.7846153846153845,
"grad_norm": 1.027013897895813,
"learning_rate": 1.0329907112321685e-05,
"loss": 0.1103,
"step": 1230
},
{
"epoch": 3.7846153846153845,
"eval_loss": 0.9584141969680786,
"eval_runtime": 1.1717,
"eval_samples_per_second": 97.292,
"eval_steps_per_second": 5.121,
"step": 1230
},
{
"epoch": 3.815384615384615,
"grad_norm": 1.2633482217788696,
"learning_rate": 1.0176484806431288e-05,
"loss": 0.1135,
"step": 1240
},
{
"epoch": 3.815384615384615,
"eval_loss": 0.9456351399421692,
"eval_runtime": 1.1713,
"eval_samples_per_second": 97.325,
"eval_steps_per_second": 5.122,
"step": 1240
},
{
"epoch": 3.8461538461538463,
"grad_norm": 1.5647493600845337,
"learning_rate": 1.002302093218396e-05,
"loss": 0.102,
"step": 1250
},
{
"epoch": 3.8461538461538463,
"eval_loss": 0.9635660648345947,
"eval_runtime": 1.1695,
"eval_samples_per_second": 97.479,
"eval_steps_per_second": 5.13,
"step": 1250
},
{
"epoch": 3.876923076923077,
"grad_norm": 1.1846381425857544,
"learning_rate": 9.869551635700321e-06,
"loss": 0.0996,
"step": 1260
},
{
"epoch": 3.876923076923077,
"eval_loss": 0.9652856588363647,
"eval_runtime": 1.1667,
"eval_samples_per_second": 97.709,
"eval_steps_per_second": 5.143,
"step": 1260
},
{
"epoch": 3.9076923076923076,
"grad_norm": 1.5169893503189087,
"learning_rate": 9.716113064378113e-06,
"loss": 0.0995,
"step": 1270
},
{
"epoch": 3.9076923076923076,
"eval_loss": 0.9649413228034973,
"eval_runtime": 1.1669,
"eval_samples_per_second": 97.691,
"eval_steps_per_second": 5.142,
"step": 1270
},
{
"epoch": 3.9384615384615387,
"grad_norm": 1.013720154762268,
"learning_rate": 9.562741358378239e-06,
"loss": 0.1124,
"step": 1280
},
{
"epoch": 3.9384615384615387,
"eval_loss": 0.9643102288246155,
"eval_runtime": 1.1719,
"eval_samples_per_second": 97.276,
"eval_steps_per_second": 5.12,
"step": 1280
},
{
"epoch": 3.9692307692307693,
"grad_norm": 1.1078283786773682,
"learning_rate": 9.409472642112454e-06,
"loss": 0.096,
"step": 1290
},
{
"epoch": 3.9692307692307693,
"eval_loss": 0.9525210857391357,
"eval_runtime": 1.1921,
"eval_samples_per_second": 95.632,
"eval_steps_per_second": 5.033,
"step": 1290
},
{
"epoch": 4.0,
"grad_norm": 1.2566571235656738,
"learning_rate": 9.256343015734842e-06,
"loss": 0.1133,
"step": 1300
},
{
"epoch": 4.0,
"eval_loss": 0.9479546546936035,
"eval_runtime": 1.1715,
"eval_samples_per_second": 97.308,
"eval_steps_per_second": 5.121,
"step": 1300
},
{
"epoch": 4.030769230769231,
"grad_norm": 1.1318272352218628,
"learning_rate": 9.103388546638929e-06,
"loss": 0.069,
"step": 1310
},
{
"epoch": 4.030769230769231,
"eval_loss": 0.966812789440155,
"eval_runtime": 1.1672,
"eval_samples_per_second": 97.667,
"eval_steps_per_second": 5.14,
"step": 1310
},
{
"epoch": 4.061538461538461,
"grad_norm": 0.9845967292785645,
"learning_rate": 8.950645260962572e-06,
"loss": 0.0716,
"step": 1320
},
{
"epoch": 4.061538461538461,
"eval_loss": 1.0023221969604492,
"eval_runtime": 1.17,
"eval_samples_per_second": 97.437,
"eval_steps_per_second": 5.128,
"step": 1320
},
{
"epoch": 4.092307692307692,
"grad_norm": 0.5891923904418945,
"learning_rate": 8.798149135102528e-06,
"loss": 0.0637,
"step": 1330
},
{
"epoch": 4.092307692307692,
"eval_loss": 1.0226399898529053,
"eval_runtime": 1.1693,
"eval_samples_per_second": 97.493,
"eval_steps_per_second": 5.131,
"step": 1330
},
{
"epoch": 4.123076923076923,
"grad_norm": 0.9881710410118103,
"learning_rate": 8.645936087240758e-06,
"loss": 0.0695,
"step": 1340
},
{
"epoch": 4.123076923076923,
"eval_loss": 1.0229477882385254,
"eval_runtime": 1.1678,
"eval_samples_per_second": 97.615,
"eval_steps_per_second": 5.138,
"step": 1340
},
{
"epoch": 4.153846153846154,
"grad_norm": 0.47998377680778503,
"learning_rate": 8.494041968884423e-06,
"loss": 0.0628,
"step": 1350
},
{
"epoch": 4.153846153846154,
"eval_loss": 1.0193408727645874,
"eval_runtime": 1.1721,
"eval_samples_per_second": 97.261,
"eval_steps_per_second": 5.119,
"step": 1350
},
{
"epoch": 4.184615384615385,
"grad_norm": 0.6444534063339233,
"learning_rate": 8.342502556421627e-06,
"loss": 0.0711,
"step": 1360
},
{
"epoch": 4.184615384615385,
"eval_loss": 1.0234897136688232,
"eval_runtime": 1.1655,
"eval_samples_per_second": 97.808,
"eval_steps_per_second": 5.148,
"step": 1360
},
{
"epoch": 4.2153846153846155,
"grad_norm": 1.1395206451416016,
"learning_rate": 8.19135354269479e-06,
"loss": 0.0662,
"step": 1370
},
{
"epoch": 4.2153846153846155,
"eval_loss": 1.027916669845581,
"eval_runtime": 1.1668,
"eval_samples_per_second": 97.702,
"eval_steps_per_second": 5.142,
"step": 1370
},
{
"epoch": 4.246153846153846,
"grad_norm": 0.6894501447677612,
"learning_rate": 8.040630528593753e-06,
"loss": 0.0748,
"step": 1380
},
{
"epoch": 4.246153846153846,
"eval_loss": 1.0266273021697998,
"eval_runtime": 1.1674,
"eval_samples_per_second": 97.65,
"eval_steps_per_second": 5.139,
"step": 1380
},
{
"epoch": 4.276923076923077,
"grad_norm": 0.6127138137817383,
"learning_rate": 7.890369014670512e-06,
"loss": 0.0571,
"step": 1390
},
{
"epoch": 4.276923076923077,
"eval_loss": 1.0378963947296143,
"eval_runtime": 1.169,
"eval_samples_per_second": 97.523,
"eval_steps_per_second": 5.133,
"step": 1390
},
{
"epoch": 4.3076923076923075,
"grad_norm": 1.168372392654419,
"learning_rate": 7.740604392777612e-06,
"loss": 0.067,
"step": 1400
},
{
"epoch": 4.3076923076923075,
"eval_loss": 1.0480735301971436,
"eval_runtime": 1.1695,
"eval_samples_per_second": 97.475,
"eval_steps_per_second": 5.13,
"step": 1400
},
{
"epoch": 4.338461538461538,
"grad_norm": 0.5983268022537231,
"learning_rate": 7.591371937732091e-06,
"loss": 0.0691,
"step": 1410
},
{
"epoch": 4.338461538461538,
"eval_loss": 1.0469272136688232,
"eval_runtime": 1.1663,
"eval_samples_per_second": 97.748,
"eval_steps_per_second": 5.145,
"step": 1410
},
{
"epoch": 4.36923076923077,
"grad_norm": 0.7790025472640991,
"learning_rate": 7.442706799007056e-06,
"loss": 0.0711,
"step": 1420
},
{
"epoch": 4.36923076923077,
"eval_loss": 1.0372542142868042,
"eval_runtime": 1.1684,
"eval_samples_per_second": 97.571,
"eval_steps_per_second": 5.135,
"step": 1420
},
{
"epoch": 4.4,
"grad_norm": 1.0238187313079834,
"learning_rate": 7.294643992452735e-06,
"loss": 0.0737,
"step": 1430
},
{
"epoch": 4.4,
"eval_loss": 1.03952157497406,
"eval_runtime": 1.1679,
"eval_samples_per_second": 97.615,
"eval_steps_per_second": 5.138,
"step": 1430
},
{
"epoch": 4.430769230769231,
"grad_norm": 1.3187087774276733,
"learning_rate": 7.147218392049026e-06,
"loss": 0.0673,
"step": 1440
},
{
"epoch": 4.430769230769231,
"eval_loss": 1.036614179611206,
"eval_runtime": 1.169,
"eval_samples_per_second": 97.522,
"eval_steps_per_second": 5.133,
"step": 1440
},
{
"epoch": 4.461538461538462,
"grad_norm": 1.0066938400268555,
"learning_rate": 7.000464721691438e-06,
"loss": 0.0671,
"step": 1450
},
{
"epoch": 4.461538461538462,
"eval_loss": 1.033462643623352,
"eval_runtime": 1.1706,
"eval_samples_per_second": 97.388,
"eval_steps_per_second": 5.126,
"step": 1450
},
{
"epoch": 4.492307692307692,
"grad_norm": 0.8664164543151855,
"learning_rate": 6.854417547012415e-06,
"loss": 0.0637,
"step": 1460
},
{
"epoch": 4.492307692307692,
"eval_loss": 1.0385520458221436,
"eval_runtime": 1.1667,
"eval_samples_per_second": 97.708,
"eval_steps_per_second": 5.143,
"step": 1460
},
{
"epoch": 4.523076923076923,
"grad_norm": 0.41295212507247925,
"learning_rate": 6.7091112672399e-06,
"loss": 0.0626,
"step": 1470
},
{
"epoch": 4.523076923076923,
"eval_loss": 1.0354348421096802,
"eval_runtime": 1.1643,
"eval_samples_per_second": 97.913,
"eval_steps_per_second": 5.153,
"step": 1470
},
{
"epoch": 4.553846153846154,
"grad_norm": 1.439942479133606,
"learning_rate": 6.564580107095133e-06,
"loss": 0.0675,
"step": 1480
},
{
"epoch": 4.553846153846154,
"eval_loss": 1.0380290746688843,
"eval_runtime": 1.1702,
"eval_samples_per_second": 97.415,
"eval_steps_per_second": 5.127,
"step": 1480
},
{
"epoch": 4.584615384615384,
"grad_norm": 0.6881715059280396,
"learning_rate": 6.4208581087315035e-06,
"loss": 0.0662,
"step": 1490
},
{
"epoch": 4.584615384615384,
"eval_loss": 1.029496192932129,
"eval_runtime": 1.1716,
"eval_samples_per_second": 97.305,
"eval_steps_per_second": 5.121,
"step": 1490
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.5975565314292908,
"learning_rate": 6.277979123716455e-06,
"loss": 0.0628,
"step": 1500
},
{
"epoch": 4.615384615384615,
"eval_loss": 1.0207117795944214,
"eval_runtime": 1.17,
"eval_samples_per_second": 97.434,
"eval_steps_per_second": 5.128,
"step": 1500
},
{
"epoch": 4.6461538461538465,
"grad_norm": 0.7314721345901489,
"learning_rate": 6.13597680505823e-06,
"loss": 0.0748,
"step": 1510
},
{
"epoch": 4.6461538461538465,
"eval_loss": 1.021004557609558,
"eval_runtime": 1.1652,
"eval_samples_per_second": 97.839,
"eval_steps_per_second": 5.149,
"step": 1510
},
{
"epoch": 4.676923076923077,
"grad_norm": 1.1508655548095703,
"learning_rate": 5.994884599279443e-06,
"loss": 0.0707,
"step": 1520
},
{
"epoch": 4.676923076923077,
"eval_loss": 1.023290753364563,
"eval_runtime": 1.1651,
"eval_samples_per_second": 97.845,
"eval_steps_per_second": 5.15,
"step": 1520
},
{
"epoch": 4.707692307692308,
"grad_norm": 0.9571641087532043,
"learning_rate": 5.854735738539203e-06,
"loss": 0.0672,
"step": 1530
},
{
"epoch": 4.707692307692308,
"eval_loss": 1.026742935180664,
"eval_runtime": 1.1684,
"eval_samples_per_second": 97.565,
"eval_steps_per_second": 5.135,
"step": 1530
},
{
"epoch": 4.7384615384615385,
"grad_norm": 0.8185418248176575,
"learning_rate": 5.715563232805825e-06,
"loss": 0.0606,
"step": 1540
},
{
"epoch": 4.7384615384615385,
"eval_loss": 1.032530426979065,
"eval_runtime": 1.1692,
"eval_samples_per_second": 97.5,
"eval_steps_per_second": 5.132,
"step": 1540
},
{
"epoch": 4.769230769230769,
"grad_norm": 0.9479151368141174,
"learning_rate": 5.577399862081789e-06,
"loss": 0.0632,
"step": 1550
},
{
"epoch": 4.769230769230769,
"eval_loss": 1.039183497428894,
"eval_runtime": 1.1755,
"eval_samples_per_second": 96.981,
"eval_steps_per_second": 5.104,
"step": 1550
}
],
"logging_steps": 10,
"max_steps": 2275,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 50,
"total_flos": 9.60159616681902e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}