qwen1.5-14B-RM-Lora / trainer_state.json
田贵成
init
03d12ba
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.997411003236246,
"eval_steps": 50,
"global_step": 4632,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008629989212513484,
"grad_norm": 20.681591033935547,
"learning_rate": 2.1551724137931036e-07,
"loss": 1.0408,
"step": 10
},
{
"epoch": 0.017259978425026967,
"grad_norm": 96.75000762939453,
"learning_rate": 4.3103448275862073e-07,
"loss": 1.047,
"step": 20
},
{
"epoch": 0.025889967637540454,
"grad_norm": 22.108104705810547,
"learning_rate": 6.465517241379311e-07,
"loss": 1.0718,
"step": 30
},
{
"epoch": 0.034519956850053934,
"grad_norm": 40.05157470703125,
"learning_rate": 8.620689655172415e-07,
"loss": 1.0488,
"step": 40
},
{
"epoch": 0.043149946062567425,
"grad_norm": 15.964655876159668,
"learning_rate": 1.0775862068965518e-06,
"loss": 1.075,
"step": 50
},
{
"epoch": 0.043149946062567425,
"eval_accuracy": 0.49320388349514566,
"eval_loss": 1.018173336982727,
"eval_runtime": 322.676,
"eval_samples_per_second": 1.596,
"eval_steps_per_second": 1.596,
"step": 50
},
{
"epoch": 0.05177993527508091,
"grad_norm": 27.802989959716797,
"learning_rate": 1.2931034482758623e-06,
"loss": 1.1389,
"step": 60
},
{
"epoch": 0.06040992448759439,
"grad_norm": 28.11711883544922,
"learning_rate": 1.5086206896551726e-06,
"loss": 1.1116,
"step": 70
},
{
"epoch": 0.06903991370010787,
"grad_norm": 22.176109313964844,
"learning_rate": 1.724137931034483e-06,
"loss": 1.0697,
"step": 80
},
{
"epoch": 0.07766990291262135,
"grad_norm": 41.33392333984375,
"learning_rate": 1.9396551724137932e-06,
"loss": 1.0242,
"step": 90
},
{
"epoch": 0.08629989212513485,
"grad_norm": 34.400508880615234,
"learning_rate": 2.1551724137931035e-06,
"loss": 1.0505,
"step": 100
},
{
"epoch": 0.08629989212513485,
"eval_accuracy": 0.5009708737864078,
"eval_loss": 0.9943639039993286,
"eval_runtime": 321.8255,
"eval_samples_per_second": 1.6,
"eval_steps_per_second": 1.6,
"step": 100
},
{
"epoch": 0.09492988133764833,
"grad_norm": 28.23130989074707,
"learning_rate": 2.370689655172414e-06,
"loss": 1.0073,
"step": 110
},
{
"epoch": 0.10355987055016182,
"grad_norm": 36.090736389160156,
"learning_rate": 2.5862068965517246e-06,
"loss": 0.9802,
"step": 120
},
{
"epoch": 0.1121898597626753,
"grad_norm": 58.96036148071289,
"learning_rate": 2.8017241379310345e-06,
"loss": 0.9827,
"step": 130
},
{
"epoch": 0.12081984897518878,
"grad_norm": 18.94993782043457,
"learning_rate": 3.017241379310345e-06,
"loss": 1.0015,
"step": 140
},
{
"epoch": 0.12944983818770225,
"grad_norm": 32.874114990234375,
"learning_rate": 3.2327586206896555e-06,
"loss": 0.9387,
"step": 150
},
{
"epoch": 0.12944983818770225,
"eval_accuracy": 0.5048543689320388,
"eval_loss": 0.9101472496986389,
"eval_runtime": 321.9422,
"eval_samples_per_second": 1.6,
"eval_steps_per_second": 1.6,
"step": 150
},
{
"epoch": 0.13807982740021574,
"grad_norm": 14.486083030700684,
"learning_rate": 3.448275862068966e-06,
"loss": 0.9255,
"step": 160
},
{
"epoch": 0.14670981661272922,
"grad_norm": 26.06964111328125,
"learning_rate": 3.663793103448276e-06,
"loss": 0.8775,
"step": 170
},
{
"epoch": 0.1553398058252427,
"grad_norm": 23.44382667541504,
"learning_rate": 3.8793103448275865e-06,
"loss": 0.8675,
"step": 180
},
{
"epoch": 0.16396979503775622,
"grad_norm": 22.29359245300293,
"learning_rate": 4.094827586206897e-06,
"loss": 0.9728,
"step": 190
},
{
"epoch": 0.1725997842502697,
"grad_norm": 38.14244842529297,
"learning_rate": 4.310344827586207e-06,
"loss": 0.92,
"step": 200
},
{
"epoch": 0.1725997842502697,
"eval_accuracy": 0.5048543689320388,
"eval_loss": 0.9019931554794312,
"eval_runtime": 321.9115,
"eval_samples_per_second": 1.6,
"eval_steps_per_second": 1.6,
"step": 200
},
{
"epoch": 0.18122977346278318,
"grad_norm": 64.9331283569336,
"learning_rate": 4.525862068965518e-06,
"loss": 0.9633,
"step": 210
},
{
"epoch": 0.18985976267529667,
"grad_norm": 39.31247329711914,
"learning_rate": 4.741379310344828e-06,
"loss": 0.9646,
"step": 220
},
{
"epoch": 0.19848975188781015,
"grad_norm": 26.192481994628906,
"learning_rate": 4.9568965517241384e-06,
"loss": 0.9956,
"step": 230
},
{
"epoch": 0.20711974110032363,
"grad_norm": 33.946685791015625,
"learning_rate": 5.172413793103449e-06,
"loss": 0.8929,
"step": 240
},
{
"epoch": 0.21574973031283712,
"grad_norm": 20.04779624938965,
"learning_rate": 5.38793103448276e-06,
"loss": 0.9531,
"step": 250
},
{
"epoch": 0.21574973031283712,
"eval_accuracy": 0.5223300970873787,
"eval_loss": 0.886761486530304,
"eval_runtime": 321.7179,
"eval_samples_per_second": 1.601,
"eval_steps_per_second": 1.601,
"step": 250
},
{
"epoch": 0.2243797195253506,
"grad_norm": 53.125587463378906,
"learning_rate": 5.603448275862069e-06,
"loss": 0.9716,
"step": 260
},
{
"epoch": 0.23300970873786409,
"grad_norm": 43.821533203125,
"learning_rate": 5.81896551724138e-06,
"loss": 0.9407,
"step": 270
},
{
"epoch": 0.24163969795037757,
"grad_norm": 47.41954803466797,
"learning_rate": 6.03448275862069e-06,
"loss": 0.9464,
"step": 280
},
{
"epoch": 0.25026968716289105,
"grad_norm": 29.925968170166016,
"learning_rate": 6.25e-06,
"loss": 0.9151,
"step": 290
},
{
"epoch": 0.2588996763754045,
"grad_norm": 23.372934341430664,
"learning_rate": 6.465517241379311e-06,
"loss": 0.849,
"step": 300
},
{
"epoch": 0.2588996763754045,
"eval_accuracy": 0.5339805825242718,
"eval_loss": 0.856666088104248,
"eval_runtime": 321.7027,
"eval_samples_per_second": 1.601,
"eval_steps_per_second": 1.601,
"step": 300
},
{
"epoch": 0.267529665587918,
"grad_norm": 22.651479721069336,
"learning_rate": 6.681034482758622e-06,
"loss": 1.0237,
"step": 310
},
{
"epoch": 0.2761596548004315,
"grad_norm": 17.50941276550293,
"learning_rate": 6.896551724137932e-06,
"loss": 0.8401,
"step": 320
},
{
"epoch": 0.284789644012945,
"grad_norm": 51.20744323730469,
"learning_rate": 7.1120689655172415e-06,
"loss": 0.9366,
"step": 330
},
{
"epoch": 0.29341963322545844,
"grad_norm": 23.283870697021484,
"learning_rate": 7.327586206896552e-06,
"loss": 0.8198,
"step": 340
},
{
"epoch": 0.30204962243797195,
"grad_norm": 24.28423500061035,
"learning_rate": 7.543103448275862e-06,
"loss": 0.8897,
"step": 350
},
{
"epoch": 0.30204962243797195,
"eval_accuracy": 0.5262135922330097,
"eval_loss": 0.8523032069206238,
"eval_runtime": 321.7555,
"eval_samples_per_second": 1.601,
"eval_steps_per_second": 1.601,
"step": 350
},
{
"epoch": 0.3106796116504854,
"grad_norm": 27.711999893188477,
"learning_rate": 7.758620689655173e-06,
"loss": 0.8352,
"step": 360
},
{
"epoch": 0.3193096008629989,
"grad_norm": 25.017581939697266,
"learning_rate": 7.974137931034484e-06,
"loss": 0.7918,
"step": 370
},
{
"epoch": 0.32793959007551243,
"grad_norm": 33.27495193481445,
"learning_rate": 8.189655172413794e-06,
"loss": 0.9004,
"step": 380
},
{
"epoch": 0.3365695792880259,
"grad_norm": 17.355253219604492,
"learning_rate": 8.405172413793105e-06,
"loss": 0.8079,
"step": 390
},
{
"epoch": 0.3451995685005394,
"grad_norm": 33.237518310546875,
"learning_rate": 8.620689655172414e-06,
"loss": 0.8512,
"step": 400
},
{
"epoch": 0.3451995685005394,
"eval_accuracy": 0.5262135922330097,
"eval_loss": 0.8104857206344604,
"eval_runtime": 321.6492,
"eval_samples_per_second": 1.601,
"eval_steps_per_second": 1.601,
"step": 400
},
{
"epoch": 0.35382955771305286,
"grad_norm": 31.926298141479492,
"learning_rate": 8.836206896551725e-06,
"loss": 0.8049,
"step": 410
},
{
"epoch": 0.36245954692556637,
"grad_norm": 18.511268615722656,
"learning_rate": 9.051724137931036e-06,
"loss": 0.7887,
"step": 420
},
{
"epoch": 0.3710895361380798,
"grad_norm": 12.080615043640137,
"learning_rate": 9.267241379310346e-06,
"loss": 0.8286,
"step": 430
},
{
"epoch": 0.37971952535059333,
"grad_norm": 22.48563003540039,
"learning_rate": 9.482758620689655e-06,
"loss": 0.8201,
"step": 440
},
{
"epoch": 0.3883495145631068,
"grad_norm": 25.83173179626465,
"learning_rate": 9.698275862068966e-06,
"loss": 0.7854,
"step": 450
},
{
"epoch": 0.3883495145631068,
"eval_accuracy": 0.5106796116504855,
"eval_loss": 0.7994323372840881,
"eval_runtime": 321.4421,
"eval_samples_per_second": 1.602,
"eval_steps_per_second": 1.602,
"step": 450
},
{
"epoch": 0.3969795037756203,
"grad_norm": 41.783851623535156,
"learning_rate": 9.913793103448277e-06,
"loss": 0.8339,
"step": 460
},
{
"epoch": 0.40560949298813376,
"grad_norm": 12.72182846069336,
"learning_rate": 9.9999488687872e-06,
"loss": 0.8063,
"step": 470
},
{
"epoch": 0.41423948220064727,
"grad_norm": 28.933361053466797,
"learning_rate": 9.999636404051638e-06,
"loss": 0.8554,
"step": 480
},
{
"epoch": 0.4228694714131607,
"grad_norm": 48.14093017578125,
"learning_rate": 9.999039898540166e-06,
"loss": 0.9297,
"step": 490
},
{
"epoch": 0.43149946062567424,
"grad_norm": 27.8731746673584,
"learning_rate": 9.998159386141626e-06,
"loss": 0.8147,
"step": 500
},
{
"epoch": 0.43149946062567424,
"eval_accuracy": 0.5398058252427185,
"eval_loss": 0.7859384417533875,
"eval_runtime": 321.5871,
"eval_samples_per_second": 1.601,
"eval_steps_per_second": 1.601,
"step": 500
},
{
"epoch": 0.4401294498381877,
"grad_norm": 17.547481536865234,
"learning_rate": 9.996994916879941e-06,
"loss": 0.8449,
"step": 510
},
{
"epoch": 0.4487594390507012,
"grad_norm": 33.447723388671875,
"learning_rate": 9.995546556911271e-06,
"loss": 0.779,
"step": 520
},
{
"epoch": 0.45738942826321466,
"grad_norm": 41.81571578979492,
"learning_rate": 9.99381438852026e-06,
"loss": 0.7262,
"step": 530
},
{
"epoch": 0.46601941747572817,
"grad_norm": 40.82163619995117,
"learning_rate": 9.991798510115351e-06,
"loss": 0.8282,
"step": 540
},
{
"epoch": 0.4746494066882416,
"grad_norm": 55.30727767944336,
"learning_rate": 9.989499036223209e-06,
"loss": 0.8075,
"step": 550
},
{
"epoch": 0.4746494066882416,
"eval_accuracy": 0.5553398058252427,
"eval_loss": 0.7565743923187256,
"eval_runtime": 321.511,
"eval_samples_per_second": 1.602,
"eval_steps_per_second": 1.602,
"step": 550
},
{
"epoch": 0.48327939590075514,
"grad_norm": 51.085289001464844,
"learning_rate": 9.986916097482204e-06,
"loss": 0.7747,
"step": 560
},
{
"epoch": 0.4919093851132686,
"grad_norm": 65.66133880615234,
"learning_rate": 9.98404984063499e-06,
"loss": 0.7563,
"step": 570
},
{
"epoch": 0.5005393743257821,
"grad_norm": 11.704032897949219,
"learning_rate": 9.980900428520171e-06,
"loss": 0.7819,
"step": 580
},
{
"epoch": 0.5091693635382956,
"grad_norm": 27.524673461914062,
"learning_rate": 9.977468040063054e-06,
"loss": 0.7777,
"step": 590
},
{
"epoch": 0.517799352750809,
"grad_norm": 22.56294822692871,
"learning_rate": 9.973752870265473e-06,
"loss": 0.8282,
"step": 600
},
{
"epoch": 0.517799352750809,
"eval_accuracy": 0.5145631067961165,
"eval_loss": 0.7454360127449036,
"eval_runtime": 321.3773,
"eval_samples_per_second": 1.602,
"eval_steps_per_second": 1.602,
"step": 600
},
{
"epoch": 0.5264293419633226,
"grad_norm": 24.327606201171875,
"learning_rate": 9.96975513019472e-06,
"loss": 0.7907,
"step": 610
},
{
"epoch": 0.535059331175836,
"grad_norm": 18.27765655517578,
"learning_rate": 9.965475046971548e-06,
"loss": 0.8475,
"step": 620
},
{
"epoch": 0.5436893203883495,
"grad_norm": 23.742115020751953,
"learning_rate": 9.960912863757273e-06,
"loss": 0.7363,
"step": 630
},
{
"epoch": 0.552319309600863,
"grad_norm": 11.194246292114258,
"learning_rate": 9.956068839739955e-06,
"loss": 0.8291,
"step": 640
},
{
"epoch": 0.5609492988133765,
"grad_norm": 23.568937301635742,
"learning_rate": 9.950943250119674e-06,
"loss": 0.7524,
"step": 650
},
{
"epoch": 0.5609492988133765,
"eval_accuracy": 0.49902912621359224,
"eval_loss": 0.7317044138908386,
"eval_runtime": 321.3686,
"eval_samples_per_second": 1.603,
"eval_steps_per_second": 1.603,
"step": 650
},
{
"epoch": 0.56957928802589,
"grad_norm": 11.3060302734375,
"learning_rate": 9.945536386092893e-06,
"loss": 0.7319,
"step": 660
},
{
"epoch": 0.5782092772384034,
"grad_norm": 29.552515029907227,
"learning_rate": 9.939848554835927e-06,
"loss": 0.6644,
"step": 670
},
{
"epoch": 0.5868392664509169,
"grad_norm": 23.357723236083984,
"learning_rate": 9.93388007948747e-06,
"loss": 0.8749,
"step": 680
},
{
"epoch": 0.5954692556634305,
"grad_norm": 18.92988395690918,
"learning_rate": 9.927631299130254e-06,
"loss": 0.8157,
"step": 690
},
{
"epoch": 0.6040992448759439,
"grad_norm": 18.492721557617188,
"learning_rate": 9.921102568771781e-06,
"loss": 0.7338,
"step": 700
},
{
"epoch": 0.6040992448759439,
"eval_accuracy": 0.5339805825242718,
"eval_loss": 0.7266865968704224,
"eval_runtime": 321.4222,
"eval_samples_per_second": 1.602,
"eval_steps_per_second": 1.602,
"step": 700
},
{
"epoch": 0.6127292340884574,
"grad_norm": 24.050262451171875,
"learning_rate": 9.914294259324149e-06,
"loss": 0.7609,
"step": 710
},
{
"epoch": 0.6213592233009708,
"grad_norm": 8.642351150512695,
"learning_rate": 9.907206757582987e-06,
"loss": 0.7681,
"step": 720
},
{
"epoch": 0.6299892125134844,
"grad_norm": 20.86747932434082,
"learning_rate": 9.899840466205473e-06,
"loss": 0.8052,
"step": 730
},
{
"epoch": 0.6386192017259978,
"grad_norm": 44.50579833984375,
"learning_rate": 9.892195803687464e-06,
"loss": 0.739,
"step": 740
},
{
"epoch": 0.6472491909385113,
"grad_norm": 20.538475036621094,
"learning_rate": 9.884273204339716e-06,
"loss": 0.7909,
"step": 750
},
{
"epoch": 0.6472491909385113,
"eval_accuracy": 0.5611650485436893,
"eval_loss": 0.7110950350761414,
"eval_runtime": 321.0742,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 750
},
{
"epoch": 0.6558791801510249,
"grad_norm": 53.17654037475586,
"learning_rate": 9.876073118263216e-06,
"loss": 0.8172,
"step": 760
},
{
"epoch": 0.6645091693635383,
"grad_norm": 26.998899459838867,
"learning_rate": 9.867596011323602e-06,
"loss": 0.7901,
"step": 770
},
{
"epoch": 0.6731391585760518,
"grad_norm": 45.38533020019531,
"learning_rate": 9.858842365124702e-06,
"loss": 0.7284,
"step": 780
},
{
"epoch": 0.6817691477885652,
"grad_norm": 28.952617645263672,
"learning_rate": 9.849812676981172e-06,
"loss": 0.7501,
"step": 790
},
{
"epoch": 0.6903991370010788,
"grad_norm": 19.87049102783203,
"learning_rate": 9.840507459890244e-06,
"loss": 0.7783,
"step": 800
},
{
"epoch": 0.6903991370010788,
"eval_accuracy": 0.5300970873786408,
"eval_loss": 0.7211207151412964,
"eval_runtime": 320.8034,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 800
},
{
"epoch": 0.6990291262135923,
"grad_norm": 15.508710861206055,
"learning_rate": 9.830927242502575e-06,
"loss": 0.6965,
"step": 810
},
{
"epoch": 0.7076591154261057,
"grad_norm": 36.019798278808594,
"learning_rate": 9.821072569092223e-06,
"loss": 0.77,
"step": 820
},
{
"epoch": 0.7162891046386192,
"grad_norm": 13.119162559509277,
"learning_rate": 9.810943999525714e-06,
"loss": 0.7158,
"step": 830
},
{
"epoch": 0.7249190938511327,
"grad_norm": 20.22465705871582,
"learning_rate": 9.800542109230247e-06,
"loss": 0.6938,
"step": 840
},
{
"epoch": 0.7335490830636462,
"grad_norm": 33.313209533691406,
"learning_rate": 9.78986748916099e-06,
"loss": 0.7895,
"step": 850
},
{
"epoch": 0.7335490830636462,
"eval_accuracy": 0.5592233009708738,
"eval_loss": 0.7069711685180664,
"eval_runtime": 321.285,
"eval_samples_per_second": 1.603,
"eval_steps_per_second": 1.603,
"step": 850
},
{
"epoch": 0.7421790722761596,
"grad_norm": 9.106620788574219,
"learning_rate": 9.778920745767524e-06,
"loss": 0.6717,
"step": 860
},
{
"epoch": 0.7508090614886731,
"grad_norm": 34.899375915527344,
"learning_rate": 9.767702500959365e-06,
"loss": 0.7353,
"step": 870
},
{
"epoch": 0.7594390507011867,
"grad_norm": 29.355737686157227,
"learning_rate": 9.756213392070654e-06,
"loss": 0.7315,
"step": 880
},
{
"epoch": 0.7680690399137001,
"grad_norm": 16.923168182373047,
"learning_rate": 9.744454071823936e-06,
"loss": 0.6777,
"step": 890
},
{
"epoch": 0.7766990291262136,
"grad_norm": 7.441469192504883,
"learning_rate": 9.732425208293083e-06,
"loss": 0.6881,
"step": 900
},
{
"epoch": 0.7766990291262136,
"eval_accuracy": 0.537864077669903,
"eval_loss": 0.7709933519363403,
"eval_runtime": 321.2302,
"eval_samples_per_second": 1.603,
"eval_steps_per_second": 1.603,
"step": 900
},
{
"epoch": 0.785329018338727,
"grad_norm": 17.159208297729492,
"learning_rate": 9.720127484865336e-06,
"loss": 0.7973,
"step": 910
},
{
"epoch": 0.7939590075512406,
"grad_norm": 29.373632431030273,
"learning_rate": 9.707561600202481e-06,
"loss": 0.6946,
"step": 920
},
{
"epoch": 0.8025889967637541,
"grad_norm": 40.986690521240234,
"learning_rate": 9.694728268201162e-06,
"loss": 0.7697,
"step": 930
},
{
"epoch": 0.8112189859762675,
"grad_norm": 10.117018699645996,
"learning_rate": 9.681628217952308e-06,
"loss": 0.7183,
"step": 940
},
{
"epoch": 0.819848975188781,
"grad_norm": 45.013118743896484,
"learning_rate": 9.668262193699731e-06,
"loss": 0.7137,
"step": 950
},
{
"epoch": 0.819848975188781,
"eval_accuracy": 0.5805825242718446,
"eval_loss": 0.6908486485481262,
"eval_runtime": 321.1671,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 950
},
{
"epoch": 0.8284789644012945,
"grad_norm": 22.911548614501953,
"learning_rate": 9.65463095479783e-06,
"loss": 0.7166,
"step": 960
},
{
"epoch": 0.837108953613808,
"grad_norm": 9.517961502075195,
"learning_rate": 9.640735275668453e-06,
"loss": 0.7713,
"step": 970
},
{
"epoch": 0.8457389428263214,
"grad_norm": 19.63594627380371,
"learning_rate": 9.62657594575691e-06,
"loss": 0.7101,
"step": 980
},
{
"epoch": 0.8543689320388349,
"grad_norm": 27.475940704345703,
"learning_rate": 9.6121537694871e-06,
"loss": 0.741,
"step": 990
},
{
"epoch": 0.8629989212513485,
"grad_norm": 13.922393798828125,
"learning_rate": 9.597469566215841e-06,
"loss": 0.6924,
"step": 1000
},
{
"epoch": 0.8629989212513485,
"eval_accuracy": 0.6,
"eval_loss": 0.6857309341430664,
"eval_runtime": 321.1313,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1000
},
{
"epoch": 0.8716289104638619,
"grad_norm": 8.671666145324707,
"learning_rate": 9.582524170186294e-06,
"loss": 0.6936,
"step": 1010
},
{
"epoch": 0.8802588996763754,
"grad_norm": 11.311553001403809,
"learning_rate": 9.567318430480579e-06,
"loss": 0.6853,
"step": 1020
},
{
"epoch": 0.8888888888888888,
"grad_norm": 6.2082648277282715,
"learning_rate": 9.55185321097154e-06,
"loss": 0.6846,
"step": 1030
},
{
"epoch": 0.8975188781014024,
"grad_norm": 35.873565673828125,
"learning_rate": 9.536129390273659e-06,
"loss": 0.7125,
"step": 1040
},
{
"epoch": 0.9061488673139159,
"grad_norm": 3.9832065105438232,
"learning_rate": 9.520147861693138e-06,
"loss": 0.7275,
"step": 1050
},
{
"epoch": 0.9061488673139159,
"eval_accuracy": 0.5766990291262136,
"eval_loss": 0.6835415959358215,
"eval_runtime": 321.1452,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1050
},
{
"epoch": 0.9147788565264293,
"grad_norm": 9.8655424118042,
"learning_rate": 9.503909533177162e-06,
"loss": 0.7286,
"step": 1060
},
{
"epoch": 0.9234088457389428,
"grad_norm": 14.413016319274902,
"learning_rate": 9.487415327262303e-06,
"loss": 0.7012,
"step": 1070
},
{
"epoch": 0.9320388349514563,
"grad_norm": 22.791946411132812,
"learning_rate": 9.470666181022114e-06,
"loss": 0.7057,
"step": 1080
},
{
"epoch": 0.9406688241639698,
"grad_norm": 7.595472812652588,
"learning_rate": 9.453663046013889e-06,
"loss": 0.7165,
"step": 1090
},
{
"epoch": 0.9492988133764833,
"grad_norm": 6.206796169281006,
"learning_rate": 9.436406888224603e-06,
"loss": 0.67,
"step": 1100
},
{
"epoch": 0.9492988133764833,
"eval_accuracy": 0.570873786407767,
"eval_loss": 0.6888366341590881,
"eval_runtime": 321.1897,
"eval_samples_per_second": 1.603,
"eval_steps_per_second": 1.603,
"step": 1100
},
{
"epoch": 0.9579288025889967,
"grad_norm": 9.740569114685059,
"learning_rate": 9.418898688016042e-06,
"loss": 0.7177,
"step": 1110
},
{
"epoch": 0.9665587918015103,
"grad_norm": 9.868525505065918,
"learning_rate": 9.40113944006909e-06,
"loss": 0.6841,
"step": 1120
},
{
"epoch": 0.9751887810140237,
"grad_norm": 10.188973426818848,
"learning_rate": 9.383130153327231e-06,
"loss": 0.6808,
"step": 1130
},
{
"epoch": 0.9838187702265372,
"grad_norm": 5.215792655944824,
"learning_rate": 9.36487185093922e-06,
"loss": 0.7059,
"step": 1140
},
{
"epoch": 0.9924487594390508,
"grad_norm": 5.438614845275879,
"learning_rate": 9.34636557020097e-06,
"loss": 0.6787,
"step": 1150
},
{
"epoch": 0.9924487594390508,
"eval_accuracy": 0.596116504854369,
"eval_loss": 0.6860348582267761,
"eval_runtime": 320.9468,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 1150
},
{
"epoch": 1.0010787486515642,
"grad_norm": 7.045734405517578,
"learning_rate": 9.327612362496601e-06,
"loss": 0.6904,
"step": 1160
},
{
"epoch": 1.0097087378640777,
"grad_norm": 21.833343505859375,
"learning_rate": 9.308613293238722e-06,
"loss": 0.7516,
"step": 1170
},
{
"epoch": 1.0183387270765911,
"grad_norm": 4.44768762588501,
"learning_rate": 9.2893694418079e-06,
"loss": 0.7105,
"step": 1180
},
{
"epoch": 1.0269687162891046,
"grad_norm": 12.016294479370117,
"learning_rate": 9.269881901491335e-06,
"loss": 0.67,
"step": 1190
},
{
"epoch": 1.035598705501618,
"grad_norm": 5.096578598022461,
"learning_rate": 9.250151779420756e-06,
"loss": 0.7012,
"step": 1200
},
{
"epoch": 1.035598705501618,
"eval_accuracy": 0.570873786407767,
"eval_loss": 0.6847370266914368,
"eval_runtime": 320.5183,
"eval_samples_per_second": 1.607,
"eval_steps_per_second": 1.607,
"step": 1200
},
{
"epoch": 1.0442286947141317,
"grad_norm": 11.158854484558105,
"learning_rate": 9.230180196509506e-06,
"loss": 0.6726,
"step": 1210
},
{
"epoch": 1.0528586839266452,
"grad_norm": 7.818958282470703,
"learning_rate": 9.209968287388878e-06,
"loss": 0.6737,
"step": 1220
},
{
"epoch": 1.0614886731391586,
"grad_norm": 4.283718109130859,
"learning_rate": 9.189517200343643e-06,
"loss": 0.6421,
"step": 1230
},
{
"epoch": 1.070118662351672,
"grad_norm": 6.186824321746826,
"learning_rate": 9.168828097246819e-06,
"loss": 0.7709,
"step": 1240
},
{
"epoch": 1.0787486515641855,
"grad_norm": 5.761249542236328,
"learning_rate": 9.147902153493659e-06,
"loss": 0.6765,
"step": 1250
},
{
"epoch": 1.0787486515641855,
"eval_accuracy": 0.5786407766990291,
"eval_loss": 0.6961000561714172,
"eval_runtime": 320.4513,
"eval_samples_per_second": 1.607,
"eval_steps_per_second": 1.607,
"step": 1250
},
{
"epoch": 1.087378640776699,
"grad_norm": 5.015466213226318,
"learning_rate": 9.126740557934874e-06,
"loss": 0.6551,
"step": 1260
},
{
"epoch": 1.0960086299892124,
"grad_norm": 8.18385124206543,
"learning_rate": 9.105344512809097e-06,
"loss": 0.6606,
"step": 1270
},
{
"epoch": 1.104638619201726,
"grad_norm": 3.6305551528930664,
"learning_rate": 9.083715233674572e-06,
"loss": 0.7058,
"step": 1280
},
{
"epoch": 1.1132686084142396,
"grad_norm": 9.872076034545898,
"learning_rate": 9.061853949340104e-06,
"loss": 0.6577,
"step": 1290
},
{
"epoch": 1.121898597626753,
"grad_norm": 4.889667510986328,
"learning_rate": 9.039761901795241e-06,
"loss": 0.7052,
"step": 1300
},
{
"epoch": 1.121898597626753,
"eval_accuracy": 0.6058252427184466,
"eval_loss": 0.6881099939346313,
"eval_runtime": 320.8035,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 1300
},
{
"epoch": 1.1305285868392665,
"grad_norm": 3.392106056213379,
"learning_rate": 9.017440346139718e-06,
"loss": 0.681,
"step": 1310
},
{
"epoch": 1.13915857605178,
"grad_norm": 5.220512866973877,
"learning_rate": 8.994890550512152e-06,
"loss": 0.7117,
"step": 1320
},
{
"epoch": 1.1477885652642934,
"grad_norm": 11.190145492553711,
"learning_rate": 8.972113796017992e-06,
"loss": 0.7058,
"step": 1330
},
{
"epoch": 1.1564185544768069,
"grad_norm": 3.2504310607910156,
"learning_rate": 8.949111376656741e-06,
"loss": 0.6867,
"step": 1340
},
{
"epoch": 1.1650485436893203,
"grad_norm": 3.312730073928833,
"learning_rate": 8.925884599248437e-06,
"loss": 0.6804,
"step": 1350
},
{
"epoch": 1.1650485436893203,
"eval_accuracy": 0.6097087378640776,
"eval_loss": 0.6778111457824707,
"eval_runtime": 320.8442,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 1350
},
{
"epoch": 1.173678532901834,
"grad_norm": 3.8169898986816406,
"learning_rate": 8.902434783359417e-06,
"loss": 0.6812,
"step": 1360
},
{
"epoch": 1.1823085221143474,
"grad_norm": 13.139059066772461,
"learning_rate": 8.878763261227337e-06,
"loss": 0.7111,
"step": 1370
},
{
"epoch": 1.190938511326861,
"grad_norm": 8.938994407653809,
"learning_rate": 8.854871377685496e-06,
"loss": 0.6762,
"step": 1380
},
{
"epoch": 1.1995685005393744,
"grad_norm": 7.517580509185791,
"learning_rate": 8.830760490086427e-06,
"loss": 0.6817,
"step": 1390
},
{
"epoch": 1.2081984897518878,
"grad_norm": 5.75648307800293,
"learning_rate": 8.806431968224784e-06,
"loss": 0.6644,
"step": 1400
},
{
"epoch": 1.2081984897518878,
"eval_accuracy": 0.6194174757281553,
"eval_loss": 0.6810408234596252,
"eval_runtime": 320.9626,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 1400
},
{
"epoch": 1.2168284789644013,
"grad_norm": 6.445542812347412,
"learning_rate": 8.781887194259523e-06,
"loss": 0.6684,
"step": 1410
},
{
"epoch": 1.2254584681769147,
"grad_norm": 5.923236846923828,
"learning_rate": 8.757127562635374e-06,
"loss": 0.6802,
"step": 1420
},
{
"epoch": 1.2340884573894282,
"grad_norm": 5.63727331161499,
"learning_rate": 8.732154480003625e-06,
"loss": 0.7045,
"step": 1430
},
{
"epoch": 1.2427184466019416,
"grad_norm": 5.639196872711182,
"learning_rate": 8.706969365142202e-06,
"loss": 0.6916,
"step": 1440
},
{
"epoch": 1.2513484358144553,
"grad_norm": 6.068101406097412,
"learning_rate": 8.681573648875064e-06,
"loss": 0.6566,
"step": 1450
},
{
"epoch": 1.2513484358144553,
"eval_accuracy": 0.6135922330097088,
"eval_loss": 0.6820415258407593,
"eval_runtime": 320.9166,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 1450
},
{
"epoch": 1.2599784250269688,
"grad_norm": 5.288263320922852,
"learning_rate": 8.655968773990922e-06,
"loss": 0.6696,
"step": 1460
},
{
"epoch": 1.2686084142394822,
"grad_norm": 9.293752670288086,
"learning_rate": 8.630156195161264e-06,
"loss": 0.6407,
"step": 1470
},
{
"epoch": 1.2772384034519957,
"grad_norm": 14.672719955444336,
"learning_rate": 8.604137378857713e-06,
"loss": 0.6507,
"step": 1480
},
{
"epoch": 1.2858683926645091,
"grad_norm": 9.176056861877441,
"learning_rate": 8.577913803268719e-06,
"loss": 0.7229,
"step": 1490
},
{
"epoch": 1.2944983818770226,
"grad_norm": 12.57158374786377,
"learning_rate": 8.551486958215569e-06,
"loss": 0.7024,
"step": 1500
},
{
"epoch": 1.2944983818770226,
"eval_accuracy": 0.6116504854368932,
"eval_loss": 0.6744683384895325,
"eval_runtime": 321.1558,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1500
},
{
"epoch": 1.303128371089536,
"grad_norm": 6.8445305824279785,
"learning_rate": 8.524858345067757e-06,
"loss": 0.6842,
"step": 1510
},
{
"epoch": 1.3117583603020497,
"grad_norm": 5.6327643394470215,
"learning_rate": 8.498029476657686e-06,
"loss": 0.6904,
"step": 1520
},
{
"epoch": 1.3203883495145632,
"grad_norm": 10.025938987731934,
"learning_rate": 8.471001877194708e-06,
"loss": 0.6733,
"step": 1530
},
{
"epoch": 1.3290183387270766,
"grad_norm": 6.761681079864502,
"learning_rate": 8.443777082178556e-06,
"loss": 0.6767,
"step": 1540
},
{
"epoch": 1.33764832793959,
"grad_norm": 5.284752368927002,
"learning_rate": 8.416356638312082e-06,
"loss": 0.7241,
"step": 1550
},
{
"epoch": 1.33764832793959,
"eval_accuracy": 0.6135922330097088,
"eval_loss": 0.6697773933410645,
"eval_runtime": 321.0762,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1550
},
{
"epoch": 1.3462783171521036,
"grad_norm": 5.520620346069336,
"learning_rate": 8.388742103413397e-06,
"loss": 0.6738,
"step": 1560
},
{
"epoch": 1.354908306364617,
"grad_norm": 4.6568098068237305,
"learning_rate": 8.360935046327373e-06,
"loss": 0.671,
"step": 1570
},
{
"epoch": 1.3635382955771305,
"grad_norm": 4.777432441711426,
"learning_rate": 8.332937046836503e-06,
"loss": 0.69,
"step": 1580
},
{
"epoch": 1.3721682847896441,
"grad_norm": 8.115592956542969,
"learning_rate": 8.304749695571157e-06,
"loss": 0.6583,
"step": 1590
},
{
"epoch": 1.3807982740021574,
"grad_norm": 11.980337142944336,
"learning_rate": 8.276374593919213e-06,
"loss": 0.7378,
"step": 1600
},
{
"epoch": 1.3807982740021574,
"eval_accuracy": 0.6058252427184466,
"eval_loss": 0.6734395027160645,
"eval_runtime": 320.9778,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1600
},
{
"epoch": 1.389428263214671,
"grad_norm": 3.5900051593780518,
"learning_rate": 8.247813353935073e-06,
"loss": 0.664,
"step": 1610
},
{
"epoch": 1.3980582524271845,
"grad_norm": 14.644140243530273,
"learning_rate": 8.219067598248087e-06,
"loss": 0.6718,
"step": 1620
},
{
"epoch": 1.406688241639698,
"grad_norm": 6.659509658813477,
"learning_rate": 8.190138959970366e-06,
"loss": 0.6476,
"step": 1630
},
{
"epoch": 1.4153182308522114,
"grad_norm": 5.535285949707031,
"learning_rate": 8.161029082603994e-06,
"loss": 0.642,
"step": 1640
},
{
"epoch": 1.4239482200647249,
"grad_norm": 7.590597152709961,
"learning_rate": 8.131739619947667e-06,
"loss": 0.6584,
"step": 1650
},
{
"epoch": 1.4239482200647249,
"eval_accuracy": 0.6,
"eval_loss": 0.6994197964668274,
"eval_runtime": 321.0664,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1650
},
{
"epoch": 1.4325782092772383,
"grad_norm": 13.075584411621094,
"learning_rate": 8.102272236002729e-06,
"loss": 0.7239,
"step": 1660
},
{
"epoch": 1.4412081984897518,
"grad_norm": 6.066156387329102,
"learning_rate": 8.072628604878638e-06,
"loss": 0.7182,
"step": 1670
},
{
"epoch": 1.4498381877022655,
"grad_norm": 4.588730335235596,
"learning_rate": 8.042810410697861e-06,
"loss": 0.717,
"step": 1680
},
{
"epoch": 1.458468176914779,
"grad_norm": 3.397918224334717,
"learning_rate": 8.012819347500189e-06,
"loss": 0.6567,
"step": 1690
},
{
"epoch": 1.4670981661272924,
"grad_norm": 8.24763298034668,
"learning_rate": 7.982657119146495e-06,
"loss": 0.6724,
"step": 1700
},
{
"epoch": 1.4670981661272924,
"eval_accuracy": 0.6097087378640776,
"eval_loss": 0.6715120077133179,
"eval_runtime": 321.0917,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1700
},
{
"epoch": 1.4757281553398058,
"grad_norm": 8.984458923339844,
"learning_rate": 7.952325439221944e-06,
"loss": 0.6653,
"step": 1710
},
{
"epoch": 1.4843581445523193,
"grad_norm": 8.375741958618164,
"learning_rate": 7.921826030938623e-06,
"loss": 0.722,
"step": 1720
},
{
"epoch": 1.4929881337648327,
"grad_norm": 8.309843063354492,
"learning_rate": 7.891160627037653e-06,
"loss": 0.7034,
"step": 1730
},
{
"epoch": 1.5016181229773462,
"grad_norm": 7.065859794616699,
"learning_rate": 7.860330969690749e-06,
"loss": 0.6338,
"step": 1740
},
{
"epoch": 1.5102481121898599,
"grad_norm": 5.86482048034668,
"learning_rate": 7.829338810401238e-06,
"loss": 0.6774,
"step": 1750
},
{
"epoch": 1.5102481121898599,
"eval_accuracy": 0.6135922330097088,
"eval_loss": 0.669984757900238,
"eval_runtime": 321.0227,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1750
},
{
"epoch": 1.5188781014023731,
"grad_norm": 6.1000075340271,
"learning_rate": 7.798185909904552e-06,
"loss": 0.6813,
"step": 1760
},
{
"epoch": 1.5275080906148868,
"grad_norm": 8.106244087219238,
"learning_rate": 7.766874038068202e-06,
"loss": 0.7138,
"step": 1770
},
{
"epoch": 1.5361380798274002,
"grad_norm": 5.946533203125,
"learning_rate": 7.735404973791223e-06,
"loss": 0.7025,
"step": 1780
},
{
"epoch": 1.5447680690399137,
"grad_norm": 6.442516326904297,
"learning_rate": 7.703780504903107e-06,
"loss": 0.6643,
"step": 1790
},
{
"epoch": 1.5533980582524272,
"grad_norm": 6.0701985359191895,
"learning_rate": 7.672002428062245e-06,
"loss": 0.6653,
"step": 1800
},
{
"epoch": 1.5533980582524272,
"eval_accuracy": 0.6097087378640776,
"eval_loss": 0.6695827841758728,
"eval_runtime": 321.0661,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1800
},
{
"epoch": 1.5620280474649406,
"grad_norm": 10.973797798156738,
"learning_rate": 7.640072548653843e-06,
"loss": 0.6681,
"step": 1810
},
{
"epoch": 1.5706580366774543,
"grad_norm": 9.289361000061035,
"learning_rate": 7.607992680687362e-06,
"loss": 0.6297,
"step": 1820
},
{
"epoch": 1.5792880258899675,
"grad_norm": 6.6282148361206055,
"learning_rate": 7.575764646693447e-06,
"loss": 0.706,
"step": 1830
},
{
"epoch": 1.5879180151024812,
"grad_norm": 4.8196702003479,
"learning_rate": 7.5433902776204015e-06,
"loss": 0.6669,
"step": 1840
},
{
"epoch": 1.5965480043149944,
"grad_norm": 8.807297706604004,
"learning_rate": 7.510871412730157e-06,
"loss": 0.6641,
"step": 1850
},
{
"epoch": 1.5965480043149944,
"eval_accuracy": 0.5980582524271845,
"eval_loss": 0.6732643246650696,
"eval_runtime": 321.036,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1850
},
{
"epoch": 1.6051779935275081,
"grad_norm": 4.137267589569092,
"learning_rate": 7.478209899493787e-06,
"loss": 0.6345,
"step": 1860
},
{
"epoch": 1.6138079827400216,
"grad_norm": 7.294461250305176,
"learning_rate": 7.445407593486535e-06,
"loss": 0.6899,
"step": 1870
},
{
"epoch": 1.622437971952535,
"grad_norm": 8.29757308959961,
"learning_rate": 7.41246635828241e-06,
"loss": 0.6848,
"step": 1880
},
{
"epoch": 1.6310679611650487,
"grad_norm": 10.072659492492676,
"learning_rate": 7.379388065348305e-06,
"loss": 0.6829,
"step": 1890
},
{
"epoch": 1.639697950377562,
"grad_norm": 8.695294380187988,
"learning_rate": 7.346174593937676e-06,
"loss": 0.7241,
"step": 1900
},
{
"epoch": 1.639697950377562,
"eval_accuracy": 0.596116504854369,
"eval_loss": 0.6652901768684387,
"eval_runtime": 321.0146,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1900
},
{
"epoch": 1.6483279395900756,
"grad_norm": 3.542787790298462,
"learning_rate": 7.31282783098378e-06,
"loss": 0.6428,
"step": 1910
},
{
"epoch": 1.6569579288025889,
"grad_norm": 6.900018215179443,
"learning_rate": 7.279349670992464e-06,
"loss": 0.6494,
"step": 1920
},
{
"epoch": 1.6655879180151025,
"grad_norm": 7.8714189529418945,
"learning_rate": 7.245742015934547e-06,
"loss": 0.5778,
"step": 1930
},
{
"epoch": 1.674217907227616,
"grad_norm": 4.089023590087891,
"learning_rate": 7.212006775137761e-06,
"loss": 0.6912,
"step": 1940
},
{
"epoch": 1.6828478964401294,
"grad_norm": 5.432620048522949,
"learning_rate": 7.178145865178268e-06,
"loss": 0.6496,
"step": 1950
},
{
"epoch": 1.6828478964401294,
"eval_accuracy": 0.6116504854368932,
"eval_loss": 0.6761239767074585,
"eval_runtime": 320.9902,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 1950
},
{
"epoch": 1.691477885652643,
"grad_norm": 4.092471122741699,
"learning_rate": 7.144161209771788e-06,
"loss": 0.6757,
"step": 1960
},
{
"epoch": 1.7001078748651564,
"grad_norm": 6.498571872711182,
"learning_rate": 7.110054739664303e-06,
"loss": 0.6111,
"step": 1970
},
{
"epoch": 1.70873786407767,
"grad_norm": 9.238410949707031,
"learning_rate": 7.075828392522362e-06,
"loss": 0.5998,
"step": 1980
},
{
"epoch": 1.7173678532901833,
"grad_norm": 5.266243934631348,
"learning_rate": 7.04148411282301e-06,
"loss": 0.655,
"step": 1990
},
{
"epoch": 1.725997842502697,
"grad_norm": 8.122797966003418,
"learning_rate": 7.0070238517433e-06,
"loss": 0.662,
"step": 2000
},
{
"epoch": 1.725997842502697,
"eval_accuracy": 0.6038834951456311,
"eval_loss": 0.6728688478469849,
"eval_runtime": 320.9753,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 2000
},
{
"epoch": 1.7346278317152104,
"grad_norm": 8.114389419555664,
"learning_rate": 6.972449567049463e-06,
"loss": 0.6923,
"step": 2010
},
{
"epoch": 1.7432578209277239,
"grad_norm": 6.447281837463379,
"learning_rate": 6.9377632229856665e-06,
"loss": 0.6625,
"step": 2020
},
{
"epoch": 1.7518878101402373,
"grad_norm": 8.996492385864258,
"learning_rate": 6.902966790162425e-06,
"loss": 0.6919,
"step": 2030
},
{
"epoch": 1.7605177993527508,
"grad_norm": 5.145361423492432,
"learning_rate": 6.868062245444655e-06,
"loss": 0.6468,
"step": 2040
},
{
"epoch": 1.7691477885652644,
"grad_norm": 6.459311008453369,
"learning_rate": 6.833051571839347e-06,
"loss": 0.7049,
"step": 2050
},
{
"epoch": 1.7691477885652644,
"eval_accuracy": 0.6135922330097088,
"eval_loss": 0.6757835149765015,
"eval_runtime": 320.6068,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2050
},
{
"epoch": 1.7777777777777777,
"grad_norm": 8.930355072021484,
"learning_rate": 6.797936758382924e-06,
"loss": 0.6384,
"step": 2060
},
{
"epoch": 1.7864077669902914,
"grad_norm": 8.780126571655273,
"learning_rate": 6.762719800028231e-06,
"loss": 0.6169,
"step": 2070
},
{
"epoch": 1.7950377562028046,
"grad_norm": 7.830219745635986,
"learning_rate": 6.727402697531193e-06,
"loss": 0.6596,
"step": 2080
},
{
"epoch": 1.8036677454153183,
"grad_norm": 4.703182697296143,
"learning_rate": 6.69198745733716e-06,
"loss": 0.6964,
"step": 2090
},
{
"epoch": 1.8122977346278317,
"grad_norm": 4.655829906463623,
"learning_rate": 6.656476091466901e-06,
"loss": 0.6483,
"step": 2100
},
{
"epoch": 1.8122977346278317,
"eval_accuracy": 0.6135922330097088,
"eval_loss": 0.6741885542869568,
"eval_runtime": 320.6691,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2100
},
{
"epoch": 1.8209277238403452,
"grad_norm": 4.25952672958374,
"learning_rate": 6.620870617402312e-06,
"loss": 0.6732,
"step": 2110
},
{
"epoch": 1.8295577130528586,
"grad_norm": 6.7814226150512695,
"learning_rate": 6.585173057971787e-06,
"loss": 0.6674,
"step": 2120
},
{
"epoch": 1.838187702265372,
"grad_norm": 4.3662638664245605,
"learning_rate": 6.5493854412352985e-06,
"loss": 0.6807,
"step": 2130
},
{
"epoch": 1.8468176914778858,
"grad_norm": 5.596447467803955,
"learning_rate": 6.5135098003691865e-06,
"loss": 0.6637,
"step": 2140
},
{
"epoch": 1.855447680690399,
"grad_norm": 4.839741230010986,
"learning_rate": 6.477548173550635e-06,
"loss": 0.678,
"step": 2150
},
{
"epoch": 1.855447680690399,
"eval_accuracy": 0.6310679611650486,
"eval_loss": 0.6695934534072876,
"eval_runtime": 320.6467,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2150
},
{
"epoch": 1.8640776699029127,
"grad_norm": 11.375150680541992,
"learning_rate": 6.441502603841892e-06,
"loss": 0.6592,
"step": 2160
},
{
"epoch": 1.8727076591154261,
"grad_norm": 6.302811145782471,
"learning_rate": 6.405375139074194e-06,
"loss": 0.6413,
"step": 2170
},
{
"epoch": 1.8813376483279396,
"grad_norm": 9.698513984680176,
"learning_rate": 6.369167831731419e-06,
"loss": 0.6304,
"step": 2180
},
{
"epoch": 1.889967637540453,
"grad_norm": 9.770709991455078,
"learning_rate": 6.332882738833485e-06,
"loss": 0.6144,
"step": 2190
},
{
"epoch": 1.8985976267529665,
"grad_norm": 10.665081977844238,
"learning_rate": 6.296521921819489e-06,
"loss": 0.678,
"step": 2200
},
{
"epoch": 1.8985976267529665,
"eval_accuracy": 0.6233009708737864,
"eval_loss": 0.6689735054969788,
"eval_runtime": 320.6295,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2200
},
{
"epoch": 1.9072276159654802,
"grad_norm": 8.206169128417969,
"learning_rate": 6.260087446430582e-06,
"loss": 0.6622,
"step": 2210
},
{
"epoch": 1.9158576051779934,
"grad_norm": 11.89337158203125,
"learning_rate": 6.223581382592625e-06,
"loss": 0.6567,
"step": 2220
},
{
"epoch": 1.924487594390507,
"grad_norm": 4.916356086730957,
"learning_rate": 6.18700580429857e-06,
"loss": 0.6634,
"step": 2230
},
{
"epoch": 1.9331175836030206,
"grad_norm": 9.565736770629883,
"learning_rate": 6.150362789490654e-06,
"loss": 0.6532,
"step": 2240
},
{
"epoch": 1.941747572815534,
"grad_norm": 10.54036808013916,
"learning_rate": 6.113654419942334e-06,
"loss": 0.6953,
"step": 2250
},
{
"epoch": 1.941747572815534,
"eval_accuracy": 0.625242718446602,
"eval_loss": 0.6624494791030884,
"eval_runtime": 320.6343,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2250
},
{
"epoch": 1.9503775620280475,
"grad_norm": 12.351181983947754,
"learning_rate": 6.0768827811400166e-06,
"loss": 0.71,
"step": 2260
},
{
"epoch": 1.959007551240561,
"grad_norm": 6.94906759262085,
"learning_rate": 6.040049962164585e-06,
"loss": 0.6464,
"step": 2270
},
{
"epoch": 1.9676375404530746,
"grad_norm": 6.037535667419434,
"learning_rate": 6.0031580555727005e-06,
"loss": 0.6598,
"step": 2280
},
{
"epoch": 1.9762675296655878,
"grad_norm": 11.901267051696777,
"learning_rate": 5.9662091572779325e-06,
"loss": 0.6292,
"step": 2290
},
{
"epoch": 1.9848975188781015,
"grad_norm": 7.471567153930664,
"learning_rate": 5.929205366431679e-06,
"loss": 0.6969,
"step": 2300
},
{
"epoch": 1.9848975188781015,
"eval_accuracy": 0.6368932038834951,
"eval_loss": 0.6725260019302368,
"eval_runtime": 320.5652,
"eval_samples_per_second": 1.607,
"eval_steps_per_second": 1.607,
"step": 2300
},
{
"epoch": 1.9935275080906147,
"grad_norm": 4.360079765319824,
"learning_rate": 5.892148785303905e-06,
"loss": 0.6386,
"step": 2310
},
{
"epoch": 2.0021574973031284,
"grad_norm": 7.370548725128174,
"learning_rate": 5.855041519163718e-06,
"loss": 0.5936,
"step": 2320
},
{
"epoch": 2.0107874865156417,
"grad_norm": 11.645364761352539,
"learning_rate": 5.817885676159754e-06,
"loss": 0.7021,
"step": 2330
},
{
"epoch": 2.0194174757281553,
"grad_norm": 9.975643157958984,
"learning_rate": 5.78068336720041e-06,
"loss": 0.62,
"step": 2340
},
{
"epoch": 2.028047464940669,
"grad_norm": 8.763169288635254,
"learning_rate": 5.743436705833922e-06,
"loss": 0.6492,
"step": 2350
},
{
"epoch": 2.028047464940669,
"eval_accuracy": 0.6485436893203883,
"eval_loss": 0.656815767288208,
"eval_runtime": 320.6788,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2350
},
{
"epoch": 2.0366774541531822,
"grad_norm": 6.766859531402588,
"learning_rate": 5.706147808128288e-06,
"loss": 0.6385,
"step": 2360
},
{
"epoch": 2.045307443365696,
"grad_norm": 7.149226665496826,
"learning_rate": 5.668818792551052e-06,
"loss": 0.5838,
"step": 2370
},
{
"epoch": 2.053937432578209,
"grad_norm": 6.320857048034668,
"learning_rate": 5.6314517798489395e-06,
"loss": 0.655,
"step": 2380
},
{
"epoch": 2.062567421790723,
"grad_norm": 12.915064811706543,
"learning_rate": 5.594048892927382e-06,
"loss": 0.7095,
"step": 2390
},
{
"epoch": 2.071197411003236,
"grad_norm": 7.46158504486084,
"learning_rate": 5.556612256729909e-06,
"loss": 0.6572,
"step": 2400
},
{
"epoch": 2.071197411003236,
"eval_accuracy": 0.6446601941747573,
"eval_loss": 0.669795036315918,
"eval_runtime": 320.7237,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2400
},
{
"epoch": 2.0798274002157497,
"grad_norm": 9.09875202178955,
"learning_rate": 5.519143998117424e-06,
"loss": 0.6518,
"step": 2410
},
{
"epoch": 2.0884573894282634,
"grad_norm": 9.286842346191406,
"learning_rate": 5.48164624574737e-06,
"loss": 0.6492,
"step": 2420
},
{
"epoch": 2.0970873786407767,
"grad_norm": 5.891538143157959,
"learning_rate": 5.444121129952799e-06,
"loss": 0.648,
"step": 2430
},
{
"epoch": 2.1057173678532903,
"grad_norm": 11.724071502685547,
"learning_rate": 5.406570782621341e-06,
"loss": 0.6533,
"step": 2440
},
{
"epoch": 2.1143473570658036,
"grad_norm": 8.159801483154297,
"learning_rate": 5.368997337074088e-06,
"loss": 0.6204,
"step": 2450
},
{
"epoch": 2.1143473570658036,
"eval_accuracy": 0.654368932038835,
"eval_loss": 0.6549546122550964,
"eval_runtime": 320.7153,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2450
},
{
"epoch": 2.1229773462783172,
"grad_norm": 8.30516529083252,
"learning_rate": 5.331402927944392e-06,
"loss": 0.5746,
"step": 2460
},
{
"epoch": 2.1316073354908305,
"grad_norm": 6.368971824645996,
"learning_rate": 5.293789691056601e-06,
"loss": 0.6352,
"step": 2470
},
{
"epoch": 2.140237324703344,
"grad_norm": 18.369422912597656,
"learning_rate": 5.256159763304703e-06,
"loss": 0.6815,
"step": 2480
},
{
"epoch": 2.148867313915858,
"grad_norm": 7.470778465270996,
"learning_rate": 5.218515282530934e-06,
"loss": 0.5849,
"step": 2490
},
{
"epoch": 2.157497303128371,
"grad_norm": 8.369938850402832,
"learning_rate": 5.180858387404325e-06,
"loss": 0.6479,
"step": 2500
},
{
"epoch": 2.157497303128371,
"eval_accuracy": 0.6446601941747573,
"eval_loss": 0.6610180735588074,
"eval_runtime": 320.6988,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2500
},
{
"epoch": 2.1661272923408847,
"grad_norm": 12.514945030212402,
"learning_rate": 5.143191217299189e-06,
"loss": 0.5588,
"step": 2510
},
{
"epoch": 2.174757281553398,
"grad_norm": 10.213220596313477,
"learning_rate": 5.10551591217359e-06,
"loss": 0.6862,
"step": 2520
},
{
"epoch": 2.1833872707659117,
"grad_norm": 10.838960647583008,
"learning_rate": 5.067834612447755e-06,
"loss": 0.6218,
"step": 2530
},
{
"epoch": 2.192017259978425,
"grad_norm": 8.767598152160645,
"learning_rate": 5.0301494588824795e-06,
"loss": 0.5711,
"step": 2540
},
{
"epoch": 2.2006472491909386,
"grad_norm": 6.138967514038086,
"learning_rate": 4.9924625924575095e-06,
"loss": 0.6954,
"step": 2550
},
{
"epoch": 2.2006472491909386,
"eval_accuracy": 0.6679611650485436,
"eval_loss": 0.6637104153633118,
"eval_runtime": 320.7599,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2550
},
{
"epoch": 2.209277238403452,
"grad_norm": 10.984577178955078,
"learning_rate": 4.954776154249896e-06,
"loss": 0.6567,
"step": 2560
},
{
"epoch": 2.2179072276159655,
"grad_norm": 8.720921516418457,
"learning_rate": 4.9170922853123635e-06,
"loss": 0.6283,
"step": 2570
},
{
"epoch": 2.226537216828479,
"grad_norm": 10.784737586975098,
"learning_rate": 4.879413126551675e-06,
"loss": 0.6072,
"step": 2580
},
{
"epoch": 2.2351672060409924,
"grad_norm": 6.139902114868164,
"learning_rate": 4.84174081860699e-06,
"loss": 0.5966,
"step": 2590
},
{
"epoch": 2.243797195253506,
"grad_norm": 7.9166083335876465,
"learning_rate": 4.8040775017282644e-06,
"loss": 0.5668,
"step": 2600
},
{
"epoch": 2.243797195253506,
"eval_accuracy": 0.658252427184466,
"eval_loss": 0.6660070419311523,
"eval_runtime": 320.7212,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2600
},
{
"epoch": 2.2524271844660193,
"grad_norm": 6.704747676849365,
"learning_rate": 4.766425315654648e-06,
"loss": 0.5675,
"step": 2610
},
{
"epoch": 2.261057173678533,
"grad_norm": 6.141285419464111,
"learning_rate": 4.728786399492923e-06,
"loss": 0.6543,
"step": 2620
},
{
"epoch": 2.269687162891046,
"grad_norm": 16.798852920532227,
"learning_rate": 4.69116289159598e-06,
"loss": 0.5984,
"step": 2630
},
{
"epoch": 2.27831715210356,
"grad_norm": 7.124361038208008,
"learning_rate": 4.653556929441332e-06,
"loss": 0.5777,
"step": 2640
},
{
"epoch": 2.286947141316073,
"grad_norm": 13.590773582458496,
"learning_rate": 4.61597064950967e-06,
"loss": 0.6185,
"step": 2650
},
{
"epoch": 2.286947141316073,
"eval_accuracy": 0.6679611650485436,
"eval_loss": 0.6793263554573059,
"eval_runtime": 320.6049,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2650
},
{
"epoch": 2.295577130528587,
"grad_norm": 8.081377983093262,
"learning_rate": 4.578406187163503e-06,
"loss": 0.5651,
"step": 2660
},
{
"epoch": 2.3042071197411005,
"grad_norm": 6.233886241912842,
"learning_rate": 4.540865676525828e-06,
"loss": 0.6087,
"step": 2670
},
{
"epoch": 2.3128371089536137,
"grad_norm": 5.7994489669799805,
"learning_rate": 4.503351250358893e-06,
"loss": 0.6153,
"step": 2680
},
{
"epoch": 2.3214670981661274,
"grad_norm": 21.2513427734375,
"learning_rate": 4.465865039943023e-06,
"loss": 0.5765,
"step": 2690
},
{
"epoch": 2.3300970873786406,
"grad_norm": 13.356746673583984,
"learning_rate": 4.428409174955548e-06,
"loss": 0.5314,
"step": 2700
},
{
"epoch": 2.3300970873786406,
"eval_accuracy": 0.6718446601941748,
"eval_loss": 0.6751753091812134,
"eval_runtime": 320.6989,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2700
},
{
"epoch": 2.3387270765911543,
"grad_norm": 10.287054061889648,
"learning_rate": 4.3909857833498015e-06,
"loss": 0.6288,
"step": 2710
},
{
"epoch": 2.347357065803668,
"grad_norm": 8.844134330749512,
"learning_rate": 4.353596991234228e-06,
"loss": 0.6502,
"step": 2720
},
{
"epoch": 2.355987055016181,
"grad_norm": 18.77345848083496,
"learning_rate": 4.3162449227516015e-06,
"loss": 0.6461,
"step": 2730
},
{
"epoch": 2.364617044228695,
"grad_norm": 5.465780258178711,
"learning_rate": 4.278931699958337e-06,
"loss": 0.5786,
"step": 2740
},
{
"epoch": 2.373247033441208,
"grad_norm": 9.964437484741211,
"learning_rate": 4.241659442703937e-06,
"loss": 0.6406,
"step": 2750
},
{
"epoch": 2.373247033441208,
"eval_accuracy": 0.6563106796116505,
"eval_loss": 0.6680858731269836,
"eval_runtime": 320.7173,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2750
},
{
"epoch": 2.381877022653722,
"grad_norm": 16.344274520874023,
"learning_rate": 4.2044302685105635e-06,
"loss": 0.6201,
"step": 2760
},
{
"epoch": 2.390507011866235,
"grad_norm": 6.842400074005127,
"learning_rate": 4.167246292452724e-06,
"loss": 0.5944,
"step": 2770
},
{
"epoch": 2.3991370010787487,
"grad_norm": 15.446759223937988,
"learning_rate": 4.130109627037124e-06,
"loss": 0.5883,
"step": 2780
},
{
"epoch": 2.407766990291262,
"grad_norm": 8.021566390991211,
"learning_rate": 4.093022382082639e-06,
"loss": 0.6618,
"step": 2790
},
{
"epoch": 2.4163969795037756,
"grad_norm": 10.198580741882324,
"learning_rate": 4.0559866646004546e-06,
"loss": 0.7011,
"step": 2800
},
{
"epoch": 2.4163969795037756,
"eval_accuracy": 0.6679611650485436,
"eval_loss": 0.6721732020378113,
"eval_runtime": 320.5897,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2800
},
{
"epoch": 2.4250269687162893,
"grad_norm": 7.147483825683594,
"learning_rate": 4.0190045786743656e-06,
"loss": 0.5454,
"step": 2810
},
{
"epoch": 2.4336569579288025,
"grad_norm": 6.587264060974121,
"learning_rate": 3.982078225341232e-06,
"loss": 0.5114,
"step": 2820
},
{
"epoch": 2.4422869471413162,
"grad_norm": 9.162304878234863,
"learning_rate": 3.945209702471622e-06,
"loss": 0.712,
"step": 2830
},
{
"epoch": 2.4509169363538295,
"grad_norm": 8.858553886413574,
"learning_rate": 3.908401104650621e-06,
"loss": 0.6119,
"step": 2840
},
{
"epoch": 2.459546925566343,
"grad_norm": 7.771361827850342,
"learning_rate": 3.871654523058831e-06,
"loss": 0.6195,
"step": 2850
},
{
"epoch": 2.459546925566343,
"eval_accuracy": 0.6757281553398058,
"eval_loss": 0.6643590927124023,
"eval_runtime": 320.706,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2850
},
{
"epoch": 2.4681769147788564,
"grad_norm": 7.508529186248779,
"learning_rate": 3.834972045353575e-06,
"loss": 0.6087,
"step": 2860
},
{
"epoch": 2.47680690399137,
"grad_norm": 9.493097305297852,
"learning_rate": 3.798355755550292e-06,
"loss": 0.6224,
"step": 2870
},
{
"epoch": 2.4854368932038833,
"grad_norm": 7.044253826141357,
"learning_rate": 3.7618077339041244e-06,
"loss": 0.6495,
"step": 2880
},
{
"epoch": 2.494066882416397,
"grad_norm": 6.932374954223633,
"learning_rate": 3.725330056791753e-06,
"loss": 0.627,
"step": 2890
},
{
"epoch": 2.5026968716289106,
"grad_norm": 8.32701301574707,
"learning_rate": 3.6889247965934195e-06,
"loss": 0.6675,
"step": 2900
},
{
"epoch": 2.5026968716289106,
"eval_accuracy": 0.6601941747572816,
"eval_loss": 0.6530495285987854,
"eval_runtime": 320.625,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2900
},
{
"epoch": 2.511326860841424,
"grad_norm": 7.712283134460449,
"learning_rate": 3.6525940215751987e-06,
"loss": 0.6522,
"step": 2910
},
{
"epoch": 2.5199568500539375,
"grad_norm": 8.3215913772583,
"learning_rate": 3.6163397957714895e-06,
"loss": 0.6759,
"step": 2920
},
{
"epoch": 2.528586839266451,
"grad_norm": 6.627832412719727,
"learning_rate": 3.5801641788677576e-06,
"loss": 0.6035,
"step": 2930
},
{
"epoch": 2.5372168284789645,
"grad_norm": 11.45533561706543,
"learning_rate": 3.5440692260835162e-06,
"loss": 0.6256,
"step": 2940
},
{
"epoch": 2.545846817691478,
"grad_norm": 6.252264499664307,
"learning_rate": 3.508056988055564e-06,
"loss": 0.5796,
"step": 2950
},
{
"epoch": 2.545846817691478,
"eval_accuracy": 0.6601941747572816,
"eval_loss": 0.6489056348800659,
"eval_runtime": 320.6022,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 2950
},
{
"epoch": 2.5544768069039914,
"grad_norm": 10.386983871459961,
"learning_rate": 3.4721295107214835e-06,
"loss": 0.4864,
"step": 2960
},
{
"epoch": 2.5631067961165046,
"grad_norm": 8.145389556884766,
"learning_rate": 3.4362888352034153e-06,
"loss": 0.6728,
"step": 2970
},
{
"epoch": 2.5717367853290183,
"grad_norm": 6.486176013946533,
"learning_rate": 3.4005369976920837e-06,
"loss": 0.6055,
"step": 2980
},
{
"epoch": 2.580366774541532,
"grad_norm": 10.21779727935791,
"learning_rate": 3.3648760293311267e-06,
"loss": 0.6123,
"step": 2990
},
{
"epoch": 2.588996763754045,
"grad_norm": 8.619269371032715,
"learning_rate": 3.3293079561016957e-06,
"loss": 0.6148,
"step": 3000
},
{
"epoch": 2.588996763754045,
"eval_accuracy": 0.6679611650485436,
"eval_loss": 0.6675190329551697,
"eval_runtime": 320.4804,
"eval_samples_per_second": 1.607,
"eval_steps_per_second": 1.607,
"step": 3000
},
{
"epoch": 2.597626752966559,
"grad_norm": 14.024328231811523,
"learning_rate": 3.2938347987073576e-06,
"loss": 0.6054,
"step": 3010
},
{
"epoch": 2.606256742179072,
"grad_norm": 13.966845512390137,
"learning_rate": 3.2584585724592967e-06,
"loss": 0.5767,
"step": 3020
},
{
"epoch": 2.614886731391586,
"grad_norm": 6.929962635040283,
"learning_rate": 3.223181287161812e-06,
"loss": 0.5214,
"step": 3030
},
{
"epoch": 2.6235167206040995,
"grad_norm": 9.28740406036377,
"learning_rate": 3.1880049469981468e-06,
"loss": 0.5823,
"step": 3040
},
{
"epoch": 2.6321467098166127,
"grad_norm": 22.37981414794922,
"learning_rate": 3.1529315504166147e-06,
"loss": 0.6293,
"step": 3050
},
{
"epoch": 2.6321467098166127,
"eval_accuracy": 0.6368932038834951,
"eval_loss": 0.6685478091239929,
"eval_runtime": 321.0635,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 3050
},
{
"epoch": 2.6407766990291264,
"grad_norm": 17.161617279052734,
"learning_rate": 3.117963090017071e-06,
"loss": 0.5728,
"step": 3060
},
{
"epoch": 2.6494066882416396,
"grad_norm": 19.009254455566406,
"learning_rate": 3.08310155243771e-06,
"loss": 0.7621,
"step": 3070
},
{
"epoch": 2.6580366774541533,
"grad_norm": 12.797933578491211,
"learning_rate": 3.048348918242191e-06,
"loss": 0.5567,
"step": 3080
},
{
"epoch": 2.6666666666666665,
"grad_norm": 10.396708488464355,
"learning_rate": 3.013707161807128e-06,
"loss": 0.6592,
"step": 3090
},
{
"epoch": 2.67529665587918,
"grad_norm": 8.590036392211914,
"learning_rate": 2.9791782512099098e-06,
"loss": 0.6095,
"step": 3100
},
{
"epoch": 2.67529665587918,
"eval_accuracy": 0.6621359223300971,
"eval_loss": 0.6717608571052551,
"eval_runtime": 321.0303,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 3100
},
{
"epoch": 2.6839266450916934,
"grad_norm": 11.886474609375,
"learning_rate": 2.944764148116902e-06,
"loss": 0.4862,
"step": 3110
},
{
"epoch": 2.692556634304207,
"grad_norm": 15.282882690429688,
"learning_rate": 2.9104668076719876e-06,
"loss": 0.5833,
"step": 3120
},
{
"epoch": 2.701186623516721,
"grad_norm": 15.11883544921875,
"learning_rate": 2.8762881783855025e-06,
"loss": 0.5887,
"step": 3130
},
{
"epoch": 2.709816612729234,
"grad_norm": 9.773431777954102,
"learning_rate": 2.8422302020235252e-06,
"loss": 0.6644,
"step": 3140
},
{
"epoch": 2.7184466019417477,
"grad_norm": 16.19442367553711,
"learning_rate": 2.808294813497563e-06,
"loss": 0.5422,
"step": 3150
},
{
"epoch": 2.7184466019417477,
"eval_accuracy": 0.6485436893203883,
"eval_loss": 0.6904874444007874,
"eval_runtime": 321.1401,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 3150
},
{
"epoch": 2.727076591154261,
"grad_norm": 16.843564987182617,
"learning_rate": 2.7744839407546374e-06,
"loss": 0.6523,
"step": 3160
},
{
"epoch": 2.7357065803667746,
"grad_norm": 18.18024253845215,
"learning_rate": 2.7407995046677377e-06,
"loss": 0.5283,
"step": 3170
},
{
"epoch": 2.7443365695792883,
"grad_norm": 20.41519546508789,
"learning_rate": 2.7072434189266945e-06,
"loss": 0.5934,
"step": 3180
},
{
"epoch": 2.7529665587918015,
"grad_norm": 14.765863418579102,
"learning_rate": 2.6738175899294703e-06,
"loss": 0.6699,
"step": 3190
},
{
"epoch": 2.7615965480043148,
"grad_norm": 17.99534034729004,
"learning_rate": 2.640523916673838e-06,
"loss": 0.6089,
"step": 3200
},
{
"epoch": 2.7615965480043148,
"eval_accuracy": 0.654368932038835,
"eval_loss": 0.6814106106758118,
"eval_runtime": 321.1084,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 3200
},
{
"epoch": 2.7702265372168284,
"grad_norm": 5.104621887207031,
"learning_rate": 2.607364290649501e-06,
"loss": 0.6884,
"step": 3210
},
{
"epoch": 2.778856526429342,
"grad_norm": 17.406665802001953,
"learning_rate": 2.574340595730633e-06,
"loss": 0.6264,
"step": 3220
},
{
"epoch": 2.7874865156418553,
"grad_norm": 8.697972297668457,
"learning_rate": 2.541454708068855e-06,
"loss": 0.5552,
"step": 3230
},
{
"epoch": 2.796116504854369,
"grad_norm": 7.472986698150635,
"learning_rate": 2.5087084959866403e-06,
"loss": 0.596,
"step": 3240
},
{
"epoch": 2.8047464940668823,
"grad_norm": 11.333291053771973,
"learning_rate": 2.476103819871166e-06,
"loss": 0.6238,
"step": 3250
},
{
"epoch": 2.8047464940668823,
"eval_accuracy": 0.6466019417475728,
"eval_loss": 0.6738768815994263,
"eval_runtime": 321.0019,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 3250
},
{
"epoch": 2.813376483279396,
"grad_norm": 15.323911666870117,
"learning_rate": 2.44364253206864e-06,
"loss": 0.6472,
"step": 3260
},
{
"epoch": 2.8220064724919096,
"grad_norm": 14.362588882446289,
"learning_rate": 2.4113264767790433e-06,
"loss": 0.6375,
"step": 3270
},
{
"epoch": 2.830636461704423,
"grad_norm": 11.027913093566895,
"learning_rate": 2.379157489951367e-06,
"loss": 0.6185,
"step": 3280
},
{
"epoch": 2.839266450916936,
"grad_norm": 8.004063606262207,
"learning_rate": 2.3471373991793116e-06,
"loss": 0.6608,
"step": 3290
},
{
"epoch": 2.8478964401294498,
"grad_norm": 11.401987075805664,
"learning_rate": 2.315268023597447e-06,
"loss": 0.7386,
"step": 3300
},
{
"epoch": 2.8478964401294498,
"eval_accuracy": 0.6485436893203883,
"eval_loss": 0.6621807813644409,
"eval_runtime": 321.0895,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 3300
},
{
"epoch": 2.8565264293419634,
"grad_norm": 11.381020545959473,
"learning_rate": 2.2835511737778687e-06,
"loss": 0.5386,
"step": 3310
},
{
"epoch": 2.8651564185544767,
"grad_norm": 14.900254249572754,
"learning_rate": 2.2519886516273365e-06,
"loss": 0.6754,
"step": 3320
},
{
"epoch": 2.8737864077669903,
"grad_norm": 10.069350242614746,
"learning_rate": 2.220582250284905e-06,
"loss": 0.6129,
"step": 3330
},
{
"epoch": 2.8824163969795036,
"grad_norm": 8.782756805419922,
"learning_rate": 2.189333754020046e-06,
"loss": 0.6185,
"step": 3340
},
{
"epoch": 2.8910463861920173,
"grad_norm": 8.9526948928833,
"learning_rate": 2.158244938131277e-06,
"loss": 0.6166,
"step": 3350
},
{
"epoch": 2.8910463861920173,
"eval_accuracy": 0.654368932038835,
"eval_loss": 0.6567447781562805,
"eval_runtime": 320.6468,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3350
},
{
"epoch": 2.899676375404531,
"grad_norm": 6.0573625564575195,
"learning_rate": 2.12731756884532e-06,
"loss": 0.6601,
"step": 3360
},
{
"epoch": 2.908306364617044,
"grad_norm": 15.11607837677002,
"learning_rate": 2.096553403216739e-06,
"loss": 0.7397,
"step": 3370
},
{
"epoch": 2.916936353829558,
"grad_norm": 7.567427635192871,
"learning_rate": 2.0659541890281236e-06,
"loss": 0.5167,
"step": 3380
},
{
"epoch": 2.925566343042071,
"grad_norm": 11.045202255249023,
"learning_rate": 2.0355216646908016e-06,
"loss": 0.6497,
"step": 3390
},
{
"epoch": 2.9341963322545848,
"grad_norm": 14.782462120056152,
"learning_rate": 2.0052575591460636e-06,
"loss": 0.5866,
"step": 3400
},
{
"epoch": 2.9341963322545848,
"eval_accuracy": 0.6504854368932039,
"eval_loss": 0.6615984439849854,
"eval_runtime": 320.6259,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3400
},
{
"epoch": 2.9428263214670984,
"grad_norm": 5.701985836029053,
"learning_rate": 1.975163591766946e-06,
"loss": 0.6723,
"step": 3410
},
{
"epoch": 2.9514563106796117,
"grad_norm": 10.19908618927002,
"learning_rate": 1.9452414722605432e-06,
"loss": 0.592,
"step": 3420
},
{
"epoch": 2.960086299892125,
"grad_norm": 8.34867000579834,
"learning_rate": 1.915492900570887e-06,
"loss": 0.6623,
"step": 3430
},
{
"epoch": 2.9687162891046386,
"grad_norm": 14.363434791564941,
"learning_rate": 1.885919566782352e-06,
"loss": 0.6295,
"step": 3440
},
{
"epoch": 2.9773462783171523,
"grad_norm": 9.90467357635498,
"learning_rate": 1.8565231510236531e-06,
"loss": 0.6348,
"step": 3450
},
{
"epoch": 2.9773462783171523,
"eval_accuracy": 0.6563106796116505,
"eval_loss": 0.6633828282356262,
"eval_runtime": 320.6481,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3450
},
{
"epoch": 2.9859762675296655,
"grad_norm": 13.353963851928711,
"learning_rate": 1.8273053233723843e-06,
"loss": 0.5338,
"step": 3460
},
{
"epoch": 2.994606256742179,
"grad_norm": 14.00833797454834,
"learning_rate": 1.798267743760142e-06,
"loss": 0.633,
"step": 3470
},
{
"epoch": 3.0032362459546924,
"grad_norm": 14.501118659973145,
"learning_rate": 1.7694120618782169e-06,
"loss": 0.5085,
"step": 3480
},
{
"epoch": 3.011866235167206,
"grad_norm": 9.27495002746582,
"learning_rate": 1.7407399170838802e-06,
"loss": 0.5477,
"step": 3490
},
{
"epoch": 3.0204962243797193,
"grad_norm": 12.652294158935547,
"learning_rate": 1.7122529383072346e-06,
"loss": 0.5907,
"step": 3500
},
{
"epoch": 3.0204962243797193,
"eval_accuracy": 0.658252427184466,
"eval_loss": 0.6642096042633057,
"eval_runtime": 320.7217,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3500
},
{
"epoch": 3.029126213592233,
"grad_norm": 12.352764129638672,
"learning_rate": 1.68395274395868e-06,
"loss": 0.5256,
"step": 3510
},
{
"epoch": 3.0377562028047467,
"grad_norm": 6.0259222984313965,
"learning_rate": 1.6558409418369686e-06,
"loss": 0.4449,
"step": 3520
},
{
"epoch": 3.04638619201726,
"grad_norm": 4.154427528381348,
"learning_rate": 1.6279191290378566e-06,
"loss": 0.449,
"step": 3530
},
{
"epoch": 3.0550161812297736,
"grad_norm": 12.186491012573242,
"learning_rate": 1.6001888918633728e-06,
"loss": 0.4746,
"step": 3540
},
{
"epoch": 3.063646170442287,
"grad_norm": 9.144371032714844,
"learning_rate": 1.5726518057316969e-06,
"loss": 0.4985,
"step": 3550
},
{
"epoch": 3.063646170442287,
"eval_accuracy": 0.654368932038835,
"eval_loss": 0.6903661489486694,
"eval_runtime": 320.6325,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3550
},
{
"epoch": 3.0722761596548005,
"grad_norm": 14.253432273864746,
"learning_rate": 1.5453094350876563e-06,
"loss": 0.5309,
"step": 3560
},
{
"epoch": 3.0809061488673137,
"grad_norm": 14.948261260986328,
"learning_rate": 1.5181633333138456e-06,
"loss": 0.5263,
"step": 3570
},
{
"epoch": 3.0895361380798274,
"grad_norm": 9.058218955993652,
"learning_rate": 1.4912150426423766e-06,
"loss": 0.5077,
"step": 3580
},
{
"epoch": 3.098166127292341,
"grad_norm": 17.286836624145508,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.5556,
"step": 3590
},
{
"epoch": 3.1067961165048543,
"grad_norm": 9.762429237365723,
"learning_rate": 1.4379180072574335e-06,
"loss": 0.53,
"step": 3600
},
{
"epoch": 3.1067961165048543,
"eval_accuracy": 0.6466019417475728,
"eval_loss": 0.6925872564315796,
"eval_runtime": 320.6091,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3600
},
{
"epoch": 3.115426105717368,
"grad_norm": 15.105671882629395,
"learning_rate": 1.411572290470401e-06,
"loss": 0.5956,
"step": 3610
},
{
"epoch": 3.1240560949298812,
"grad_norm": 13.916862487792969,
"learning_rate": 1.3854304404665796e-06,
"loss": 0.5019,
"step": 3620
},
{
"epoch": 3.132686084142395,
"grad_norm": 14.544822692871094,
"learning_rate": 1.359493942424241e-06,
"loss": 0.5761,
"step": 3630
},
{
"epoch": 3.141316073354908,
"grad_norm": 15.535740852355957,
"learning_rate": 1.3337642698551428e-06,
"loss": 0.4957,
"step": 3640
},
{
"epoch": 3.149946062567422,
"grad_norm": 13.230164527893066,
"learning_rate": 1.3082428845208155e-06,
"loss": 0.5728,
"step": 3650
},
{
"epoch": 3.149946062567422,
"eval_accuracy": 0.654368932038835,
"eval_loss": 0.6939272880554199,
"eval_runtime": 320.6286,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3650
},
{
"epoch": 3.158576051779935,
"grad_norm": 11.026480674743652,
"learning_rate": 1.2829312363495155e-06,
"loss": 0.5602,
"step": 3660
},
{
"epoch": 3.1672060409924487,
"grad_norm": 10.449764251708984,
"learning_rate": 1.2578307633538505e-06,
"loss": 0.6031,
"step": 3670
},
{
"epoch": 3.1758360302049624,
"grad_norm": 13.517521858215332,
"learning_rate": 1.232942891549083e-06,
"loss": 0.6053,
"step": 3680
},
{
"epoch": 3.1844660194174756,
"grad_norm": 10.760808944702148,
"learning_rate": 1.2082690348721204e-06,
"loss": 0.5024,
"step": 3690
},
{
"epoch": 3.1930960086299893,
"grad_norm": 14.012762069702148,
"learning_rate": 1.1838105951011758e-06,
"loss": 0.5011,
"step": 3700
},
{
"epoch": 3.1930960086299893,
"eval_accuracy": 0.6601941747572816,
"eval_loss": 0.6916132569313049,
"eval_runtime": 320.6627,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3700
},
{
"epoch": 3.2017259978425026,
"grad_norm": 11.190227508544922,
"learning_rate": 1.1595689617761363e-06,
"loss": 0.4906,
"step": 3710
},
{
"epoch": 3.2103559870550162,
"grad_norm": 17.964550018310547,
"learning_rate": 1.1355455121196234e-06,
"loss": 0.5705,
"step": 3720
},
{
"epoch": 3.2189859762675295,
"grad_norm": 21.885299682617188,
"learning_rate": 1.1117416109587403e-06,
"loss": 0.6581,
"step": 3730
},
{
"epoch": 3.227615965480043,
"grad_norm": 10.283282279968262,
"learning_rate": 1.0881586106475406e-06,
"loss": 0.6133,
"step": 3740
},
{
"epoch": 3.236245954692557,
"grad_norm": 8.597122192382812,
"learning_rate": 1.0647978509901946e-06,
"loss": 0.4987,
"step": 3750
},
{
"epoch": 3.236245954692557,
"eval_accuracy": 0.654368932038835,
"eval_loss": 0.6906397938728333,
"eval_runtime": 320.6953,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3750
},
{
"epoch": 3.24487594390507,
"grad_norm": 10.815213203430176,
"learning_rate": 1.0416606591648737e-06,
"loss": 0.6638,
"step": 3760
},
{
"epoch": 3.2535059331175837,
"grad_norm": 7.768321990966797,
"learning_rate": 1.018748349648348e-06,
"loss": 0.5556,
"step": 3770
},
{
"epoch": 3.262135922330097,
"grad_norm": 11.6558837890625,
"learning_rate": 9.960622241413137e-07,
"loss": 0.5817,
"step": 3780
},
{
"epoch": 3.2707659115426106,
"grad_norm": 14.339502334594727,
"learning_rate": 9.736035714944314e-07,
"loss": 0.5237,
"step": 3790
},
{
"epoch": 3.279395900755124,
"grad_norm": 15.16897964477539,
"learning_rate": 9.513736676351104e-07,
"loss": 0.5909,
"step": 3800
},
{
"epoch": 3.279395900755124,
"eval_accuracy": 0.658252427184466,
"eval_loss": 0.6882277727127075,
"eval_runtime": 320.663,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3800
},
{
"epoch": 3.2880258899676376,
"grad_norm": 13.602522850036621,
"learning_rate": 9.293737754950166e-07,
"loss": 0.5828,
"step": 3810
},
{
"epoch": 3.2966558791801512,
"grad_norm": 17.136140823364258,
"learning_rate": 9.076051449383294e-07,
"loss": 0.6515,
"step": 3820
},
{
"epoch": 3.3052858683926645,
"grad_norm": 13.352173805236816,
"learning_rate": 8.860690126907229e-07,
"loss": 0.5751,
"step": 3830
},
{
"epoch": 3.313915857605178,
"grad_norm": 21.102169036865234,
"learning_rate": 8.64766602269112e-07,
"loss": 0.6061,
"step": 3840
},
{
"epoch": 3.3225458468176914,
"grad_norm": 23.22005844116211,
"learning_rate": 8.436991239121451e-07,
"loss": 0.5194,
"step": 3850
},
{
"epoch": 3.3225458468176914,
"eval_accuracy": 0.6524271844660194,
"eval_loss": 0.6874131560325623,
"eval_runtime": 320.7489,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 3850
},
{
"epoch": 3.331175836030205,
"grad_norm": 8.979095458984375,
"learning_rate": 8.22867774511435e-07,
"loss": 0.5395,
"step": 3860
},
{
"epoch": 3.3398058252427183,
"grad_norm": 9.126049041748047,
"learning_rate": 8.022737375435735e-07,
"loss": 0.566,
"step": 3870
},
{
"epoch": 3.348435814455232,
"grad_norm": 8.811643600463867,
"learning_rate": 7.81918183002891e-07,
"loss": 0.5703,
"step": 3880
},
{
"epoch": 3.357065803667745,
"grad_norm": 9.9462308883667,
"learning_rate": 7.618022673349834e-07,
"loss": 0.5318,
"step": 3890
},
{
"epoch": 3.365695792880259,
"grad_norm": 15.365378379821777,
"learning_rate": 7.419271333710154e-07,
"loss": 0.5925,
"step": 3900
},
{
"epoch": 3.365695792880259,
"eval_accuracy": 0.6601941747572816,
"eval_loss": 0.685357391834259,
"eval_runtime": 320.5481,
"eval_samples_per_second": 1.607,
"eval_steps_per_second": 1.607,
"step": 3900
},
{
"epoch": 3.3743257820927726,
"grad_norm": 13.633624076843262,
"learning_rate": 7.222939102627919e-07,
"loss": 0.6622,
"step": 3910
},
{
"epoch": 3.382955771305286,
"grad_norm": 14.377915382385254,
"learning_rate": 7.029037134186112e-07,
"loss": 0.4916,
"step": 3920
},
{
"epoch": 3.3915857605177995,
"grad_norm": 11.740239143371582,
"learning_rate": 6.837576444398913e-07,
"loss": 0.5409,
"step": 3930
},
{
"epoch": 3.4002157497303127,
"grad_norm": 10.254107475280762,
"learning_rate": 6.648567910585874e-07,
"loss": 0.6555,
"step": 3940
},
{
"epoch": 3.4088457389428264,
"grad_norm": 16.456100463867188,
"learning_rate": 6.46202227075401e-07,
"loss": 0.4709,
"step": 3950
},
{
"epoch": 3.4088457389428264,
"eval_accuracy": 0.6621359223300971,
"eval_loss": 0.6879016160964966,
"eval_runtime": 320.8657,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 3950
},
{
"epoch": 3.4174757281553396,
"grad_norm": 6.954639911651611,
"learning_rate": 6.277950122987631e-07,
"loss": 0.542,
"step": 3960
},
{
"epoch": 3.4261057173678533,
"grad_norm": 16.155237197875977,
"learning_rate": 6.096361924846333e-07,
"loss": 0.6621,
"step": 3970
},
{
"epoch": 3.4347357065803665,
"grad_norm": 10.976309776306152,
"learning_rate": 5.917267992770881e-07,
"loss": 0.5217,
"step": 3980
},
{
"epoch": 3.44336569579288,
"grad_norm": 17.910186767578125,
"learning_rate": 5.740678501497049e-07,
"loss": 0.669,
"step": 3990
},
{
"epoch": 3.451995685005394,
"grad_norm": 16.26474952697754,
"learning_rate": 5.566603483477607e-07,
"loss": 0.5317,
"step": 4000
},
{
"epoch": 3.451995685005394,
"eval_accuracy": 0.6601941747572816,
"eval_loss": 0.6886419057846069,
"eval_runtime": 320.5766,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 4000
},
{
"epoch": 3.460625674217907,
"grad_norm": 22.223215103149414,
"learning_rate": 5.395052828312359e-07,
"loss": 0.5363,
"step": 4010
},
{
"epoch": 3.469255663430421,
"grad_norm": 8.730759620666504,
"learning_rate": 5.226036282186286e-07,
"loss": 0.6681,
"step": 4020
},
{
"epoch": 3.477885652642934,
"grad_norm": 8.632150650024414,
"learning_rate": 5.059563447315829e-07,
"loss": 0.5089,
"step": 4030
},
{
"epoch": 3.4865156418554477,
"grad_norm": 9.663848876953125,
"learning_rate": 4.895643781403375e-07,
"loss": 0.4644,
"step": 4040
},
{
"epoch": 3.4951456310679614,
"grad_norm": 11.52153205871582,
"learning_rate": 4.73428659709998e-07,
"loss": 0.5821,
"step": 4050
},
{
"epoch": 3.4951456310679614,
"eval_accuracy": 0.6660194174757281,
"eval_loss": 0.6889378428459167,
"eval_runtime": 320.9557,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 4050
},
{
"epoch": 3.5037756202804746,
"grad_norm": 17.435976028442383,
"learning_rate": 4.575501061476195e-07,
"loss": 0.5951,
"step": 4060
},
{
"epoch": 3.512405609492988,
"grad_norm": 13.329899787902832,
"learning_rate": 4.4192961955013766e-07,
"loss": 0.5985,
"step": 4070
},
{
"epoch": 3.5210355987055015,
"grad_norm": 10.234993934631348,
"learning_rate": 4.265680873531136e-07,
"loss": 0.5232,
"step": 4080
},
{
"epoch": 3.529665587918015,
"grad_norm": 13.122269630432129,
"learning_rate": 4.1146638228031557e-07,
"loss": 0.5554,
"step": 4090
},
{
"epoch": 3.5382955771305284,
"grad_norm": 10.752240180969238,
"learning_rate": 3.966253622941385e-07,
"loss": 0.5887,
"step": 4100
},
{
"epoch": 3.5382955771305284,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6890589594841003,
"eval_runtime": 321.1286,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 4100
},
{
"epoch": 3.546925566343042,
"grad_norm": 13.36107063293457,
"learning_rate": 3.820458705468633e-07,
"loss": 0.5101,
"step": 4110
},
{
"epoch": 3.5555555555555554,
"grad_norm": 11.969443321228027,
"learning_rate": 3.677287353327519e-07,
"loss": 0.6162,
"step": 4120
},
{
"epoch": 3.564185544768069,
"grad_norm": 15.6027250289917,
"learning_rate": 3.536747700409932e-07,
"loss": 0.6591,
"step": 4130
},
{
"epoch": 3.5728155339805827,
"grad_norm": 10.335657119750977,
"learning_rate": 3.3988477310948785e-07,
"loss": 0.5749,
"step": 4140
},
{
"epoch": 3.581445523193096,
"grad_norm": 7.062427043914795,
"learning_rate": 3.2635952797949566e-07,
"loss": 0.5362,
"step": 4150
},
{
"epoch": 3.581445523193096,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6879053711891174,
"eval_runtime": 321.1587,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 4150
},
{
"epoch": 3.5900755124056096,
"grad_norm": 9.053596496582031,
"learning_rate": 3.1309980305111674e-07,
"loss": 0.5753,
"step": 4160
},
{
"epoch": 3.598705501618123,
"grad_norm": 9.732317924499512,
"learning_rate": 3.0010635163964186e-07,
"loss": 0.5671,
"step": 4170
},
{
"epoch": 3.6073354908306365,
"grad_norm": 14.350728034973145,
"learning_rate": 2.8737991193275805e-07,
"loss": 0.525,
"step": 4180
},
{
"epoch": 3.61596548004315,
"grad_norm": 12.92699146270752,
"learning_rate": 2.7492120694860237e-07,
"loss": 0.5276,
"step": 4190
},
{
"epoch": 3.6245954692556634,
"grad_norm": 8.268197059631348,
"learning_rate": 2.627309444946929e-07,
"loss": 0.4971,
"step": 4200
},
{
"epoch": 3.6245954692556634,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6887635588645935,
"eval_runtime": 320.9246,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 4200
},
{
"epoch": 3.6332254584681767,
"grad_norm": 9.3760404586792,
"learning_rate": 2.5080981712771344e-07,
"loss": 0.4793,
"step": 4210
},
{
"epoch": 3.6418554476806904,
"grad_norm": 17.867101669311523,
"learning_rate": 2.391585021141668e-07,
"loss": 0.4916,
"step": 4220
},
{
"epoch": 3.650485436893204,
"grad_norm": 9.685575485229492,
"learning_rate": 2.2777766139190084e-07,
"loss": 0.54,
"step": 4230
},
{
"epoch": 3.6591154261057173,
"grad_norm": 20.8098201751709,
"learning_rate": 2.1666794153249792e-07,
"loss": 0.6402,
"step": 4240
},
{
"epoch": 3.667745415318231,
"grad_norm": 9.999732971191406,
"learning_rate": 2.0582997370454882e-07,
"loss": 0.5009,
"step": 4250
},
{
"epoch": 3.667745415318231,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6899433732032776,
"eval_runtime": 321.085,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 4250
},
{
"epoch": 3.676375404530744,
"grad_norm": 13.440372467041016,
"learning_rate": 1.9526437363778404e-07,
"loss": 0.7073,
"step": 4260
},
{
"epoch": 3.685005393743258,
"grad_norm": 12.25793170928955,
"learning_rate": 1.8497174158810361e-07,
"loss": 0.6589,
"step": 4270
},
{
"epoch": 3.6936353829557715,
"grad_norm": 20.834096908569336,
"learning_rate": 1.749526623034681e-07,
"loss": 0.6127,
"step": 4280
},
{
"epoch": 3.7022653721682848,
"grad_norm": 14.255398750305176,
"learning_rate": 1.6520770499068083e-07,
"loss": 0.4761,
"step": 4290
},
{
"epoch": 3.710895361380798,
"grad_norm": 6.590888977050781,
"learning_rate": 1.557374232830483e-07,
"loss": 0.5813,
"step": 4300
},
{
"epoch": 3.710895361380798,
"eval_accuracy": 0.6621359223300971,
"eval_loss": 0.6886661648750305,
"eval_runtime": 321.0071,
"eval_samples_per_second": 1.604,
"eval_steps_per_second": 1.604,
"step": 4300
},
{
"epoch": 3.7195253505933117,
"grad_norm": 7.404444694519043,
"learning_rate": 1.4654235520892958e-07,
"loss": 0.5689,
"step": 4310
},
{
"epoch": 3.7281553398058254,
"grad_norm": 18.861854553222656,
"learning_rate": 1.3762302316116527e-07,
"loss": 0.4723,
"step": 4320
},
{
"epoch": 3.7367853290183386,
"grad_norm": 20.41657257080078,
"learning_rate": 1.289799338674036e-07,
"loss": 0.6008,
"step": 4330
},
{
"epoch": 3.7454153182308523,
"grad_norm": 11.25420093536377,
"learning_rate": 1.2061357836131104e-07,
"loss": 0.5452,
"step": 4340
},
{
"epoch": 3.7540453074433655,
"grad_norm": 13.756759643554688,
"learning_rate": 1.1252443195467311e-07,
"loss": 0.6147,
"step": 4350
},
{
"epoch": 3.7540453074433655,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6891469955444336,
"eval_runtime": 320.9449,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 4350
},
{
"epoch": 3.762675296655879,
"grad_norm": 13.715859413146973,
"learning_rate": 1.0471295421039251e-07,
"loss": 0.5173,
"step": 4360
},
{
"epoch": 3.771305285868393,
"grad_norm": 7.733090400695801,
"learning_rate": 9.71795889163818e-08,
"loss": 0.6093,
"step": 4370
},
{
"epoch": 3.779935275080906,
"grad_norm": 7.727634429931641,
"learning_rate": 8.992476406034845e-08,
"loss": 0.5655,
"step": 4380
},
{
"epoch": 3.7885652642934198,
"grad_norm": 8.828600883483887,
"learning_rate": 8.294889180548104e-08,
"loss": 0.7,
"step": 4390
},
{
"epoch": 3.797195253505933,
"grad_norm": 8.170161247253418,
"learning_rate": 7.625236846703243e-08,
"loss": 0.6033,
"step": 4400
},
{
"epoch": 3.797195253505933,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6890521049499512,
"eval_runtime": 320.8322,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 4400
},
{
"epoch": 3.8058252427184467,
"grad_norm": 10.907033920288086,
"learning_rate": 6.983557448980549e-08,
"loss": 0.5508,
"step": 4410
},
{
"epoch": 3.81445523193096,
"grad_norm": 16.888439178466797,
"learning_rate": 6.369887442653877e-08,
"loss": 0.5819,
"step": 4420
},
{
"epoch": 3.8230852211434736,
"grad_norm": 20.531522750854492,
"learning_rate": 5.7842616917193064e-08,
"loss": 0.4267,
"step": 4430
},
{
"epoch": 3.831715210355987,
"grad_norm": 8.410703659057617,
"learning_rate": 5.226713466915001e-08,
"loss": 0.5266,
"step": 4440
},
{
"epoch": 3.8403451995685005,
"grad_norm": 6.310892105102539,
"learning_rate": 4.697274443830335e-08,
"loss": 0.565,
"step": 4450
},
{
"epoch": 3.8403451995685005,
"eval_accuracy": 0.6660194174757281,
"eval_loss": 0.6890508532524109,
"eval_runtime": 320.9035,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 1.605,
"step": 4450
},
{
"epoch": 3.848975188781014,
"grad_norm": 28.219768524169922,
"learning_rate": 4.195974701106775e-08,
"loss": 0.5493,
"step": 4460
},
{
"epoch": 3.8576051779935274,
"grad_norm": 19.05866241455078,
"learning_rate": 3.722842718728969e-08,
"loss": 0.5646,
"step": 4470
},
{
"epoch": 3.866235167206041,
"grad_norm": 8.093132019042969,
"learning_rate": 3.277905376406654e-08,
"loss": 0.5774,
"step": 4480
},
{
"epoch": 3.8748651564185543,
"grad_norm": 10.243422508239746,
"learning_rate": 2.8611879520476503e-08,
"loss": 0.6114,
"step": 4490
},
{
"epoch": 3.883495145631068,
"grad_norm": 9.737555503845215,
"learning_rate": 2.4727141203216286e-08,
"loss": 0.5044,
"step": 4500
},
{
"epoch": 3.883495145631068,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6893202662467957,
"eval_runtime": 321.2665,
"eval_samples_per_second": 1.603,
"eval_steps_per_second": 1.603,
"step": 4500
},
{
"epoch": 3.8921251348435817,
"grad_norm": 15.192139625549316,
"learning_rate": 2.1125059513152357e-08,
"loss": 0.5512,
"step": 4510
},
{
"epoch": 3.900755124056095,
"grad_norm": 23.43290901184082,
"learning_rate": 1.7805839092781553e-08,
"loss": 0.633,
"step": 4520
},
{
"epoch": 3.909385113268608,
"grad_norm": 13.518702507019043,
"learning_rate": 1.4769668514605374e-08,
"loss": 0.5216,
"step": 4530
},
{
"epoch": 3.918015102481122,
"grad_norm": 11.329241752624512,
"learning_rate": 1.2016720270417448e-08,
"loss": 0.5502,
"step": 4540
},
{
"epoch": 3.9266450916936355,
"grad_norm": 20.290353775024414,
"learning_rate": 9.547150761501922e-09,
"loss": 0.613,
"step": 4550
},
{
"epoch": 3.9266450916936355,
"eval_accuracy": 0.6660194174757281,
"eval_loss": 0.68938148021698,
"eval_runtime": 320.6069,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 4550
},
{
"epoch": 3.9352750809061487,
"grad_norm": 10.623443603515625,
"learning_rate": 7.3611002897489015e-09,
"loss": 0.5943,
"step": 4560
},
{
"epoch": 3.9439050701186624,
"grad_norm": 13.714851379394531,
"learning_rate": 5.458693049684161e-09,
"loss": 0.5628,
"step": 4570
},
{
"epoch": 3.9525350593311757,
"grad_norm": 20.694622039794922,
"learning_rate": 3.8400371214131205e-09,
"loss": 0.5538,
"step": 4580
},
{
"epoch": 3.9611650485436893,
"grad_norm": 14.463215827941895,
"learning_rate": 2.5052244644802048e-09,
"loss": 0.64,
"step": 4590
},
{
"epoch": 3.969795037756203,
"grad_norm": 7.637043476104736,
"learning_rate": 1.4543309126446858e-09,
"loss": 0.4614,
"step": 4600
},
{
"epoch": 3.969795037756203,
"eval_accuracy": 0.6640776699029126,
"eval_loss": 0.6896011829376221,
"eval_runtime": 320.6166,
"eval_samples_per_second": 1.606,
"eval_steps_per_second": 1.606,
"step": 4600
},
{
"epoch": 3.9784250269687162,
"grad_norm": 12.583084106445312,
"learning_rate": 6.874161695719084e-10,
"loss": 0.5865,
"step": 4610
},
{
"epoch": 3.98705501618123,
"grad_norm": 16.6655216217041,
"learning_rate": 2.045238054415588e-10,
"loss": 0.5533,
"step": 4620
},
{
"epoch": 3.995685005393743,
"grad_norm": 26.88420867919922,
"learning_rate": 5.681254474088072e-12,
"loss": 0.6292,
"step": 4630
},
{
"epoch": 3.997411003236246,
"step": 4632,
"total_flos": 0.0,
"train_loss": 0.6694142627746947,
"train_runtime": 66014.9203,
"train_samples_per_second": 0.281,
"train_steps_per_second": 0.07
}
],
"logging_steps": 10,
"max_steps": 4632,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}