{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993049349617714, "eval_steps": 500, "global_step": 1258, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007943600436898023, "grad_norm": 11.12151660776111, "learning_rate": 1.5873015873015874e-07, "loss": 1.8009, "step": 1 }, { "epoch": 0.003971800218449012, "grad_norm": 10.10420314385476, "learning_rate": 7.936507936507937e-07, "loss": 1.7719, "step": 5 }, { "epoch": 0.007943600436898023, "grad_norm": 2.7149708608696828, "learning_rate": 1.5873015873015873e-06, "loss": 1.7086, "step": 10 }, { "epoch": 0.011915400655347037, "grad_norm": 1.570269172415534, "learning_rate": 2.380952380952381e-06, "loss": 1.6525, "step": 15 }, { "epoch": 0.015887200873796047, "grad_norm": 1.166276790017584, "learning_rate": 3.1746031746031746e-06, "loss": 1.6423, "step": 20 }, { "epoch": 0.01985900109224506, "grad_norm": 1.0260020187790146, "learning_rate": 3.968253968253968e-06, "loss": 1.6245, "step": 25 }, { "epoch": 0.023830801310694073, "grad_norm": 0.9521643373391829, "learning_rate": 4.761904761904762e-06, "loss": 1.6242, "step": 30 }, { "epoch": 0.027802601529143083, "grad_norm": 0.9087502803225287, "learning_rate": 5.555555555555557e-06, "loss": 1.6067, "step": 35 }, { "epoch": 0.03177440174759209, "grad_norm": 0.9276432527674262, "learning_rate": 6.349206349206349e-06, "loss": 1.5848, "step": 40 }, { "epoch": 0.035746201966041107, "grad_norm": 0.8919073066787582, "learning_rate": 7.1428571428571436e-06, "loss": 1.6, "step": 45 }, { "epoch": 0.03971800218449012, "grad_norm": 0.965858397020833, "learning_rate": 7.936507936507936e-06, "loss": 1.6023, "step": 50 }, { "epoch": 0.04368980240293913, "grad_norm": 0.9487708676860586, "learning_rate": 8.730158730158731e-06, "loss": 1.5826, "step": 55 }, { "epoch": 0.04766160262138815, "grad_norm": 0.9151332727520083, "learning_rate": 9.523809523809525e-06, "loss": 1.5978, "step": 60 }, { "epoch": 0.05163340283983715, "grad_norm": 0.917185703733017, "learning_rate": 1.031746031746032e-05, "loss": 1.6003, "step": 65 }, { "epoch": 0.055605203058286166, "grad_norm": 0.9194628312444804, "learning_rate": 1.1111111111111113e-05, "loss": 1.5856, "step": 70 }, { "epoch": 0.05957700327673518, "grad_norm": 0.8901707069616845, "learning_rate": 1.1904761904761905e-05, "loss": 1.5846, "step": 75 }, { "epoch": 0.06354880349518419, "grad_norm": 0.966608805341671, "learning_rate": 1.2698412698412699e-05, "loss": 1.6089, "step": 80 }, { "epoch": 0.0675206037136332, "grad_norm": 0.9313133663663362, "learning_rate": 1.3492063492063494e-05, "loss": 1.5815, "step": 85 }, { "epoch": 0.07149240393208221, "grad_norm": 0.9808540327178217, "learning_rate": 1.4285714285714287e-05, "loss": 1.5816, "step": 90 }, { "epoch": 0.07546420415053123, "grad_norm": 0.9026305570096459, "learning_rate": 1.507936507936508e-05, "loss": 1.5958, "step": 95 }, { "epoch": 0.07943600436898024, "grad_norm": 0.9788483223436265, "learning_rate": 1.5873015873015872e-05, "loss": 1.5911, "step": 100 }, { "epoch": 0.08340780458742925, "grad_norm": 0.9515538442938523, "learning_rate": 1.6666666666666667e-05, "loss": 1.5865, "step": 105 }, { "epoch": 0.08737960480587827, "grad_norm": 0.926406626131289, "learning_rate": 1.7460317460317463e-05, "loss": 1.5793, "step": 110 }, { "epoch": 0.09135140502432727, "grad_norm": 0.922601693661366, "learning_rate": 1.8253968253968254e-05, "loss": 1.5822, "step": 115 }, { "epoch": 0.0953232052427763, "grad_norm": 0.9585781023399166, "learning_rate": 1.904761904761905e-05, "loss": 1.5718, "step": 120 }, { "epoch": 0.0992950054612253, "grad_norm": 1.0121070868569275, "learning_rate": 1.9841269841269845e-05, "loss": 1.5773, "step": 125 }, { "epoch": 0.1032668056796743, "grad_norm": 1.0425213086708742, "learning_rate": 1.999938384153589e-05, "loss": 1.585, "step": 130 }, { "epoch": 0.10723860589812333, "grad_norm": 0.9511949716486409, "learning_rate": 1.999688082790923e-05, "loss": 1.5868, "step": 135 }, { "epoch": 0.11121040611657233, "grad_norm": 0.9590697791262225, "learning_rate": 1.9992452930796544e-05, "loss": 1.5776, "step": 140 }, { "epoch": 0.11518220633502135, "grad_norm": 0.932339341553828, "learning_rate": 1.9986101002782376e-05, "loss": 1.5789, "step": 145 }, { "epoch": 0.11915400655347036, "grad_norm": 0.9529377325330747, "learning_rate": 1.997782626692034e-05, "loss": 1.5814, "step": 150 }, { "epoch": 0.12312580677191937, "grad_norm": 0.9257334084732001, "learning_rate": 1.9967630316497663e-05, "loss": 1.5659, "step": 155 }, { "epoch": 0.12709760699036837, "grad_norm": 0.9578201119584917, "learning_rate": 1.995551511472836e-05, "loss": 1.5844, "step": 160 }, { "epoch": 0.1310694072088174, "grad_norm": 0.9483988649883341, "learning_rate": 1.994148299437524e-05, "loss": 1.559, "step": 165 }, { "epoch": 0.1350412074272664, "grad_norm": 0.9752973142389638, "learning_rate": 1.9925536657300734e-05, "loss": 1.5783, "step": 170 }, { "epoch": 0.13901300764571542, "grad_norm": 0.9082694825570907, "learning_rate": 1.990767917394666e-05, "loss": 1.5716, "step": 175 }, { "epoch": 0.14298480786416443, "grad_norm": 0.9870799951805275, "learning_rate": 1.9887913982743e-05, "loss": 1.5705, "step": 180 }, { "epoch": 0.14695660808261343, "grad_norm": 0.8978375791866258, "learning_rate": 1.986624488944585e-05, "loss": 1.5738, "step": 185 }, { "epoch": 0.15092840830106247, "grad_norm": 0.9206959902444366, "learning_rate": 1.984267606640462e-05, "loss": 1.5729, "step": 190 }, { "epoch": 0.15490020851951147, "grad_norm": 0.9532760851392515, "learning_rate": 1.9817212051758667e-05, "loss": 1.5674, "step": 195 }, { "epoch": 0.15887200873796048, "grad_norm": 0.8995123548574, "learning_rate": 1.978985774856346e-05, "loss": 1.5683, "step": 200 }, { "epoch": 0.16284380895640949, "grad_norm": 0.9731444458977212, "learning_rate": 1.9760618423846526e-05, "loss": 1.5738, "step": 205 }, { "epoch": 0.1668156091748585, "grad_norm": 0.9813203114593326, "learning_rate": 1.9729499707593284e-05, "loss": 1.5826, "step": 210 }, { "epoch": 0.17078740939330753, "grad_norm": 0.9301814711446849, "learning_rate": 1.9696507591663003e-05, "loss": 1.5565, "step": 215 }, { "epoch": 0.17475920961175653, "grad_norm": 0.9084286747837647, "learning_rate": 1.9661648428635066e-05, "loss": 1.5621, "step": 220 }, { "epoch": 0.17873100983020554, "grad_norm": 0.9292775329242645, "learning_rate": 1.962492893058582e-05, "loss": 1.5533, "step": 225 }, { "epoch": 0.18270281004865455, "grad_norm": 0.9282763244366917, "learning_rate": 1.9586356167796145e-05, "loss": 1.5801, "step": 230 }, { "epoch": 0.18667461026710355, "grad_norm": 0.9219988029696228, "learning_rate": 1.954593756739009e-05, "loss": 1.5802, "step": 235 }, { "epoch": 0.1906464104855526, "grad_norm": 0.9871605765917675, "learning_rate": 1.9503680911904822e-05, "loss": 1.5817, "step": 240 }, { "epoch": 0.1946182107040016, "grad_norm": 0.9551024433790934, "learning_rate": 1.9459594337792063e-05, "loss": 1.571, "step": 245 }, { "epoch": 0.1985900109224506, "grad_norm": 0.9155920092825396, "learning_rate": 1.9413686333851465e-05, "loss": 1.5694, "step": 250 }, { "epoch": 0.2025618111408996, "grad_norm": 0.9097167589577474, "learning_rate": 1.9365965739596086e-05, "loss": 1.556, "step": 255 }, { "epoch": 0.2065336113593486, "grad_norm": 0.9023225707856134, "learning_rate": 1.9316441743550375e-05, "loss": 1.5762, "step": 260 }, { "epoch": 0.21050541157779765, "grad_norm": 0.9077263206493167, "learning_rate": 1.9265123881480912e-05, "loss": 1.5706, "step": 265 }, { "epoch": 0.21447721179624665, "grad_norm": 0.8937248614200829, "learning_rate": 1.9212022034560332e-05, "loss": 1.567, "step": 270 }, { "epoch": 0.21844901201469566, "grad_norm": 0.9122061973911317, "learning_rate": 1.91571464274647e-05, "loss": 1.5742, "step": 275 }, { "epoch": 0.22242081223314467, "grad_norm": 0.9625286348265168, "learning_rate": 1.91005076264048e-05, "loss": 1.5653, "step": 280 }, { "epoch": 0.22639261245159367, "grad_norm": 0.9485988527754959, "learning_rate": 1.9042116537091583e-05, "loss": 1.555, "step": 285 }, { "epoch": 0.2303644126700427, "grad_norm": 0.9576868955487222, "learning_rate": 1.898198440263633e-05, "loss": 1.5624, "step": 290 }, { "epoch": 0.2343362128884917, "grad_norm": 0.947081159201408, "learning_rate": 1.8920122801385785e-05, "loss": 1.5567, "step": 295 }, { "epoch": 0.23830801310694072, "grad_norm": 0.9810396328169484, "learning_rate": 1.8856543644692767e-05, "loss": 1.5552, "step": 300 }, { "epoch": 0.24227981332538973, "grad_norm": 0.9296206498184163, "learning_rate": 1.8791259174622668e-05, "loss": 1.5791, "step": 305 }, { "epoch": 0.24625161354383873, "grad_norm": 0.9647483866705726, "learning_rate": 1.8724281961596255e-05, "loss": 1.5604, "step": 310 }, { "epoch": 0.25022341376228774, "grad_norm": 0.8990721005609152, "learning_rate": 1.865562490196924e-05, "loss": 1.5648, "step": 315 }, { "epoch": 0.25419521398073675, "grad_norm": 0.8752521530386431, "learning_rate": 1.8585301215549152e-05, "loss": 1.575, "step": 320 }, { "epoch": 0.2581670141991858, "grad_norm": 0.9677847810573758, "learning_rate": 1.8513324443049826e-05, "loss": 1.5752, "step": 325 }, { "epoch": 0.2621388144176348, "grad_norm": 0.8650992533303118, "learning_rate": 1.8439708443484212e-05, "loss": 1.5576, "step": 330 }, { "epoch": 0.2661106146360838, "grad_norm": 0.8721763870519639, "learning_rate": 1.836446739149581e-05, "loss": 1.55, "step": 335 }, { "epoch": 0.2700824148545328, "grad_norm": 0.9043150779726963, "learning_rate": 1.8287615774629372e-05, "loss": 1.5736, "step": 340 }, { "epoch": 0.27405421507298183, "grad_norm": 0.872060546407473, "learning_rate": 1.820916839054137e-05, "loss": 1.5739, "step": 345 }, { "epoch": 0.27802601529143084, "grad_norm": 0.9490597285856667, "learning_rate": 1.8129140344150698e-05, "loss": 1.5656, "step": 350 }, { "epoch": 0.28199781550987985, "grad_norm": 0.9293959378534794, "learning_rate": 1.8047547044730266e-05, "loss": 1.5601, "step": 355 }, { "epoch": 0.28596961572832885, "grad_norm": 0.8875229973252522, "learning_rate": 1.796440420293996e-05, "loss": 1.5595, "step": 360 }, { "epoch": 0.28994141594677786, "grad_norm": 0.9241685744107659, "learning_rate": 1.7879727827801587e-05, "loss": 1.5681, "step": 365 }, { "epoch": 0.29391321616522686, "grad_norm": 0.9255387788030182, "learning_rate": 1.7793534223616354e-05, "loss": 1.5613, "step": 370 }, { "epoch": 0.2978850163836759, "grad_norm": 0.9447825641314226, "learning_rate": 1.7705839986825502e-05, "loss": 1.5726, "step": 375 }, { "epoch": 0.30185681660212493, "grad_norm": 0.879507513143992, "learning_rate": 1.7616662002814704e-05, "loss": 1.5419, "step": 380 }, { "epoch": 0.30582861682057394, "grad_norm": 0.89862098882536, "learning_rate": 1.752601744266278e-05, "loss": 1.5516, "step": 385 }, { "epoch": 0.30980041703902295, "grad_norm": 0.9106190928925514, "learning_rate": 1.7433923759835468e-05, "loss": 1.5565, "step": 390 }, { "epoch": 0.31377221725747195, "grad_norm": 0.9288737547498717, "learning_rate": 1.7340398686824755e-05, "loss": 1.5732, "step": 395 }, { "epoch": 0.31774401747592096, "grad_norm": 0.9087033425211577, "learning_rate": 1.7245460231734537e-05, "loss": 1.5492, "step": 400 }, { "epoch": 0.32171581769436997, "grad_norm": 0.8999862928001036, "learning_rate": 1.7149126674813174e-05, "loss": 1.5695, "step": 405 }, { "epoch": 0.32568761791281897, "grad_norm": 0.9119036306365369, "learning_rate": 1.7051416564933677e-05, "loss": 1.5507, "step": 410 }, { "epoch": 0.329659418131268, "grad_norm": 0.8975417094048892, "learning_rate": 1.6952348716022112e-05, "loss": 1.5902, "step": 415 }, { "epoch": 0.333631218349717, "grad_norm": 0.8840046538984045, "learning_rate": 1.6851942203435056e-05, "loss": 1.5592, "step": 420 }, { "epoch": 0.33760301856816605, "grad_norm": 0.9029659180561906, "learning_rate": 1.6750216360286634e-05, "loss": 1.5829, "step": 425 }, { "epoch": 0.34157481878661505, "grad_norm": 0.8585662547884547, "learning_rate": 1.664719077372597e-05, "loss": 1.5576, "step": 430 }, { "epoch": 0.34554661900506406, "grad_norm": 0.8785123954008921, "learning_rate": 1.6563847811650376e-05, "loss": 1.5683, "step": 435 }, { "epoch": 0.34951841922351307, "grad_norm": 0.8697699611475651, "learning_rate": 1.64585328429674e-05, "loss": 1.5448, "step": 440 }, { "epoch": 0.3534902194419621, "grad_norm": 0.8843659106432961, "learning_rate": 1.635197429406901e-05, "loss": 1.5726, "step": 445 }, { "epoch": 0.3574620196604111, "grad_norm": 0.9485513906574914, "learning_rate": 1.6244192682634143e-05, "loss": 1.5465, "step": 450 }, { "epoch": 0.3614338198788601, "grad_norm": 0.9407868053462191, "learning_rate": 1.6135208761840457e-05, "loss": 1.5591, "step": 455 }, { "epoch": 0.3654056200973091, "grad_norm": 0.9465851646246182, "learning_rate": 1.602504351636838e-05, "loss": 1.5534, "step": 460 }, { "epoch": 0.3693774203157581, "grad_norm": 0.9186695674403026, "learning_rate": 1.591371815836051e-05, "loss": 1.5543, "step": 465 }, { "epoch": 0.3733492205342071, "grad_norm": 0.8884315626992574, "learning_rate": 1.580125412333728e-05, "loss": 1.5402, "step": 470 }, { "epoch": 0.37732102075265617, "grad_norm": 0.8733398363412331, "learning_rate": 1.5687673066069568e-05, "loss": 1.552, "step": 475 }, { "epoch": 0.3812928209711052, "grad_norm": 0.8992808033952767, "learning_rate": 1.5572996856409094e-05, "loss": 1.5638, "step": 480 }, { "epoch": 0.3852646211895542, "grad_norm": 0.8937067889490861, "learning_rate": 1.5457247575077445e-05, "loss": 1.5406, "step": 485 }, { "epoch": 0.3892364214080032, "grad_norm": 0.9108964394640126, "learning_rate": 1.534044750941444e-05, "loss": 1.5472, "step": 490 }, { "epoch": 0.3932082216264522, "grad_norm": 0.8793030627405377, "learning_rate": 1.5222619149086746e-05, "loss": 1.5412, "step": 495 }, { "epoch": 0.3971800218449012, "grad_norm": 0.8782946871477869, "learning_rate": 1.5103785181757533e-05, "loss": 1.5396, "step": 500 }, { "epoch": 0.4011518220633502, "grad_norm": 0.8674461660159545, "learning_rate": 1.4983968488718005e-05, "loss": 1.5426, "step": 505 }, { "epoch": 0.4051236222817992, "grad_norm": 0.8976341288348811, "learning_rate": 1.4863192140481624e-05, "loss": 1.5537, "step": 510 }, { "epoch": 0.4090954225002482, "grad_norm": 0.8431858627506479, "learning_rate": 1.4741479392341941e-05, "loss": 1.5586, "step": 515 }, { "epoch": 0.4130672227186972, "grad_norm": 0.9421670179657328, "learning_rate": 1.4618853679894813e-05, "loss": 1.5202, "step": 520 }, { "epoch": 0.4170390229371463, "grad_norm": 0.8878585099095585, "learning_rate": 1.4495338614525927e-05, "loss": 1.5507, "step": 525 }, { "epoch": 0.4210108231555953, "grad_norm": 0.9642989519892418, "learning_rate": 1.437095797886445e-05, "loss": 1.5488, "step": 530 }, { "epoch": 0.4249826233740443, "grad_norm": 0.9246966546921916, "learning_rate": 1.4245735722203736e-05, "loss": 1.5401, "step": 535 }, { "epoch": 0.4289544235924933, "grad_norm": 0.9529371824264209, "learning_rate": 1.4119695955889925e-05, "loss": 1.5495, "step": 540 }, { "epoch": 0.4329262238109423, "grad_norm": 0.8825791672603804, "learning_rate": 1.3992862948679332e-05, "loss": 1.5491, "step": 545 }, { "epoch": 0.4368980240293913, "grad_norm": 0.8881684368996328, "learning_rate": 1.3865261122065551e-05, "loss": 1.5482, "step": 550 }, { "epoch": 0.4408698242478403, "grad_norm": 0.8423283640939411, "learning_rate": 1.3736915045577122e-05, "loss": 1.5488, "step": 555 }, { "epoch": 0.44484162446628933, "grad_norm": 0.8255688998685623, "learning_rate": 1.3607849432046717e-05, "loss": 1.5478, "step": 560 }, { "epoch": 0.44881342468473834, "grad_norm": 0.8357255814716047, "learning_rate": 1.3478089132852717e-05, "loss": 1.5598, "step": 565 }, { "epoch": 0.45278522490318734, "grad_norm": 0.8217394668509155, "learning_rate": 1.3347659133134118e-05, "loss": 1.5141, "step": 570 }, { "epoch": 0.4567570251216364, "grad_norm": 0.8370380332545342, "learning_rate": 1.3216584546979702e-05, "loss": 1.5338, "step": 575 }, { "epoch": 0.4607288253400854, "grad_norm": 0.9499613626096974, "learning_rate": 1.3084890612592325e-05, "loss": 1.5633, "step": 580 }, { "epoch": 0.4647006255585344, "grad_norm": 0.9043571179729512, "learning_rate": 1.2979106570683663e-05, "loss": 1.5624, "step": 585 }, { "epoch": 0.4686724257769834, "grad_norm": 0.837192808810411, "learning_rate": 1.2846361787292137e-05, "loss": 1.5514, "step": 590 }, { "epoch": 0.47264422599543243, "grad_norm": 0.893985182264359, "learning_rate": 1.2713068941470547e-05, "loss": 1.5609, "step": 595 }, { "epoch": 0.47661602621388144, "grad_norm": 0.9569825734990193, "learning_rate": 1.2579253698544124e-05, "loss": 1.5421, "step": 600 }, { "epoch": 0.48058782643233044, "grad_norm": 0.8770646136960255, "learning_rate": 1.2444941824424825e-05, "loss": 1.5392, "step": 605 }, { "epoch": 0.48455962665077945, "grad_norm": 0.9065759679781771, "learning_rate": 1.2310159180650158e-05, "loss": 1.5277, "step": 610 }, { "epoch": 0.48853142686922846, "grad_norm": 0.8727166144942986, "learning_rate": 1.2174931719403568e-05, "loss": 1.5206, "step": 615 }, { "epoch": 0.49250322708767746, "grad_norm": 0.8374618600702926, "learning_rate": 1.2039285478517417e-05, "loss": 1.5363, "step": 620 }, { "epoch": 0.4964750273061265, "grad_norm": 0.8559233592367627, "learning_rate": 1.1903246576459398e-05, "loss": 1.5188, "step": 625 }, { "epoch": 0.5004468275245755, "grad_norm": 0.8928106567260038, "learning_rate": 1.1766841207303498e-05, "loss": 1.5388, "step": 630 }, { "epoch": 0.5044186277430245, "grad_norm": 0.9205521273045262, "learning_rate": 1.1630095635686359e-05, "loss": 1.5246, "step": 635 }, { "epoch": 0.5083904279614735, "grad_norm": 0.8459927952590032, "learning_rate": 1.1493036191750067e-05, "loss": 1.5597, "step": 640 }, { "epoch": 0.5123622281799225, "grad_norm": 0.8954533380109134, "learning_rate": 1.1355689266072314e-05, "loss": 1.5407, "step": 645 }, { "epoch": 0.5163340283983716, "grad_norm": 0.8801173721660838, "learning_rate": 1.1218081304584959e-05, "loss": 1.5358, "step": 650 }, { "epoch": 0.5203058286168206, "grad_norm": 0.8777403433212762, "learning_rate": 1.1080238803481878e-05, "loss": 1.5529, "step": 655 }, { "epoch": 0.5242776288352696, "grad_norm": 0.8804685666128383, "learning_rate": 1.0942188304117184e-05, "loss": 1.5373, "step": 660 }, { "epoch": 0.5282494290537186, "grad_norm": 0.8340381127557642, "learning_rate": 1.0803956387894715e-05, "loss": 1.5454, "step": 665 }, { "epoch": 0.5322212292721676, "grad_norm": 0.8855144262951853, "learning_rate": 1.066556967114984e-05, "loss": 1.5283, "step": 670 }, { "epoch": 0.5361930294906166, "grad_norm": 0.8545713944344788, "learning_rate": 1.0527054800024537e-05, "loss": 1.5434, "step": 675 }, { "epoch": 0.5401648297090657, "grad_norm": 0.8489068751890607, "learning_rate": 1.0388438445336677e-05, "loss": 1.5134, "step": 680 }, { "epoch": 0.5441366299275147, "grad_norm": 0.8830252810636811, "learning_rate": 1.0249747297444659e-05, "loss": 1.5412, "step": 685 }, { "epoch": 0.5481084301459637, "grad_norm": 0.815565252068504, "learning_rate": 1.0111008061108176e-05, "loss": 1.5327, "step": 690 }, { "epoch": 0.5520802303644127, "grad_norm": 0.8249011581838526, "learning_rate": 9.972247450346272e-06, "loss": 1.5083, "step": 695 }, { "epoch": 0.5560520305828617, "grad_norm": 0.8855145666062009, "learning_rate": 9.833492183293616e-06, "loss": 1.5481, "step": 700 }, { "epoch": 0.5600238308013107, "grad_norm": 0.8735055764957946, "learning_rate": 9.69476897705595e-06, "loss": 1.5224, "step": 705 }, { "epoch": 0.5639956310197597, "grad_norm": 0.8651835408338951, "learning_rate": 9.55610454256575e-06, "loss": 1.5291, "step": 710 }, { "epoch": 0.5679674312382087, "grad_norm": 0.8804958974715262, "learning_rate": 9.417525579439094e-06, "loss": 1.5248, "step": 715 }, { "epoch": 0.5719392314566577, "grad_norm": 0.8450121942474024, "learning_rate": 9.279058770834679e-06, "loss": 1.5264, "step": 720 }, { "epoch": 0.5759110316751067, "grad_norm": 0.8556582918574475, "learning_rate": 9.140730778316037e-06, "loss": 1.5464, "step": 725 }, { "epoch": 0.5798828318935557, "grad_norm": 0.8851588599533484, "learning_rate": 9.002568236717863e-06, "loss": 1.5389, "step": 730 }, { "epoch": 0.5838546321120047, "grad_norm": 0.8608369842804535, "learning_rate": 8.864597749017566e-06, "loss": 1.5392, "step": 735 }, { "epoch": 0.5878264323304537, "grad_norm": 0.8184232636661748, "learning_rate": 8.72684588121287e-06, "loss": 1.558, "step": 740 }, { "epoch": 0.5917982325489027, "grad_norm": 0.84877451025036, "learning_rate": 8.589339157206583e-06, "loss": 1.5388, "step": 745 }, { "epoch": 0.5957700327673519, "grad_norm": 0.8092058683281641, "learning_rate": 8.452104053699474e-06, "loss": 1.5313, "step": 750 }, { "epoch": 0.5997418329858009, "grad_norm": 0.8452233779619619, "learning_rate": 8.315166995092206e-06, "loss": 1.5259, "step": 755 }, { "epoch": 0.6037136332042499, "grad_norm": 0.8484399724532287, "learning_rate": 8.178554348397388e-06, "loss": 1.5193, "step": 760 }, { "epoch": 0.6076854334226989, "grad_norm": 0.8144552151209362, "learning_rate": 8.042292418162611e-06, "loss": 1.5046, "step": 765 }, { "epoch": 0.6116572336411479, "grad_norm": 0.8339533354115302, "learning_rate": 7.906407441405586e-06, "loss": 1.5372, "step": 770 }, { "epoch": 0.6156290338595969, "grad_norm": 0.8305252567757494, "learning_rate": 7.770925582562228e-06, "loss": 1.5365, "step": 775 }, { "epoch": 0.6196008340780459, "grad_norm": 0.8327672905609981, "learning_rate": 7.635872928448734e-06, "loss": 1.5326, "step": 780 }, { "epoch": 0.6235726342964949, "grad_norm": 0.844461561584183, "learning_rate": 7.501275483238619e-06, "loss": 1.543, "step": 785 }, { "epoch": 0.6275444345149439, "grad_norm": 0.8619627758469628, "learning_rate": 7.367159163455648e-06, "loss": 1.5259, "step": 790 }, { "epoch": 0.6315162347333929, "grad_norm": 0.8366660180458262, "learning_rate": 7.2335497929836565e-06, "loss": 1.5465, "step": 795 }, { "epoch": 0.6354880349518419, "grad_norm": 0.8803323174183961, "learning_rate": 7.10047309809418e-06, "loss": 1.5412, "step": 800 }, { "epoch": 0.6394598351702909, "grad_norm": 0.8544630082563632, "learning_rate": 6.967954702492939e-06, "loss": 1.5207, "step": 805 }, { "epoch": 0.6434316353887399, "grad_norm": 0.8328235652984776, "learning_rate": 6.8360201223860024e-06, "loss": 1.5407, "step": 810 }, { "epoch": 0.6474034356071889, "grad_norm": 0.8462106579634066, "learning_rate": 6.704694761566697e-06, "loss": 1.5217, "step": 815 }, { "epoch": 0.6513752358256379, "grad_norm": 0.8277784955761321, "learning_rate": 6.574003906524149e-06, "loss": 1.5389, "step": 820 }, { "epoch": 0.655347036044087, "grad_norm": 0.8532449534893878, "learning_rate": 6.443972721574409e-06, "loss": 1.5046, "step": 825 }, { "epoch": 0.659318836262536, "grad_norm": 0.8230178573303815, "learning_rate": 6.314626244015099e-06, "loss": 1.5062, "step": 830 }, { "epoch": 0.663290636480985, "grad_norm": 0.84801479235042, "learning_rate": 6.18598937930452e-06, "loss": 1.5203, "step": 835 }, { "epoch": 0.667262436699434, "grad_norm": 0.8397075331547054, "learning_rate": 6.058086896266149e-06, "loss": 1.5242, "step": 840 }, { "epoch": 0.671234236917883, "grad_norm": 0.8597360676196729, "learning_rate": 5.930943422319453e-06, "loss": 1.5055, "step": 845 }, { "epoch": 0.6752060371363321, "grad_norm": 0.8775604658344065, "learning_rate": 5.80458343873789e-06, "loss": 1.5257, "step": 850 }, { "epoch": 0.6791778373547811, "grad_norm": 0.8235956970430471, "learning_rate": 5.679031275935104e-06, "loss": 1.5312, "step": 855 }, { "epoch": 0.6831496375732301, "grad_norm": 0.8867364985685439, "learning_rate": 5.55431110878014e-06, "loss": 1.5074, "step": 860 }, { "epoch": 0.6871214377916791, "grad_norm": 0.861522055731343, "learning_rate": 5.430446951942597e-06, "loss": 1.538, "step": 865 }, { "epoch": 0.6910932380101281, "grad_norm": 0.8275026007168744, "learning_rate": 5.307462655268651e-06, "loss": 1.5146, "step": 870 }, { "epoch": 0.6950650382285771, "grad_norm": 1.0158409774185204, "learning_rate": 5.185381899188811e-06, "loss": 1.5276, "step": 875 }, { "epoch": 0.6990368384470261, "grad_norm": 0.8341799420785135, "learning_rate": 5.064228190158274e-06, "loss": 1.5281, "step": 880 }, { "epoch": 0.7030086386654751, "grad_norm": 0.8162862561088432, "learning_rate": 4.944024856130813e-06, "loss": 1.5093, "step": 885 }, { "epoch": 0.7069804388839241, "grad_norm": 0.8479134502646151, "learning_rate": 4.824795042066997e-06, "loss": 1.5455, "step": 890 }, { "epoch": 0.7109522391023732, "grad_norm": 0.8433313440901115, "learning_rate": 4.706561705477687e-06, "loss": 1.5226, "step": 895 }, { "epoch": 0.7149240393208222, "grad_norm": 0.8370841428358435, "learning_rate": 4.5893476120035895e-06, "loss": 1.5412, "step": 900 }, { "epoch": 0.7188958395392712, "grad_norm": 0.8424960564588163, "learning_rate": 4.473175331031765e-06, "loss": 1.5175, "step": 905 }, { "epoch": 0.7228676397577202, "grad_norm": 0.8032981360581487, "learning_rate": 4.358067231349942e-06, "loss": 1.5276, "step": 910 }, { "epoch": 0.7268394399761692, "grad_norm": 0.7960926551931078, "learning_rate": 4.244045476839439e-06, "loss": 1.5167, "step": 915 }, { "epoch": 0.7308112401946182, "grad_norm": 0.838320712946578, "learning_rate": 4.131132022207537e-06, "loss": 1.5445, "step": 920 }, { "epoch": 0.7347830404130672, "grad_norm": 0.8306388787438427, "learning_rate": 4.019348608760137e-06, "loss": 1.5374, "step": 925 }, { "epoch": 0.7387548406315162, "grad_norm": 0.8194701201913649, "learning_rate": 3.908716760215513e-06, "loss": 1.5204, "step": 930 }, { "epoch": 0.7427266408499652, "grad_norm": 0.8194871368550529, "learning_rate": 3.799257778559955e-06, "loss": 1.5292, "step": 935 }, { "epoch": 0.7466984410684142, "grad_norm": 0.824217253107322, "learning_rate": 3.6909927399460942e-06, "loss": 1.5336, "step": 940 }, { "epoch": 0.7506702412868632, "grad_norm": 0.8142575339219603, "learning_rate": 3.5839424906347274e-06, "loss": 1.5092, "step": 945 }, { "epoch": 0.7546420415053123, "grad_norm": 0.807677248718189, "learning_rate": 3.4781276429809153e-06, "loss": 1.5314, "step": 950 }, { "epoch": 0.7586138417237613, "grad_norm": 0.8089021775387019, "learning_rate": 3.3735685714650925e-06, "loss": 1.5235, "step": 955 }, { "epoch": 0.7625856419422103, "grad_norm": 0.8366198862903579, "learning_rate": 3.270285408769991e-06, "loss": 1.5381, "step": 960 }, { "epoch": 0.7665574421606594, "grad_norm": 0.8405851483984087, "learning_rate": 3.168298041904141e-06, "loss": 1.5217, "step": 965 }, { "epoch": 0.7705292423791084, "grad_norm": 0.8520055208027894, "learning_rate": 3.0676261083726466e-06, "loss": 1.5293, "step": 970 }, { "epoch": 0.7745010425975574, "grad_norm": 0.8355631433834037, "learning_rate": 2.968288992396009e-06, "loss": 1.5132, "step": 975 }, { "epoch": 0.7784728428160064, "grad_norm": 0.8291513738741879, "learning_rate": 2.870305821177747e-06, "loss": 1.5268, "step": 980 }, { "epoch": 0.7824446430344554, "grad_norm": 0.8378530571179096, "learning_rate": 2.773695461221464e-06, "loss": 1.5098, "step": 985 }, { "epoch": 0.7864164432529044, "grad_norm": 0.8199379802303441, "learning_rate": 2.678476514698146e-06, "loss": 1.5431, "step": 990 }, { "epoch": 0.7903882434713534, "grad_norm": 0.8149428301792468, "learning_rate": 2.584667315864334e-06, "loss": 1.5524, "step": 995 }, { "epoch": 0.7943600436898024, "grad_norm": 0.8127178564027231, "learning_rate": 2.492285927531893e-06, "loss": 1.5246, "step": 1000 }, { "epoch": 0.7983318439082514, "grad_norm": 0.8296085136845848, "learning_rate": 2.4013501375900604e-06, "loss": 1.5428, "step": 1005 }, { "epoch": 0.8023036441267004, "grad_norm": 0.8331298630227156, "learning_rate": 2.3118774555803915e-06, "loss": 1.5073, "step": 1010 }, { "epoch": 0.8062754443451494, "grad_norm": 0.8084677789049869, "learning_rate": 2.2238851093253476e-06, "loss": 1.518, "step": 1015 }, { "epoch": 0.8102472445635984, "grad_norm": 0.7995439438308841, "learning_rate": 2.1373900416110973e-06, "loss": 1.5272, "step": 1020 }, { "epoch": 0.8142190447820474, "grad_norm": 0.8236450629839644, "learning_rate": 2.0524089069252106e-06, "loss": 1.5028, "step": 1025 }, { "epoch": 0.8181908450004964, "grad_norm": 0.8275359166427305, "learning_rate": 1.9689580682498553e-06, "loss": 1.5268, "step": 1030 }, { "epoch": 0.8221626452189454, "grad_norm": 0.8081276364031936, "learning_rate": 1.887053593911149e-06, "loss": 1.5427, "step": 1035 }, { "epoch": 0.8261344454373944, "grad_norm": 0.8237710475989976, "learning_rate": 1.806711254485215e-06, "loss": 1.5389, "step": 1040 }, { "epoch": 0.8301062456558436, "grad_norm": 0.8384266531879961, "learning_rate": 1.727946519761583e-06, "loss": 1.5015, "step": 1045 }, { "epoch": 0.8340780458742926, "grad_norm": 0.8364585808120162, "learning_rate": 1.6507745557645127e-06, "loss": 1.5009, "step": 1050 }, { "epoch": 0.8380498460927416, "grad_norm": 0.8244749141974579, "learning_rate": 1.575210221832799e-06, "loss": 1.525, "step": 1055 }, { "epoch": 0.8420216463111906, "grad_norm": 0.8040790359125747, "learning_rate": 1.5012680677586222e-06, "loss": 1.5134, "step": 1060 }, { "epoch": 0.8459934465296396, "grad_norm": 0.7975576330551882, "learning_rate": 1.4432918921243055e-06, "loss": 1.5128, "step": 1065 }, { "epoch": 0.8499652467480886, "grad_norm": 0.7987138186718241, "learning_rate": 1.3723053285030463e-06, "loss": 1.5146, "step": 1070 }, { "epoch": 0.8539370469665376, "grad_norm": 0.8041473257969093, "learning_rate": 1.3029800137534632e-06, "loss": 1.56, "step": 1075 }, { "epoch": 0.8579088471849866, "grad_norm": 0.8017665713122328, "learning_rate": 1.235329296354526e-06, "loss": 1.5104, "step": 1080 }, { "epoch": 0.8618806474034356, "grad_norm": 0.8106278967681714, "learning_rate": 1.1693662023441577e-06, "loss": 1.5272, "step": 1085 }, { "epoch": 0.8658524476218846, "grad_norm": 0.8102683680589345, "learning_rate": 1.1051034328110776e-06, "loss": 1.5276, "step": 1090 }, { "epoch": 0.8698242478403336, "grad_norm": 0.8245536236761711, "learning_rate": 1.0425533614492412e-06, "loss": 1.5436, "step": 1095 }, { "epoch": 0.8737960480587826, "grad_norm": 0.8040355908066084, "learning_rate": 9.817280321752898e-07, "loss": 1.5007, "step": 1100 }, { "epoch": 0.8777678482772316, "grad_norm": 0.8123936825847906, "learning_rate": 9.226391568095306e-07, "loss": 1.5176, "step": 1105 }, { "epoch": 0.8817396484956807, "grad_norm": 0.7956886909023395, "learning_rate": 8.652981128208315e-07, "loss": 1.5135, "step": 1110 }, { "epoch": 0.8857114487141297, "grad_norm": 0.8198537899032718, "learning_rate": 8.097159411359135e-07, "loss": 1.5309, "step": 1115 }, { "epoch": 0.8896832489325787, "grad_norm": 0.782162948876607, "learning_rate": 7.559033440134311e-07, "loss": 1.5287, "step": 1120 }, { "epoch": 0.8936550491510277, "grad_norm": 0.8040565507060439, "learning_rate": 7.038706829832808e-07, "loss": 1.519, "step": 1125 }, { "epoch": 0.8976268493694767, "grad_norm": 0.8259422263740301, "learning_rate": 6.536279768514952e-07, "loss": 1.5137, "step": 1130 }, { "epoch": 0.9015986495879257, "grad_norm": 0.8095546356558041, "learning_rate": 6.051848997711395e-07, "loss": 1.5288, "step": 1135 }, { "epoch": 0.9055704498063747, "grad_norm": 0.8093720744796288, "learning_rate": 5.585507793795763e-07, "loss": 1.5212, "step": 1140 }, { "epoch": 0.9095422500248238, "grad_norm": 0.8077656938263478, "learning_rate": 5.137345950024309e-07, "loss": 1.4942, "step": 1145 }, { "epoch": 0.9135140502432728, "grad_norm": 0.8113816503121841, "learning_rate": 4.7074497592465074e-07, "loss": 1.5361, "step": 1150 }, { "epoch": 0.9174858504617218, "grad_norm": 0.813231569130407, "learning_rate": 4.2959019972893644e-07, "loss": 1.5306, "step": 1155 }, { "epoch": 0.9214576506801708, "grad_norm": 0.8099017485840051, "learning_rate": 3.9027819070191706e-07, "loss": 1.5137, "step": 1160 }, { "epoch": 0.9254294508986198, "grad_norm": 0.7881148123811212, "learning_rate": 3.5281651830833987e-07, "loss": 1.5193, "step": 1165 }, { "epoch": 0.9294012511170688, "grad_norm": 0.8097066087342274, "learning_rate": 3.1721239573357264e-07, "loss": 1.53, "step": 1170 }, { "epoch": 0.9333730513355178, "grad_norm": 0.7843276174810709, "learning_rate": 2.834726784947273e-07, "loss": 1.5407, "step": 1175 }, { "epoch": 0.9373448515539669, "grad_norm": 0.8036612590725093, "learning_rate": 2.5160386312063855e-07, "loss": 1.5143, "step": 1180 }, { "epoch": 0.9413166517724159, "grad_norm": 0.7981854209700985, "learning_rate": 2.2161208590096407e-07, "loss": 1.517, "step": 1185 }, { "epoch": 0.9452884519908649, "grad_norm": 0.7979530312793264, "learning_rate": 1.9350312170465234e-07, "loss": 1.5233, "step": 1190 }, { "epoch": 0.9492602522093139, "grad_norm": 0.8079703791008118, "learning_rate": 1.672823828680037e-07, "loss": 1.5193, "step": 1195 }, { "epoch": 0.9532320524277629, "grad_norm": 0.8283052457474909, "learning_rate": 1.4295491815253138e-07, "loss": 1.5306, "step": 1200 }, { "epoch": 0.9572038526462119, "grad_norm": 0.8204454900067728, "learning_rate": 1.205254117728316e-07, "loss": 1.5186, "step": 1205 }, { "epoch": 0.9611756528646609, "grad_norm": 0.8209623860337765, "learning_rate": 9.999818249464389e-08, "loss": 1.5262, "step": 1210 }, { "epoch": 0.9651474530831099, "grad_norm": 0.8116412111952241, "learning_rate": 8.137718280328166e-08, "loss": 1.5243, "step": 1215 }, { "epoch": 0.9691192533015589, "grad_norm": 0.7858348914688461, "learning_rate": 6.46659981425879e-08, "loss": 1.5169, "step": 1220 }, { "epoch": 0.9730910535200079, "grad_norm": 0.7965100492008761, "learning_rate": 4.9867846224559423e-08, "loss": 1.5238, "step": 1225 }, { "epoch": 0.9770628537384569, "grad_norm": 0.7942704227422176, "learning_rate": 3.6985576409787064e-08, "loss": 1.5131, "step": 1230 }, { "epoch": 0.9810346539569059, "grad_norm": 0.8182170297859925, "learning_rate": 2.6021669158811104e-08, "loss": 1.5142, "step": 1235 }, { "epoch": 0.9850064541753549, "grad_norm": 0.7890859967844505, "learning_rate": 1.697823555451561e-08, "loss": 1.5156, "step": 1240 }, { "epoch": 0.988978254393804, "grad_norm": 0.792384402297532, "learning_rate": 9.857016895642446e-09, "loss": 1.5025, "step": 1245 }, { "epoch": 0.992950054612253, "grad_norm": 0.7972540802101755, "learning_rate": 4.6593843615050374e-09, "loss": 1.5062, "step": 1250 }, { "epoch": 0.9969218548307021, "grad_norm": 0.7796868385026204, "learning_rate": 1.386338747972893e-09, "loss": 1.526, "step": 1255 }, { "epoch": 0.9993049349617714, "eval_loss": 1.5326974391937256, "eval_runtime": 248.3752, "eval_samples_per_second": 107.676, "eval_steps_per_second": 4.489, "step": 1258 }, { "epoch": 0.9993049349617714, "step": 1258, "total_flos": 106140763422720.0, "train_loss": 1.5477893754295022, "train_runtime": 7899.7649, "train_samples_per_second": 30.594, "train_steps_per_second": 0.159 } ], "logging_steps": 5, "max_steps": 1258, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 106140763422720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }