{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 761, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001314060446780552, "grad_norm": 0.20481063425540924, "learning_rate": 2.5974025974025976e-06, "loss": 1.4972, "step": 1 }, { "epoch": 0.006570302233902759, "grad_norm": 0.2685558497905731, "learning_rate": 1.2987012987012986e-05, "loss": 1.4328, "step": 5 }, { "epoch": 0.013140604467805518, "grad_norm": 0.15764696896076202, "learning_rate": 2.5974025974025972e-05, "loss": 1.4448, "step": 10 }, { "epoch": 0.01971090670170828, "grad_norm": 0.10870245844125748, "learning_rate": 3.8961038961038966e-05, "loss": 1.3786, "step": 15 }, { "epoch": 0.026281208935611037, "grad_norm": 0.12210144102573395, "learning_rate": 5.1948051948051944e-05, "loss": 1.3637, "step": 20 }, { "epoch": 0.0328515111695138, "grad_norm": 0.14269188046455383, "learning_rate": 6.493506493506494e-05, "loss": 1.3934, "step": 25 }, { "epoch": 0.03942181340341656, "grad_norm": 0.13970889151096344, "learning_rate": 7.792207792207793e-05, "loss": 1.3698, "step": 30 }, { "epoch": 0.045992115637319315, "grad_norm": 0.13595248758792877, "learning_rate": 9.090909090909092e-05, "loss": 1.3411, "step": 35 }, { "epoch": 0.052562417871222074, "grad_norm": 0.12064177542924881, "learning_rate": 0.00010389610389610389, "loss": 1.3198, "step": 40 }, { "epoch": 0.05913272010512484, "grad_norm": 0.13153237104415894, "learning_rate": 0.00011688311688311689, "loss": 1.3558, "step": 45 }, { "epoch": 0.0657030223390276, "grad_norm": 0.1267818659543991, "learning_rate": 0.00012987012987012987, "loss": 1.3438, "step": 50 }, { "epoch": 0.07227332457293036, "grad_norm": 0.14356698095798492, "learning_rate": 0.00014285714285714287, "loss": 1.2924, "step": 55 }, { "epoch": 0.07884362680683311, "grad_norm": 0.12211916595697403, "learning_rate": 0.00015584415584415587, "loss": 1.2536, "step": 60 }, { "epoch": 0.08541392904073587, "grad_norm": 0.12076783180236816, "learning_rate": 0.00016883116883116884, "loss": 1.2783, "step": 65 }, { "epoch": 0.09198423127463863, "grad_norm": 0.13078632950782776, "learning_rate": 0.00018181818181818183, "loss": 1.3032, "step": 70 }, { "epoch": 0.09855453350854139, "grad_norm": 0.13422036170959473, "learning_rate": 0.0001948051948051948, "loss": 1.2695, "step": 75 }, { "epoch": 0.10512483574244415, "grad_norm": 0.12621906399726868, "learning_rate": 0.00019999050722505993, "loss": 1.304, "step": 80 }, { "epoch": 0.1116951379763469, "grad_norm": 0.12420738488435745, "learning_rate": 0.00019993250234920636, "loss": 1.2315, "step": 85 }, { "epoch": 0.11826544021024968, "grad_norm": 0.13390915095806122, "learning_rate": 0.000199821796913812, "loss": 1.2631, "step": 90 }, { "epoch": 0.12483574244415244, "grad_norm": 0.12965141236782074, "learning_rate": 0.000199658449300667, "loss": 1.2757, "step": 95 }, { "epoch": 0.1314060446780552, "grad_norm": 0.12207651138305664, "learning_rate": 0.00019944254565302217, "loss": 1.2275, "step": 100 }, { "epoch": 0.13797634691195795, "grad_norm": 0.12977205216884613, "learning_rate": 0.00019917419983016025, "loss": 1.2648, "step": 105 }, { "epoch": 0.1445466491458607, "grad_norm": 0.11946344375610352, "learning_rate": 0.00019885355334735082, "loss": 1.2639, "step": 110 }, { "epoch": 0.15111695137976347, "grad_norm": 0.12259474396705627, "learning_rate": 0.00019848077530122083, "loss": 1.2436, "step": 115 }, { "epoch": 0.15768725361366623, "grad_norm": 0.12387403845787048, "learning_rate": 0.00019805606228057916, "loss": 1.2109, "step": 120 }, { "epoch": 0.164257555847569, "grad_norm": 0.12060413509607315, "learning_rate": 0.00019757963826274357, "loss": 1.2373, "step": 125 }, { "epoch": 0.17082785808147175, "grad_norm": 0.11977066099643707, "learning_rate": 0.00019705175449542358, "loss": 1.2104, "step": 130 }, { "epoch": 0.1773981603153745, "grad_norm": 0.11596687883138657, "learning_rate": 0.00019647268936422206, "loss": 1.2233, "step": 135 }, { "epoch": 0.18396846254927726, "grad_norm": 0.1179521232843399, "learning_rate": 0.0001958427482458253, "loss": 1.2743, "step": 140 }, { "epoch": 0.19053876478318002, "grad_norm": 0.12631933391094208, "learning_rate": 0.0001951622633469592, "loss": 1.2922, "step": 145 }, { "epoch": 0.19710906701708278, "grad_norm": 0.12004056572914124, "learning_rate": 0.00019443159352919623, "loss": 1.2317, "step": 150 }, { "epoch": 0.20367936925098554, "grad_norm": 0.1243131160736084, "learning_rate": 0.0001936511241197055, "loss": 1.2963, "step": 155 }, { "epoch": 0.2102496714848883, "grad_norm": 0.11765783280134201, "learning_rate": 0.00019282126670804614, "loss": 1.275, "step": 160 }, { "epoch": 0.21681997371879105, "grad_norm": 0.11682084202766418, "learning_rate": 0.0001919424589291108, "loss": 1.2308, "step": 165 }, { "epoch": 0.2233902759526938, "grad_norm": 0.12230474501848221, "learning_rate": 0.00019101516423233368, "loss": 1.2608, "step": 170 }, { "epoch": 0.22996057818659657, "grad_norm": 0.11786483973264694, "learning_rate": 0.00019003987163728535, "loss": 1.2516, "step": 175 }, { "epoch": 0.23653088042049936, "grad_norm": 0.11655290424823761, "learning_rate": 0.00018901709547578245, "loss": 1.2032, "step": 180 }, { "epoch": 0.24310118265440211, "grad_norm": 0.11199444532394409, "learning_rate": 0.0001879473751206489, "loss": 1.2352, "step": 185 }, { "epoch": 0.24967148488830487, "grad_norm": 0.12412846833467484, "learning_rate": 0.0001868312747012715, "loss": 1.2113, "step": 190 }, { "epoch": 0.25624178712220763, "grad_norm": 0.11748431622982025, "learning_rate": 0.00018566938280609966, "loss": 1.2238, "step": 195 }, { "epoch": 0.2628120893561104, "grad_norm": 0.12820030748844147, "learning_rate": 0.0001844623121722465, "loss": 1.2296, "step": 200 }, { "epoch": 0.26938239159001315, "grad_norm": 0.11599507927894592, "learning_rate": 0.00018321069936235503, "loss": 1.2363, "step": 205 }, { "epoch": 0.2759526938239159, "grad_norm": 0.13053999841213226, "learning_rate": 0.0001819152044288992, "loss": 1.2052, "step": 210 }, { "epoch": 0.28252299605781866, "grad_norm": 0.11889121681451797, "learning_rate": 0.00018057651056609784, "loss": 1.2022, "step": 215 }, { "epoch": 0.2890932982917214, "grad_norm": 0.1182330921292305, "learning_rate": 0.00017919532374962416, "loss": 1.2397, "step": 220 }, { "epoch": 0.2956636005256242, "grad_norm": 0.11528757214546204, "learning_rate": 0.0001777723723643014, "loss": 1.2275, "step": 225 }, { "epoch": 0.30223390275952694, "grad_norm": 0.12166588008403778, "learning_rate": 0.00017630840681998066, "loss": 1.2883, "step": 230 }, { "epoch": 0.3088042049934297, "grad_norm": 0.11569472402334213, "learning_rate": 0.00017480419915580356, "loss": 1.1974, "step": 235 }, { "epoch": 0.31537450722733246, "grad_norm": 0.1137891337275505, "learning_rate": 0.00017326054263305847, "loss": 1.2236, "step": 240 }, { "epoch": 0.3219448094612352, "grad_norm": 0.1182858943939209, "learning_rate": 0.00017167825131684513, "loss": 1.2156, "step": 245 }, { "epoch": 0.328515111695138, "grad_norm": 0.11363399028778076, "learning_rate": 0.00017005815964676787, "loss": 1.2398, "step": 250 }, { "epoch": 0.33508541392904073, "grad_norm": 0.11693256348371506, "learning_rate": 0.00016840112199688432, "loss": 1.2672, "step": 255 }, { "epoch": 0.3416557161629435, "grad_norm": 0.10189284384250641, "learning_rate": 0.00016670801222514134, "loss": 1.2125, "step": 260 }, { "epoch": 0.34822601839684625, "grad_norm": 0.11468140780925751, "learning_rate": 0.000164979723212536, "loss": 1.2938, "step": 265 }, { "epoch": 0.354796320630749, "grad_norm": 0.11262702941894531, "learning_rate": 0.00016321716639224434, "loss": 1.1908, "step": 270 }, { "epoch": 0.36136662286465177, "grad_norm": 0.11424372345209122, "learning_rate": 0.0001614212712689668, "loss": 1.2362, "step": 275 }, { "epoch": 0.3679369250985545, "grad_norm": 0.10987983644008636, "learning_rate": 0.00015959298492874288, "loss": 1.2357, "step": 280 }, { "epoch": 0.3745072273324573, "grad_norm": 0.1153380423784256, "learning_rate": 0.00015773327153949465, "loss": 1.2273, "step": 285 }, { "epoch": 0.38107752956636004, "grad_norm": 0.1153273954987526, "learning_rate": 0.0001558431118425614, "loss": 1.2516, "step": 290 }, { "epoch": 0.3876478318002628, "grad_norm": 0.1217270940542221, "learning_rate": 0.0001539235026354946, "loss": 1.2697, "step": 295 }, { "epoch": 0.39421813403416556, "grad_norm": 0.1220201700925827, "learning_rate": 0.00015197545624638504, "loss": 1.2384, "step": 300 }, { "epoch": 0.4007884362680683, "grad_norm": 0.10586865991353989, "learning_rate": 0.00015000000000000001, "loss": 1.1821, "step": 305 }, { "epoch": 0.4073587385019711, "grad_norm": 0.12206688523292542, "learning_rate": 0.00014799817567601157, "loss": 1.2562, "step": 310 }, { "epoch": 0.41392904073587383, "grad_norm": 0.1146216094493866, "learning_rate": 0.00014597103895960226, "loss": 1.2326, "step": 315 }, { "epoch": 0.4204993429697766, "grad_norm": 0.11471971124410629, "learning_rate": 0.00014391965888473703, "loss": 1.2526, "step": 320 }, { "epoch": 0.42706964520367935, "grad_norm": 0.11247406154870987, "learning_rate": 0.00014184511727039612, "loss": 1.2698, "step": 325 }, { "epoch": 0.4336399474375821, "grad_norm": 0.11814497411251068, "learning_rate": 0.00013974850815006503, "loss": 1.1875, "step": 330 }, { "epoch": 0.44021024967148487, "grad_norm": 0.11835132539272308, "learning_rate": 0.00013763093719478358, "loss": 1.2311, "step": 335 }, { "epoch": 0.4467805519053876, "grad_norm": 0.1311633437871933, "learning_rate": 0.00013549352113005728, "loss": 1.2106, "step": 340 }, { "epoch": 0.4533508541392904, "grad_norm": 0.12060287594795227, "learning_rate": 0.00013333738714693956, "loss": 1.1733, "step": 345 }, { "epoch": 0.45992115637319314, "grad_norm": 0.115968257188797, "learning_rate": 0.00013116367230759415, "loss": 1.1933, "step": 350 }, { "epoch": 0.4664914586070959, "grad_norm": 0.1167786568403244, "learning_rate": 0.0001289735229456525, "loss": 1.2408, "step": 355 }, { "epoch": 0.4730617608409987, "grad_norm": 0.1175895482301712, "learning_rate": 0.00012676809406168133, "loss": 1.2432, "step": 360 }, { "epoch": 0.47963206307490147, "grad_norm": 0.1053074523806572, "learning_rate": 0.00012454854871407994, "loss": 1.1954, "step": 365 }, { "epoch": 0.48620236530880423, "grad_norm": 0.1283629983663559, "learning_rate": 0.00012231605740572766, "loss": 1.23, "step": 370 }, { "epoch": 0.492772667542707, "grad_norm": 0.11125301569700241, "learning_rate": 0.00012007179746670592, "loss": 1.1961, "step": 375 }, { "epoch": 0.49934296977660975, "grad_norm": 0.12729716300964355, "learning_rate": 0.00011781695243341932, "loss": 1.2646, "step": 380 }, { "epoch": 0.5059132720105125, "grad_norm": 0.11228667199611664, "learning_rate": 0.00011555271142444433, "loss": 1.2453, "step": 385 }, { "epoch": 0.5124835742444153, "grad_norm": 0.11379769444465637, "learning_rate": 0.00011328026851343367, "loss": 1.2444, "step": 390 }, { "epoch": 0.519053876478318, "grad_norm": 0.11798006296157837, "learning_rate": 0.00011100082209940795, "loss": 1.2136, "step": 395 }, { "epoch": 0.5256241787122208, "grad_norm": 0.11399506032466888, "learning_rate": 0.00010871557427476583, "loss": 1.188, "step": 400 }, { "epoch": 0.5321944809461235, "grad_norm": 0.11886433511972427, "learning_rate": 0.00010642573019134703, "loss": 1.2405, "step": 405 }, { "epoch": 0.5387647831800263, "grad_norm": 0.11915737390518188, "learning_rate": 0.00010413249742488131, "loss": 1.2365, "step": 410 }, { "epoch": 0.545335085413929, "grad_norm": 0.1189686506986618, "learning_rate": 0.00010183708533815974, "loss": 1.2662, "step": 415 }, { "epoch": 0.5519053876478318, "grad_norm": 0.11237179487943649, "learning_rate": 9.954070444326293e-05, "loss": 1.2118, "step": 420 }, { "epoch": 0.5584756898817346, "grad_norm": 0.11251773685216904, "learning_rate": 9.724456576318381e-05, "loss": 1.2266, "step": 425 }, { "epoch": 0.5650459921156373, "grad_norm": 0.1128716990351677, "learning_rate": 9.49498801931804e-05, "loss": 1.2424, "step": 430 }, { "epoch": 0.5716162943495401, "grad_norm": 0.11235509812831879, "learning_rate": 9.265785786219647e-05, "loss": 1.1903, "step": 435 }, { "epoch": 0.5781865965834428, "grad_norm": 0.11456587165594101, "learning_rate": 9.036970749468584e-05, "loss": 1.168, "step": 440 }, { "epoch": 0.5847568988173456, "grad_norm": 0.11781252175569534, "learning_rate": 8.808663577317764e-05, "loss": 1.3052, "step": 445 }, { "epoch": 0.5913272010512484, "grad_norm": 0.1393735557794571, "learning_rate": 8.580984670191848e-05, "loss": 1.1935, "step": 450 }, { "epoch": 0.5978975032851511, "grad_norm": 0.11526591330766678, "learning_rate": 8.35405409719266e-05, "loss": 1.1925, "step": 455 }, { "epoch": 0.6044678055190539, "grad_norm": 0.11947058141231537, "learning_rate": 8.127991532779401e-05, "loss": 1.2224, "step": 460 }, { "epoch": 0.6110381077529566, "grad_norm": 0.11296655237674713, "learning_rate": 7.902916193656898e-05, "loss": 1.1984, "step": 465 }, { "epoch": 0.6176084099868594, "grad_norm": 0.11227685958147049, "learning_rate": 7.678946775905324e-05, "loss": 1.1633, "step": 470 }, { "epoch": 0.6241787122207622, "grad_norm": 0.11681170761585236, "learning_rate": 7.456201392384436e-05, "loss": 1.2832, "step": 475 }, { "epoch": 0.6307490144546649, "grad_norm": 0.11576228588819504, "learning_rate": 7.234797510445411e-05, "loss": 1.1869, "step": 480 }, { "epoch": 0.6373193166885677, "grad_norm": 0.11033356189727783, "learning_rate": 7.014851889983057e-05, "loss": 1.2293, "step": 485 }, { "epoch": 0.6438896189224704, "grad_norm": 0.12529700994491577, "learning_rate": 6.79648052186115e-05, "loss": 1.2044, "step": 490 }, { "epoch": 0.6504599211563732, "grad_norm": 0.11405730992555618, "learning_rate": 6.579798566743314e-05, "loss": 1.2509, "step": 495 }, { "epoch": 0.657030223390276, "grad_norm": 0.11706886440515518, "learning_rate": 6.3649202943617e-05, "loss": 1.1843, "step": 500 }, { "epoch": 0.6636005256241787, "grad_norm": 0.11757861822843552, "learning_rate": 6.151959023255545e-05, "loss": 1.1951, "step": 505 }, { "epoch": 0.6701708278580815, "grad_norm": 0.1282133162021637, "learning_rate": 5.941027061011303e-05, "loss": 1.1663, "step": 510 }, { "epoch": 0.6767411300919842, "grad_norm": 0.11773235350847244, "learning_rate": 5.732235645035964e-05, "loss": 1.1371, "step": 515 }, { "epoch": 0.683311432325887, "grad_norm": 0.1168578639626503, "learning_rate": 5.52569488389472e-05, "loss": 1.2345, "step": 520 }, { "epoch": 0.6898817345597897, "grad_norm": 0.14125367999076843, "learning_rate": 5.321513699243924e-05, "loss": 1.2284, "step": 525 }, { "epoch": 0.6964520367936925, "grad_norm": 0.11345361173152924, "learning_rate": 5.1197997683900214e-05, "loss": 1.2032, "step": 530 }, { "epoch": 0.7030223390275953, "grad_norm": 0.11965469270944595, "learning_rate": 4.920659467504659e-05, "loss": 1.2142, "step": 535 }, { "epoch": 0.709592641261498, "grad_norm": 0.11439390480518341, "learning_rate": 4.7241978155259925e-05, "loss": 1.2083, "step": 540 }, { "epoch": 0.7161629434954008, "grad_norm": 0.11237988620996475, "learning_rate": 4.530518418775733e-05, "loss": 1.2324, "step": 545 }, { "epoch": 0.7227332457293035, "grad_norm": 0.1146211326122284, "learning_rate": 4.3397234163211483e-05, "loss": 1.1968, "step": 550 }, { "epoch": 0.7293035479632063, "grad_norm": 0.11229883134365082, "learning_rate": 4.151913426110864e-05, "loss": 1.2254, "step": 555 }, { "epoch": 0.735873850197109, "grad_norm": 0.11867476254701614, "learning_rate": 3.967187491912813e-05, "loss": 1.1785, "step": 560 }, { "epoch": 0.7424441524310118, "grad_norm": 0.1171465814113617, "learning_rate": 3.7856430310823545e-05, "loss": 1.1859, "step": 565 }, { "epoch": 0.7490144546649146, "grad_norm": 0.11713968962430954, "learning_rate": 3.607375783188125e-05, "loss": 1.2049, "step": 570 }, { "epoch": 0.7555847568988173, "grad_norm": 0.11092919111251831, "learning_rate": 3.4324797595226565e-05, "loss": 1.2334, "step": 575 }, { "epoch": 0.7621550591327201, "grad_norm": 0.11737989634275436, "learning_rate": 3.261047193524439e-05, "loss": 1.2249, "step": 580 }, { "epoch": 0.7687253613666228, "grad_norm": 0.11774250119924545, "learning_rate": 3.093168492137557e-05, "loss": 1.2134, "step": 585 }, { "epoch": 0.7752956636005256, "grad_norm": 0.11658301949501038, "learning_rate": 2.9289321881345254e-05, "loss": 1.2749, "step": 590 }, { "epoch": 0.7818659658344284, "grad_norm": 0.11728201061487198, "learning_rate": 2.7684248934275325e-05, "loss": 1.2288, "step": 595 }, { "epoch": 0.7884362680683311, "grad_norm": 0.11490175873041153, "learning_rate": 2.6117312533926362e-05, "loss": 1.1958, "step": 600 }, { "epoch": 0.7950065703022339, "grad_norm": 0.12242759019136429, "learning_rate": 2.4589339022310386e-05, "loss": 1.2922, "step": 605 }, { "epoch": 0.8015768725361366, "grad_norm": 0.11296830326318741, "learning_rate": 2.3101134193910024e-05, "loss": 1.1516, "step": 610 }, { "epoch": 0.8081471747700394, "grad_norm": 0.11919889599084854, "learning_rate": 2.165348287073339e-05, "loss": 1.1924, "step": 615 }, { "epoch": 0.8147174770039421, "grad_norm": 0.14579623937606812, "learning_rate": 2.02471484884291e-05, "loss": 1.2701, "step": 620 }, { "epoch": 0.8212877792378449, "grad_norm": 0.13354499638080597, "learning_rate": 1.888287269367979e-05, "loss": 1.1735, "step": 625 }, { "epoch": 0.8278580814717477, "grad_norm": 0.12233620136976242, "learning_rate": 1.756137495308594e-05, "loss": 1.2578, "step": 630 }, { "epoch": 0.8344283837056504, "grad_norm": 0.11252877116203308, "learning_rate": 1.6283352173747145e-05, "loss": 1.1774, "step": 635 }, { "epoch": 0.8409986859395532, "grad_norm": 0.145619198679924, "learning_rate": 1.5049478335739886e-05, "loss": 1.1973, "step": 640 }, { "epoch": 0.8475689881734559, "grad_norm": 0.17927619814872742, "learning_rate": 1.3860404136686411e-05, "loss": 1.237, "step": 645 }, { "epoch": 0.8541392904073587, "grad_norm": 0.1277463138103485, "learning_rate": 1.2716756648601857e-05, "loss": 1.1906, "step": 650 }, { "epoch": 0.8607095926412615, "grad_norm": 0.11752137541770935, "learning_rate": 1.1619138987200562e-05, "loss": 1.2196, "step": 655 }, { "epoch": 0.8672798948751642, "grad_norm": 0.1263829469680786, "learning_rate": 1.056812999383604e-05, "loss": 1.2017, "step": 660 }, { "epoch": 0.873850197109067, "grad_norm": 0.11384302377700806, "learning_rate": 9.564283930242257e-06, "loss": 1.2104, "step": 665 }, { "epoch": 0.8804204993429697, "grad_norm": 0.12480375170707703, "learning_rate": 8.608130186237329e-06, "loss": 1.2602, "step": 670 }, { "epoch": 0.8869908015768725, "grad_norm": 0.1103396862745285, "learning_rate": 7.700173000543742e-06, "loss": 1.2786, "step": 675 }, { "epoch": 0.8935611038107752, "grad_norm": 0.11293721199035645, "learning_rate": 6.840891194872112e-06, "loss": 1.182, "step": 680 }, { "epoch": 0.900131406044678, "grad_norm": 0.11655032634735107, "learning_rate": 6.030737921409169e-06, "loss": 1.2231, "step": 685 }, { "epoch": 0.9067017082785808, "grad_norm": 0.11646699905395508, "learning_rate": 5.270140423842607e-06, "loss": 1.2144, "step": 690 }, { "epoch": 0.9132720105124835, "grad_norm": 0.1152067482471466, "learning_rate": 4.559499812049251e-06, "loss": 1.2346, "step": 695 }, { "epoch": 0.9198423127463863, "grad_norm": 0.1144418865442276, "learning_rate": 3.899190850565115e-06, "loss": 1.1944, "step": 700 }, { "epoch": 0.926412614980289, "grad_norm": 0.12021008878946304, "learning_rate": 3.2895617609489336e-06, "loss": 1.2049, "step": 705 }, { "epoch": 0.9329829172141918, "grad_norm": 0.11006554961204529, "learning_rate": 2.730934038143607e-06, "loss": 1.1771, "step": 710 }, { "epoch": 0.9395532194480947, "grad_norm": 0.12349164485931396, "learning_rate": 2.22360228093208e-06, "loss": 1.2575, "step": 715 }, { "epoch": 0.9461235216819974, "grad_norm": 0.11703501641750336, "learning_rate": 1.7678340365772206e-06, "loss": 1.2332, "step": 720 }, { "epoch": 0.9526938239159002, "grad_norm": 0.12201967090368271, "learning_rate": 1.3638696597277679e-06, "loss": 1.2461, "step": 725 }, { "epoch": 0.9592641261498029, "grad_norm": 0.10865501314401627, "learning_rate": 1.0119221856644712e-06, "loss": 1.2051, "step": 730 }, { "epoch": 0.9658344283837057, "grad_norm": 0.1180211752653122, "learning_rate": 7.121772179535135e-07, "loss": 1.222, "step": 735 }, { "epoch": 0.9724047306176085, "grad_norm": 0.11063376069068909, "learning_rate": 4.647928305662852e-07, "loss": 1.1723, "step": 740 }, { "epoch": 0.9789750328515112, "grad_norm": 0.1198456659913063, "learning_rate": 2.6989948451726643e-07, "loss": 1.2553, "step": 745 }, { "epoch": 0.985545335085414, "grad_norm": 0.1216738149523735, "learning_rate": 1.2759995906392874e-07, "loss": 1.238, "step": 750 }, { "epoch": 0.9921156373193167, "grad_norm": 0.11744906008243561, "learning_rate": 3.796929750485845e-08, "loss": 1.1762, "step": 755 }, { "epoch": 0.9986859395532195, "grad_norm": 0.12220453470945358, "learning_rate": 1.0547676048688892e-09, "loss": 1.2372, "step": 760 }, { "epoch": 1.0, "eval_loss": 1.2166202068328857, "eval_runtime": 482.0207, "eval_samples_per_second": 27.918, "eval_steps_per_second": 1.747, "step": 761 }, { "epoch": 1.0, "step": 761, "total_flos": 4.2537508369596416e+17, "train_loss": 1.2369335692439536, "train_runtime": 2094.6914, "train_samples_per_second": 5.81, "train_steps_per_second": 0.363 } ], "logging_steps": 5, "max_steps": 761, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.2537508369596416e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }