{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.47914153807761095, "eval_steps": 3000, "global_step": 10800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002218247861470421, "grad_norm": 0.15810145437717438, "learning_rate": 2e-05, "loss": 2.8627, "step": 50 }, { "epoch": 0.004436495722940842, "grad_norm": 0.1590433567762375, "learning_rate": 4e-05, "loss": 2.8607, "step": 100 }, { "epoch": 0.006654743584411263, "grad_norm": 0.15798641741275787, "learning_rate": 6e-05, "loss": 2.8623, "step": 150 }, { "epoch": 0.008872991445881684, "grad_norm": 0.16127805411815643, "learning_rate": 8e-05, "loss": 2.8608, "step": 200 }, { "epoch": 0.011091239307352105, "grad_norm": 0.1587396264076233, "learning_rate": 0.0001, "loss": 2.8608, "step": 250 }, { "epoch": 0.013309487168822525, "grad_norm": 0.160736083984375, "learning_rate": 0.00012, "loss": 2.8563, "step": 300 }, { "epoch": 0.015527735030292948, "grad_norm": 0.16256989538669586, "learning_rate": 0.00014, "loss": 2.8549, "step": 350 }, { "epoch": 0.01774598289176337, "grad_norm": 0.16194568574428558, "learning_rate": 0.00016, "loss": 2.8557, "step": 400 }, { "epoch": 0.01996423075323379, "grad_norm": 0.15836463868618011, "learning_rate": 0.00018, "loss": 2.8545, "step": 450 }, { "epoch": 0.02218247861470421, "grad_norm": 0.16059577465057373, "learning_rate": 0.0002, "loss": 2.8522, "step": 500 }, { "epoch": 0.024400726476174632, "grad_norm": 0.16031378507614136, "learning_rate": 0.00022000000000000003, "loss": 2.8481, "step": 550 }, { "epoch": 0.02661897433764505, "grad_norm": 0.16000501811504364, "learning_rate": 0.00024, "loss": 2.8431, "step": 600 }, { "epoch": 0.028837222199115473, "grad_norm": 0.15952646732330322, "learning_rate": 0.00026000000000000003, "loss": 2.8475, "step": 650 }, { "epoch": 0.031055470060585896, "grad_norm": 0.16443726420402527, "learning_rate": 0.00028, "loss": 2.8452, "step": 700 }, { "epoch": 0.033273717922056315, "grad_norm": 0.1644088476896286, "learning_rate": 0.00030000000000000003, "loss": 2.8458, "step": 750 }, { "epoch": 0.03549196578352674, "grad_norm": 0.16272033751010895, "learning_rate": 0.00032, "loss": 2.8435, "step": 800 }, { "epoch": 0.03771021364499716, "grad_norm": 0.16485804319381714, "learning_rate": 0.00034, "loss": 2.8481, "step": 850 }, { "epoch": 0.03992846150646758, "grad_norm": 0.1669188290834427, "learning_rate": 0.00036, "loss": 2.8555, "step": 900 }, { "epoch": 0.042146709367938, "grad_norm": 0.16288943588733673, "learning_rate": 0.00038, "loss": 2.851, "step": 950 }, { "epoch": 0.04436495722940842, "grad_norm": 0.1651136726140976, "learning_rate": 0.0004, "loss": 2.8443, "step": 1000 }, { "epoch": 0.04658320509087884, "grad_norm": 0.16190673410892487, "learning_rate": 0.00039999468202328424, "loss": 2.8398, "step": 1050 }, { "epoch": 0.048801452952349264, "grad_norm": 0.1649934947490692, "learning_rate": 0.00039997872837594555, "loss": 2.8371, "step": 1100 }, { "epoch": 0.051019700813819686, "grad_norm": 0.16184477508068085, "learning_rate": 0.00039995213990639536, "loss": 2.8347, "step": 1150 }, { "epoch": 0.0532379486752901, "grad_norm": 0.1629864126443863, "learning_rate": 0.0003999149180286022, "loss": 2.834, "step": 1200 }, { "epoch": 0.055456196536760524, "grad_norm": 0.1627526730298996, "learning_rate": 0.00039986706472201685, "loss": 2.8309, "step": 1250 }, { "epoch": 0.05767444439823095, "grad_norm": 0.1642647087574005, "learning_rate": 0.000399808582531467, "loss": 2.8352, "step": 1300 }, { "epoch": 0.05989269225970137, "grad_norm": 0.16397783160209656, "learning_rate": 0.000399739474567022, "loss": 2.8317, "step": 1350 }, { "epoch": 0.06211094012117179, "grad_norm": 0.16319701075553894, "learning_rate": 0.00039965974450382726, "loss": 2.8322, "step": 1400 }, { "epoch": 0.06432918798264221, "grad_norm": 0.16067005693912506, "learning_rate": 0.000399569396581909, "loss": 2.8279, "step": 1450 }, { "epoch": 0.06654743584411263, "grad_norm": 0.16118553280830383, "learning_rate": 0.00039946843560594866, "loss": 2.8323, "step": 1500 }, { "epoch": 0.06876568370558306, "grad_norm": 0.16291728615760803, "learning_rate": 0.0003993568669450274, "loss": 2.8301, "step": 1550 }, { "epoch": 0.07098393156705347, "grad_norm": 0.1590035855770111, "learning_rate": 0.0003992346965323407, "loss": 2.8214, "step": 1600 }, { "epoch": 0.07320217942852389, "grad_norm": 0.16236472129821777, "learning_rate": 0.00039910193086488253, "loss": 2.8242, "step": 1650 }, { "epoch": 0.07542042728999432, "grad_norm": 0.1617489606142044, "learning_rate": 0.0003989585770031003, "loss": 2.8231, "step": 1700 }, { "epoch": 0.07763867515146473, "grad_norm": 0.15960238873958588, "learning_rate": 0.000398804642570519, "loss": 2.8248, "step": 1750 }, { "epoch": 0.07985692301293516, "grad_norm": 0.16391754150390625, "learning_rate": 0.0003986401357533358, "loss": 2.8222, "step": 1800 }, { "epoch": 0.08207517087440558, "grad_norm": 0.16161847114562988, "learning_rate": 0.000398465065299985, "loss": 2.8153, "step": 1850 }, { "epoch": 0.084293418735876, "grad_norm": 0.16447125375270844, "learning_rate": 0.00039827944052067265, "loss": 2.818, "step": 1900 }, { "epoch": 0.08651166659734642, "grad_norm": 0.16384591162204742, "learning_rate": 0.0003980832712868812, "loss": 2.8093, "step": 1950 }, { "epoch": 0.08872991445881684, "grad_norm": 0.16317427158355713, "learning_rate": 0.0003978765680308447, "loss": 2.8113, "step": 2000 }, { "epoch": 0.09094816232028727, "grad_norm": 0.16197824478149414, "learning_rate": 0.00039765934174499436, "loss": 2.8134, "step": 2050 }, { "epoch": 0.09316641018175768, "grad_norm": 0.16196754574775696, "learning_rate": 0.00039743160398137344, "loss": 2.8147, "step": 2100 }, { "epoch": 0.0953846580432281, "grad_norm": 0.16696424782276154, "learning_rate": 0.00039719336685102314, "loss": 2.811, "step": 2150 }, { "epoch": 0.09760290590469853, "grad_norm": 0.16266262531280518, "learning_rate": 0.0003969446430233386, "loss": 2.8103, "step": 2200 }, { "epoch": 0.09982115376616894, "grad_norm": 0.16161397099494934, "learning_rate": 0.0003966854457253951, "loss": 2.8017, "step": 2250 }, { "epoch": 0.10203940162763937, "grad_norm": 0.1631053388118744, "learning_rate": 0.0003964157887412445, "loss": 2.8034, "step": 2300 }, { "epoch": 0.10425764948910979, "grad_norm": 0.16185788810253143, "learning_rate": 0.00039613568641118255, "loss": 2.8027, "step": 2350 }, { "epoch": 0.1064758973505802, "grad_norm": 0.16428661346435547, "learning_rate": 0.00039584515363098584, "loss": 2.8031, "step": 2400 }, { "epoch": 0.10869414521205063, "grad_norm": 0.1625480055809021, "learning_rate": 0.00039554420585112, "loss": 2.7968, "step": 2450 }, { "epoch": 0.11091239307352105, "grad_norm": 0.1638619303703308, "learning_rate": 0.0003952328590759179, "loss": 2.8007, "step": 2500 }, { "epoch": 0.11313064093499148, "grad_norm": 0.16504357755184174, "learning_rate": 0.0003949111298627286, "loss": 2.7921, "step": 2550 }, { "epoch": 0.1153488887964619, "grad_norm": 0.16429375112056732, "learning_rate": 0.0003945790353210367, "loss": 2.7951, "step": 2600 }, { "epoch": 0.11756713665793232, "grad_norm": 0.166097030043602, "learning_rate": 0.0003942365931115526, "loss": 2.7948, "step": 2650 }, { "epoch": 0.11978538451940274, "grad_norm": 0.16275139153003693, "learning_rate": 0.0003938838214452733, "loss": 2.79, "step": 2700 }, { "epoch": 0.12200363238087315, "grad_norm": 0.16379590332508087, "learning_rate": 0.0003935207390825137, "loss": 2.7896, "step": 2750 }, { "epoch": 0.12422188024234358, "grad_norm": 0.16332408785820007, "learning_rate": 0.0003931473653319095, "loss": 2.7848, "step": 2800 }, { "epoch": 0.126440128103814, "grad_norm": 0.16235879063606262, "learning_rate": 0.00039276372004938987, "loss": 2.7836, "step": 2850 }, { "epoch": 0.12865837596528443, "grad_norm": 0.1654053032398224, "learning_rate": 0.00039236982363712145, "loss": 2.7845, "step": 2900 }, { "epoch": 0.13087662382675483, "grad_norm": 0.16393068432807922, "learning_rate": 0.00039196569704242376, "loss": 2.7796, "step": 2950 }, { "epoch": 0.13309487168822526, "grad_norm": 0.16517628729343414, "learning_rate": 0.0003915513617566551, "loss": 2.7738, "step": 3000 }, { "epoch": 0.13309487168822526, "eval_accuracy": 0.4247227650219834, "eval_loss": 2.8996665477752686, "eval_runtime": 243.2366, "eval_samples_per_second": 8.222, "eval_steps_per_second": 1.028, "step": 3000 }, { "epoch": 0.1353131195496957, "grad_norm": 0.16540038585662842, "learning_rate": 0.00039112683981406936, "loss": 2.7708, "step": 3050 }, { "epoch": 0.13753136741116612, "grad_norm": 0.16403205692768097, "learning_rate": 0.00039069215379064465, "loss": 2.7709, "step": 3100 }, { "epoch": 0.13974961527263652, "grad_norm": 0.16498889029026031, "learning_rate": 0.0003902473268028826, "loss": 2.7683, "step": 3150 }, { "epoch": 0.14196786313410695, "grad_norm": 0.16713927686214447, "learning_rate": 0.00038979238250657863, "loss": 2.7578, "step": 3200 }, { "epoch": 0.14418611099557738, "grad_norm": 0.16905058920383453, "learning_rate": 0.00038932734509556467, "loss": 2.7602, "step": 3250 }, { "epoch": 0.14640435885704778, "grad_norm": 0.16431044042110443, "learning_rate": 0.0003888522393004219, "loss": 2.7685, "step": 3300 }, { "epoch": 0.1486226067185182, "grad_norm": 0.163705512881279, "learning_rate": 0.00038836709038716583, "loss": 2.8434, "step": 3350 }, { "epoch": 0.15084085457998864, "grad_norm": 0.1622520387172699, "learning_rate": 0.0003878719241559027, "loss": 2.8349, "step": 3400 }, { "epoch": 0.15305910244145907, "grad_norm": 0.16072827577590942, "learning_rate": 0.00038736676693945746, "loss": 2.8369, "step": 3450 }, { "epoch": 0.15527735030292947, "grad_norm": 0.16206832230091095, "learning_rate": 0.0003868516456019733, "loss": 2.8404, "step": 3500 }, { "epoch": 0.1574955981643999, "grad_norm": 0.16148249804973602, "learning_rate": 0.0003863265875374829, "loss": 2.836, "step": 3550 }, { "epoch": 0.15971384602587033, "grad_norm": 0.16401226818561554, "learning_rate": 0.0003857916206684519, "loss": 2.8369, "step": 3600 }, { "epoch": 0.16193209388734073, "grad_norm": 0.15987250208854675, "learning_rate": 0.00038524677344429386, "loss": 2.8363, "step": 3650 }, { "epoch": 0.16415034174881116, "grad_norm": 0.16117645800113678, "learning_rate": 0.00038469207483985725, "loss": 2.8426, "step": 3700 }, { "epoch": 0.1663685896102816, "grad_norm": 0.16374363005161285, "learning_rate": 0.00038412755435388474, "loss": 2.8416, "step": 3750 }, { "epoch": 0.168586837471752, "grad_norm": 0.16495129466056824, "learning_rate": 0.0003835532420074444, "loss": 2.8396, "step": 3800 }, { "epoch": 0.17080508533322242, "grad_norm": 0.16315814852714539, "learning_rate": 0.0003829691683423329, "loss": 2.8358, "step": 3850 }, { "epoch": 0.17302333319469285, "grad_norm": 0.16098596155643463, "learning_rate": 0.00038237536441945193, "loss": 2.8354, "step": 3900 }, { "epoch": 0.17524158105616328, "grad_norm": 0.16226187348365784, "learning_rate": 0.00038177186181715577, "loss": 2.8352, "step": 3950 }, { "epoch": 0.17745982891763368, "grad_norm": 0.15939714014530182, "learning_rate": 0.00038115869262957233, "loss": 2.835, "step": 4000 }, { "epoch": 0.1796780767791041, "grad_norm": 0.1622256189584732, "learning_rate": 0.00038053588946489615, "loss": 2.8391, "step": 4050 }, { "epoch": 0.18189632464057454, "grad_norm": 0.16176052391529083, "learning_rate": 0.0003799034854436545, "loss": 2.8371, "step": 4100 }, { "epoch": 0.18411457250204494, "grad_norm": 0.16316720843315125, "learning_rate": 0.0003792615141969462, "loss": 2.8365, "step": 4150 }, { "epoch": 0.18633282036351537, "grad_norm": 0.16207610070705414, "learning_rate": 0.0003786100098646524, "loss": 2.8346, "step": 4200 }, { "epoch": 0.1885510682249858, "grad_norm": 0.1638234406709671, "learning_rate": 0.000377949007093622, "loss": 2.8319, "step": 4250 }, { "epoch": 0.1907693160864562, "grad_norm": 0.1628599315881729, "learning_rate": 0.0003772785410358283, "loss": 2.8369, "step": 4300 }, { "epoch": 0.19298756394792663, "grad_norm": 0.1653933823108673, "learning_rate": 0.00037659864734650026, "loss": 2.8304, "step": 4350 }, { "epoch": 0.19520581180939706, "grad_norm": 0.16370812058448792, "learning_rate": 0.0003759093621822259, "loss": 2.8369, "step": 4400 }, { "epoch": 0.19742405967086749, "grad_norm": 0.1629050225019455, "learning_rate": 0.0003752107221990298, "loss": 2.8339, "step": 4450 }, { "epoch": 0.1996423075323379, "grad_norm": 0.1610899269580841, "learning_rate": 0.00037450276455042354, "loss": 2.829, "step": 4500 }, { "epoch": 0.20186055539380832, "grad_norm": 0.1629941165447235, "learning_rate": 0.00037378552688543005, "loss": 2.8351, "step": 4550 }, { "epoch": 0.20407880325527875, "grad_norm": 0.16439735889434814, "learning_rate": 0.0003730590473465814, "loss": 2.8316, "step": 4600 }, { "epoch": 0.20629705111674915, "grad_norm": 0.16572241485118866, "learning_rate": 0.00037232336456789023, "loss": 2.8335, "step": 4650 }, { "epoch": 0.20851529897821958, "grad_norm": 0.16380038857460022, "learning_rate": 0.00037157851767279543, "loss": 2.8286, "step": 4700 }, { "epoch": 0.21073354683969, "grad_norm": 0.16284549236297607, "learning_rate": 0.00037082454627208156, "loss": 2.8301, "step": 4750 }, { "epoch": 0.2129517947011604, "grad_norm": 0.16597482562065125, "learning_rate": 0.0003700614904617721, "loss": 2.8323, "step": 4800 }, { "epoch": 0.21517004256263084, "grad_norm": 0.16378666460514069, "learning_rate": 0.0003692893908209973, "loss": 2.8299, "step": 4850 }, { "epoch": 0.21738829042410127, "grad_norm": 0.1630343496799469, "learning_rate": 0.0003685082884098363, "loss": 2.8333, "step": 4900 }, { "epoch": 0.2196065382855717, "grad_norm": 0.16490814089775085, "learning_rate": 0.00036771822476713346, "loss": 2.8307, "step": 4950 }, { "epoch": 0.2218247861470421, "grad_norm": 0.1655721366405487, "learning_rate": 0.00036691924190828935, "loss": 2.8301, "step": 5000 }, { "epoch": 0.22404303400851253, "grad_norm": 0.16776245832443237, "learning_rate": 0.0003661113823230264, "loss": 2.8228, "step": 5050 }, { "epoch": 0.22626128186998296, "grad_norm": 0.1626349687576294, "learning_rate": 0.00036529468897312926, "loss": 2.8262, "step": 5100 }, { "epoch": 0.22847952973145336, "grad_norm": 0.16331753134727478, "learning_rate": 0.00036446920529016, "loss": 2.8282, "step": 5150 }, { "epoch": 0.2306977775929238, "grad_norm": 0.1676001250743866, "learning_rate": 0.00036363497517314877, "loss": 2.8313, "step": 5200 }, { "epoch": 0.23291602545439422, "grad_norm": 0.16441357135772705, "learning_rate": 0.000362792042986259, "loss": 2.8278, "step": 5250 }, { "epoch": 0.23513427331586464, "grad_norm": 0.16601622104644775, "learning_rate": 0.000361940453556428, "loss": 2.8303, "step": 5300 }, { "epoch": 0.23735252117733505, "grad_norm": 0.1679011583328247, "learning_rate": 0.0003610802521709833, "loss": 2.8252, "step": 5350 }, { "epoch": 0.23957076903880548, "grad_norm": 0.1650955229997635, "learning_rate": 0.0003602114845752345, "loss": 2.8299, "step": 5400 }, { "epoch": 0.2417890169002759, "grad_norm": 0.16651777923107147, "learning_rate": 0.00035933419697004, "loss": 2.832, "step": 5450 }, { "epoch": 0.2440072647617463, "grad_norm": 0.166709303855896, "learning_rate": 0.00035844843600935024, "loss": 2.8262, "step": 5500 }, { "epoch": 0.24622551262321674, "grad_norm": 0.16586416959762573, "learning_rate": 0.000357554248797727, "loss": 2.8255, "step": 5550 }, { "epoch": 0.24844376048468717, "grad_norm": 0.1647614985704422, "learning_rate": 0.00035665168288783795, "loss": 2.8298, "step": 5600 }, { "epoch": 0.2506620083461576, "grad_norm": 0.16310204565525055, "learning_rate": 0.000355740786277928, "loss": 2.8273, "step": 5650 }, { "epoch": 0.252880256207628, "grad_norm": 0.1629767119884491, "learning_rate": 0.00035482160740926683, "loss": 2.8231, "step": 5700 }, { "epoch": 0.2550985040690984, "grad_norm": 0.16427451372146606, "learning_rate": 0.00035389419516357253, "loss": 2.8188, "step": 5750 }, { "epoch": 0.25731675193056885, "grad_norm": 0.1655891388654709, "learning_rate": 0.0003529585988604125, "loss": 2.8258, "step": 5800 }, { "epoch": 0.25953499979203926, "grad_norm": 0.16402335464954376, "learning_rate": 0.0003520148682545803, "loss": 2.8254, "step": 5850 }, { "epoch": 0.26175324765350966, "grad_norm": 0.1638861894607544, "learning_rate": 0.0003510630535334497, "loss": 2.8298, "step": 5900 }, { "epoch": 0.2639714955149801, "grad_norm": 0.16864845156669617, "learning_rate": 0.0003501032053143061, "loss": 2.8238, "step": 5950 }, { "epoch": 0.2661897433764505, "grad_norm": 0.16578635573387146, "learning_rate": 0.0003491353746416541, "loss": 2.8225, "step": 6000 }, { "epoch": 0.2661897433764505, "eval_accuracy": 0.4264626282364436, "eval_loss": 2.8843319416046143, "eval_runtime": 242.3694, "eval_samples_per_second": 8.252, "eval_steps_per_second": 1.031, "step": 6000 }, { "epoch": 0.268407991237921, "grad_norm": 0.16673897206783295, "learning_rate": 0.00034815961298450377, "loss": 2.823, "step": 6050 }, { "epoch": 0.2706262390993914, "grad_norm": 0.16588376462459564, "learning_rate": 0.0003471759722336326, "loss": 2.8193, "step": 6100 }, { "epoch": 0.2728444869608618, "grad_norm": 0.16813361644744873, "learning_rate": 0.00034618450469882687, "loss": 2.8267, "step": 6150 }, { "epoch": 0.27506273482233223, "grad_norm": 0.16656942665576935, "learning_rate": 0.0003451852631060991, "loss": 2.8219, "step": 6200 }, { "epoch": 0.27728098268380263, "grad_norm": 0.1666443794965744, "learning_rate": 0.0003441783005948846, "loss": 2.8233, "step": 6250 }, { "epoch": 0.27949923054527304, "grad_norm": 0.1673704832792282, "learning_rate": 0.0003431636707152152, "loss": 2.824, "step": 6300 }, { "epoch": 0.2817174784067435, "grad_norm": 0.16707104444503784, "learning_rate": 0.00034214142742487177, "loss": 2.8221, "step": 6350 }, { "epoch": 0.2839357262682139, "grad_norm": 0.16775397956371307, "learning_rate": 0.0003411116250865143, "loss": 2.8234, "step": 6400 }, { "epoch": 0.2861539741296843, "grad_norm": 0.16813720762729645, "learning_rate": 0.0003400743184647915, "loss": 2.8258, "step": 6450 }, { "epoch": 0.28837222199115475, "grad_norm": 0.16362161934375763, "learning_rate": 0.00033902956272342783, "loss": 2.8232, "step": 6500 }, { "epoch": 0.29059046985262516, "grad_norm": 0.16950780153274536, "learning_rate": 0.00033797741342229054, "loss": 2.821, "step": 6550 }, { "epoch": 0.29280871771409556, "grad_norm": 0.1657160073518753, "learning_rate": 0.00033691792651443435, "loss": 2.8181, "step": 6600 }, { "epoch": 0.295026965575566, "grad_norm": 0.1689310073852539, "learning_rate": 0.0003358511583431264, "loss": 2.8257, "step": 6650 }, { "epoch": 0.2972452134370364, "grad_norm": 0.16674135625362396, "learning_rate": 0.00033477716563884956, "loss": 2.8209, "step": 6700 }, { "epoch": 0.2994634612985068, "grad_norm": 0.16600748896598816, "learning_rate": 0.00033369600551628586, "loss": 2.8227, "step": 6750 }, { "epoch": 0.3016817091599773, "grad_norm": 0.16666853427886963, "learning_rate": 0.0003326077354712789, "loss": 2.8199, "step": 6800 }, { "epoch": 0.3038999570214477, "grad_norm": 0.1671936959028244, "learning_rate": 0.00033151241337777624, "loss": 2.82, "step": 6850 }, { "epoch": 0.30611820488291813, "grad_norm": 0.1675061583518982, "learning_rate": 0.00033041009748475166, "loss": 2.8246, "step": 6900 }, { "epoch": 0.30833645274438853, "grad_norm": 0.16512750089168549, "learning_rate": 0.0003293008464131079, "loss": 2.8178, "step": 6950 }, { "epoch": 0.31055470060585894, "grad_norm": 0.1670486181974411, "learning_rate": 0.0003281847191525585, "loss": 2.8185, "step": 7000 }, { "epoch": 0.3127729484673294, "grad_norm": 0.1692744940519333, "learning_rate": 0.0003270617750584913, "loss": 2.8184, "step": 7050 }, { "epoch": 0.3149911963287998, "grad_norm": 0.16573506593704224, "learning_rate": 0.0003259320738488119, "loss": 2.823, "step": 7100 }, { "epoch": 0.3172094441902702, "grad_norm": 0.17004618048667908, "learning_rate": 0.00032479567560076745, "loss": 2.8174, "step": 7150 }, { "epoch": 0.31942769205174065, "grad_norm": 0.16867642104625702, "learning_rate": 0.00032365264074775223, "loss": 2.8183, "step": 7200 }, { "epoch": 0.32164593991321105, "grad_norm": 0.16543437540531158, "learning_rate": 0.00032250303007609366, "loss": 2.8178, "step": 7250 }, { "epoch": 0.32386418777468146, "grad_norm": 0.16606374084949493, "learning_rate": 0.0003213469047218194, "loss": 2.8182, "step": 7300 }, { "epoch": 0.3260824356361519, "grad_norm": 0.1708928942680359, "learning_rate": 0.0003201843261674067, "loss": 2.8194, "step": 7350 }, { "epoch": 0.3283006834976223, "grad_norm": 0.16661237180233002, "learning_rate": 0.00031901535623851245, "loss": 2.8226, "step": 7400 }, { "epoch": 0.3305189313590927, "grad_norm": 0.16710756719112396, "learning_rate": 0.0003178400571006852, "loss": 2.8187, "step": 7450 }, { "epoch": 0.3327371792205632, "grad_norm": 0.16679760813713074, "learning_rate": 0.00031665849125605937, "loss": 2.8163, "step": 7500 }, { "epoch": 0.3349554270820336, "grad_norm": 0.16872857511043549, "learning_rate": 0.00031547072154003154, "loss": 2.8147, "step": 7550 }, { "epoch": 0.337173674943504, "grad_norm": 0.1672954261302948, "learning_rate": 0.0003142768111179187, "loss": 2.8167, "step": 7600 }, { "epoch": 0.33939192280497443, "grad_norm": 0.16654394567012787, "learning_rate": 0.00031307682348159907, "loss": 2.816, "step": 7650 }, { "epoch": 0.34161017066644483, "grad_norm": 0.16810841858386993, "learning_rate": 0.00031187082244613567, "loss": 2.8139, "step": 7700 }, { "epoch": 0.34382841852791524, "grad_norm": 0.1682497262954712, "learning_rate": 0.00031065887214638284, "loss": 2.8157, "step": 7750 }, { "epoch": 0.3460466663893857, "grad_norm": 0.17154847085475922, "learning_rate": 0.00030944103703357524, "loss": 2.8143, "step": 7800 }, { "epoch": 0.3482649142508561, "grad_norm": 0.16658836603164673, "learning_rate": 0.00030821738187190075, "loss": 2.8143, "step": 7850 }, { "epoch": 0.35048316211232655, "grad_norm": 0.16820305585861206, "learning_rate": 0.00030698797173505586, "loss": 2.8157, "step": 7900 }, { "epoch": 0.35270140997379695, "grad_norm": 0.16843385994434357, "learning_rate": 0.0003057528720027853, "loss": 2.8103, "step": 7950 }, { "epoch": 0.35491965783526735, "grad_norm": 0.17145898938179016, "learning_rate": 0.0003045121483574054, "loss": 2.8161, "step": 8000 }, { "epoch": 0.3571379056967378, "grad_norm": 0.1709701269865036, "learning_rate": 0.00030326586678031066, "loss": 2.8134, "step": 8050 }, { "epoch": 0.3593561535582082, "grad_norm": 0.16859866678714752, "learning_rate": 0.0003020140935484653, "loss": 2.818, "step": 8100 }, { "epoch": 0.3615744014196786, "grad_norm": 0.16738031804561615, "learning_rate": 0.00030075689523087804, "loss": 2.8128, "step": 8150 }, { "epoch": 0.36379264928114907, "grad_norm": 0.1693500131368637, "learning_rate": 0.00029949433868506293, "loss": 2.8138, "step": 8200 }, { "epoch": 0.3660108971426195, "grad_norm": 0.16915106773376465, "learning_rate": 0.00029822649105348294, "loss": 2.8209, "step": 8250 }, { "epoch": 0.3682291450040899, "grad_norm": 0.17108069360256195, "learning_rate": 0.00029695341975998006, "loss": 2.8174, "step": 8300 }, { "epoch": 0.37044739286556033, "grad_norm": 0.16659317910671234, "learning_rate": 0.00029567519250618907, "loss": 2.8153, "step": 8350 }, { "epoch": 0.37266564072703073, "grad_norm": 0.16678906977176666, "learning_rate": 0.0002943918772679379, "loss": 2.8163, "step": 8400 }, { "epoch": 0.37488388858850114, "grad_norm": 0.16928167641162872, "learning_rate": 0.00029310354229163197, "loss": 2.8165, "step": 8450 }, { "epoch": 0.3771021364499716, "grad_norm": 0.1695391833782196, "learning_rate": 0.0002918102560906254, "loss": 2.8197, "step": 8500 }, { "epoch": 0.379320384311442, "grad_norm": 0.17006346583366394, "learning_rate": 0.0002905120874415772, "loss": 2.8172, "step": 8550 }, { "epoch": 0.3815386321729124, "grad_norm": 0.16821132600307465, "learning_rate": 0.0002892091053807939, "loss": 2.8137, "step": 8600 }, { "epoch": 0.38375688003438285, "grad_norm": 0.17077401280403137, "learning_rate": 0.000287901379200558, "loss": 2.8174, "step": 8650 }, { "epoch": 0.38597512789585325, "grad_norm": 0.17006562650203705, "learning_rate": 0.0002865889784454435, "loss": 2.813, "step": 8700 }, { "epoch": 0.3881933757573237, "grad_norm": 0.16847462952136993, "learning_rate": 0.0002852719729086167, "loss": 2.8158, "step": 8750 }, { "epoch": 0.3904116236187941, "grad_norm": 0.16790613532066345, "learning_rate": 0.0002839504326281256, "loss": 2.816, "step": 8800 }, { "epoch": 0.3926298714802645, "grad_norm": 0.16898341476917267, "learning_rate": 0.00028262442788317446, "loss": 2.8143, "step": 8850 }, { "epoch": 0.39484811934173497, "grad_norm": 0.17099575698375702, "learning_rate": 0.00028129402919038695, "loss": 2.812, "step": 8900 }, { "epoch": 0.3970663672032054, "grad_norm": 0.17063932120800018, "learning_rate": 0.00027995930730005577, "loss": 2.815, "step": 8950 }, { "epoch": 0.3992846150646758, "grad_norm": 0.1704034060239792, "learning_rate": 0.00027862033319238025, "loss": 2.8144, "step": 9000 }, { "epoch": 0.3992846150646758, "eval_accuracy": 0.42786541279921836, "eval_loss": 2.8759515285491943, "eval_runtime": 250.6732, "eval_samples_per_second": 7.979, "eval_steps_per_second": 0.997, "step": 9000 }, { "epoch": 0.40150286292614623, "grad_norm": 0.1675969511270523, "learning_rate": 0.0002772771780736917, "loss": 2.8128, "step": 9050 }, { "epoch": 0.40372111078761663, "grad_norm": 0.1697956621646881, "learning_rate": 0.0002759299133726665, "loss": 2.8121, "step": 9100 }, { "epoch": 0.40593935864908703, "grad_norm": 0.1710100620985031, "learning_rate": 0.00027457861073652785, "loss": 2.8156, "step": 9150 }, { "epoch": 0.4081576065105575, "grad_norm": 0.16877809166908264, "learning_rate": 0.00027322334202723527, "loss": 2.815, "step": 9200 }, { "epoch": 0.4103758543720279, "grad_norm": 0.17122440040111542, "learning_rate": 0.0002718641793176631, "loss": 2.8119, "step": 9250 }, { "epoch": 0.4125941022334983, "grad_norm": 0.16771045327186584, "learning_rate": 0.0002705011948877679, "loss": 2.808, "step": 9300 }, { "epoch": 0.41481235009496875, "grad_norm": 0.16941729187965393, "learning_rate": 0.0002691344612207442, "loss": 2.8121, "step": 9350 }, { "epoch": 0.41703059795643915, "grad_norm": 0.1719992607831955, "learning_rate": 0.00026776405099917014, "loss": 2.8094, "step": 9400 }, { "epoch": 0.41924884581790955, "grad_norm": 0.1693243533372879, "learning_rate": 0.00026639003710114223, "loss": 2.8103, "step": 9450 }, { "epoch": 0.42146709367938, "grad_norm": 0.17014500498771667, "learning_rate": 0.0002650124925963998, "loss": 2.8129, "step": 9500 }, { "epoch": 0.4236853415408504, "grad_norm": 0.1709510087966919, "learning_rate": 0.00026363149074243867, "loss": 2.8084, "step": 9550 }, { "epoch": 0.4259035894023208, "grad_norm": 0.16937118768692017, "learning_rate": 0.0002622471049806159, "loss": 2.814, "step": 9600 }, { "epoch": 0.42812183726379127, "grad_norm": 0.1713036149740219, "learning_rate": 0.00026085940893224403, "loss": 2.8162, "step": 9650 }, { "epoch": 0.4303400851252617, "grad_norm": 0.17020347714424133, "learning_rate": 0.0002594684763946758, "loss": 2.8116, "step": 9700 }, { "epoch": 0.43255833298673213, "grad_norm": 0.16786696016788483, "learning_rate": 0.0002580743813373796, "loss": 2.8111, "step": 9750 }, { "epoch": 0.43477658084820253, "grad_norm": 0.17273075878620148, "learning_rate": 0.00025667719789800606, "loss": 2.8131, "step": 9800 }, { "epoch": 0.43699482870967293, "grad_norm": 0.16986466944217682, "learning_rate": 0.00025527700037844515, "loss": 2.8139, "step": 9850 }, { "epoch": 0.4392130765711434, "grad_norm": 0.17129731178283691, "learning_rate": 0.00025387386324087494, "loss": 2.8125, "step": 9900 }, { "epoch": 0.4414313244326138, "grad_norm": 0.16890868544578552, "learning_rate": 0.00025246786110380163, "loss": 2.8142, "step": 9950 }, { "epoch": 0.4436495722940842, "grad_norm": 0.17167522013187408, "learning_rate": 0.00025105906873809154, "loss": 2.8142, "step": 10000 }, { "epoch": 0.44586782015555465, "grad_norm": 0.17136669158935547, "learning_rate": 0.0002496475610629947, "loss": 2.8112, "step": 10050 }, { "epoch": 0.44808606801702505, "grad_norm": 0.16926760971546173, "learning_rate": 0.00024823341314216056, "loss": 2.8156, "step": 10100 }, { "epoch": 0.45030431587849545, "grad_norm": 0.16898435354232788, "learning_rate": 0.00024681670017964627, "loss": 2.8079, "step": 10150 }, { "epoch": 0.4525225637399659, "grad_norm": 0.17237040400505066, "learning_rate": 0.0002453974975159173, "loss": 2.813, "step": 10200 }, { "epoch": 0.4547408116014363, "grad_norm": 0.16995486617088318, "learning_rate": 0.00024397588062384095, "loss": 2.8117, "step": 10250 }, { "epoch": 0.4569590594629067, "grad_norm": 0.17290563881397247, "learning_rate": 0.00024255192510467245, "loss": 2.8121, "step": 10300 }, { "epoch": 0.45917730732437717, "grad_norm": 0.17059782147407532, "learning_rate": 0.00024112570668403472, "loss": 2.8138, "step": 10350 }, { "epoch": 0.4613955551858476, "grad_norm": 0.17196382582187653, "learning_rate": 0.00023969730120789132, "loss": 2.8095, "step": 10400 }, { "epoch": 0.463613803047318, "grad_norm": 0.16942380368709564, "learning_rate": 0.00023826678463851285, "loss": 2.8124, "step": 10450 }, { "epoch": 0.46583205090878843, "grad_norm": 0.17288681864738464, "learning_rate": 0.00023683423305043749, "loss": 2.813, "step": 10500 }, { "epoch": 0.46805029877025883, "grad_norm": 0.17040428519248962, "learning_rate": 0.00023539972262642502, "loss": 2.8141, "step": 10550 }, { "epoch": 0.4702685466317293, "grad_norm": 0.17321184277534485, "learning_rate": 0.00023396332965340585, "loss": 2.8146, "step": 10600 }, { "epoch": 0.4724867944931997, "grad_norm": 0.17026926577091217, "learning_rate": 0.00023252513051842373, "loss": 2.8086, "step": 10650 }, { "epoch": 0.4747050423546701, "grad_norm": 0.1710352748632431, "learning_rate": 0.00023108520170457398, "loss": 2.8099, "step": 10700 }, { "epoch": 0.47692329021614055, "grad_norm": 0.17067080736160278, "learning_rate": 0.00022964361978693542, "loss": 2.8099, "step": 10750 }, { "epoch": 0.47914153807761095, "grad_norm": 0.17244164645671844, "learning_rate": 0.0002282004614284989, "loss": 2.8054, "step": 10800 } ], "logging_steps": 50, "max_steps": 22540, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.5864483085358727e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }