{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 15630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06397952655150352, "grad_norm": 1.3913878202438354, "learning_rate": 0.0002980806142034549, "loss": 2.6653, "step": 100 }, { "epoch": 0.12795905310300704, "grad_norm": 1.3103582859039307, "learning_rate": 0.00029616122840690973, "loss": 2.292, "step": 200 }, { "epoch": 0.19193857965451055, "grad_norm": 1.3559526205062866, "learning_rate": 0.00029424184261036466, "loss": 2.2125, "step": 300 }, { "epoch": 0.2559181062060141, "grad_norm": 1.1802483797073364, "learning_rate": 0.00029232245681381954, "loss": 2.1921, "step": 400 }, { "epoch": 0.3198976327575176, "grad_norm": 1.3195720911026, "learning_rate": 0.0002904030710172744, "loss": 2.1725, "step": 500 }, { "epoch": 0.3838771593090211, "grad_norm": 1.3061714172363281, "learning_rate": 0.00028848368522072936, "loss": 2.1612, "step": 600 }, { "epoch": 0.44785668586052463, "grad_norm": 1.3104487657546997, "learning_rate": 0.00028656429942418423, "loss": 2.141, "step": 700 }, { "epoch": 0.5118362124120281, "grad_norm": 1.2008849382400513, "learning_rate": 0.0002846449136276391, "loss": 2.1019, "step": 800 }, { "epoch": 0.5758157389635317, "grad_norm": 1.0927375555038452, "learning_rate": 0.000282725527831094, "loss": 2.0864, "step": 900 }, { "epoch": 0.6397952655150352, "grad_norm": 1.2209320068359375, "learning_rate": 0.0002808061420345489, "loss": 2.1023, "step": 1000 }, { "epoch": 0.7037747920665387, "grad_norm": 1.180240511894226, "learning_rate": 0.0002788867562380038, "loss": 2.0917, "step": 1100 }, { "epoch": 0.7677543186180422, "grad_norm": 1.134225845336914, "learning_rate": 0.0002769673704414587, "loss": 2.0871, "step": 1200 }, { "epoch": 0.8317338451695457, "grad_norm": 1.1944783926010132, "learning_rate": 0.0002750479846449136, "loss": 2.0509, "step": 1300 }, { "epoch": 0.8957133717210493, "grad_norm": 1.3126178979873657, "learning_rate": 0.0002731285988483685, "loss": 2.0499, "step": 1400 }, { "epoch": 0.9596928982725528, "grad_norm": 1.114277720451355, "learning_rate": 0.0002712092130518234, "loss": 2.035, "step": 1500 }, { "epoch": 1.0236724248240563, "grad_norm": 1.2892206907272339, "learning_rate": 0.0002692898272552783, "loss": 1.9784, "step": 1600 }, { "epoch": 1.0876519513755598, "grad_norm": 1.3515739440917969, "learning_rate": 0.0002673704414587332, "loss": 1.9109, "step": 1700 }, { "epoch": 1.1516314779270633, "grad_norm": 1.367629885673523, "learning_rate": 0.00026545105566218806, "loss": 1.947, "step": 1800 }, { "epoch": 1.2156110044785668, "grad_norm": 1.246567726135254, "learning_rate": 0.00026353166986564294, "loss": 1.92, "step": 1900 }, { "epoch": 1.2795905310300704, "grad_norm": 1.420345664024353, "learning_rate": 0.0002616122840690979, "loss": 1.9131, "step": 2000 }, { "epoch": 1.3435700575815739, "grad_norm": 1.4400845766067505, "learning_rate": 0.00025969289827255276, "loss": 1.9205, "step": 2100 }, { "epoch": 1.4075495841330774, "grad_norm": 1.2077809572219849, "learning_rate": 0.00025777351247600763, "loss": 1.9198, "step": 2200 }, { "epoch": 1.471529110684581, "grad_norm": 1.3657140731811523, "learning_rate": 0.00025585412667946257, "loss": 1.9221, "step": 2300 }, { "epoch": 1.5355086372360844, "grad_norm": 1.4031739234924316, "learning_rate": 0.00025393474088291745, "loss": 1.9228, "step": 2400 }, { "epoch": 1.599488163787588, "grad_norm": 1.3224505186080933, "learning_rate": 0.0002520153550863723, "loss": 1.9413, "step": 2500 }, { "epoch": 1.6634676903390915, "grad_norm": 1.4395138025283813, "learning_rate": 0.00025009596928982726, "loss": 1.9295, "step": 2600 }, { "epoch": 1.727447216890595, "grad_norm": 1.3267974853515625, "learning_rate": 0.00024817658349328214, "loss": 1.9122, "step": 2700 }, { "epoch": 1.7914267434420985, "grad_norm": 1.3772944211959839, "learning_rate": 0.000246257197696737, "loss": 1.9057, "step": 2800 }, { "epoch": 1.855406269993602, "grad_norm": 1.3348450660705566, "learning_rate": 0.0002443378119001919, "loss": 1.9048, "step": 2900 }, { "epoch": 1.9193857965451055, "grad_norm": 1.3936595916748047, "learning_rate": 0.0002424184261036468, "loss": 1.9272, "step": 3000 }, { "epoch": 1.983365323096609, "grad_norm": 1.3806579113006592, "learning_rate": 0.0002404990403071017, "loss": 1.929, "step": 3100 }, { "epoch": 2.0473448496481126, "grad_norm": 1.6080211400985718, "learning_rate": 0.00023857965451055659, "loss": 1.795, "step": 3200 }, { "epoch": 2.111324376199616, "grad_norm": 1.6093289852142334, "learning_rate": 0.0002366602687140115, "loss": 1.7608, "step": 3300 }, { "epoch": 2.1753039027511196, "grad_norm": 1.5963257551193237, "learning_rate": 0.00023474088291746637, "loss": 1.7549, "step": 3400 }, { "epoch": 2.239283429302623, "grad_norm": 1.5618189573287964, "learning_rate": 0.00023282149712092128, "loss": 1.7706, "step": 3500 }, { "epoch": 2.3032629558541267, "grad_norm": 1.5841214656829834, "learning_rate": 0.00023090211132437618, "loss": 1.7643, "step": 3600 }, { "epoch": 2.36724248240563, "grad_norm": 1.5369797945022583, "learning_rate": 0.00022898272552783106, "loss": 1.7765, "step": 3700 }, { "epoch": 2.4312220089571337, "grad_norm": 1.6150139570236206, "learning_rate": 0.00022706333973128597, "loss": 1.7698, "step": 3800 }, { "epoch": 2.495201535508637, "grad_norm": 1.6394102573394775, "learning_rate": 0.00022514395393474085, "loss": 1.7918, "step": 3900 }, { "epoch": 2.5591810620601407, "grad_norm": 1.5699377059936523, "learning_rate": 0.00022322456813819575, "loss": 1.7881, "step": 4000 }, { "epoch": 2.6231605886116443, "grad_norm": 1.5585079193115234, "learning_rate": 0.0002213243761996161, "loss": 1.7885, "step": 4100 }, { "epoch": 2.6871401151631478, "grad_norm": 1.5128830671310425, "learning_rate": 0.000219404990403071, "loss": 1.7878, "step": 4200 }, { "epoch": 2.7511196417146513, "grad_norm": 1.584663987159729, "learning_rate": 0.00021748560460652588, "loss": 1.8021, "step": 4300 }, { "epoch": 2.815099168266155, "grad_norm": 1.570940375328064, "learning_rate": 0.00021556621880998078, "loss": 1.7732, "step": 4400 }, { "epoch": 2.8790786948176583, "grad_norm": 1.6013740301132202, "learning_rate": 0.00021364683301343566, "loss": 1.7939, "step": 4500 }, { "epoch": 2.943058221369162, "grad_norm": 1.5516438484191895, "learning_rate": 0.00021172744721689057, "loss": 1.7894, "step": 4600 }, { "epoch": 3.0070377479206654, "grad_norm": 1.5310258865356445, "learning_rate": 0.00020980806142034547, "loss": 1.7542, "step": 4700 }, { "epoch": 3.071017274472169, "grad_norm": 1.7053136825561523, "learning_rate": 0.00020788867562380035, "loss": 1.5635, "step": 4800 }, { "epoch": 3.1349968010236724, "grad_norm": 1.8617521524429321, "learning_rate": 0.00020596928982725526, "loss": 1.6037, "step": 4900 }, { "epoch": 3.198976327575176, "grad_norm": 1.928120732307434, "learning_rate": 0.00020404990403071014, "loss": 1.6123, "step": 5000 }, { "epoch": 3.2629558541266794, "grad_norm": 1.8797775506973267, "learning_rate": 0.00020213051823416504, "loss": 1.6345, "step": 5100 }, { "epoch": 3.326935380678183, "grad_norm": 2.0564582347869873, "learning_rate": 0.00020021113243761995, "loss": 1.6234, "step": 5200 }, { "epoch": 3.3909149072296865, "grad_norm": 1.7450950145721436, "learning_rate": 0.00019829174664107483, "loss": 1.6299, "step": 5300 }, { "epoch": 3.45489443378119, "grad_norm": 1.975203514099121, "learning_rate": 0.00019637236084452974, "loss": 1.6572, "step": 5400 }, { "epoch": 3.5188739603326935, "grad_norm": 2.0146219730377197, "learning_rate": 0.00019445297504798461, "loss": 1.645, "step": 5500 }, { "epoch": 3.582853486884197, "grad_norm": 1.8149664402008057, "learning_rate": 0.00019253358925143952, "loss": 1.664, "step": 5600 }, { "epoch": 3.6468330134357005, "grad_norm": 1.8936662673950195, "learning_rate": 0.00019061420345489443, "loss": 1.6702, "step": 5700 }, { "epoch": 3.710812539987204, "grad_norm": 1.8857460021972656, "learning_rate": 0.0001886948176583493, "loss": 1.6703, "step": 5800 }, { "epoch": 3.7747920665387076, "grad_norm": 1.9245814085006714, "learning_rate": 0.0001867754318618042, "loss": 1.6834, "step": 5900 }, { "epoch": 3.838771593090211, "grad_norm": 1.8571895360946655, "learning_rate": 0.0001848560460652591, "loss": 1.6663, "step": 6000 }, { "epoch": 3.9027511196417146, "grad_norm": 1.8339492082595825, "learning_rate": 0.00018297504798464492, "loss": 1.6835, "step": 6100 }, { "epoch": 3.966730646193218, "grad_norm": 1.839705228805542, "learning_rate": 0.00018105566218809977, "loss": 1.6786, "step": 6200 }, { "epoch": 4.030710172744722, "grad_norm": 1.8872791528701782, "learning_rate": 0.00017913627639155468, "loss": 1.5791, "step": 6300 }, { "epoch": 4.094689699296225, "grad_norm": 2.0222487449645996, "learning_rate": 0.0001772168905950096, "loss": 1.4462, "step": 6400 }, { "epoch": 4.158669225847729, "grad_norm": 2.354410409927368, "learning_rate": 0.00017529750479846446, "loss": 1.476, "step": 6500 }, { "epoch": 4.222648752399232, "grad_norm": 2.227118968963623, "learning_rate": 0.00017337811900191937, "loss": 1.4802, "step": 6600 }, { "epoch": 4.286628278950736, "grad_norm": 2.2707509994506836, "learning_rate": 0.00017145873320537425, "loss": 1.4982, "step": 6700 }, { "epoch": 4.350607805502239, "grad_norm": 2.224393844604492, "learning_rate": 0.00016953934740882915, "loss": 1.5118, "step": 6800 }, { "epoch": 4.414587332053743, "grad_norm": 2.305133104324341, "learning_rate": 0.00016761996161228406, "loss": 1.5273, "step": 6900 }, { "epoch": 4.478566858605246, "grad_norm": 2.1448874473571777, "learning_rate": 0.00016570057581573894, "loss": 1.5241, "step": 7000 }, { "epoch": 4.54254638515675, "grad_norm": 2.392244577407837, "learning_rate": 0.00016378119001919384, "loss": 1.5195, "step": 7100 }, { "epoch": 4.606525911708253, "grad_norm": 2.300020456314087, "learning_rate": 0.00016186180422264872, "loss": 1.5219, "step": 7200 }, { "epoch": 4.670505438259757, "grad_norm": 2.117234230041504, "learning_rate": 0.00015994241842610363, "loss": 1.5345, "step": 7300 }, { "epoch": 4.73448496481126, "grad_norm": 2.0821878910064697, "learning_rate": 0.00015802303262955853, "loss": 1.5184, "step": 7400 }, { "epoch": 4.798464491362764, "grad_norm": 2.112713575363159, "learning_rate": 0.0001561036468330134, "loss": 1.5384, "step": 7500 }, { "epoch": 4.862444017914267, "grad_norm": 2.12935209274292, "learning_rate": 0.00015418426103646832, "loss": 1.55, "step": 7600 }, { "epoch": 4.926423544465771, "grad_norm": 2.2870752811431885, "learning_rate": 0.0001522648752399232, "loss": 1.5497, "step": 7700 }, { "epoch": 4.990403071017274, "grad_norm": 2.1214969158172607, "learning_rate": 0.0001503454894433781, "loss": 1.5619, "step": 7800 }, { "epoch": 5.054382597568778, "grad_norm": 2.3006513118743896, "learning_rate": 0.000148426103646833, "loss": 1.378, "step": 7900 }, { "epoch": 5.1183621241202815, "grad_norm": 2.307389259338379, "learning_rate": 0.0001465067178502879, "loss": 1.3495, "step": 8000 }, { "epoch": 5.182341650671785, "grad_norm": 2.3815455436706543, "learning_rate": 0.00014460652591170823, "loss": 1.3522, "step": 8100 }, { "epoch": 5.2463211772232885, "grad_norm": 2.3949851989746094, "learning_rate": 0.00014268714011516314, "loss": 1.3573, "step": 8200 }, { "epoch": 5.310300703774792, "grad_norm": 2.5533056259155273, "learning_rate": 0.00014076775431861804, "loss": 1.3681, "step": 8300 }, { "epoch": 5.3742802303262955, "grad_norm": 2.4939053058624268, "learning_rate": 0.00013884836852207292, "loss": 1.3961, "step": 8400 }, { "epoch": 5.438259756877799, "grad_norm": 2.5872833728790283, "learning_rate": 0.00013692898272552783, "loss": 1.3816, "step": 8500 }, { "epoch": 5.502239283429303, "grad_norm": 2.54913330078125, "learning_rate": 0.0001350095969289827, "loss": 1.3818, "step": 8600 }, { "epoch": 5.566218809980806, "grad_norm": 2.5345094203948975, "learning_rate": 0.0001330902111324376, "loss": 1.3996, "step": 8700 }, { "epoch": 5.63019833653231, "grad_norm": 2.3987200260162354, "learning_rate": 0.00013117082533589252, "loss": 1.4175, "step": 8800 }, { "epoch": 5.694177863083813, "grad_norm": 2.412726402282715, "learning_rate": 0.0001292514395393474, "loss": 1.4087, "step": 8900 }, { "epoch": 5.758157389635317, "grad_norm": 2.340156078338623, "learning_rate": 0.0001273320537428023, "loss": 1.4168, "step": 9000 }, { "epoch": 5.82213691618682, "grad_norm": 2.49423885345459, "learning_rate": 0.00012541266794625718, "loss": 1.4223, "step": 9100 }, { "epoch": 5.886116442738324, "grad_norm": 2.406388759613037, "learning_rate": 0.0001234932821497121, "loss": 1.4104, "step": 9200 }, { "epoch": 5.950095969289827, "grad_norm": 2.5181610584259033, "learning_rate": 0.00012157389635316698, "loss": 1.4153, "step": 9300 }, { "epoch": 6.014075495841331, "grad_norm": 2.5460805892944336, "learning_rate": 0.00011965451055662187, "loss": 1.3861, "step": 9400 }, { "epoch": 6.078055022392834, "grad_norm": 2.3237457275390625, "learning_rate": 0.00011773512476007676, "loss": 1.2058, "step": 9500 }, { "epoch": 6.142034548944338, "grad_norm": 3.00022554397583, "learning_rate": 0.00011581573896353166, "loss": 1.2308, "step": 9600 }, { "epoch": 6.206014075495841, "grad_norm": 2.6109488010406494, "learning_rate": 0.00011389635316698655, "loss": 1.23, "step": 9700 }, { "epoch": 6.269993602047345, "grad_norm": 2.581364393234253, "learning_rate": 0.00011197696737044146, "loss": 1.2524, "step": 9800 }, { "epoch": 6.333973128598848, "grad_norm": 3.0848801136016846, "learning_rate": 0.00011005758157389635, "loss": 1.2621, "step": 9900 }, { "epoch": 6.397952655150352, "grad_norm": 2.758669853210449, "learning_rate": 0.00010813819577735124, "loss": 1.2708, "step": 10000 }, { "epoch": 6.461932181701855, "grad_norm": 2.635228395462036, "learning_rate": 0.00010623800383877158, "loss": 1.264, "step": 10100 }, { "epoch": 6.525911708253359, "grad_norm": 2.6656181812286377, "learning_rate": 0.00010431861804222647, "loss": 1.2714, "step": 10200 }, { "epoch": 6.589891234804862, "grad_norm": 2.8166842460632324, "learning_rate": 0.00010239923224568137, "loss": 1.2688, "step": 10300 }, { "epoch": 6.653870761356366, "grad_norm": 2.9492697715759277, "learning_rate": 0.00010047984644913627, "loss": 1.2851, "step": 10400 }, { "epoch": 6.717850287907869, "grad_norm": 2.6956639289855957, "learning_rate": 9.856046065259116e-05, "loss": 1.2725, "step": 10500 }, { "epoch": 6.781829814459373, "grad_norm": 2.9519875049591064, "learning_rate": 9.664107485604606e-05, "loss": 1.2819, "step": 10600 }, { "epoch": 6.8458093410108765, "grad_norm": 3.0412800312042236, "learning_rate": 9.472168905950095e-05, "loss": 1.2827, "step": 10700 }, { "epoch": 6.90978886756238, "grad_norm": 2.9258697032928467, "learning_rate": 9.280230326295584e-05, "loss": 1.3168, "step": 10800 }, { "epoch": 6.9737683941138835, "grad_norm": 2.7609777450561523, "learning_rate": 9.088291746641075e-05, "loss": 1.2928, "step": 10900 }, { "epoch": 7.037747920665387, "grad_norm": 2.8244144916534424, "learning_rate": 8.896353166986564e-05, "loss": 1.1978, "step": 11000 }, { "epoch": 7.1017274472168905, "grad_norm": 2.602998971939087, "learning_rate": 8.704414587332053e-05, "loss": 1.1216, "step": 11100 }, { "epoch": 7.165706973768394, "grad_norm": 3.0034470558166504, "learning_rate": 8.512476007677542e-05, "loss": 1.118, "step": 11200 }, { "epoch": 7.229686500319898, "grad_norm": 2.9190104007720947, "learning_rate": 8.320537428023032e-05, "loss": 1.1329, "step": 11300 }, { "epoch": 7.293666026871401, "grad_norm": 2.9674625396728516, "learning_rate": 8.128598848368522e-05, "loss": 1.1161, "step": 11400 }, { "epoch": 7.357645553422905, "grad_norm": 2.8053972721099854, "learning_rate": 7.936660268714012e-05, "loss": 1.1276, "step": 11500 }, { "epoch": 7.421625079974408, "grad_norm": 3.0427043437957764, "learning_rate": 7.744721689059501e-05, "loss": 1.1491, "step": 11600 }, { "epoch": 7.485604606525912, "grad_norm": 2.797395706176758, "learning_rate": 7.55278310940499e-05, "loss": 1.147, "step": 11700 }, { "epoch": 7.549584133077415, "grad_norm": 3.1447503566741943, "learning_rate": 7.360844529750479e-05, "loss": 1.1619, "step": 11800 }, { "epoch": 7.613563659628919, "grad_norm": 2.775707483291626, "learning_rate": 7.168905950095969e-05, "loss": 1.1663, "step": 11900 }, { "epoch": 7.677543186180422, "grad_norm": 2.8020401000976562, "learning_rate": 6.976967370441458e-05, "loss": 1.1463, "step": 12000 }, { "epoch": 7.741522712731926, "grad_norm": 2.949965715408325, "learning_rate": 6.786948176583492e-05, "loss": 1.1642, "step": 12100 }, { "epoch": 7.805502239283429, "grad_norm": 3.1794426441192627, "learning_rate": 6.595009596928982e-05, "loss": 1.1794, "step": 12200 }, { "epoch": 7.869481765834933, "grad_norm": 2.9939463138580322, "learning_rate": 6.403071017274472e-05, "loss": 1.1599, "step": 12300 }, { "epoch": 7.933461292386436, "grad_norm": 2.9616475105285645, "learning_rate": 6.211132437619961e-05, "loss": 1.1723, "step": 12400 }, { "epoch": 7.99744081893794, "grad_norm": 2.9027538299560547, "learning_rate": 6.019193857965451e-05, "loss": 1.1658, "step": 12500 }, { "epoch": 8.061420345489443, "grad_norm": 2.4901044368743896, "learning_rate": 5.8272552783109394e-05, "loss": 1.0281, "step": 12600 }, { "epoch": 8.125399872040948, "grad_norm": 2.961620807647705, "learning_rate": 5.63531669865643e-05, "loss": 1.0274, "step": 12700 }, { "epoch": 8.18937939859245, "grad_norm": 2.9003262519836426, "learning_rate": 5.445297504798464e-05, "loss": 1.0273, "step": 12800 }, { "epoch": 8.253358925143955, "grad_norm": 2.755610704421997, "learning_rate": 5.253358925143953e-05, "loss": 1.0497, "step": 12900 }, { "epoch": 8.317338451695457, "grad_norm": 3.1691670417785645, "learning_rate": 5.0614203454894425e-05, "loss": 1.0219, "step": 13000 }, { "epoch": 8.381317978246962, "grad_norm": 2.919013738632202, "learning_rate": 4.8694817658349324e-05, "loss": 1.0288, "step": 13100 }, { "epoch": 8.445297504798464, "grad_norm": 2.7318263053894043, "learning_rate": 4.677543186180422e-05, "loss": 1.031, "step": 13200 }, { "epoch": 8.509277031349969, "grad_norm": 3.0156123638153076, "learning_rate": 4.4856046065259116e-05, "loss": 1.0343, "step": 13300 }, { "epoch": 8.573256557901471, "grad_norm": 3.454901695251465, "learning_rate": 4.293666026871401e-05, "loss": 1.0488, "step": 13400 }, { "epoch": 8.637236084452976, "grad_norm": 2.997831106185913, "learning_rate": 4.10172744721689e-05, "loss": 1.0418, "step": 13500 }, { "epoch": 8.701215611004478, "grad_norm": 2.9801297187805176, "learning_rate": 3.90978886756238e-05, "loss": 1.0546, "step": 13600 }, { "epoch": 8.765195137555983, "grad_norm": 2.8405773639678955, "learning_rate": 3.717850287907869e-05, "loss": 1.0525, "step": 13700 }, { "epoch": 8.829174664107486, "grad_norm": 2.915435552597046, "learning_rate": 3.5259117082533585e-05, "loss": 1.0496, "step": 13800 }, { "epoch": 8.89315419065899, "grad_norm": 3.022799015045166, "learning_rate": 3.333973128598848e-05, "loss": 1.0335, "step": 13900 }, { "epoch": 8.957133717210493, "grad_norm": 3.1525983810424805, "learning_rate": 3.142034548944338e-05, "loss": 1.0552, "step": 14000 }, { "epoch": 9.021113243761997, "grad_norm": 3.046138048171997, "learning_rate": 2.9500959692898273e-05, "loss": 1.0329, "step": 14100 }, { "epoch": 9.0850927703135, "grad_norm": 3.1273550987243652, "learning_rate": 2.7581573896353165e-05, "loss": 0.9363, "step": 14200 }, { "epoch": 9.149072296865004, "grad_norm": 3.0737597942352295, "learning_rate": 2.5662188099808057e-05, "loss": 0.9493, "step": 14300 }, { "epoch": 9.213051823416507, "grad_norm": 3.15415620803833, "learning_rate": 2.3742802303262953e-05, "loss": 0.9476, "step": 14400 }, { "epoch": 9.277031349968011, "grad_norm": 3.037520170211792, "learning_rate": 2.182341650671785e-05, "loss": 0.949, "step": 14500 }, { "epoch": 9.341010876519514, "grad_norm": 3.049983024597168, "learning_rate": 1.9904030710172745e-05, "loss": 0.9481, "step": 14600 }, { "epoch": 9.404990403071018, "grad_norm": 2.831089973449707, "learning_rate": 1.7984644913627637e-05, "loss": 0.9504, "step": 14700 }, { "epoch": 9.46896992962252, "grad_norm": 2.960285186767578, "learning_rate": 1.6065259117082533e-05, "loss": 0.9482, "step": 14800 }, { "epoch": 9.532949456174025, "grad_norm": 3.3185222148895264, "learning_rate": 1.4145873320537426e-05, "loss": 0.9486, "step": 14900 }, { "epoch": 9.596928982725528, "grad_norm": 2.8582775592803955, "learning_rate": 1.2226487523992321e-05, "loss": 0.9576, "step": 15000 }, { "epoch": 9.660908509277032, "grad_norm": 2.862841844558716, "learning_rate": 1.0307101727447216e-05, "loss": 0.9449, "step": 15100 }, { "epoch": 9.724888035828535, "grad_norm": 3.2306597232818604, "learning_rate": 8.387715930902111e-06, "loss": 0.9437, "step": 15200 }, { "epoch": 9.78886756238004, "grad_norm": 3.0353362560272217, "learning_rate": 6.468330134357005e-06, "loss": 0.9623, "step": 15300 }, { "epoch": 9.852847088931542, "grad_norm": 2.7248127460479736, "learning_rate": 4.5489443378119e-06, "loss": 0.9592, "step": 15400 }, { "epoch": 9.916826615483046, "grad_norm": 3.1457252502441406, "learning_rate": 2.6295585412667943e-06, "loss": 0.9516, "step": 15500 }, { "epoch": 9.980806142034549, "grad_norm": 3.1073553562164307, "learning_rate": 7.101727447216889e-07, "loss": 0.9634, "step": 15600 } ], "logging_steps": 100, "max_steps": 15630, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.43407558825345e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }