|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2166, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004616805170821791, |
|
"grad_norm": 15.672144611335952, |
|
"learning_rate": 9.216589861751152e-08, |
|
"loss": 1.3168, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0023084025854108957, |
|
"grad_norm": 14.680930201512727, |
|
"learning_rate": 4.608294930875577e-07, |
|
"loss": 1.2513, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0046168051708217915, |
|
"grad_norm": 8.024031806354767, |
|
"learning_rate": 9.216589861751154e-07, |
|
"loss": 1.1965, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006925207756232687, |
|
"grad_norm": 5.48716759630272, |
|
"learning_rate": 1.382488479262673e-06, |
|
"loss": 1.0985, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009233610341643583, |
|
"grad_norm": 5.068857847099135, |
|
"learning_rate": 1.8433179723502307e-06, |
|
"loss": 1.0416, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011542012927054479, |
|
"grad_norm": 5.784416390233076, |
|
"learning_rate": 2.3041474654377884e-06, |
|
"loss": 1.0414, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.013850415512465374, |
|
"grad_norm": 5.088059220204017, |
|
"learning_rate": 2.764976958525346e-06, |
|
"loss": 1.0977, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.016158818097876268, |
|
"grad_norm": 4.831013532516624, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 1.0864, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.018467220683287166, |
|
"grad_norm": 5.01522341202905, |
|
"learning_rate": 3.6866359447004615e-06, |
|
"loss": 1.0875, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02077562326869806, |
|
"grad_norm": 5.2743430965797495, |
|
"learning_rate": 4.147465437788019e-06, |
|
"loss": 1.0582, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.023084025854108958, |
|
"grad_norm": 4.868797420697276, |
|
"learning_rate": 4.608294930875577e-06, |
|
"loss": 1.0706, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025392428439519853, |
|
"grad_norm": 4.866881384682284, |
|
"learning_rate": 5.0691244239631346e-06, |
|
"loss": 1.0874, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.027700831024930747, |
|
"grad_norm": 4.942215298185941, |
|
"learning_rate": 5.529953917050692e-06, |
|
"loss": 1.0765, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.030009233610341645, |
|
"grad_norm": 4.510624758005134, |
|
"learning_rate": 5.9907834101382485e-06, |
|
"loss": 1.0651, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.032317636195752536, |
|
"grad_norm": 4.961160516804359, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 1.0826, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03462603878116344, |
|
"grad_norm": 4.999318871040395, |
|
"learning_rate": 6.912442396313365e-06, |
|
"loss": 1.0966, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03693444136657433, |
|
"grad_norm": 4.639315170945839, |
|
"learning_rate": 7.373271889400923e-06, |
|
"loss": 1.0934, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.039242843951985226, |
|
"grad_norm": 4.79620290699333, |
|
"learning_rate": 7.83410138248848e-06, |
|
"loss": 1.0932, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04155124653739612, |
|
"grad_norm": 4.957993386602933, |
|
"learning_rate": 8.294930875576038e-06, |
|
"loss": 1.1032, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.043859649122807015, |
|
"grad_norm": 4.669607524515842, |
|
"learning_rate": 8.755760368663595e-06, |
|
"loss": 1.0875, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.046168051708217916, |
|
"grad_norm": 4.602861109332021, |
|
"learning_rate": 9.216589861751153e-06, |
|
"loss": 1.0809, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04847645429362881, |
|
"grad_norm": 4.548726172146522, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 1.1218, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.050784856879039705, |
|
"grad_norm": 4.724149571099335, |
|
"learning_rate": 1.0138248847926269e-05, |
|
"loss": 1.1007, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0530932594644506, |
|
"grad_norm": 5.309010432204349, |
|
"learning_rate": 1.0599078341013826e-05, |
|
"loss": 1.1368, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.055401662049861494, |
|
"grad_norm": 4.839305137997795, |
|
"learning_rate": 1.1059907834101385e-05, |
|
"loss": 1.1055, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05771006463527239, |
|
"grad_norm": 4.5796294161615565, |
|
"learning_rate": 1.152073732718894e-05, |
|
"loss": 1.1155, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06001846722068329, |
|
"grad_norm": 4.706959812240538, |
|
"learning_rate": 1.1981566820276497e-05, |
|
"loss": 1.1387, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.062326869806094184, |
|
"grad_norm": 4.492165983938348, |
|
"learning_rate": 1.2442396313364056e-05, |
|
"loss": 1.1733, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06463527239150507, |
|
"grad_norm": 4.746032213098828, |
|
"learning_rate": 1.2903225806451613e-05, |
|
"loss": 1.1375, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06694367497691597, |
|
"grad_norm": 4.713817907356248, |
|
"learning_rate": 1.3364055299539171e-05, |
|
"loss": 1.158, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06925207756232687, |
|
"grad_norm": 4.342905646572964, |
|
"learning_rate": 1.382488479262673e-05, |
|
"loss": 1.1607, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07156048014773776, |
|
"grad_norm": 4.502102336400582, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 1.1382, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07386888273314866, |
|
"grad_norm": 4.300393542300411, |
|
"learning_rate": 1.4746543778801846e-05, |
|
"loss": 1.1518, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07617728531855955, |
|
"grad_norm": 4.400546990325483, |
|
"learning_rate": 1.5207373271889403e-05, |
|
"loss": 1.1436, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07848568790397045, |
|
"grad_norm": 4.77590791643038, |
|
"learning_rate": 1.566820276497696e-05, |
|
"loss": 1.2173, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08079409048938135, |
|
"grad_norm": 4.32969974114785, |
|
"learning_rate": 1.6129032258064517e-05, |
|
"loss": 1.1654, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08310249307479224, |
|
"grad_norm": 5.285074448558262, |
|
"learning_rate": 1.6589861751152075e-05, |
|
"loss": 1.2185, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08541089566020314, |
|
"grad_norm": 6.312179413881035, |
|
"learning_rate": 1.705069124423963e-05, |
|
"loss": 1.2063, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.08771929824561403, |
|
"grad_norm": 4.351482667809684, |
|
"learning_rate": 1.751152073732719e-05, |
|
"loss": 1.1814, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09002770083102493, |
|
"grad_norm": 4.468079454686115, |
|
"learning_rate": 1.7972350230414748e-05, |
|
"loss": 1.2058, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09233610341643583, |
|
"grad_norm": 5.51425273025908, |
|
"learning_rate": 1.8433179723502307e-05, |
|
"loss": 1.1646, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09464450600184672, |
|
"grad_norm": 4.661323669253999, |
|
"learning_rate": 1.8894009216589862e-05, |
|
"loss": 1.1689, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.09695290858725762, |
|
"grad_norm": 726.888849011745, |
|
"learning_rate": 1.935483870967742e-05, |
|
"loss": 1.8785, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09926131117266851, |
|
"grad_norm": 5.867844131835615, |
|
"learning_rate": 1.981566820276498e-05, |
|
"loss": 1.2661, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.10156971375807941, |
|
"grad_norm": 6.2684199277472015, |
|
"learning_rate": 1.9999883080288618e-05, |
|
"loss": 1.2545, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1038781163434903, |
|
"grad_norm": 5.426004523317811, |
|
"learning_rate": 1.999916858084231e-05, |
|
"loss": 1.2259, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1061865189289012, |
|
"grad_norm": 4.617593739028291, |
|
"learning_rate": 1.999780458369908e-05, |
|
"loss": 1.177, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1084949215143121, |
|
"grad_norm": 4.412649452769939, |
|
"learning_rate": 1.9995791177457598e-05, |
|
"loss": 1.2127, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.11080332409972299, |
|
"grad_norm": 4.3422685444059965, |
|
"learning_rate": 1.9993128492899012e-05, |
|
"loss": 1.2398, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11311172668513389, |
|
"grad_norm": 4.837187367612426, |
|
"learning_rate": 1.9989816702978447e-05, |
|
"loss": 1.2189, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.11542012927054478, |
|
"grad_norm": 4.203624655101154, |
|
"learning_rate": 1.998585602281378e-05, |
|
"loss": 1.1641, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11772853185595568, |
|
"grad_norm": 4.172078683953242, |
|
"learning_rate": 1.9981246709671668e-05, |
|
"loss": 1.217, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.12003693444136658, |
|
"grad_norm": 4.445815868978626, |
|
"learning_rate": 1.9975989062950828e-05, |
|
"loss": 1.2198, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12234533702677747, |
|
"grad_norm": 4.5591861583880045, |
|
"learning_rate": 1.9970083424162598e-05, |
|
"loss": 1.2971, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.12465373961218837, |
|
"grad_norm": 8.794456155689286, |
|
"learning_rate": 1.9963530176908752e-05, |
|
"loss": 1.2543, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12696214219759927, |
|
"grad_norm": 4.296337355363852, |
|
"learning_rate": 1.9956329746856583e-05, |
|
"loss": 1.1902, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.12927054478301014, |
|
"grad_norm": 4.210037276606183, |
|
"learning_rate": 1.9948482601711245e-05, |
|
"loss": 1.2119, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 4.525819047133829, |
|
"learning_rate": 1.9939989251185386e-05, |
|
"loss": 1.2267, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.13388734995383195, |
|
"grad_norm": 4.617114491344834, |
|
"learning_rate": 1.993085024696604e-05, |
|
"loss": 1.253, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13619575253924285, |
|
"grad_norm": 4.306884827311129, |
|
"learning_rate": 1.992106618267878e-05, |
|
"loss": 1.2968, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.13850415512465375, |
|
"grad_norm": 4.06156778374873, |
|
"learning_rate": 1.9910637693849166e-05, |
|
"loss": 1.2523, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14081255771006462, |
|
"grad_norm": 4.1725194301873225, |
|
"learning_rate": 1.9899565457861463e-05, |
|
"loss": 1.2465, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.14312096029547552, |
|
"grad_norm": 6.1582094432239005, |
|
"learning_rate": 1.988785019391465e-05, |
|
"loss": 1.2893, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14542936288088643, |
|
"grad_norm": 4.468419480755536, |
|
"learning_rate": 1.987549266297568e-05, |
|
"loss": 1.2684, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.14773776546629733, |
|
"grad_norm": 4.663314719431853, |
|
"learning_rate": 1.986249366773009e-05, |
|
"loss": 1.2472, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15004616805170823, |
|
"grad_norm": 4.557295583444763, |
|
"learning_rate": 1.9848854052529822e-05, |
|
"loss": 1.2856, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1523545706371191, |
|
"grad_norm": 4.128322557091226, |
|
"learning_rate": 1.9834574703338406e-05, |
|
"loss": 1.2717, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15466297322253, |
|
"grad_norm": 4.265562249971871, |
|
"learning_rate": 1.9819656547673393e-05, |
|
"loss": 1.2614, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1569713758079409, |
|
"grad_norm": 4.189648461283852, |
|
"learning_rate": 1.9804100554546127e-05, |
|
"loss": 1.2221, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1592797783933518, |
|
"grad_norm": 4.753781028146877, |
|
"learning_rate": 1.9787907734398785e-05, |
|
"loss": 1.2641, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.1615881809787627, |
|
"grad_norm": 4.624571477954883, |
|
"learning_rate": 1.9771079139038765e-05, |
|
"loss": 1.3082, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16389658356417358, |
|
"grad_norm": 4.51192227446534, |
|
"learning_rate": 1.9753615861570338e-05, |
|
"loss": 1.3116, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.16620498614958448, |
|
"grad_norm": 4.392313959090253, |
|
"learning_rate": 1.9735519036323656e-05, |
|
"loss": 1.2304, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16851338873499538, |
|
"grad_norm": 4.979240339208881, |
|
"learning_rate": 1.9716789838781095e-05, |
|
"loss": 1.2682, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.17082179132040629, |
|
"grad_norm": 4.96937836441046, |
|
"learning_rate": 1.9697429485500862e-05, |
|
"loss": 1.3054, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1731301939058172, |
|
"grad_norm": 3.935739346153204, |
|
"learning_rate": 1.9677439234038004e-05, |
|
"loss": 1.2704, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.17543859649122806, |
|
"grad_norm": 4.366123456450803, |
|
"learning_rate": 1.96568203828627e-05, |
|
"loss": 1.236, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17774699907663896, |
|
"grad_norm": 4.003638705307624, |
|
"learning_rate": 1.963557427127594e-05, |
|
"loss": 1.2134, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.18005540166204986, |
|
"grad_norm": 4.711836278485082, |
|
"learning_rate": 1.9613702279322518e-05, |
|
"loss": 1.2424, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.18236380424746076, |
|
"grad_norm": 4.7756346414851345, |
|
"learning_rate": 1.95912058277014e-05, |
|
"loss": 1.2513, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.18467220683287167, |
|
"grad_norm": 4.055556447653374, |
|
"learning_rate": 1.9568086377673422e-05, |
|
"loss": 1.2305, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18698060941828254, |
|
"grad_norm": 3.9870929086001605, |
|
"learning_rate": 1.9544345430966398e-05, |
|
"loss": 1.2766, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.18928901200369344, |
|
"grad_norm": 4.3683569271591525, |
|
"learning_rate": 1.951998452967756e-05, |
|
"loss": 1.2701, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.19159741458910434, |
|
"grad_norm": 4.282177503327308, |
|
"learning_rate": 1.9495005256173398e-05, |
|
"loss": 1.2173, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.19390581717451524, |
|
"grad_norm": 4.122228465513596, |
|
"learning_rate": 1.9469409232986876e-05, |
|
"loss": 1.293, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19621421975992612, |
|
"grad_norm": 4.391730062186428, |
|
"learning_rate": 1.9443198122712036e-05, |
|
"loss": 1.3013, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.19852262234533702, |
|
"grad_norm": 4.2533205751093, |
|
"learning_rate": 1.9416373627896002e-05, |
|
"loss": 1.2478, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.20083102493074792, |
|
"grad_norm": 4.982151398275928, |
|
"learning_rate": 1.9388937490928402e-05, |
|
"loss": 1.289, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.20313942751615882, |
|
"grad_norm": 4.254393940238592, |
|
"learning_rate": 1.9360891493928186e-05, |
|
"loss": 1.2773, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20544783010156972, |
|
"grad_norm": 4.812233488623846, |
|
"learning_rate": 1.933223745862786e-05, |
|
"loss": 1.2571, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.2077562326869806, |
|
"grad_norm": 4.193819364681046, |
|
"learning_rate": 1.930297724625516e-05, |
|
"loss": 1.3167, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2100646352723915, |
|
"grad_norm": 4.318967687699199, |
|
"learning_rate": 1.9273112757412165e-05, |
|
"loss": 1.2578, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2123730378578024, |
|
"grad_norm": 4.021438837096732, |
|
"learning_rate": 1.9242645931951833e-05, |
|
"loss": 1.2703, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2146814404432133, |
|
"grad_norm": 3.9988355301981344, |
|
"learning_rate": 1.921157874885199e-05, |
|
"loss": 1.2702, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2169898430286242, |
|
"grad_norm": 3.866018897785007, |
|
"learning_rate": 1.91799132260868e-05, |
|
"loss": 1.2651, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21929824561403508, |
|
"grad_norm": 4.228145732575894, |
|
"learning_rate": 1.9147651420495696e-05, |
|
"loss": 1.2429, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.22160664819944598, |
|
"grad_norm": 4.16044625111994, |
|
"learning_rate": 1.9114795427649735e-05, |
|
"loss": 1.2263, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.22391505078485688, |
|
"grad_norm": 3.7071606709047678, |
|
"learning_rate": 1.9081347381715535e-05, |
|
"loss": 1.2592, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.22622345337026778, |
|
"grad_norm": 4.093983584879632, |
|
"learning_rate": 1.904730945531661e-05, |
|
"loss": 1.2819, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22853185595567868, |
|
"grad_norm": 4.247421291613911, |
|
"learning_rate": 1.901268385939226e-05, |
|
"loss": 1.3118, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.23084025854108955, |
|
"grad_norm": 4.088704419142061, |
|
"learning_rate": 1.8977472843053962e-05, |
|
"loss": 1.2529, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23314866112650046, |
|
"grad_norm": 3.9526614218286698, |
|
"learning_rate": 1.8941678693439272e-05, |
|
"loss": 1.2254, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.23545706371191136, |
|
"grad_norm": 3.767319095108075, |
|
"learning_rate": 1.8905303735563274e-05, |
|
"loss": 1.2705, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.23776546629732226, |
|
"grad_norm": 4.1464464034097, |
|
"learning_rate": 1.886835033216755e-05, |
|
"loss": 1.2841, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.24007386888273316, |
|
"grad_norm": 4.154511161776497, |
|
"learning_rate": 1.88308208835667e-05, |
|
"loss": 1.2715, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.24238227146814403, |
|
"grad_norm": 4.815166096996458, |
|
"learning_rate": 1.8792717827492446e-05, |
|
"loss": 1.3034, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.24469067405355494, |
|
"grad_norm": 22.245546847367528, |
|
"learning_rate": 1.8754043638935283e-05, |
|
"loss": 1.2532, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.24699907663896584, |
|
"grad_norm": 4.177323522295811, |
|
"learning_rate": 1.871480082998371e-05, |
|
"loss": 1.2501, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.24930747922437674, |
|
"grad_norm": 3.9426463777773346, |
|
"learning_rate": 1.867499194966106e-05, |
|
"loss": 1.2683, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2516158818097876, |
|
"grad_norm": 3.912690873331932, |
|
"learning_rate": 1.8634619583759933e-05, |
|
"loss": 1.2874, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.25392428439519854, |
|
"grad_norm": 3.972529239438344, |
|
"learning_rate": 1.8593686354674223e-05, |
|
"loss": 1.2698, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2562326869806094, |
|
"grad_norm": 3.958572886167977, |
|
"learning_rate": 1.8552194921228793e-05, |
|
"loss": 1.2293, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.2585410895660203, |
|
"grad_norm": 3.7553829117034767, |
|
"learning_rate": 1.851014797850676e-05, |
|
"loss": 1.2818, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2608494921514312, |
|
"grad_norm": 4.352268879736511, |
|
"learning_rate": 1.8467548257674453e-05, |
|
"loss": 1.2552, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 5.014139215739045, |
|
"learning_rate": 1.8424398525803983e-05, |
|
"loss": 1.2228, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.265466297322253, |
|
"grad_norm": 4.192590762422093, |
|
"learning_rate": 1.8380701585693526e-05, |
|
"loss": 1.2526, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.2677746999076639, |
|
"grad_norm": 4.209340122955672, |
|
"learning_rate": 1.8336460275685267e-05, |
|
"loss": 1.2681, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.27008310249307477, |
|
"grad_norm": 3.801129619164067, |
|
"learning_rate": 1.8291677469481025e-05, |
|
"loss": 1.2623, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.2723915050784857, |
|
"grad_norm": 5.60448449703679, |
|
"learning_rate": 1.8246356075955594e-05, |
|
"loss": 1.2778, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.27469990766389657, |
|
"grad_norm": 3.8415685450636143, |
|
"learning_rate": 1.820049903896782e-05, |
|
"loss": 1.2546, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.2770083102493075, |
|
"grad_norm": 3.766423848242755, |
|
"learning_rate": 1.8154109337169326e-05, |
|
"loss": 1.2994, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2793167128347184, |
|
"grad_norm": 3.8445299977202363, |
|
"learning_rate": 1.8107189983811094e-05, |
|
"loss": 1.2779, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.28162511542012925, |
|
"grad_norm": 4.20182793655244, |
|
"learning_rate": 1.8059744026547713e-05, |
|
"loss": 1.2794, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2839335180055402, |
|
"grad_norm": 3.6927184982852554, |
|
"learning_rate": 1.8011774547239403e-05, |
|
"loss": 1.2217, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.28624192059095105, |
|
"grad_norm": 3.906241578603264, |
|
"learning_rate": 1.796328466175186e-05, |
|
"loss": 1.3162, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.288550323176362, |
|
"grad_norm": 3.7221850266429675, |
|
"learning_rate": 1.791427751975385e-05, |
|
"loss": 1.2591, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.29085872576177285, |
|
"grad_norm": 4.11815775927983, |
|
"learning_rate": 1.786475630451262e-05, |
|
"loss": 1.2572, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2931671283471837, |
|
"grad_norm": 3.8995508626898454, |
|
"learning_rate": 1.781472423268713e-05, |
|
"loss": 1.2604, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.29547553093259465, |
|
"grad_norm": 4.5219499712986035, |
|
"learning_rate": 1.776418455411913e-05, |
|
"loss": 1.298, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.29778393351800553, |
|
"grad_norm": 4.5899598168207785, |
|
"learning_rate": 1.7713140551622032e-05, |
|
"loss": 1.2664, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.30009233610341646, |
|
"grad_norm": 4.641570078800192, |
|
"learning_rate": 1.7661595540767714e-05, |
|
"loss": 1.2689, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.30240073868882733, |
|
"grad_norm": 4.383087991217795, |
|
"learning_rate": 1.7609552869671126e-05, |
|
"loss": 1.2551, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.3047091412742382, |
|
"grad_norm": 3.9687899547292576, |
|
"learning_rate": 1.7557015918772822e-05, |
|
"loss": 1.2379, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.30701754385964913, |
|
"grad_norm": 4.133840300932013, |
|
"learning_rate": 1.750398810061939e-05, |
|
"loss": 1.2779, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.30932594644506, |
|
"grad_norm": 3.84778329275165, |
|
"learning_rate": 1.745047285964179e-05, |
|
"loss": 1.2306, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.31163434903047094, |
|
"grad_norm": 4.054603771464119, |
|
"learning_rate": 1.7396473671931597e-05, |
|
"loss": 1.2089, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.3139427516158818, |
|
"grad_norm": 4.013882196193361, |
|
"learning_rate": 1.7341994045015245e-05, |
|
"loss": 1.2225, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3162511542012927, |
|
"grad_norm": 4.076399340438248, |
|
"learning_rate": 1.7287037517626174e-05, |
|
"loss": 1.3166, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.3185595567867036, |
|
"grad_norm": 3.991144267549364, |
|
"learning_rate": 1.7231607659474972e-05, |
|
"loss": 1.2706, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3208679593721145, |
|
"grad_norm": 3.592102167186549, |
|
"learning_rate": 1.7175708071017503e-05, |
|
"loss": 1.2066, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.3231763619575254, |
|
"grad_norm": 4.2490266329322655, |
|
"learning_rate": 1.7119342383221055e-05, |
|
"loss": 1.3011, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3254847645429363, |
|
"grad_norm": 3.7487591296204266, |
|
"learning_rate": 1.7062514257328474e-05, |
|
"loss": 1.2587, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.32779316712834716, |
|
"grad_norm": 3.6111287365523466, |
|
"learning_rate": 1.7005227384620336e-05, |
|
"loss": 1.2626, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3301015697137581, |
|
"grad_norm": 3.8624035554609892, |
|
"learning_rate": 1.6947485486175223e-05, |
|
"loss": 1.266, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.33240997229916897, |
|
"grad_norm": 4.191574332500623, |
|
"learning_rate": 1.688929231262797e-05, |
|
"loss": 1.2275, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3347183748845799, |
|
"grad_norm": 3.931766819485826, |
|
"learning_rate": 1.683065164392606e-05, |
|
"loss": 1.2525, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.33702677746999077, |
|
"grad_norm": 3.8224846577065685, |
|
"learning_rate": 1.6771567289084122e-05, |
|
"loss": 1.228, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.33933518005540164, |
|
"grad_norm": 3.7975499971303024, |
|
"learning_rate": 1.6712043085936473e-05, |
|
"loss": 1.2121, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.34164358264081257, |
|
"grad_norm": 3.7233983105114326, |
|
"learning_rate": 1.6652082900887858e-05, |
|
"loss": 1.2439, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.34395198522622344, |
|
"grad_norm": 4.0496534376278674, |
|
"learning_rate": 1.6591690628662305e-05, |
|
"loss": 1.3064, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.3462603878116344, |
|
"grad_norm": 4.397682055950332, |
|
"learning_rate": 1.6530870192050134e-05, |
|
"loss": 1.2433, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.34856879039704525, |
|
"grad_norm": 3.999160650641557, |
|
"learning_rate": 1.6469625541653152e-05, |
|
"loss": 1.2117, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.3508771929824561, |
|
"grad_norm": 4.475385002364299, |
|
"learning_rate": 1.6407960655628055e-05, |
|
"loss": 1.203, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.35318559556786705, |
|
"grad_norm": 3.5042875341184416, |
|
"learning_rate": 1.6345879539428e-05, |
|
"loss": 1.2567, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.3554939981532779, |
|
"grad_norm": 3.678612416780679, |
|
"learning_rate": 1.6283386225542467e-05, |
|
"loss": 1.2276, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.35780240073868885, |
|
"grad_norm": 5.063348081613382, |
|
"learning_rate": 1.622048477323529e-05, |
|
"loss": 1.2297, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.3601108033240997, |
|
"grad_norm": 4.04397764374825, |
|
"learning_rate": 1.6157179268281007e-05, |
|
"loss": 1.2498, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3624192059095106, |
|
"grad_norm": 3.7786600086660553, |
|
"learning_rate": 1.6093473822699467e-05, |
|
"loss": 1.2156, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.36472760849492153, |
|
"grad_norm": 3.726670143436363, |
|
"learning_rate": 1.6029372574488732e-05, |
|
"loss": 1.248, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3670360110803324, |
|
"grad_norm": 3.6023664901819115, |
|
"learning_rate": 1.5964879687356286e-05, |
|
"loss": 1.2762, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.36934441366574333, |
|
"grad_norm": 3.684618843127009, |
|
"learning_rate": 1.589999935044859e-05, |
|
"loss": 1.2269, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3716528162511542, |
|
"grad_norm": 3.6119834291134465, |
|
"learning_rate": 1.5834735778078968e-05, |
|
"loss": 1.2078, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.3739612188365651, |
|
"grad_norm": 3.66332363718426, |
|
"learning_rate": 1.5769093209453876e-05, |
|
"loss": 1.2713, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.376269621421976, |
|
"grad_norm": 4.137676249046753, |
|
"learning_rate": 1.5703075908397523e-05, |
|
"loss": 1.2816, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.3785780240073869, |
|
"grad_norm": 3.8481468093108475, |
|
"learning_rate": 1.563668816307494e-05, |
|
"loss": 1.2203, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3808864265927978, |
|
"grad_norm": 3.7158307301305156, |
|
"learning_rate": 1.556993428571342e-05, |
|
"loss": 1.2163, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.3831948291782087, |
|
"grad_norm": 3.851222452502614, |
|
"learning_rate": 1.550281861232243e-05, |
|
"loss": 1.243, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.38550323176361956, |
|
"grad_norm": 3.6817891692377978, |
|
"learning_rate": 1.5435345502411956e-05, |
|
"loss": 1.2821, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.3878116343490305, |
|
"grad_norm": 3.9683025462284998, |
|
"learning_rate": 1.536751933870934e-05, |
|
"loss": 1.2019, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.39012003693444136, |
|
"grad_norm": 3.94265762295689, |
|
"learning_rate": 1.5299344526874576e-05, |
|
"loss": 1.2774, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.39242843951985223, |
|
"grad_norm": 4.123641725136207, |
|
"learning_rate": 1.5230825495214184e-05, |
|
"loss": 1.2352, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 3.9570109790957653, |
|
"learning_rate": 1.5161966694393516e-05, |
|
"loss": 1.215, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.39704524469067404, |
|
"grad_norm": 3.6427091867450714, |
|
"learning_rate": 1.5092772597147707e-05, |
|
"loss": 1.2202, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.39935364727608497, |
|
"grad_norm": 3.8425754107191796, |
|
"learning_rate": 1.5023247697991114e-05, |
|
"loss": 1.2432, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.40166204986149584, |
|
"grad_norm": 3.759319372367797, |
|
"learning_rate": 1.4953396512925398e-05, |
|
"loss": 1.1838, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4039704524469067, |
|
"grad_norm": 3.872324982369786, |
|
"learning_rate": 1.4883223579146167e-05, |
|
"loss": 1.2331, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.40627885503231764, |
|
"grad_norm": 3.8616658245003435, |
|
"learning_rate": 1.4812733454748283e-05, |
|
"loss": 1.2277, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4085872576177285, |
|
"grad_norm": 3.5624714154298163, |
|
"learning_rate": 1.4741930718429772e-05, |
|
"loss": 1.2051, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.41089566020313945, |
|
"grad_norm": 3.6961173549363924, |
|
"learning_rate": 1.4670819969194416e-05, |
|
"loss": 1.2309, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4132040627885503, |
|
"grad_norm": 3.5654510220296847, |
|
"learning_rate": 1.4599405826053039e-05, |
|
"loss": 1.1884, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.4155124653739612, |
|
"grad_norm": 4.205884899208378, |
|
"learning_rate": 1.4527692927723465e-05, |
|
"loss": 1.2223, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4178208679593721, |
|
"grad_norm": 3.9431786244545997, |
|
"learning_rate": 1.4455685932329204e-05, |
|
"loss": 1.2389, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.420129270544783, |
|
"grad_norm": 3.579703652121505, |
|
"learning_rate": 1.4383389517096899e-05, |
|
"loss": 1.2429, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.4224376731301939, |
|
"grad_norm": 3.7807582830713105, |
|
"learning_rate": 1.4310808378052506e-05, |
|
"loss": 1.1874, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.4247460757156048, |
|
"grad_norm": 3.9020463886513914, |
|
"learning_rate": 1.4237947229716262e-05, |
|
"loss": 1.2587, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.42705447830101567, |
|
"grad_norm": 3.7663448915088633, |
|
"learning_rate": 1.4164810804796464e-05, |
|
"loss": 1.184, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.4293628808864266, |
|
"grad_norm": 3.7907471270783937, |
|
"learning_rate": 1.409140385388203e-05, |
|
"loss": 1.2445, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4316712834718375, |
|
"grad_norm": 3.791543245723202, |
|
"learning_rate": 1.4017731145133955e-05, |
|
"loss": 1.2527, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.4339796860572484, |
|
"grad_norm": 3.8566751713668666, |
|
"learning_rate": 1.3943797463975575e-05, |
|
"loss": 1.2048, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4362880886426593, |
|
"grad_norm": 3.943257567360323, |
|
"learning_rate": 1.3869607612781733e-05, |
|
"loss": 1.2773, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.43859649122807015, |
|
"grad_norm": 3.53206021655625, |
|
"learning_rate": 1.3795166410566834e-05, |
|
"loss": 1.2066, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4409048938134811, |
|
"grad_norm": 3.8322607840339504, |
|
"learning_rate": 1.372047869267184e-05, |
|
"loss": 1.2104, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.44321329639889195, |
|
"grad_norm": 4.982802180271467, |
|
"learning_rate": 1.364554931045018e-05, |
|
"loss": 1.2782, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4455216989843029, |
|
"grad_norm": 4.121927772157904, |
|
"learning_rate": 1.3570383130952627e-05, |
|
"loss": 1.2221, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.44783010156971376, |
|
"grad_norm": 3.5401426054616674, |
|
"learning_rate": 1.349498503661116e-05, |
|
"loss": 1.249, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.45013850415512463, |
|
"grad_norm": 3.8347876039826647, |
|
"learning_rate": 1.3419359924921833e-05, |
|
"loss": 1.2736, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.45244690674053556, |
|
"grad_norm": 4.86416192250325, |
|
"learning_rate": 1.3343512708126642e-05, |
|
"loss": 1.2032, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.45475530932594643, |
|
"grad_norm": 3.8508803970513004, |
|
"learning_rate": 1.326744831289447e-05, |
|
"loss": 1.2465, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.45706371191135736, |
|
"grad_norm": 3.276661833625774, |
|
"learning_rate": 1.3191171680001048e-05, |
|
"loss": 1.1905, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.45937211449676824, |
|
"grad_norm": 3.6488550777243933, |
|
"learning_rate": 1.3114687764008048e-05, |
|
"loss": 1.1991, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.4616805170821791, |
|
"grad_norm": 3.9637997706000223, |
|
"learning_rate": 1.3038001532941249e-05, |
|
"loss": 1.1994, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.46398891966759004, |
|
"grad_norm": 3.7798295608326447, |
|
"learning_rate": 1.2961117967967844e-05, |
|
"loss": 1.2327, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.4662973222530009, |
|
"grad_norm": 3.742363753899004, |
|
"learning_rate": 1.2884042063072881e-05, |
|
"loss": 1.2415, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.46860572483841184, |
|
"grad_norm": 4.00995610689072, |
|
"learning_rate": 1.280677882473488e-05, |
|
"loss": 1.2449, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.4709141274238227, |
|
"grad_norm": 3.7802768150285284, |
|
"learning_rate": 1.272933327160063e-05, |
|
"loss": 1.2055, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4732225300092336, |
|
"grad_norm": 3.979719082398227, |
|
"learning_rate": 1.2651710434159223e-05, |
|
"loss": 1.1452, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.4755309325946445, |
|
"grad_norm": 3.7987734509998012, |
|
"learning_rate": 1.2573915354415274e-05, |
|
"loss": 1.2266, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4778393351800554, |
|
"grad_norm": 3.4449265105850344, |
|
"learning_rate": 1.2495953085561426e-05, |
|
"loss": 1.1678, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.4801477377654663, |
|
"grad_norm": 4.703831538180476, |
|
"learning_rate": 1.241782869165012e-05, |
|
"loss": 1.1893, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.4824561403508772, |
|
"grad_norm": 3.56138065098868, |
|
"learning_rate": 1.2339547247264658e-05, |
|
"loss": 1.2285, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.48476454293628807, |
|
"grad_norm": 3.8664090630676147, |
|
"learning_rate": 1.2261113837189587e-05, |
|
"loss": 1.1995, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.487072945521699, |
|
"grad_norm": 3.6587622685467553, |
|
"learning_rate": 1.2182533556080402e-05, |
|
"loss": 1.2456, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.48938134810710987, |
|
"grad_norm": 3.4219623018934615, |
|
"learning_rate": 1.2103811508132642e-05, |
|
"loss": 1.1904, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4916897506925208, |
|
"grad_norm": 3.91141223990254, |
|
"learning_rate": 1.2024952806750321e-05, |
|
"loss": 1.1811, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.4939981532779317, |
|
"grad_norm": 3.707066130468398, |
|
"learning_rate": 1.1945962574213814e-05, |
|
"loss": 1.212, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.49630655586334255, |
|
"grad_norm": 3.5782501836947653, |
|
"learning_rate": 1.1866845941347118e-05, |
|
"loss": 1.2255, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.4986149584487535, |
|
"grad_norm": 4.303350644777213, |
|
"learning_rate": 1.1787608047184583e-05, |
|
"loss": 1.1376, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5009233610341643, |
|
"grad_norm": 3.419543860379626, |
|
"learning_rate": 1.1708254038637115e-05, |
|
"loss": 1.1872, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.5032317636195752, |
|
"grad_norm": 3.586294780528409, |
|
"learning_rate": 1.1628789070157836e-05, |
|
"loss": 1.2114, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5055401662049861, |
|
"grad_norm": 3.6647616517214496, |
|
"learning_rate": 1.1549218303407305e-05, |
|
"loss": 1.2088, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.5078485687903971, |
|
"grad_norm": 3.6209405687157794, |
|
"learning_rate": 1.1469546906918219e-05, |
|
"loss": 1.1535, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.510156971375808, |
|
"grad_norm": 3.4760951984933777, |
|
"learning_rate": 1.1389780055759689e-05, |
|
"loss": 1.1692, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5124653739612188, |
|
"grad_norm": 3.523587148397925, |
|
"learning_rate": 1.1309922931201114e-05, |
|
"loss": 1.1795, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5147737765466297, |
|
"grad_norm": 3.399747435026194, |
|
"learning_rate": 1.1229980720375609e-05, |
|
"loss": 1.1913, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.5170821791320406, |
|
"grad_norm": 3.802970464768176, |
|
"learning_rate": 1.114995861594308e-05, |
|
"loss": 1.1692, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5193905817174516, |
|
"grad_norm": 3.571347595436078, |
|
"learning_rate": 1.1069861815752944e-05, |
|
"loss": 1.1575, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.5216989843028624, |
|
"grad_norm": 3.702241350827994, |
|
"learning_rate": 1.0989695522506486e-05, |
|
"loss": 1.1776, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5240073868882733, |
|
"grad_norm": 4.396145181294285, |
|
"learning_rate": 1.0909464943418926e-05, |
|
"loss": 1.2055, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 3.402649511273165, |
|
"learning_rate": 1.0829175289881188e-05, |
|
"loss": 1.2024, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.528624192059095, |
|
"grad_norm": 3.321901777095843, |
|
"learning_rate": 1.074883177712138e-05, |
|
"loss": 1.1317, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.530932594644506, |
|
"grad_norm": 4.575011114858196, |
|
"learning_rate": 1.0668439623866043e-05, |
|
"loss": 1.1516, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5332409972299169, |
|
"grad_norm": 3.428811319179132, |
|
"learning_rate": 1.0588004052001177e-05, |
|
"loss": 1.1326, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.5355493998153278, |
|
"grad_norm": 3.758823500740248, |
|
"learning_rate": 1.0507530286233042e-05, |
|
"loss": 1.1523, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5378578024007387, |
|
"grad_norm": 3.828420445656179, |
|
"learning_rate": 1.0427023553748792e-05, |
|
"loss": 1.215, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.5401662049861495, |
|
"grad_norm": 3.872474623427253, |
|
"learning_rate": 1.0346489083876928e-05, |
|
"loss": 1.1798, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5424746075715605, |
|
"grad_norm": 4.343223419966708, |
|
"learning_rate": 1.0265932107747656e-05, |
|
"loss": 1.1964, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.5447830101569714, |
|
"grad_norm": 3.4458152638291533, |
|
"learning_rate": 1.0185357857953064e-05, |
|
"loss": 1.188, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5470914127423823, |
|
"grad_norm": 3.3343026801443765, |
|
"learning_rate": 1.0104771568207266e-05, |
|
"loss": 1.1524, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.5493998153277931, |
|
"grad_norm": 3.8325280372919774, |
|
"learning_rate": 1.0024178473006418e-05, |
|
"loss": 1.1445, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.551708217913204, |
|
"grad_norm": 3.913934401934443, |
|
"learning_rate": 9.943583807288746e-06, |
|
"loss": 1.1497, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.554016620498615, |
|
"grad_norm": 3.8771337742661585, |
|
"learning_rate": 9.862992806094473e-06, |
|
"loss": 1.1584, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5563250230840259, |
|
"grad_norm": 3.385706053842486, |
|
"learning_rate": 9.782410704225793e-06, |
|
"loss": 1.133, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.5586334256694367, |
|
"grad_norm": 3.228558572718497, |
|
"learning_rate": 9.701842735906855e-06, |
|
"loss": 1.1714, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5609418282548476, |
|
"grad_norm": 3.376489834368575, |
|
"learning_rate": 9.621294134443747e-06, |
|
"loss": 1.1782, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.5632502308402585, |
|
"grad_norm": 4.101023970778267, |
|
"learning_rate": 9.54077013188459e-06, |
|
"loss": 1.1679, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5655586334256695, |
|
"grad_norm": 3.459693677788322, |
|
"learning_rate": 9.460275958679674e-06, |
|
"loss": 1.2272, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.5678670360110804, |
|
"grad_norm": 3.5741244509053556, |
|
"learning_rate": 9.379816843341715e-06, |
|
"loss": 1.1679, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5701754385964912, |
|
"grad_norm": 14.959841662019736, |
|
"learning_rate": 9.299398012106246e-06, |
|
"loss": 1.1557, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.5724838411819021, |
|
"grad_norm": 3.479142794568544, |
|
"learning_rate": 9.219024688592136e-06, |
|
"loss": 1.191, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.574792243767313, |
|
"grad_norm": 3.4791994405128195, |
|
"learning_rate": 9.138702093462286e-06, |
|
"loss": 1.1632, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.577100646352724, |
|
"grad_norm": 3.378297795269278, |
|
"learning_rate": 9.058435444084543e-06, |
|
"loss": 1.2058, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5794090489381348, |
|
"grad_norm": 3.3312286796444948, |
|
"learning_rate": 8.978229954192775e-06, |
|
"loss": 1.2072, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.5817174515235457, |
|
"grad_norm": 3.2936946867277497, |
|
"learning_rate": 8.898090833548226e-06, |
|
"loss": 1.1479, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5840258541089566, |
|
"grad_norm": 3.5657195698986306, |
|
"learning_rate": 8.818023287601117e-06, |
|
"loss": 1.1579, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.5863342566943675, |
|
"grad_norm": 3.85534125869907, |
|
"learning_rate": 8.738032517152523e-06, |
|
"loss": 1.1748, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5886426592797784, |
|
"grad_norm": 3.3807308381583585, |
|
"learning_rate": 8.658123718016548e-06, |
|
"loss": 1.1365, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.5909510618651893, |
|
"grad_norm": 3.75547737356039, |
|
"learning_rate": 8.578302080682844e-06, |
|
"loss": 1.1657, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5932594644506002, |
|
"grad_norm": 3.334058557259955, |
|
"learning_rate": 8.498572789979446e-06, |
|
"loss": 1.1653, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.5955678670360111, |
|
"grad_norm": 3.596795067568704, |
|
"learning_rate": 8.418941024735997e-06, |
|
"loss": 1.1909, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5978762696214219, |
|
"grad_norm": 3.754106205642103, |
|
"learning_rate": 8.33941195744737e-06, |
|
"loss": 1.1595, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6001846722068329, |
|
"grad_norm": 3.3575559431036988, |
|
"learning_rate": 8.259990753937662e-06, |
|
"loss": 1.1378, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6024930747922438, |
|
"grad_norm": 4.011372021010383, |
|
"learning_rate": 8.18068257302466e-06, |
|
"loss": 1.1832, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.6048014773776547, |
|
"grad_norm": 3.404379906541828, |
|
"learning_rate": 8.101492566184757e-06, |
|
"loss": 1.1592, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6071098799630655, |
|
"grad_norm": 3.498132121516362, |
|
"learning_rate": 8.022425877218321e-06, |
|
"loss": 1.1591, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.6094182825484764, |
|
"grad_norm": 3.523586580045349, |
|
"learning_rate": 7.943487641915595e-06, |
|
"loss": 1.1525, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6117266851338874, |
|
"grad_norm": 3.6229726894839858, |
|
"learning_rate": 7.864682987723082e-06, |
|
"loss": 1.1618, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.6140350877192983, |
|
"grad_norm": 3.696989469787097, |
|
"learning_rate": 7.78601703341051e-06, |
|
"loss": 1.1824, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6163434903047091, |
|
"grad_norm": 3.567967173775001, |
|
"learning_rate": 7.70749488873833e-06, |
|
"loss": 1.1792, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.61865189289012, |
|
"grad_norm": 3.399928397766497, |
|
"learning_rate": 7.629121654125808e-06, |
|
"loss": 1.1438, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6209602954755309, |
|
"grad_norm": 3.6344006441397414, |
|
"learning_rate": 7.550902420319742e-06, |
|
"loss": 1.1591, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.6232686980609419, |
|
"grad_norm": 3.538106316840523, |
|
"learning_rate": 7.472842268063776e-06, |
|
"loss": 1.1311, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6255771006463527, |
|
"grad_norm": 3.661558906665894, |
|
"learning_rate": 7.394946267768381e-06, |
|
"loss": 1.1621, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.6278855032317636, |
|
"grad_norm": 3.6197107279149954, |
|
"learning_rate": 7.317219479181517e-06, |
|
"loss": 1.1028, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6301939058171745, |
|
"grad_norm": 3.4094252840241355, |
|
"learning_rate": 7.23966695105996e-06, |
|
"loss": 1.119, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.6325023084025854, |
|
"grad_norm": 3.4085855538144467, |
|
"learning_rate": 7.162293720841378e-06, |
|
"loss": 1.1438, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6348107109879964, |
|
"grad_norm": 4.073406312500022, |
|
"learning_rate": 7.085104814317101e-06, |
|
"loss": 1.1729, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.6371191135734072, |
|
"grad_norm": 3.572178264074241, |
|
"learning_rate": 7.008105245305699e-06, |
|
"loss": 1.1661, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6394275161588181, |
|
"grad_norm": 3.81528951625221, |
|
"learning_rate": 6.931300015327274e-06, |
|
"loss": 1.1571, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.641735918744229, |
|
"grad_norm": 3.2846636335941763, |
|
"learning_rate": 6.854694113278614e-06, |
|
"loss": 1.154, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6440443213296398, |
|
"grad_norm": 3.2544013227776007, |
|
"learning_rate": 6.7782925151091224e-06, |
|
"loss": 1.0823, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.6463527239150508, |
|
"grad_norm": 3.482450898904014, |
|
"learning_rate": 6.702100183497613e-06, |
|
"loss": 1.1803, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6486611265004617, |
|
"grad_norm": 3.412256349030684, |
|
"learning_rate": 6.62612206752995e-06, |
|
"loss": 1.1643, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.6509695290858726, |
|
"grad_norm": 3.75196899322532, |
|
"learning_rate": 6.550363102377588e-06, |
|
"loss": 1.1117, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6532779316712835, |
|
"grad_norm": 3.3485189294016258, |
|
"learning_rate": 6.474828208976998e-06, |
|
"loss": 1.1466, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.6555863342566943, |
|
"grad_norm": 3.4421443761863104, |
|
"learning_rate": 6.3995222937100455e-06, |
|
"loss": 1.1468, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 3.4653107797221683, |
|
"learning_rate": 6.324450248085265e-06, |
|
"loss": 1.1418, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.6602031394275162, |
|
"grad_norm": 3.450235228111911, |
|
"learning_rate": 6.249616948420161e-06, |
|
"loss": 1.1393, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6625115420129271, |
|
"grad_norm": 3.648594332616919, |
|
"learning_rate": 6.175027255524446e-06, |
|
"loss": 1.1263, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.6648199445983379, |
|
"grad_norm": 3.50804118935427, |
|
"learning_rate": 6.100686014384315e-06, |
|
"loss": 1.1497, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6671283471837488, |
|
"grad_norm": 3.407303145023877, |
|
"learning_rate": 6.026598053847743e-06, |
|
"loss": 1.1217, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.6694367497691598, |
|
"grad_norm": 3.6049741156075426, |
|
"learning_rate": 5.952768186310813e-06, |
|
"loss": 1.2134, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6717451523545707, |
|
"grad_norm": 3.347553717603198, |
|
"learning_rate": 5.879201207405136e-06, |
|
"loss": 1.1189, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.6740535549399815, |
|
"grad_norm": 3.7624263901785087, |
|
"learning_rate": 5.805901895686344e-06, |
|
"loss": 1.1217, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6763619575253924, |
|
"grad_norm": 3.6359056480115193, |
|
"learning_rate": 5.732875012323712e-06, |
|
"loss": 1.1275, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.6786703601108033, |
|
"grad_norm": 3.5660085050284946, |
|
"learning_rate": 5.660125300790873e-06, |
|
"loss": 1.153, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6809787626962143, |
|
"grad_norm": 3.4188915438262946, |
|
"learning_rate": 5.58765748655772e-06, |
|
"loss": 1.126, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.6832871652816251, |
|
"grad_norm": 3.7409360766713995, |
|
"learning_rate": 5.5154762767834605e-06, |
|
"loss": 1.1312, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.685595567867036, |
|
"grad_norm": 3.5176838276710787, |
|
"learning_rate": 5.443586360010859e-06, |
|
"loss": 1.118, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.6879039704524469, |
|
"grad_norm": 3.940112737940071, |
|
"learning_rate": 5.3719924058616975e-06, |
|
"loss": 1.1084, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6902123730378578, |
|
"grad_norm": 3.5725656073039516, |
|
"learning_rate": 5.30069906473345e-06, |
|
"loss": 1.1462, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.6925207756232687, |
|
"grad_norm": 3.585328985764251, |
|
"learning_rate": 5.2297109674972166e-06, |
|
"loss": 1.1275, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6948291782086796, |
|
"grad_norm": 3.7150630899084276, |
|
"learning_rate": 5.159032725196946e-06, |
|
"loss": 1.1573, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.6971375807940905, |
|
"grad_norm": 3.4991531847637893, |
|
"learning_rate": 5.088668928749891e-06, |
|
"loss": 1.1339, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6994459833795014, |
|
"grad_norm": 3.337315702796277, |
|
"learning_rate": 5.0186241486484245e-06, |
|
"loss": 1.1121, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 3.166495977462906, |
|
"learning_rate": 4.948902934663158e-06, |
|
"loss": 1.1207, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7040627885503232, |
|
"grad_norm": 3.269883473204096, |
|
"learning_rate": 4.879509815547413e-06, |
|
"loss": 1.1067, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.7063711911357341, |
|
"grad_norm": 3.2549683491943138, |
|
"learning_rate": 4.810449298743051e-06, |
|
"loss": 1.0858, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.708679593721145, |
|
"grad_norm": 3.673192940396545, |
|
"learning_rate": 4.741725870087693e-06, |
|
"loss": 1.1674, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.7109879963065558, |
|
"grad_norm": 3.295243146355197, |
|
"learning_rate": 4.673343993523347e-06, |
|
"loss": 1.1087, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7132963988919667, |
|
"grad_norm": 3.4162872942710867, |
|
"learning_rate": 4.605308110806436e-06, |
|
"loss": 1.1224, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.7156048014773777, |
|
"grad_norm": 3.3989883160652865, |
|
"learning_rate": 4.537622641219309e-06, |
|
"loss": 1.1307, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7179132040627886, |
|
"grad_norm": 3.2956559559454663, |
|
"learning_rate": 4.47029198128316e-06, |
|
"loss": 1.0944, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.7202216066481995, |
|
"grad_norm": 3.3797718456778765, |
|
"learning_rate": 4.403320504472463e-06, |
|
"loss": 1.1426, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7225300092336103, |
|
"grad_norm": 3.1832339015639826, |
|
"learning_rate": 4.336712560930891e-06, |
|
"loss": 1.1223, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.7248384118190212, |
|
"grad_norm": 3.40273969921815, |
|
"learning_rate": 4.270472477188755e-06, |
|
"loss": 1.1151, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7271468144044322, |
|
"grad_norm": 3.32953363908172, |
|
"learning_rate": 4.204604555881967e-06, |
|
"loss": 1.1055, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.7294552169898431, |
|
"grad_norm": 3.363759228103856, |
|
"learning_rate": 4.139113075472565e-06, |
|
"loss": 1.15, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7317636195752539, |
|
"grad_norm": 3.5692625205390214, |
|
"learning_rate": 4.074002289970801e-06, |
|
"loss": 1.1249, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.7340720221606648, |
|
"grad_norm": 3.6298117912857895, |
|
"learning_rate": 4.009276428658836e-06, |
|
"loss": 1.0911, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7363804247460757, |
|
"grad_norm": 3.501911680130801, |
|
"learning_rate": 3.944939695816005e-06, |
|
"loss": 1.0591, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.7386888273314867, |
|
"grad_norm": 3.314254645913856, |
|
"learning_rate": 3.8809962704457375e-06, |
|
"loss": 1.122, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7409972299168975, |
|
"grad_norm": 3.56145944415269, |
|
"learning_rate": 3.81745030600411e-06, |
|
"loss": 1.1036, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.7433056325023084, |
|
"grad_norm": 3.4910849192084235, |
|
"learning_rate": 3.75430593013006e-06, |
|
"loss": 1.1353, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7456140350877193, |
|
"grad_norm": 3.325715619787326, |
|
"learning_rate": 3.6915672443772644e-06, |
|
"loss": 1.1538, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.7479224376731302, |
|
"grad_norm": 3.5950013679874724, |
|
"learning_rate": 3.62923832394774e-06, |
|
"loss": 1.0909, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7502308402585411, |
|
"grad_norm": 3.1524005532212334, |
|
"learning_rate": 3.56732321742712e-06, |
|
"loss": 1.1125, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.752539242843952, |
|
"grad_norm": 3.6760451234626124, |
|
"learning_rate": 3.5058259465216828e-06, |
|
"loss": 1.1039, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.7548476454293629, |
|
"grad_norm": 3.341546948891595, |
|
"learning_rate": 3.444750505797123e-06, |
|
"loss": 1.0531, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.7571560480147738, |
|
"grad_norm": 3.35627649123262, |
|
"learning_rate": 3.384100862419096e-06, |
|
"loss": 1.0931, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.7594644506001846, |
|
"grad_norm": 3.6221419833131527, |
|
"learning_rate": 3.3238809558955054e-06, |
|
"loss": 1.0797, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.7617728531855956, |
|
"grad_norm": 3.3487671296828267, |
|
"learning_rate": 3.2640946978206266e-06, |
|
"loss": 1.0812, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7640812557710065, |
|
"grad_norm": 3.441031645390376, |
|
"learning_rate": 3.2047459716210306e-06, |
|
"loss": 1.1155, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.7663896583564174, |
|
"grad_norm": 3.4825057106301096, |
|
"learning_rate": 3.145838632303325e-06, |
|
"loss": 1.096, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7686980609418282, |
|
"grad_norm": 3.4525699686491875, |
|
"learning_rate": 3.087376506203763e-06, |
|
"loss": 1.145, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.7710064635272391, |
|
"grad_norm": 3.2639030957505715, |
|
"learning_rate": 3.0293633907396903e-06, |
|
"loss": 1.0711, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.7733148661126501, |
|
"grad_norm": 3.247147491878351, |
|
"learning_rate": 2.971803054162903e-06, |
|
"loss": 1.0367, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.775623268698061, |
|
"grad_norm": 3.3628039668359824, |
|
"learning_rate": 2.914699235314855e-06, |
|
"loss": 1.1311, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7779316712834718, |
|
"grad_norm": 3.294560766749018, |
|
"learning_rate": 2.858055643383818e-06, |
|
"loss": 1.1303, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.7802400738688827, |
|
"grad_norm": 3.252460881051861, |
|
"learning_rate": 2.8018759576639478e-06, |
|
"loss": 1.0894, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7825484764542936, |
|
"grad_norm": 3.6541818791755083, |
|
"learning_rate": 2.7461638273162895e-06, |
|
"loss": 1.1416, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.7848568790397045, |
|
"grad_norm": 3.3018114290440286, |
|
"learning_rate": 2.6909228711317526e-06, |
|
"loss": 1.0898, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7871652816251155, |
|
"grad_norm": 3.5110479717681704, |
|
"learning_rate": 2.6361566772960466e-06, |
|
"loss": 1.0887, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 3.469571849173682, |
|
"learning_rate": 2.5818688031566132e-06, |
|
"loss": 1.0182, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7917820867959372, |
|
"grad_norm": 3.761287355693432, |
|
"learning_rate": 2.5280627749915544e-06, |
|
"loss": 1.1246, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.7940904893813481, |
|
"grad_norm": 3.7171990367681866, |
|
"learning_rate": 2.4747420877805905e-06, |
|
"loss": 1.1008, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.796398891966759, |
|
"grad_norm": 3.583342537171837, |
|
"learning_rate": 2.421910204978033e-06, |
|
"loss": 1.092, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.7987072945521699, |
|
"grad_norm": 3.3105866570237343, |
|
"learning_rate": 2.369570558287819e-06, |
|
"loss": 1.0495, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.8010156971375808, |
|
"grad_norm": 3.453250654565143, |
|
"learning_rate": 2.3177265474406084e-06, |
|
"loss": 1.0952, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.8033240997229917, |
|
"grad_norm": 3.2111312681793294, |
|
"learning_rate": 2.2663815399729495e-06, |
|
"loss": 1.0756, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8056325023084026, |
|
"grad_norm": 3.398739502823191, |
|
"learning_rate": 2.215538871008538e-06, |
|
"loss": 1.0855, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.8079409048938134, |
|
"grad_norm": 3.4089573083048883, |
|
"learning_rate": 2.1652018430415923e-06, |
|
"loss": 1.0707, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8102493074792244, |
|
"grad_norm": 3.7996382043873744, |
|
"learning_rate": 2.115373725722326e-06, |
|
"loss": 1.1419, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.8125577100646353, |
|
"grad_norm": 3.4303103622199203, |
|
"learning_rate": 2.066057755644587e-06, |
|
"loss": 1.1101, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8148661126500462, |
|
"grad_norm": 3.3758394994097363, |
|
"learning_rate": 2.0172571361356007e-06, |
|
"loss": 1.0975, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.817174515235457, |
|
"grad_norm": 3.2901551940425673, |
|
"learning_rate": 1.9689750370479134e-06, |
|
"loss": 1.0797, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8194829178208679, |
|
"grad_norm": 3.661068632899665, |
|
"learning_rate": 1.921214594553488e-06, |
|
"loss": 1.1287, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.8217913204062789, |
|
"grad_norm": 3.5442080312978415, |
|
"learning_rate": 1.8739789109399954e-06, |
|
"loss": 1.1514, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8240997229916898, |
|
"grad_norm": 3.3534741257777325, |
|
"learning_rate": 1.8272710544093019e-06, |
|
"loss": 1.0824, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.8264081255771006, |
|
"grad_norm": 3.570055818522298, |
|
"learning_rate": 1.7810940588781811e-06, |
|
"loss": 1.1313, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8287165281625115, |
|
"grad_norm": 3.3907592881825352, |
|
"learning_rate": 1.7354509237812334e-06, |
|
"loss": 1.0458, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.8310249307479224, |
|
"grad_norm": 3.7660635086416794, |
|
"learning_rate": 1.690344613876066e-06, |
|
"loss": 1.109, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 20.624336407323348, |
|
"learning_rate": 1.64577805905072e-06, |
|
"loss": 1.0872, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.8356417359187442, |
|
"grad_norm": 3.38434013035599, |
|
"learning_rate": 1.601754154133347e-06, |
|
"loss": 1.0943, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8379501385041551, |
|
"grad_norm": 3.3282071197431318, |
|
"learning_rate": 1.558275758704183e-06, |
|
"loss": 1.0983, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.840258541089566, |
|
"grad_norm": 3.4156960745203286, |
|
"learning_rate": 1.5153456969098013e-06, |
|
"loss": 1.0381, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8425669436749769, |
|
"grad_norm": 3.3418973703656274, |
|
"learning_rate": 1.4729667572796735e-06, |
|
"loss": 1.1452, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.8448753462603878, |
|
"grad_norm": 3.333897377962453, |
|
"learning_rate": 1.431141692545036e-06, |
|
"loss": 1.1076, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8471837488457987, |
|
"grad_norm": 3.402941306050666, |
|
"learning_rate": 1.389873219460085e-06, |
|
"loss": 1.0869, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.8494921514312096, |
|
"grad_norm": 3.3313186519496423, |
|
"learning_rate": 1.349164018625513e-06, |
|
"loss": 1.0765, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8518005540166205, |
|
"grad_norm": 3.6011720414080566, |
|
"learning_rate": 1.3090167343143911e-06, |
|
"loss": 1.0846, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.8541089566020313, |
|
"grad_norm": 3.629326020817196, |
|
"learning_rate": 1.2694339743004037e-06, |
|
"loss": 1.1088, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8564173591874423, |
|
"grad_norm": 3.6305906598709767, |
|
"learning_rate": 1.2304183096884626e-06, |
|
"loss": 1.0875, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.8587257617728532, |
|
"grad_norm": 3.35865168543221, |
|
"learning_rate": 1.1919722747477024e-06, |
|
"loss": 1.1143, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.8610341643582641, |
|
"grad_norm": 3.3889339992199177, |
|
"learning_rate": 1.1540983667468686e-06, |
|
"loss": 1.0916, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.863342566943675, |
|
"grad_norm": 3.3133014347890324, |
|
"learning_rate": 1.1167990457920985e-06, |
|
"loss": 1.0877, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.8656509695290858, |
|
"grad_norm": 3.415023896862017, |
|
"learning_rate": 1.0800767346671347e-06, |
|
"loss": 1.0284, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.8679593721144968, |
|
"grad_norm": 3.322962975732958, |
|
"learning_rate": 1.043933818675944e-06, |
|
"loss": 1.0782, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.8702677746999077, |
|
"grad_norm": 3.583896655771928, |
|
"learning_rate": 1.008372645487785e-06, |
|
"loss": 1.08, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.8725761772853186, |
|
"grad_norm": 3.3057678718948726, |
|
"learning_rate": 9.733955249847183e-07, |
|
"loss": 1.1034, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.8748845798707294, |
|
"grad_norm": 3.4387092657320997, |
|
"learning_rate": 9.390047291115567e-07, |
|
"loss": 1.0915, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.8771929824561403, |
|
"grad_norm": 3.8029482282950324, |
|
"learning_rate": 9.052024917282987e-07, |
|
"loss": 1.057, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8795013850415513, |
|
"grad_norm": 3.3990790831971465, |
|
"learning_rate": 8.719910084650262e-07, |
|
"loss": 1.0725, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.8818097876269622, |
|
"grad_norm": 3.262416726762208, |
|
"learning_rate": 8.393724365792866e-07, |
|
"loss": 1.1028, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.884118190212373, |
|
"grad_norm": 3.551691283783414, |
|
"learning_rate": 8.073488948159691e-07, |
|
"loss": 1.0546, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.8864265927977839, |
|
"grad_norm": 3.5211563130144197, |
|
"learning_rate": 7.759224632696793e-07, |
|
"loss": 1.1024, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8887349953831948, |
|
"grad_norm": 3.5958803804208976, |
|
"learning_rate": 7.450951832496233e-07, |
|
"loss": 1.0698, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.8910433979686058, |
|
"grad_norm": 4.107811963680795, |
|
"learning_rate": 7.148690571470251e-07, |
|
"loss": 1.0613, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8933518005540166, |
|
"grad_norm": 3.6280688174940416, |
|
"learning_rate": 6.852460483050494e-07, |
|
"loss": 1.0987, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.8956602031394275, |
|
"grad_norm": 3.4197153407779055, |
|
"learning_rate": 6.562280808912768e-07, |
|
"loss": 1.081, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8979686057248384, |
|
"grad_norm": 3.3975321682078494, |
|
"learning_rate": 6.278170397727179e-07, |
|
"loss": 1.0881, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.9002770083102493, |
|
"grad_norm": 3.385824657440924, |
|
"learning_rate": 6.000147703933845e-07, |
|
"loss": 1.0725, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.9025854108956602, |
|
"grad_norm": 3.733106691108023, |
|
"learning_rate": 5.728230786544153e-07, |
|
"loss": 1.0886, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.9048938134810711, |
|
"grad_norm": 3.3831515124529288, |
|
"learning_rate": 5.46243730796776e-07, |
|
"loss": 1.0854, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.907202216066482, |
|
"grad_norm": 3.4106013139907065, |
|
"learning_rate": 5.202784532865302e-07, |
|
"loss": 1.114, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.9095106186518929, |
|
"grad_norm": 3.130011381325973, |
|
"learning_rate": 4.949289327026952e-07, |
|
"loss": 1.0873, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.9118190212373037, |
|
"grad_norm": 3.3600219468750394, |
|
"learning_rate": 4.7019681562769816e-07, |
|
"loss": 1.0689, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.9141274238227147, |
|
"grad_norm": 3.379655670825615, |
|
"learning_rate": 4.460837085404113e-07, |
|
"loss": 1.0874, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9164358264081256, |
|
"grad_norm": 3.324809563310868, |
|
"learning_rate": 4.225911777118097e-07, |
|
"loss": 1.0894, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.9187442289935365, |
|
"grad_norm": 3.4668181744618196, |
|
"learning_rate": 3.9972074910323066e-07, |
|
"loss": 1.0896, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 3.4175120046363276, |
|
"learning_rate": 3.7747390826725736e-07, |
|
"loss": 1.0608, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.9233610341643582, |
|
"grad_norm": 3.365932789028912, |
|
"learning_rate": 3.5585210025122166e-07, |
|
"loss": 1.0465, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9256694367497692, |
|
"grad_norm": 3.3721429412301442, |
|
"learning_rate": 3.3485672950334447e-07, |
|
"loss": 1.0782, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.9279778393351801, |
|
"grad_norm": 3.402893452692765, |
|
"learning_rate": 3.1448915978150365e-07, |
|
"loss": 1.0575, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.930286241920591, |
|
"grad_norm": 3.3246351042614606, |
|
"learning_rate": 2.947507140646588e-07, |
|
"loss": 1.093, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.9325946445060018, |
|
"grad_norm": 3.42392243323848, |
|
"learning_rate": 2.756426744669105e-07, |
|
"loss": 1.0709, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9349030470914127, |
|
"grad_norm": 3.3870385964627565, |
|
"learning_rate": 2.57166282154222e-07, |
|
"loss": 1.0944, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.9372114496768237, |
|
"grad_norm": 3.4345800654530128, |
|
"learning_rate": 2.393227372638018e-07, |
|
"loss": 1.0829, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9395198522622346, |
|
"grad_norm": 3.2304527099741094, |
|
"learning_rate": 2.221131988261438e-07, |
|
"loss": 1.0663, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.9418282548476454, |
|
"grad_norm": 3.4212248154000324, |
|
"learning_rate": 2.055387846897472e-07, |
|
"loss": 1.0608, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9441366574330563, |
|
"grad_norm": 3.3424495231710356, |
|
"learning_rate": 1.8960057144850163e-07, |
|
"loss": 1.0513, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.9464450600184672, |
|
"grad_norm": 8.42913586604929, |
|
"learning_rate": 1.742995943717607e-07, |
|
"loss": 1.0698, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9487534626038782, |
|
"grad_norm": 4.03605470816158, |
|
"learning_rate": 1.5963684733709462e-07, |
|
"loss": 1.0787, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.951061865189289, |
|
"grad_norm": 3.572766321551919, |
|
"learning_rate": 1.4561328276573415e-07, |
|
"loss": 1.0625, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9533702677746999, |
|
"grad_norm": 3.213406555168112, |
|
"learning_rate": 1.3222981156070126e-07, |
|
"loss": 1.0861, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.9556786703601108, |
|
"grad_norm": 3.216022210724082, |
|
"learning_rate": 1.1948730304764622e-07, |
|
"loss": 1.0572, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9579870729455217, |
|
"grad_norm": 3.8142801195990237, |
|
"learning_rate": 1.073865849183786e-07, |
|
"loss": 1.1151, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.9602954755309326, |
|
"grad_norm": 3.2011503381896413, |
|
"learning_rate": 9.592844317710238e-08, |
|
"loss": 1.0585, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.9626038781163435, |
|
"grad_norm": 3.3780038857652226, |
|
"learning_rate": 8.511362208936447e-08, |
|
"loss": 1.0591, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.9649122807017544, |
|
"grad_norm": 3.3212612452494295, |
|
"learning_rate": 7.494282413371135e-08, |
|
"loss": 1.0787, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.9672206832871653, |
|
"grad_norm": 3.797857330316498, |
|
"learning_rate": 6.541670995605321e-08, |
|
"loss": 1.0859, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.9695290858725761, |
|
"grad_norm": 3.153773745189338, |
|
"learning_rate": 5.653589832675943e-08, |
|
"loss": 1.0983, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9718374884579871, |
|
"grad_norm": 3.4652822167549906, |
|
"learning_rate": 4.830096610045854e-08, |
|
"loss": 1.0713, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.974145891043398, |
|
"grad_norm": 3.6601967632905796, |
|
"learning_rate": 4.071244817857589e-08, |
|
"loss": 1.1118, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.9764542936288089, |
|
"grad_norm": 3.135385406063897, |
|
"learning_rate": 3.3770837474584874e-08, |
|
"loss": 1.072, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.9787626962142197, |
|
"grad_norm": 3.4884714571677784, |
|
"learning_rate": 2.747658488199023e-08, |
|
"loss": 1.0738, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.9810710987996306, |
|
"grad_norm": 3.7803263448925706, |
|
"learning_rate": 2.1830099245040427e-08, |
|
"loss": 1.0549, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.9833795013850416, |
|
"grad_norm": 3.3045019552603585, |
|
"learning_rate": 1.683174733216997e-08, |
|
"loss": 1.1129, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.9856879039704525, |
|
"grad_norm": 3.2212668180182784, |
|
"learning_rate": 1.248185381217848e-08, |
|
"loss": 1.0777, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.9879963065558633, |
|
"grad_norm": 3.324768260260177, |
|
"learning_rate": 8.780701233139789e-09, |
|
"loss": 1.0503, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9903047091412742, |
|
"grad_norm": 3.214869100486745, |
|
"learning_rate": 5.728530004051047e-09, |
|
"loss": 1.0367, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.9926131117266851, |
|
"grad_norm": 3.3583215428853666, |
|
"learning_rate": 3.325538379211901e-09, |
|
"loss": 1.0554, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9949215143120961, |
|
"grad_norm": 4.075445923312751, |
|
"learning_rate": 1.5718824453525572e-09, |
|
"loss": 1.1222, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.997229916897507, |
|
"grad_norm": 3.3599147688364903, |
|
"learning_rate": 4.676761114941197e-10, |
|
"loss": 1.0646, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9995383194829178, |
|
"grad_norm": 3.4496692841727543, |
|
"learning_rate": 1.2991101545622998e-11, |
|
"loss": 1.1038, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.1177629232406616, |
|
"eval_runtime": 1154.8442, |
|
"eval_samples_per_second": 26.579, |
|
"eval_steps_per_second": 0.831, |
|
"step": 2166 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2166, |
|
"total_flos": 113379083550720.0, |
|
"train_loss": 1.171416565762112, |
|
"train_runtime": 11018.7418, |
|
"train_samples_per_second": 6.29, |
|
"train_steps_per_second": 0.197 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2166, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 113379083550720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|