{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999342321604735, "eval_steps": 500, "global_step": 7602, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013153567905294311, "grad_norm": 0.4799177944660187, "learning_rate": 2.628120893561104e-07, "loss": 1.4291, "step": 1 }, { "epoch": 0.0006576783952647156, "grad_norm": 0.45898500084877014, "learning_rate": 1.314060446780552e-06, "loss": 1.3658, "step": 5 }, { "epoch": 0.0013153567905294311, "grad_norm": 0.4707576632499695, "learning_rate": 2.628120893561104e-06, "loss": 1.3594, "step": 10 }, { "epoch": 0.0019730351857941467, "grad_norm": 0.45364558696746826, "learning_rate": 3.942181340341656e-06, "loss": 1.3382, "step": 15 }, { "epoch": 0.0026307135810588623, "grad_norm": 0.5603415966033936, "learning_rate": 5.256241787122208e-06, "loss": 1.3606, "step": 20 }, { "epoch": 0.003288391976323578, "grad_norm": 0.6410068869590759, "learning_rate": 6.5703022339027605e-06, "loss": 1.3912, "step": 25 }, { "epoch": 0.003946070371588293, "grad_norm": 0.33453282713890076, "learning_rate": 7.884362680683311e-06, "loss": 1.3794, "step": 30 }, { "epoch": 0.0046037487668530086, "grad_norm": 0.18057125806808472, "learning_rate": 9.198423127463864e-06, "loss": 1.3199, "step": 35 }, { "epoch": 0.005261427162117725, "grad_norm": 0.17045703530311584, "learning_rate": 1.0512483574244416e-05, "loss": 1.3108, "step": 40 }, { "epoch": 0.00591910555738244, "grad_norm": 0.22115591168403625, "learning_rate": 1.1826544021024969e-05, "loss": 1.3122, "step": 45 }, { "epoch": 0.006576783952647156, "grad_norm": 0.15453889966011047, "learning_rate": 1.3140604467805521e-05, "loss": 1.2511, "step": 50 }, { "epoch": 0.007234462347911871, "grad_norm": 0.1349402368068695, "learning_rate": 1.4454664914586072e-05, "loss": 1.2744, "step": 55 }, { "epoch": 0.007892140743176587, "grad_norm": 0.15142956376075745, "learning_rate": 1.5768725361366622e-05, "loss": 1.3097, "step": 60 }, { "epoch": 0.008549819138441302, "grad_norm": 0.15905340015888214, "learning_rate": 1.7082785808147177e-05, "loss": 1.2587, "step": 65 }, { "epoch": 0.009207497533706017, "grad_norm": 0.15091343224048615, "learning_rate": 1.8396846254927727e-05, "loss": 1.273, "step": 70 }, { "epoch": 0.009865175928970734, "grad_norm": 0.1557701677083969, "learning_rate": 1.9710906701708278e-05, "loss": 1.2358, "step": 75 }, { "epoch": 0.01052285432423545, "grad_norm": 0.14513249695301056, "learning_rate": 2.1024967148488832e-05, "loss": 1.2053, "step": 80 }, { "epoch": 0.011180532719500164, "grad_norm": 0.1486293226480484, "learning_rate": 2.2339027595269383e-05, "loss": 1.2114, "step": 85 }, { "epoch": 0.01183821111476488, "grad_norm": 0.1422361135482788, "learning_rate": 2.3653088042049937e-05, "loss": 1.2254, "step": 90 }, { "epoch": 0.012495889510029596, "grad_norm": 0.14687830209732056, "learning_rate": 2.4967148488830488e-05, "loss": 1.2639, "step": 95 }, { "epoch": 0.013153567905294311, "grad_norm": 0.13631942868232727, "learning_rate": 2.6281208935611042e-05, "loss": 1.2598, "step": 100 }, { "epoch": 0.013811246300559027, "grad_norm": 0.14525571465492249, "learning_rate": 2.7595269382391593e-05, "loss": 1.2044, "step": 105 }, { "epoch": 0.014468924695823742, "grad_norm": 0.13871049880981445, "learning_rate": 2.8909329829172143e-05, "loss": 1.1883, "step": 110 }, { "epoch": 0.015126603091088459, "grad_norm": 0.14974988996982574, "learning_rate": 3.0223390275952694e-05, "loss": 1.2094, "step": 115 }, { "epoch": 0.015784281486353174, "grad_norm": 0.15352840721607208, "learning_rate": 3.1537450722733245e-05, "loss": 1.1789, "step": 120 }, { "epoch": 0.01644195988161789, "grad_norm": 0.15584270656108856, "learning_rate": 3.2851511169513796e-05, "loss": 1.1763, "step": 125 }, { "epoch": 0.017099638276882604, "grad_norm": 0.29917633533477783, "learning_rate": 3.416557161629435e-05, "loss": 1.1161, "step": 130 }, { "epoch": 0.01775731667214732, "grad_norm": 0.14615362882614136, "learning_rate": 3.5479632063074904e-05, "loss": 1.1634, "step": 135 }, { "epoch": 0.018414995067412034, "grad_norm": 0.15577350556850433, "learning_rate": 3.6793692509855455e-05, "loss": 1.1472, "step": 140 }, { "epoch": 0.01907267346267675, "grad_norm": 0.17378628253936768, "learning_rate": 3.8107752956636005e-05, "loss": 1.1386, "step": 145 }, { "epoch": 0.019730351857941468, "grad_norm": 0.16961731016635895, "learning_rate": 3.9421813403416556e-05, "loss": 1.1962, "step": 150 }, { "epoch": 0.020388030253206183, "grad_norm": 0.16368459165096283, "learning_rate": 4.073587385019711e-05, "loss": 1.1726, "step": 155 }, { "epoch": 0.0210457086484709, "grad_norm": 0.1629062443971634, "learning_rate": 4.2049934296977664e-05, "loss": 1.1509, "step": 160 }, { "epoch": 0.021703387043735613, "grad_norm": 0.16831117868423462, "learning_rate": 4.3363994743758215e-05, "loss": 1.2265, "step": 165 }, { "epoch": 0.02236106543900033, "grad_norm": 0.15755963325500488, "learning_rate": 4.4678055190538766e-05, "loss": 1.1464, "step": 170 }, { "epoch": 0.023018743834265044, "grad_norm": 0.18842089176177979, "learning_rate": 4.5992115637319317e-05, "loss": 1.1707, "step": 175 }, { "epoch": 0.02367642222952976, "grad_norm": 0.1670859456062317, "learning_rate": 4.7306176084099874e-05, "loss": 1.1807, "step": 180 }, { "epoch": 0.024334100624794474, "grad_norm": 0.15638850629329681, "learning_rate": 4.8620236530880425e-05, "loss": 1.1935, "step": 185 }, { "epoch": 0.024991779020059193, "grad_norm": 0.16719070076942444, "learning_rate": 4.9934296977660976e-05, "loss": 1.1525, "step": 190 }, { "epoch": 0.025649457415323908, "grad_norm": 0.17287735641002655, "learning_rate": 5.1248357424441526e-05, "loss": 1.1524, "step": 195 }, { "epoch": 0.026307135810588623, "grad_norm": 0.18613961338996887, "learning_rate": 5.2562417871222084e-05, "loss": 1.1958, "step": 200 }, { "epoch": 0.026964814205853338, "grad_norm": 0.16728590428829193, "learning_rate": 5.3876478318002635e-05, "loss": 1.1276, "step": 205 }, { "epoch": 0.027622492601118053, "grad_norm": 0.18466223776340485, "learning_rate": 5.5190538764783185e-05, "loss": 1.189, "step": 210 }, { "epoch": 0.02828017099638277, "grad_norm": 0.1594204306602478, "learning_rate": 5.6504599211563736e-05, "loss": 1.1539, "step": 215 }, { "epoch": 0.028937849391647483, "grad_norm": 0.17637033760547638, "learning_rate": 5.781865965834429e-05, "loss": 1.2125, "step": 220 }, { "epoch": 0.0295955277869122, "grad_norm": 0.18302315473556519, "learning_rate": 5.913272010512484e-05, "loss": 1.1749, "step": 225 }, { "epoch": 0.030253206182176917, "grad_norm": 0.17645341157913208, "learning_rate": 6.044678055190539e-05, "loss": 1.1166, "step": 230 }, { "epoch": 0.030910884577441632, "grad_norm": 0.17943893373012543, "learning_rate": 6.176084099868594e-05, "loss": 1.1059, "step": 235 }, { "epoch": 0.03156856297270635, "grad_norm": 0.1531655490398407, "learning_rate": 6.307490144546649e-05, "loss": 1.1292, "step": 240 }, { "epoch": 0.03222624136797106, "grad_norm": 0.17501339316368103, "learning_rate": 6.438896189224704e-05, "loss": 1.1318, "step": 245 }, { "epoch": 0.03288391976323578, "grad_norm": 0.15884102880954742, "learning_rate": 6.570302233902759e-05, "loss": 1.1356, "step": 250 }, { "epoch": 0.033541598158500496, "grad_norm": 0.18064704537391663, "learning_rate": 6.701708278580816e-05, "loss": 1.1633, "step": 255 }, { "epoch": 0.03419927655376521, "grad_norm": 0.18719318509101868, "learning_rate": 6.83311432325887e-05, "loss": 1.1549, "step": 260 }, { "epoch": 0.03485695494902993, "grad_norm": 0.16811800003051758, "learning_rate": 6.964520367936926e-05, "loss": 1.1272, "step": 265 }, { "epoch": 0.03551463334429464, "grad_norm": 0.16309702396392822, "learning_rate": 7.095926412614981e-05, "loss": 1.1014, "step": 270 }, { "epoch": 0.03617231173955936, "grad_norm": 0.16379152238368988, "learning_rate": 7.227332457293036e-05, "loss": 1.0398, "step": 275 }, { "epoch": 0.03682999013482407, "grad_norm": 0.16131411492824554, "learning_rate": 7.358738501971091e-05, "loss": 1.1617, "step": 280 }, { "epoch": 0.03748766853008879, "grad_norm": 0.18855397403240204, "learning_rate": 7.490144546649146e-05, "loss": 1.1225, "step": 285 }, { "epoch": 0.0381453469253535, "grad_norm": 0.15875910222530365, "learning_rate": 7.621550591327201e-05, "loss": 1.108, "step": 290 }, { "epoch": 0.03880302532061822, "grad_norm": 0.17261070013046265, "learning_rate": 7.752956636005256e-05, "loss": 1.1385, "step": 295 }, { "epoch": 0.039460703715882936, "grad_norm": 0.17602774500846863, "learning_rate": 7.884362680683311e-05, "loss": 1.1339, "step": 300 }, { "epoch": 0.04011838211114765, "grad_norm": 0.15580470860004425, "learning_rate": 8.015768725361366e-05, "loss": 1.1182, "step": 305 }, { "epoch": 0.040776060506412366, "grad_norm": 0.18294861912727356, "learning_rate": 8.147174770039421e-05, "loss": 1.091, "step": 310 }, { "epoch": 0.04143373890167708, "grad_norm": 0.1641063541173935, "learning_rate": 8.278580814717476e-05, "loss": 1.1292, "step": 315 }, { "epoch": 0.0420914172969418, "grad_norm": 0.15849268436431885, "learning_rate": 8.409986859395533e-05, "loss": 1.1542, "step": 320 }, { "epoch": 0.04274909569220651, "grad_norm": 0.15354867279529572, "learning_rate": 8.541392904073588e-05, "loss": 1.1646, "step": 325 }, { "epoch": 0.04340677408747123, "grad_norm": 0.15696944296360016, "learning_rate": 8.672798948751643e-05, "loss": 1.1679, "step": 330 }, { "epoch": 0.044064452482735945, "grad_norm": 0.20342762768268585, "learning_rate": 8.804204993429698e-05, "loss": 1.1806, "step": 335 }, { "epoch": 0.04472213087800066, "grad_norm": 0.1570473313331604, "learning_rate": 8.935611038107753e-05, "loss": 1.1565, "step": 340 }, { "epoch": 0.045379809273265376, "grad_norm": 0.15503478050231934, "learning_rate": 9.067017082785808e-05, "loss": 1.1796, "step": 345 }, { "epoch": 0.04603748766853009, "grad_norm": 0.20772993564605713, "learning_rate": 9.198423127463863e-05, "loss": 1.2196, "step": 350 }, { "epoch": 0.046695166063794806, "grad_norm": 0.15664535760879517, "learning_rate": 9.329829172141918e-05, "loss": 1.1418, "step": 355 }, { "epoch": 0.04735284445905952, "grad_norm": 0.15098999440670013, "learning_rate": 9.461235216819975e-05, "loss": 1.1951, "step": 360 }, { "epoch": 0.048010522854324236, "grad_norm": 0.14299260079860687, "learning_rate": 9.59264126149803e-05, "loss": 1.146, "step": 365 }, { "epoch": 0.04866820124958895, "grad_norm": 0.14739377796649933, "learning_rate": 9.724047306176085e-05, "loss": 1.1514, "step": 370 }, { "epoch": 0.04932587964485367, "grad_norm": 0.15700824558734894, "learning_rate": 9.85545335085414e-05, "loss": 1.1065, "step": 375 }, { "epoch": 0.049983558040118385, "grad_norm": 0.14551231265068054, "learning_rate": 9.986859395532195e-05, "loss": 1.1613, "step": 380 }, { "epoch": 0.0506412364353831, "grad_norm": 0.14373187720775604, "learning_rate": 0.0001011826544021025, "loss": 1.1132, "step": 385 }, { "epoch": 0.051298914830647815, "grad_norm": 0.13231082260608673, "learning_rate": 0.00010249671484888305, "loss": 1.1639, "step": 390 }, { "epoch": 0.05195659322591253, "grad_norm": 0.14773692190647125, "learning_rate": 0.0001038107752956636, "loss": 1.1646, "step": 395 }, { "epoch": 0.052614271621177246, "grad_norm": 0.14072872698307037, "learning_rate": 0.00010512483574244417, "loss": 1.1914, "step": 400 }, { "epoch": 0.05327195001644196, "grad_norm": 0.1455809473991394, "learning_rate": 0.00010643889618922472, "loss": 1.1632, "step": 405 }, { "epoch": 0.053929628411706676, "grad_norm": 0.15249772369861603, "learning_rate": 0.00010775295663600527, "loss": 1.1416, "step": 410 }, { "epoch": 0.05458730680697139, "grad_norm": 0.1612495630979538, "learning_rate": 0.00010906701708278582, "loss": 1.1134, "step": 415 }, { "epoch": 0.055244985202236106, "grad_norm": 0.1365536004304886, "learning_rate": 0.00011038107752956637, "loss": 1.1524, "step": 420 }, { "epoch": 0.055902663597500825, "grad_norm": 0.15040118992328644, "learning_rate": 0.00011169513797634692, "loss": 1.1665, "step": 425 }, { "epoch": 0.05656034199276554, "grad_norm": 0.14957711100578308, "learning_rate": 0.00011300919842312747, "loss": 1.1422, "step": 430 }, { "epoch": 0.057218020388030255, "grad_norm": 0.14276188611984253, "learning_rate": 0.00011432325886990802, "loss": 1.1059, "step": 435 }, { "epoch": 0.05787569878329497, "grad_norm": 0.14642347395420074, "learning_rate": 0.00011563731931668857, "loss": 1.0887, "step": 440 }, { "epoch": 0.058533377178559685, "grad_norm": 0.14467675983905792, "learning_rate": 0.00011695137976346912, "loss": 1.1074, "step": 445 }, { "epoch": 0.0591910555738244, "grad_norm": 0.13578906655311584, "learning_rate": 0.00011826544021024967, "loss": 1.1606, "step": 450 }, { "epoch": 0.059848733969089116, "grad_norm": 0.16568222641944885, "learning_rate": 0.00011957950065703023, "loss": 1.0569, "step": 455 }, { "epoch": 0.060506412364353834, "grad_norm": 0.1401658058166504, "learning_rate": 0.00012089356110381078, "loss": 1.1312, "step": 460 }, { "epoch": 0.061164090759618546, "grad_norm": 0.1948261708021164, "learning_rate": 0.00012220762155059134, "loss": 1.1269, "step": 465 }, { "epoch": 0.061821769154883265, "grad_norm": 0.1355002373456955, "learning_rate": 0.00012352168199737188, "loss": 1.1555, "step": 470 }, { "epoch": 0.062479447550147976, "grad_norm": 0.1482207477092743, "learning_rate": 0.00012483574244415244, "loss": 1.1414, "step": 475 }, { "epoch": 0.0631371259454127, "grad_norm": 0.1601637899875641, "learning_rate": 0.00012614980289093298, "loss": 1.1644, "step": 480 }, { "epoch": 0.06379480434067741, "grad_norm": 0.1337427943944931, "learning_rate": 0.00012746386333771354, "loss": 1.1487, "step": 485 }, { "epoch": 0.06445248273594212, "grad_norm": 0.1297510266304016, "learning_rate": 0.00012877792378449408, "loss": 1.123, "step": 490 }, { "epoch": 0.06511016113120684, "grad_norm": 0.13838523626327515, "learning_rate": 0.00013009198423127465, "loss": 1.1165, "step": 495 }, { "epoch": 0.06576783952647156, "grad_norm": 0.13319997489452362, "learning_rate": 0.00013140604467805518, "loss": 1.1146, "step": 500 }, { "epoch": 0.06642551792173627, "grad_norm": 0.13041220605373383, "learning_rate": 0.00013272010512483575, "loss": 1.1105, "step": 505 }, { "epoch": 0.06708319631700099, "grad_norm": 0.13738314807415009, "learning_rate": 0.0001340341655716163, "loss": 1.1024, "step": 510 }, { "epoch": 0.0677408747122657, "grad_norm": 0.13976356387138367, "learning_rate": 0.00013534822601839685, "loss": 1.1287, "step": 515 }, { "epoch": 0.06839855310753042, "grad_norm": 0.12786519527435303, "learning_rate": 0.0001366622864651774, "loss": 1.1212, "step": 520 }, { "epoch": 0.06905623150279513, "grad_norm": 0.1392868459224701, "learning_rate": 0.00013797634691195795, "loss": 1.126, "step": 525 }, { "epoch": 0.06971390989805985, "grad_norm": 0.14690658450126648, "learning_rate": 0.00013929040735873851, "loss": 1.1136, "step": 530 }, { "epoch": 0.07037158829332456, "grad_norm": 0.13067905604839325, "learning_rate": 0.00014060446780551905, "loss": 1.1154, "step": 535 }, { "epoch": 0.07102926668858928, "grad_norm": 0.1261102259159088, "learning_rate": 0.00014191852825229962, "loss": 1.1557, "step": 540 }, { "epoch": 0.071686945083854, "grad_norm": 0.13624908030033112, "learning_rate": 0.00014323258869908015, "loss": 1.1458, "step": 545 }, { "epoch": 0.07234462347911871, "grad_norm": 0.13739381730556488, "learning_rate": 0.00014454664914586072, "loss": 1.1581, "step": 550 }, { "epoch": 0.07300230187438343, "grad_norm": 0.14173921942710876, "learning_rate": 0.00014586070959264125, "loss": 1.1516, "step": 555 }, { "epoch": 0.07365998026964814, "grad_norm": 0.13173076510429382, "learning_rate": 0.00014717477003942182, "loss": 1.1597, "step": 560 }, { "epoch": 0.07431765866491286, "grad_norm": 0.12557290494441986, "learning_rate": 0.00014848883048620236, "loss": 1.0877, "step": 565 }, { "epoch": 0.07497533706017757, "grad_norm": 0.13105574250221252, "learning_rate": 0.00014980289093298292, "loss": 1.1254, "step": 570 }, { "epoch": 0.07563301545544229, "grad_norm": 0.13010239601135254, "learning_rate": 0.00015111695137976348, "loss": 1.0972, "step": 575 }, { "epoch": 0.076290693850707, "grad_norm": 0.12758758664131165, "learning_rate": 0.00015243101182654402, "loss": 1.1419, "step": 580 }, { "epoch": 0.07694837224597172, "grad_norm": 0.12518388032913208, "learning_rate": 0.00015374507227332459, "loss": 1.1434, "step": 585 }, { "epoch": 0.07760605064123643, "grad_norm": 0.14056430757045746, "learning_rate": 0.00015505913272010512, "loss": 1.1813, "step": 590 }, { "epoch": 0.07826372903650115, "grad_norm": 0.136140838265419, "learning_rate": 0.0001563731931668857, "loss": 1.142, "step": 595 }, { "epoch": 0.07892140743176587, "grad_norm": 0.15038864314556122, "learning_rate": 0.00015768725361366622, "loss": 1.1603, "step": 600 }, { "epoch": 0.07957908582703058, "grad_norm": 0.12900781631469727, "learning_rate": 0.0001590013140604468, "loss": 1.1768, "step": 605 }, { "epoch": 0.0802367642222953, "grad_norm": 0.1280013471841812, "learning_rate": 0.00016031537450722733, "loss": 1.1281, "step": 610 }, { "epoch": 0.08089444261756001, "grad_norm": 0.1196935772895813, "learning_rate": 0.0001616294349540079, "loss": 1.1317, "step": 615 }, { "epoch": 0.08155212101282473, "grad_norm": 0.13040530681610107, "learning_rate": 0.00016294349540078843, "loss": 1.1047, "step": 620 }, { "epoch": 0.08220979940808945, "grad_norm": 0.12405379116535187, "learning_rate": 0.000164257555847569, "loss": 1.1208, "step": 625 }, { "epoch": 0.08286747780335416, "grad_norm": 0.13238492608070374, "learning_rate": 0.00016557161629434953, "loss": 1.046, "step": 630 }, { "epoch": 0.08352515619861887, "grad_norm": 0.12712593376636505, "learning_rate": 0.0001668856767411301, "loss": 1.1405, "step": 635 }, { "epoch": 0.0841828345938836, "grad_norm": 0.1272333562374115, "learning_rate": 0.00016819973718791066, "loss": 1.1087, "step": 640 }, { "epoch": 0.08484051298914831, "grad_norm": 0.11601302027702332, "learning_rate": 0.0001695137976346912, "loss": 1.084, "step": 645 }, { "epoch": 0.08549819138441302, "grad_norm": 0.12852762639522552, "learning_rate": 0.00017082785808147176, "loss": 1.1118, "step": 650 }, { "epoch": 0.08615586977967774, "grad_norm": 0.1362471580505371, "learning_rate": 0.0001721419185282523, "loss": 1.0933, "step": 655 }, { "epoch": 0.08681354817494245, "grad_norm": 0.13344067335128784, "learning_rate": 0.00017345597897503286, "loss": 1.1748, "step": 660 }, { "epoch": 0.08747122657020717, "grad_norm": 0.13904009759426117, "learning_rate": 0.0001747700394218134, "loss": 1.177, "step": 665 }, { "epoch": 0.08812890496547189, "grad_norm": 0.1249956339597702, "learning_rate": 0.00017608409986859396, "loss": 1.0976, "step": 670 }, { "epoch": 0.0887865833607366, "grad_norm": 0.12863720953464508, "learning_rate": 0.0001773981603153745, "loss": 1.0997, "step": 675 }, { "epoch": 0.08944426175600131, "grad_norm": 0.13097426295280457, "learning_rate": 0.00017871222076215506, "loss": 1.1402, "step": 680 }, { "epoch": 0.09010194015126603, "grad_norm": 0.12504078447818756, "learning_rate": 0.0001800262812089356, "loss": 1.1226, "step": 685 }, { "epoch": 0.09075961854653075, "grad_norm": 0.12640883028507233, "learning_rate": 0.00018134034165571616, "loss": 1.1206, "step": 690 }, { "epoch": 0.09141729694179546, "grad_norm": 0.1491176187992096, "learning_rate": 0.0001826544021024967, "loss": 1.1073, "step": 695 }, { "epoch": 0.09207497533706017, "grad_norm": 0.12444590032100677, "learning_rate": 0.00018396846254927727, "loss": 1.0999, "step": 700 }, { "epoch": 0.0927326537323249, "grad_norm": 0.11708039045333862, "learning_rate": 0.0001852825229960578, "loss": 1.126, "step": 705 }, { "epoch": 0.09339033212758961, "grad_norm": 0.12827545404434204, "learning_rate": 0.00018659658344283837, "loss": 1.1032, "step": 710 }, { "epoch": 0.09404801052285433, "grad_norm": 0.1356835514307022, "learning_rate": 0.00018791064388961893, "loss": 1.1592, "step": 715 }, { "epoch": 0.09470568891811904, "grad_norm": 0.12291291356086731, "learning_rate": 0.0001892247043363995, "loss": 1.1341, "step": 720 }, { "epoch": 0.09536336731338375, "grad_norm": 0.12390495836734772, "learning_rate": 0.00019053876478318003, "loss": 1.1511, "step": 725 }, { "epoch": 0.09602104570864847, "grad_norm": 0.11830148845911026, "learning_rate": 0.0001918528252299606, "loss": 1.1327, "step": 730 }, { "epoch": 0.09667872410391319, "grad_norm": 0.12577317655086517, "learning_rate": 0.00019316688567674116, "loss": 1.165, "step": 735 }, { "epoch": 0.0973364024991779, "grad_norm": 0.12031582742929459, "learning_rate": 0.0001944809461235217, "loss": 1.0973, "step": 740 }, { "epoch": 0.09799408089444261, "grad_norm": 0.13613493740558624, "learning_rate": 0.00019579500657030226, "loss": 1.1013, "step": 745 }, { "epoch": 0.09865175928970733, "grad_norm": 0.12341351062059402, "learning_rate": 0.0001971090670170828, "loss": 1.1239, "step": 750 }, { "epoch": 0.09930943768497205, "grad_norm": 0.11620868742465973, "learning_rate": 0.00019842312746386336, "loss": 1.0826, "step": 755 }, { "epoch": 0.09996711608023677, "grad_norm": 0.13111752271652222, "learning_rate": 0.0001997371879106439, "loss": 1.0975, "step": 760 }, { "epoch": 0.10062479447550148, "grad_norm": 0.11321892589330673, "learning_rate": 0.00019999983128626912, "loss": 1.0441, "step": 765 }, { "epoch": 0.1012824728707662, "grad_norm": 0.12926174700260162, "learning_rate": 0.00019999914588771304, "loss": 1.1259, "step": 770 }, { "epoch": 0.10194015126603091, "grad_norm": 0.1263921558856964, "learning_rate": 0.00019999793326333444, "loss": 1.1246, "step": 775 }, { "epoch": 0.10259782966129563, "grad_norm": 0.1566065400838852, "learning_rate": 0.00019999619341952668, "loss": 1.1602, "step": 780 }, { "epoch": 0.10325550805656034, "grad_norm": 0.12098898738622665, "learning_rate": 0.00019999392636546265, "loss": 1.1006, "step": 785 }, { "epoch": 0.10391318645182505, "grad_norm": 0.13054217398166656, "learning_rate": 0.00019999113211309507, "loss": 1.1546, "step": 790 }, { "epoch": 0.10457086484708977, "grad_norm": 0.11896365135908127, "learning_rate": 0.000199987810677156, "loss": 1.1148, "step": 795 }, { "epoch": 0.10522854324235449, "grad_norm": 0.12782897055149078, "learning_rate": 0.00019998396207515708, "loss": 1.1123, "step": 800 }, { "epoch": 0.10588622163761921, "grad_norm": 0.11801337450742722, "learning_rate": 0.00019997958632738934, "loss": 1.1098, "step": 805 }, { "epoch": 0.10654390003288391, "grad_norm": 0.12314503639936447, "learning_rate": 0.00019997468345692298, "loss": 1.1273, "step": 810 }, { "epoch": 0.10720157842814863, "grad_norm": 0.12287040799856186, "learning_rate": 0.0001999692534896075, "loss": 1.1222, "step": 815 }, { "epoch": 0.10785925682341335, "grad_norm": 0.12676484882831573, "learning_rate": 0.00019996329645407125, "loss": 1.0955, "step": 820 }, { "epoch": 0.10851693521867807, "grad_norm": 0.12635618448257446, "learning_rate": 0.0001999568123817216, "loss": 1.1409, "step": 825 }, { "epoch": 0.10917461361394278, "grad_norm": 0.1186482235789299, "learning_rate": 0.0001999498013067445, "loss": 1.08, "step": 830 }, { "epoch": 0.1098322920092075, "grad_norm": 0.13690640032291412, "learning_rate": 0.0001999422632661045, "loss": 1.1221, "step": 835 }, { "epoch": 0.11048997040447221, "grad_norm": 0.11385052651166916, "learning_rate": 0.0001999341982995444, "loss": 1.1117, "step": 840 }, { "epoch": 0.11114764879973693, "grad_norm": 0.12471459060907364, "learning_rate": 0.00019992560644958518, "loss": 1.1225, "step": 845 }, { "epoch": 0.11180532719500165, "grad_norm": 0.11578110605478287, "learning_rate": 0.00019991648776152566, "loss": 1.1584, "step": 850 }, { "epoch": 0.11246300559026635, "grad_norm": 0.13787905871868134, "learning_rate": 0.00019990684228344229, "loss": 1.0887, "step": 855 }, { "epoch": 0.11312068398553107, "grad_norm": 0.12279576063156128, "learning_rate": 0.00019989667006618906, "loss": 1.1013, "step": 860 }, { "epoch": 0.11377836238079579, "grad_norm": 0.11744140088558197, "learning_rate": 0.00019988597116339692, "loss": 1.1313, "step": 865 }, { "epoch": 0.11443604077606051, "grad_norm": 0.1250988095998764, "learning_rate": 0.0001998747456314737, "loss": 1.0567, "step": 870 }, { "epoch": 0.11509371917132523, "grad_norm": 0.12108694761991501, "learning_rate": 0.00019986299352960383, "loss": 1.1273, "step": 875 }, { "epoch": 0.11575139756658993, "grad_norm": 0.12365109473466873, "learning_rate": 0.000199850714919748, "loss": 1.0778, "step": 880 }, { "epoch": 0.11640907596185465, "grad_norm": 0.11948968470096588, "learning_rate": 0.0001998379098666427, "loss": 1.1347, "step": 885 }, { "epoch": 0.11706675435711937, "grad_norm": 0.12305375188589096, "learning_rate": 0.00019982457843780013, "loss": 1.1336, "step": 890 }, { "epoch": 0.11772443275238409, "grad_norm": 0.12142252177000046, "learning_rate": 0.00019981072070350756, "loss": 1.1451, "step": 895 }, { "epoch": 0.1183821111476488, "grad_norm": 0.12632034718990326, "learning_rate": 0.0001997963367368272, "loss": 1.1729, "step": 900 }, { "epoch": 0.11903978954291351, "grad_norm": 0.16316013038158417, "learning_rate": 0.00019978142661359564, "loss": 1.1747, "step": 905 }, { "epoch": 0.11969746793817823, "grad_norm": 0.11593782901763916, "learning_rate": 0.00019976599041242362, "loss": 1.1679, "step": 910 }, { "epoch": 0.12035514633344295, "grad_norm": 0.13732707500457764, "learning_rate": 0.00019975002821469545, "loss": 1.1392, "step": 915 }, { "epoch": 0.12101282472870767, "grad_norm": 0.13103729486465454, "learning_rate": 0.00019973354010456872, "loss": 1.1115, "step": 920 }, { "epoch": 0.12167050312397237, "grad_norm": 0.1280241757631302, "learning_rate": 0.00019971652616897366, "loss": 1.051, "step": 925 }, { "epoch": 0.12232818151923709, "grad_norm": 0.11974916607141495, "learning_rate": 0.00019969898649761298, "loss": 1.102, "step": 930 }, { "epoch": 0.12298585991450181, "grad_norm": 0.11828111112117767, "learning_rate": 0.00019968092118296111, "loss": 1.0902, "step": 935 }, { "epoch": 0.12364353830976653, "grad_norm": 0.11457942426204681, "learning_rate": 0.00019966233032026387, "loss": 1.1179, "step": 940 }, { "epoch": 0.12430121670503123, "grad_norm": 0.11767230927944183, "learning_rate": 0.00019964321400753793, "loss": 1.0958, "step": 945 }, { "epoch": 0.12495889510029595, "grad_norm": 0.11972962319850922, "learning_rate": 0.00019962357234557028, "loss": 1.1349, "step": 950 }, { "epoch": 0.12561657349556066, "grad_norm": 0.16924403607845306, "learning_rate": 0.00019960340543791772, "loss": 1.0738, "step": 955 }, { "epoch": 0.1262742518908254, "grad_norm": 0.12789861857891083, "learning_rate": 0.00019958271339090634, "loss": 1.1722, "step": 960 }, { "epoch": 0.1269319302860901, "grad_norm": 0.11652809381484985, "learning_rate": 0.0001995614963136308, "loss": 1.1724, "step": 965 }, { "epoch": 0.12758960868135483, "grad_norm": 0.1378277689218521, "learning_rate": 0.00019953975431795406, "loss": 1.1424, "step": 970 }, { "epoch": 0.12824728707661953, "grad_norm": 0.12053310126066208, "learning_rate": 0.00019951748751850646, "loss": 1.1405, "step": 975 }, { "epoch": 0.12890496547188424, "grad_norm": 0.14564210176467896, "learning_rate": 0.00019949469603268533, "loss": 1.0982, "step": 980 }, { "epoch": 0.12956264386714897, "grad_norm": 0.11273706704378128, "learning_rate": 0.00019947137998065434, "loss": 1.0957, "step": 985 }, { "epoch": 0.13022032226241367, "grad_norm": 0.12744776904582977, "learning_rate": 0.0001994475394853427, "loss": 1.1258, "step": 990 }, { "epoch": 0.1308780006576784, "grad_norm": 0.12046480178833008, "learning_rate": 0.0001994231746724448, "loss": 1.104, "step": 995 }, { "epoch": 0.1315356790529431, "grad_norm": 0.12932738661766052, "learning_rate": 0.0001993982856704193, "loss": 1.0814, "step": 1000 }, { "epoch": 0.13219335744820782, "grad_norm": 0.12433269619941711, "learning_rate": 0.00019937287261048855, "loss": 1.1319, "step": 1005 }, { "epoch": 0.13285103584347255, "grad_norm": 0.11687958985567093, "learning_rate": 0.00019934693562663792, "loss": 1.1702, "step": 1010 }, { "epoch": 0.13350871423873725, "grad_norm": 0.14447098970413208, "learning_rate": 0.0001993204748556151, "loss": 1.1685, "step": 1015 }, { "epoch": 0.13416639263400199, "grad_norm": 0.11485927551984787, "learning_rate": 0.0001992934904369292, "loss": 1.0495, "step": 1020 }, { "epoch": 0.1348240710292667, "grad_norm": 0.12658756971359253, "learning_rate": 0.00019926598251285036, "loss": 1.1036, "step": 1025 }, { "epoch": 0.1354817494245314, "grad_norm": 0.12388040870428085, "learning_rate": 0.00019923795122840863, "loss": 1.0913, "step": 1030 }, { "epoch": 0.13613942781979613, "grad_norm": 0.11855945736169815, "learning_rate": 0.00019920939673139347, "loss": 1.1066, "step": 1035 }, { "epoch": 0.13679710621506083, "grad_norm": 0.11995045840740204, "learning_rate": 0.00019918031917235288, "loss": 1.1213, "step": 1040 }, { "epoch": 0.13745478461032556, "grad_norm": 0.11373840272426605, "learning_rate": 0.00019915071870459246, "loss": 1.105, "step": 1045 }, { "epoch": 0.13811246300559027, "grad_norm": 0.14510244131088257, "learning_rate": 0.000199120595484175, "loss": 1.1303, "step": 1050 }, { "epoch": 0.13877014140085497, "grad_norm": 0.11649101227521896, "learning_rate": 0.00019908994966991915, "loss": 1.0902, "step": 1055 }, { "epoch": 0.1394278197961197, "grad_norm": 0.13140636682510376, "learning_rate": 0.00019905878142339895, "loss": 1.1494, "step": 1060 }, { "epoch": 0.1400854981913844, "grad_norm": 0.11751606315374374, "learning_rate": 0.0001990270909089429, "loss": 1.1209, "step": 1065 }, { "epoch": 0.14074317658664912, "grad_norm": 0.16028735041618347, "learning_rate": 0.00019899487829363292, "loss": 1.0867, "step": 1070 }, { "epoch": 0.14140085498191385, "grad_norm": 0.1145075261592865, "learning_rate": 0.00019896214374730376, "loss": 1.1452, "step": 1075 }, { "epoch": 0.14205853337717855, "grad_norm": 0.11406519263982773, "learning_rate": 0.00019892888744254183, "loss": 1.1904, "step": 1080 }, { "epoch": 0.14271621177244329, "grad_norm": 0.12424487620592117, "learning_rate": 0.00019889510955468448, "loss": 1.1109, "step": 1085 }, { "epoch": 0.143373890167708, "grad_norm": 0.12217444181442261, "learning_rate": 0.000198860810261819, "loss": 1.0874, "step": 1090 }, { "epoch": 0.1440315685629727, "grad_norm": 0.12621872127056122, "learning_rate": 0.0001988259897447816, "loss": 1.1355, "step": 1095 }, { "epoch": 0.14468924695823743, "grad_norm": 0.11648758500814438, "learning_rate": 0.00019879064818715663, "loss": 1.1217, "step": 1100 }, { "epoch": 0.14534692535350213, "grad_norm": 0.11290526390075684, "learning_rate": 0.00019875478577527554, "loss": 1.1118, "step": 1105 }, { "epoch": 0.14600460374876686, "grad_norm": 0.1197567954659462, "learning_rate": 0.00019871840269821575, "loss": 1.1358, "step": 1110 }, { "epoch": 0.14666228214403157, "grad_norm": 0.11461668461561203, "learning_rate": 0.0001986814991478, "loss": 1.0991, "step": 1115 }, { "epoch": 0.14731996053929627, "grad_norm": 0.12228855490684509, "learning_rate": 0.0001986440753185949, "loss": 1.081, "step": 1120 }, { "epoch": 0.147977638934561, "grad_norm": 0.11870820820331573, "learning_rate": 0.00019860613140791026, "loss": 1.0855, "step": 1125 }, { "epoch": 0.1486353173298257, "grad_norm": 0.11865003407001495, "learning_rate": 0.0001985676676157979, "loss": 1.1224, "step": 1130 }, { "epoch": 0.14929299572509044, "grad_norm": 0.13282229006290436, "learning_rate": 0.00019852868414505057, "loss": 1.1738, "step": 1135 }, { "epoch": 0.14995067412035515, "grad_norm": 0.12200037389993668, "learning_rate": 0.00019848918120120095, "loss": 1.0806, "step": 1140 }, { "epoch": 0.15060835251561985, "grad_norm": 0.12677037715911865, "learning_rate": 0.0001984491589925206, "loss": 1.1048, "step": 1145 }, { "epoch": 0.15126603091088459, "grad_norm": 0.12437453120946884, "learning_rate": 0.00019840861773001863, "loss": 1.0785, "step": 1150 }, { "epoch": 0.1519237093061493, "grad_norm": 0.12875019013881683, "learning_rate": 0.00019836755762744096, "loss": 1.1012, "step": 1155 }, { "epoch": 0.152581387701414, "grad_norm": 0.12883338332176208, "learning_rate": 0.00019832597890126884, "loss": 1.1224, "step": 1160 }, { "epoch": 0.15323906609667873, "grad_norm": 0.11353100836277008, "learning_rate": 0.000198283881770718, "loss": 1.105, "step": 1165 }, { "epoch": 0.15389674449194343, "grad_norm": 0.12303536385297775, "learning_rate": 0.00019824126645773718, "loss": 1.0635, "step": 1170 }, { "epoch": 0.15455442288720816, "grad_norm": 0.1319465935230255, "learning_rate": 0.00019819813318700736, "loss": 1.1003, "step": 1175 }, { "epoch": 0.15521210128247287, "grad_norm": 0.11699216067790985, "learning_rate": 0.0001981544821859401, "loss": 1.1156, "step": 1180 }, { "epoch": 0.15586977967773757, "grad_norm": 0.11667801439762115, "learning_rate": 0.0001981103136846768, "loss": 1.0939, "step": 1185 }, { "epoch": 0.1565274580730023, "grad_norm": 0.12407808005809784, "learning_rate": 0.00019806562791608716, "loss": 1.0931, "step": 1190 }, { "epoch": 0.157185136468267, "grad_norm": 0.11887746304273605, "learning_rate": 0.00019802042511576809, "loss": 1.1478, "step": 1195 }, { "epoch": 0.15784281486353174, "grad_norm": 0.12790104746818542, "learning_rate": 0.00019797470552204244, "loss": 1.1552, "step": 1200 }, { "epoch": 0.15850049325879645, "grad_norm": 0.12999314069747925, "learning_rate": 0.00019792846937595785, "loss": 1.1308, "step": 1205 }, { "epoch": 0.15915817165406115, "grad_norm": 0.10982532799243927, "learning_rate": 0.0001978817169212853, "loss": 1.0761, "step": 1210 }, { "epoch": 0.15981585004932589, "grad_norm": 0.11761903762817383, "learning_rate": 0.0001978344484045178, "loss": 1.1124, "step": 1215 }, { "epoch": 0.1604735284445906, "grad_norm": 0.11974523961544037, "learning_rate": 0.0001977866640748694, "loss": 1.0798, "step": 1220 }, { "epoch": 0.16113120683985532, "grad_norm": 0.12951800227165222, "learning_rate": 0.00019773836418427357, "loss": 1.1949, "step": 1225 }, { "epoch": 0.16178888523512003, "grad_norm": 0.12468104064464569, "learning_rate": 0.00019768954898738192, "loss": 1.1324, "step": 1230 }, { "epoch": 0.16244656363038473, "grad_norm": 0.11467801779508591, "learning_rate": 0.00019764021874156304, "loss": 1.1009, "step": 1235 }, { "epoch": 0.16310424202564947, "grad_norm": 0.15334735810756683, "learning_rate": 0.0001975903737069009, "loss": 1.1001, "step": 1240 }, { "epoch": 0.16376192042091417, "grad_norm": 0.12227438390254974, "learning_rate": 0.0001975400141461937, "loss": 1.0333, "step": 1245 }, { "epoch": 0.1644195988161789, "grad_norm": 0.11797935515642166, "learning_rate": 0.00019748914032495228, "loss": 1.109, "step": 1250 }, { "epoch": 0.1650772772114436, "grad_norm": 0.13183729350566864, "learning_rate": 0.0001974377525113988, "loss": 1.0635, "step": 1255 }, { "epoch": 0.1657349556067083, "grad_norm": 0.12748801708221436, "learning_rate": 0.00019738585097646546, "loss": 1.1602, "step": 1260 }, { "epoch": 0.16639263400197304, "grad_norm": 0.12365809082984924, "learning_rate": 0.00019733343599379288, "loss": 1.1143, "step": 1265 }, { "epoch": 0.16705031239723775, "grad_norm": 0.1184631735086441, "learning_rate": 0.00019728050783972875, "loss": 1.068, "step": 1270 }, { "epoch": 0.16770799079250245, "grad_norm": 0.12313895672559738, "learning_rate": 0.00019722706679332634, "loss": 1.1828, "step": 1275 }, { "epoch": 0.1683656691877672, "grad_norm": 0.1195981502532959, "learning_rate": 0.00019717311313634304, "loss": 1.0987, "step": 1280 }, { "epoch": 0.1690233475830319, "grad_norm": 0.12029096484184265, "learning_rate": 0.0001971186471532389, "loss": 1.0801, "step": 1285 }, { "epoch": 0.16968102597829662, "grad_norm": 0.11706339567899704, "learning_rate": 0.00019706366913117514, "loss": 1.1372, "step": 1290 }, { "epoch": 0.17033870437356133, "grad_norm": 0.12753020226955414, "learning_rate": 0.00019700817936001247, "loss": 1.1202, "step": 1295 }, { "epoch": 0.17099638276882603, "grad_norm": 0.11744942516088486, "learning_rate": 0.0001969521781323099, "loss": 1.0617, "step": 1300 }, { "epoch": 0.17165406116409077, "grad_norm": 0.11762962490320206, "learning_rate": 0.00019689566574332282, "loss": 1.1466, "step": 1305 }, { "epoch": 0.17231173955935547, "grad_norm": 0.11810944974422455, "learning_rate": 0.0001968386424910017, "loss": 1.1556, "step": 1310 }, { "epoch": 0.1729694179546202, "grad_norm": 0.12276309728622437, "learning_rate": 0.0001967811086759904, "loss": 1.118, "step": 1315 }, { "epoch": 0.1736270963498849, "grad_norm": 0.1150173768401146, "learning_rate": 0.00019672306460162466, "loss": 1.0738, "step": 1320 }, { "epoch": 0.1742847747451496, "grad_norm": 0.11996825784444809, "learning_rate": 0.00019666451057393042, "loss": 1.1204, "step": 1325 }, { "epoch": 0.17494245314041434, "grad_norm": 0.12643462419509888, "learning_rate": 0.00019660544690162232, "loss": 1.0874, "step": 1330 }, { "epoch": 0.17560013153567905, "grad_norm": 0.11835413426160812, "learning_rate": 0.00019654587389610192, "loss": 1.1755, "step": 1335 }, { "epoch": 0.17625780993094378, "grad_norm": 0.13048353791236877, "learning_rate": 0.00019648579187145617, "loss": 1.1142, "step": 1340 }, { "epoch": 0.1769154883262085, "grad_norm": 0.12002298980951309, "learning_rate": 0.00019642520114445574, "loss": 1.0639, "step": 1345 }, { "epoch": 0.1775731667214732, "grad_norm": 0.1215134859085083, "learning_rate": 0.00019636410203455328, "loss": 1.0399, "step": 1350 }, { "epoch": 0.17823084511673792, "grad_norm": 0.12047736346721649, "learning_rate": 0.00019630249486388187, "loss": 1.1155, "step": 1355 }, { "epoch": 0.17888852351200263, "grad_norm": 0.1188744530081749, "learning_rate": 0.00019624037995725314, "loss": 1.0875, "step": 1360 }, { "epoch": 0.17954620190726733, "grad_norm": 0.11916454881429672, "learning_rate": 0.0001961777576421558, "loss": 1.1241, "step": 1365 }, { "epoch": 0.18020388030253207, "grad_norm": 0.12034516036510468, "learning_rate": 0.00019611462824875358, "loss": 1.0638, "step": 1370 }, { "epoch": 0.18086155869779677, "grad_norm": 0.11693466454744339, "learning_rate": 0.00019605099210988388, "loss": 1.0939, "step": 1375 }, { "epoch": 0.1815192370930615, "grad_norm": 0.11898714303970337, "learning_rate": 0.0001959868495610557, "loss": 1.1179, "step": 1380 }, { "epoch": 0.1821769154883262, "grad_norm": 0.1338665634393692, "learning_rate": 0.00019592220094044794, "loss": 1.0716, "step": 1385 }, { "epoch": 0.1828345938835909, "grad_norm": 0.11845030635595322, "learning_rate": 0.00019585704658890785, "loss": 1.1033, "step": 1390 }, { "epoch": 0.18349227227885564, "grad_norm": 0.13372300565242767, "learning_rate": 0.00019579138684994884, "loss": 1.0648, "step": 1395 }, { "epoch": 0.18414995067412035, "grad_norm": 0.14201444387435913, "learning_rate": 0.00019572522206974898, "loss": 1.1115, "step": 1400 }, { "epoch": 0.18480762906938508, "grad_norm": 0.1381274312734604, "learning_rate": 0.00019565855259714909, "loss": 1.0665, "step": 1405 }, { "epoch": 0.1854653074646498, "grad_norm": 0.116342693567276, "learning_rate": 0.00019559137878365083, "loss": 1.0856, "step": 1410 }, { "epoch": 0.1861229858599145, "grad_norm": 0.12727460265159607, "learning_rate": 0.00019552370098341491, "loss": 1.092, "step": 1415 }, { "epoch": 0.18678066425517922, "grad_norm": 0.11715777218341827, "learning_rate": 0.0001954555195532592, "loss": 1.1158, "step": 1420 }, { "epoch": 0.18743834265044393, "grad_norm": 0.11991927027702332, "learning_rate": 0.0001953868348526569, "loss": 1.0911, "step": 1425 }, { "epoch": 0.18809602104570866, "grad_norm": 0.12746894359588623, "learning_rate": 0.00019531764724373457, "loss": 1.1397, "step": 1430 }, { "epoch": 0.18875369944097337, "grad_norm": 0.11954222619533539, "learning_rate": 0.00019524795709127031, "loss": 1.0781, "step": 1435 }, { "epoch": 0.18941137783623807, "grad_norm": 0.12972930073738098, "learning_rate": 0.00019517776476269167, "loss": 1.09, "step": 1440 }, { "epoch": 0.1900690562315028, "grad_norm": 0.129747211933136, "learning_rate": 0.00019510707062807395, "loss": 1.1337, "step": 1445 }, { "epoch": 0.1907267346267675, "grad_norm": 0.12582586705684662, "learning_rate": 0.000195035875060138, "loss": 1.0997, "step": 1450 }, { "epoch": 0.1913844130220322, "grad_norm": 0.12693369388580322, "learning_rate": 0.00019496417843424847, "loss": 1.1361, "step": 1455 }, { "epoch": 0.19204209141729695, "grad_norm": 0.11949343979358673, "learning_rate": 0.00019489198112841177, "loss": 1.0901, "step": 1460 }, { "epoch": 0.19269976981256165, "grad_norm": 0.15593942999839783, "learning_rate": 0.00019481928352327392, "loss": 1.1119, "step": 1465 }, { "epoch": 0.19335744820782638, "grad_norm": 0.11990107595920563, "learning_rate": 0.00019474608600211877, "loss": 1.0819, "step": 1470 }, { "epoch": 0.1940151266030911, "grad_norm": 0.12292806804180145, "learning_rate": 0.00019467238895086588, "loss": 1.0749, "step": 1475 }, { "epoch": 0.1946728049983558, "grad_norm": 0.1304595023393631, "learning_rate": 0.0001945981927580684, "loss": 1.0918, "step": 1480 }, { "epoch": 0.19533048339362052, "grad_norm": 0.1331252157688141, "learning_rate": 0.00019452349781491117, "loss": 1.0981, "step": 1485 }, { "epoch": 0.19598816178888523, "grad_norm": 0.12859852612018585, "learning_rate": 0.00019444830451520857, "loss": 1.1412, "step": 1490 }, { "epoch": 0.19664584018414996, "grad_norm": 0.12999925017356873, "learning_rate": 0.00019437261325540254, "loss": 1.1169, "step": 1495 }, { "epoch": 0.19730351857941467, "grad_norm": 0.12481727451086044, "learning_rate": 0.00019429642443456028, "loss": 1.1133, "step": 1500 }, { "epoch": 0.19796119697467937, "grad_norm": 0.1232612133026123, "learning_rate": 0.00019421973845437242, "loss": 1.1203, "step": 1505 }, { "epoch": 0.1986188753699441, "grad_norm": 0.11705848574638367, "learning_rate": 0.0001941425557191507, "loss": 1.146, "step": 1510 }, { "epoch": 0.1992765537652088, "grad_norm": 0.1201818585395813, "learning_rate": 0.00019406487663582584, "loss": 1.103, "step": 1515 }, { "epoch": 0.19993423216047354, "grad_norm": 0.11374238133430481, "learning_rate": 0.0001939867016139456, "loss": 1.066, "step": 1520 }, { "epoch": 0.20059191055573825, "grad_norm": 0.13211093842983246, "learning_rate": 0.00019390803106567235, "loss": 1.0908, "step": 1525 }, { "epoch": 0.20124958895100295, "grad_norm": 0.1177334114909172, "learning_rate": 0.0001938288654057811, "loss": 1.0917, "step": 1530 }, { "epoch": 0.20190726734626768, "grad_norm": 0.1153583824634552, "learning_rate": 0.00019374920505165722, "loss": 1.0796, "step": 1535 }, { "epoch": 0.2025649457415324, "grad_norm": 0.13302463293075562, "learning_rate": 0.00019366905042329422, "loss": 1.1038, "step": 1540 }, { "epoch": 0.20322262413679712, "grad_norm": 0.1290385127067566, "learning_rate": 0.00019358840194329168, "loss": 1.1203, "step": 1545 }, { "epoch": 0.20388030253206182, "grad_norm": 0.12041278928518295, "learning_rate": 0.00019350726003685277, "loss": 1.1293, "step": 1550 }, { "epoch": 0.20453798092732653, "grad_norm": 0.11505395174026489, "learning_rate": 0.00019342562513178228, "loss": 1.0947, "step": 1555 }, { "epoch": 0.20519565932259126, "grad_norm": 0.1646180897951126, "learning_rate": 0.00019334349765848417, "loss": 1.1429, "step": 1560 }, { "epoch": 0.20585333771785597, "grad_norm": 0.12679384648799896, "learning_rate": 0.00019326087804995935, "loss": 1.1402, "step": 1565 }, { "epoch": 0.20651101611312067, "grad_norm": 0.11802898347377777, "learning_rate": 0.00019317776674180348, "loss": 1.1366, "step": 1570 }, { "epoch": 0.2071686945083854, "grad_norm": 0.12060213088989258, "learning_rate": 0.0001930941641722046, "loss": 1.0664, "step": 1575 }, { "epoch": 0.2078263729036501, "grad_norm": 0.12272518873214722, "learning_rate": 0.00019301007078194077, "loss": 1.1001, "step": 1580 }, { "epoch": 0.20848405129891484, "grad_norm": 0.12174446135759354, "learning_rate": 0.00019292548701437787, "loss": 1.108, "step": 1585 }, { "epoch": 0.20914172969417955, "grad_norm": 0.11479714512825012, "learning_rate": 0.0001928404133154672, "loss": 1.1117, "step": 1590 }, { "epoch": 0.20979940808944425, "grad_norm": 0.12429360300302505, "learning_rate": 0.00019275485013374303, "loss": 1.0902, "step": 1595 }, { "epoch": 0.21045708648470898, "grad_norm": 0.13622327148914337, "learning_rate": 0.00019266879792032045, "loss": 1.061, "step": 1600 }, { "epoch": 0.2111147648799737, "grad_norm": 0.11813180893659592, "learning_rate": 0.00019258225712889284, "loss": 1.0919, "step": 1605 }, { "epoch": 0.21177244327523842, "grad_norm": 0.11803152412176132, "learning_rate": 0.00019249522821572946, "loss": 1.1313, "step": 1610 }, { "epoch": 0.21243012167050312, "grad_norm": 0.12054797261953354, "learning_rate": 0.00019240771163967312, "loss": 1.1278, "step": 1615 }, { "epoch": 0.21308780006576783, "grad_norm": 0.12006231397390366, "learning_rate": 0.0001923197078621378, "loss": 1.1295, "step": 1620 }, { "epoch": 0.21374547846103256, "grad_norm": 0.12404396384954453, "learning_rate": 0.00019223121734710606, "loss": 1.1517, "step": 1625 }, { "epoch": 0.21440315685629727, "grad_norm": 0.13037464022636414, "learning_rate": 0.00019214224056112676, "loss": 1.0898, "step": 1630 }, { "epoch": 0.215060835251562, "grad_norm": 0.11906000226736069, "learning_rate": 0.00019205277797331246, "loss": 1.0873, "step": 1635 }, { "epoch": 0.2157185136468267, "grad_norm": 0.12592414021492004, "learning_rate": 0.0001919628300553371, "loss": 1.0578, "step": 1640 }, { "epoch": 0.2163761920420914, "grad_norm": 0.11958621442317963, "learning_rate": 0.00019187239728143336, "loss": 1.0701, "step": 1645 }, { "epoch": 0.21703387043735614, "grad_norm": 0.17514668405056, "learning_rate": 0.00019178148012839025, "loss": 1.1118, "step": 1650 }, { "epoch": 0.21769154883262085, "grad_norm": 0.12871570885181427, "learning_rate": 0.00019169007907555055, "loss": 1.1456, "step": 1655 }, { "epoch": 0.21834922722788555, "grad_norm": 0.12771371006965637, "learning_rate": 0.0001915981946048084, "loss": 1.1309, "step": 1660 }, { "epoch": 0.21900690562315028, "grad_norm": 0.12694258987903595, "learning_rate": 0.00019150582720060648, "loss": 1.1242, "step": 1665 }, { "epoch": 0.219664584018415, "grad_norm": 0.12098956853151321, "learning_rate": 0.00019141297734993383, "loss": 1.0824, "step": 1670 }, { "epoch": 0.22032226241367972, "grad_norm": 0.1316050887107849, "learning_rate": 0.00019131964554232304, "loss": 1.1925, "step": 1675 }, { "epoch": 0.22097994080894443, "grad_norm": 0.12222263962030411, "learning_rate": 0.00019122583226984766, "loss": 1.0834, "step": 1680 }, { "epoch": 0.22163761920420913, "grad_norm": 0.12348196655511856, "learning_rate": 0.0001911315380271197, "loss": 1.0941, "step": 1685 }, { "epoch": 0.22229529759947386, "grad_norm": 0.12133897840976715, "learning_rate": 0.00019103676331128704, "loss": 1.147, "step": 1690 }, { "epoch": 0.22295297599473857, "grad_norm": 0.12322820723056793, "learning_rate": 0.0001909415086220307, "loss": 1.0264, "step": 1695 }, { "epoch": 0.2236106543900033, "grad_norm": 0.13303636014461517, "learning_rate": 0.00019084577446156232, "loss": 1.1142, "step": 1700 }, { "epoch": 0.224268332785268, "grad_norm": 0.12800318002700806, "learning_rate": 0.00019074956133462138, "loss": 1.1268, "step": 1705 }, { "epoch": 0.2249260111805327, "grad_norm": 0.1284877508878708, "learning_rate": 0.0001906528697484727, "loss": 1.1192, "step": 1710 }, { "epoch": 0.22558368957579744, "grad_norm": 0.1423519402742386, "learning_rate": 0.00019055570021290366, "loss": 1.1189, "step": 1715 }, { "epoch": 0.22624136797106215, "grad_norm": 0.11997201293706894, "learning_rate": 0.00019045805324022146, "loss": 1.128, "step": 1720 }, { "epoch": 0.22689904636632688, "grad_norm": 0.12833046913146973, "learning_rate": 0.00019035992934525057, "loss": 1.1004, "step": 1725 }, { "epoch": 0.22755672476159158, "grad_norm": 0.12759844958782196, "learning_rate": 0.00019026132904532994, "loss": 1.0568, "step": 1730 }, { "epoch": 0.2282144031568563, "grad_norm": 0.12050973623991013, "learning_rate": 0.00019016225286031023, "loss": 1.0802, "step": 1735 }, { "epoch": 0.22887208155212102, "grad_norm": 0.11680714786052704, "learning_rate": 0.0001900627013125512, "loss": 1.0966, "step": 1740 }, { "epoch": 0.22952975994738573, "grad_norm": 0.1316251903772354, "learning_rate": 0.00018996267492691867, "loss": 1.0896, "step": 1745 }, { "epoch": 0.23018743834265046, "grad_norm": 0.13125015795230865, "learning_rate": 0.00018986217423078223, "loss": 1.0942, "step": 1750 }, { "epoch": 0.23084511673791516, "grad_norm": 0.1381591111421585, "learning_rate": 0.00018976119975401193, "loss": 1.0482, "step": 1755 }, { "epoch": 0.23150279513317987, "grad_norm": 0.1331150233745575, "learning_rate": 0.00018965975202897583, "loss": 1.1587, "step": 1760 }, { "epoch": 0.2321604735284446, "grad_norm": 0.1459963470697403, "learning_rate": 0.00018955783159053718, "loss": 1.0793, "step": 1765 }, { "epoch": 0.2328181519237093, "grad_norm": 0.12813477218151093, "learning_rate": 0.00018945543897605133, "loss": 1.1036, "step": 1770 }, { "epoch": 0.233475830318974, "grad_norm": 0.11975600570440292, "learning_rate": 0.00018935257472536327, "loss": 1.0894, "step": 1775 }, { "epoch": 0.23413350871423874, "grad_norm": 0.12801101803779602, "learning_rate": 0.00018924923938080452, "loss": 1.0417, "step": 1780 }, { "epoch": 0.23479118710950345, "grad_norm": 0.1303645819425583, "learning_rate": 0.00018914543348719034, "loss": 1.1069, "step": 1785 }, { "epoch": 0.23544886550476818, "grad_norm": 0.13825669884681702, "learning_rate": 0.00018904115759181686, "loss": 1.1008, "step": 1790 }, { "epoch": 0.23610654390003288, "grad_norm": 0.12449214607477188, "learning_rate": 0.00018893641224445825, "loss": 1.1054, "step": 1795 }, { "epoch": 0.2367642222952976, "grad_norm": 0.11941149085760117, "learning_rate": 0.00018883119799736377, "loss": 1.1639, "step": 1800 }, { "epoch": 0.23742190069056232, "grad_norm": 0.12320604920387268, "learning_rate": 0.0001887255154052548, "loss": 1.1219, "step": 1805 }, { "epoch": 0.23807957908582703, "grad_norm": 0.11824801564216614, "learning_rate": 0.00018861936502532199, "loss": 1.1067, "step": 1810 }, { "epoch": 0.23873725748109176, "grad_norm": 0.13560426235198975, "learning_rate": 0.00018851274741722235, "loss": 1.0901, "step": 1815 }, { "epoch": 0.23939493587635646, "grad_norm": 0.11675427854061127, "learning_rate": 0.0001884056631430763, "loss": 1.1061, "step": 1820 }, { "epoch": 0.24005261427162117, "grad_norm": 0.1280919313430786, "learning_rate": 0.00018829811276746455, "loss": 1.1528, "step": 1825 }, { "epoch": 0.2407102926668859, "grad_norm": 0.11827636510133743, "learning_rate": 0.0001881900968574253, "loss": 1.1239, "step": 1830 }, { "epoch": 0.2413679710621506, "grad_norm": 0.125891774892807, "learning_rate": 0.0001880816159824512, "loss": 1.094, "step": 1835 }, { "epoch": 0.24202564945741534, "grad_norm": 0.13762013614177704, "learning_rate": 0.00018797267071448632, "loss": 1.1377, "step": 1840 }, { "epoch": 0.24268332785268004, "grad_norm": 0.12209563702344894, "learning_rate": 0.00018786326162792313, "loss": 1.1413, "step": 1845 }, { "epoch": 0.24334100624794475, "grad_norm": 0.1265508383512497, "learning_rate": 0.00018775338929959956, "loss": 1.092, "step": 1850 }, { "epoch": 0.24399868464320948, "grad_norm": 0.12232159823179245, "learning_rate": 0.00018764305430879577, "loss": 1.0657, "step": 1855 }, { "epoch": 0.24465636303847418, "grad_norm": 0.1251397579908371, "learning_rate": 0.00018753225723723137, "loss": 1.1311, "step": 1860 }, { "epoch": 0.2453140414337389, "grad_norm": 0.12585444748401642, "learning_rate": 0.0001874209986690621, "loss": 1.0816, "step": 1865 }, { "epoch": 0.24597171982900362, "grad_norm": 0.12100059539079666, "learning_rate": 0.00018730927919087683, "loss": 1.0707, "step": 1870 }, { "epoch": 0.24662939822426833, "grad_norm": 0.1218976080417633, "learning_rate": 0.00018719709939169453, "loss": 1.1258, "step": 1875 }, { "epoch": 0.24728707661953306, "grad_norm": 0.11606669425964355, "learning_rate": 0.00018708445986296123, "loss": 1.1306, "step": 1880 }, { "epoch": 0.24794475501479776, "grad_norm": 0.12821166217327118, "learning_rate": 0.00018697136119854652, "loss": 1.1069, "step": 1885 }, { "epoch": 0.24860243341006247, "grad_norm": 0.12565293908119202, "learning_rate": 0.00018685780399474097, "loss": 1.1381, "step": 1890 }, { "epoch": 0.2492601118053272, "grad_norm": 0.12476211041212082, "learning_rate": 0.00018674378885025257, "loss": 1.0562, "step": 1895 }, { "epoch": 0.2499177902005919, "grad_norm": 0.12470985949039459, "learning_rate": 0.00018662931636620372, "loss": 1.0666, "step": 1900 }, { "epoch": 0.2505754685958566, "grad_norm": 0.13284392654895782, "learning_rate": 0.00018651438714612807, "loss": 1.1636, "step": 1905 }, { "epoch": 0.2512331469911213, "grad_norm": 0.13404981791973114, "learning_rate": 0.00018639900179596736, "loss": 1.1032, "step": 1910 }, { "epoch": 0.2518908253863861, "grad_norm": 0.13734029233455658, "learning_rate": 0.0001862831609240681, "loss": 1.1021, "step": 1915 }, { "epoch": 0.2525485037816508, "grad_norm": 0.1278238594532013, "learning_rate": 0.0001861668651411785, "loss": 1.1484, "step": 1920 }, { "epoch": 0.2532061821769155, "grad_norm": 0.12127093225717545, "learning_rate": 0.00018605011506044522, "loss": 1.0989, "step": 1925 }, { "epoch": 0.2538638605721802, "grad_norm": 0.1347067505121231, "learning_rate": 0.0001859329112974101, "loss": 1.1079, "step": 1930 }, { "epoch": 0.2545215389674449, "grad_norm": 0.13483388721942902, "learning_rate": 0.00018581525447000692, "loss": 1.0849, "step": 1935 }, { "epoch": 0.25517921736270965, "grad_norm": 0.1620136797428131, "learning_rate": 0.0001856971451985581, "loss": 1.1491, "step": 1940 }, { "epoch": 0.25583689575797436, "grad_norm": 0.12785854935646057, "learning_rate": 0.0001855785841057716, "loss": 1.0731, "step": 1945 }, { "epoch": 0.25649457415323906, "grad_norm": 0.1255158632993698, "learning_rate": 0.0001854595718167374, "loss": 1.0924, "step": 1950 }, { "epoch": 0.25715225254850377, "grad_norm": 0.12754568457603455, "learning_rate": 0.00018534010895892441, "loss": 1.0944, "step": 1955 }, { "epoch": 0.2578099309437685, "grad_norm": 0.11563219130039215, "learning_rate": 0.00018522019616217707, "loss": 1.0766, "step": 1960 }, { "epoch": 0.25846760933903323, "grad_norm": 0.1720672845840454, "learning_rate": 0.00018509983405871197, "loss": 1.0909, "step": 1965 }, { "epoch": 0.25912528773429794, "grad_norm": 0.1237102597951889, "learning_rate": 0.00018497902328311463, "loss": 1.0821, "step": 1970 }, { "epoch": 0.25978296612956264, "grad_norm": 0.12221318483352661, "learning_rate": 0.0001848577644723361, "loss": 1.1133, "step": 1975 }, { "epoch": 0.26044064452482735, "grad_norm": 0.13074050843715668, "learning_rate": 0.00018473605826568957, "loss": 1.071, "step": 1980 }, { "epoch": 0.26109832292009205, "grad_norm": 0.1402166187763214, "learning_rate": 0.00018461390530484715, "loss": 1.0884, "step": 1985 }, { "epoch": 0.2617560013153568, "grad_norm": 0.12232601642608643, "learning_rate": 0.0001844913062338362, "loss": 1.0633, "step": 1990 }, { "epoch": 0.2624136797106215, "grad_norm": 0.131879061460495, "learning_rate": 0.0001843682616990363, "loss": 1.0897, "step": 1995 }, { "epoch": 0.2630713581058862, "grad_norm": 0.13571526110172272, "learning_rate": 0.00018424477234917547, "loss": 1.105, "step": 2000 }, { "epoch": 0.2637290365011509, "grad_norm": 0.1231071874499321, "learning_rate": 0.000184120838835327, "loss": 1.142, "step": 2005 }, { "epoch": 0.26438671489641563, "grad_norm": 0.12111788988113403, "learning_rate": 0.00018399646181090602, "loss": 1.1332, "step": 2010 }, { "epoch": 0.2650443932916804, "grad_norm": 0.11962269991636276, "learning_rate": 0.0001838716419316659, "loss": 1.1747, "step": 2015 }, { "epoch": 0.2657020716869451, "grad_norm": 0.12085366994142532, "learning_rate": 0.00018374637985569493, "loss": 1.0673, "step": 2020 }, { "epoch": 0.2663597500822098, "grad_norm": 0.12849266827106476, "learning_rate": 0.00018362067624341265, "loss": 1.0734, "step": 2025 }, { "epoch": 0.2670174284774745, "grad_norm": 0.12889696657657623, "learning_rate": 0.00018349453175756675, "loss": 1.1032, "step": 2030 }, { "epoch": 0.2676751068727392, "grad_norm": 0.13377971947193146, "learning_rate": 0.00018336794706322914, "loss": 1.1102, "step": 2035 }, { "epoch": 0.26833278526800397, "grad_norm": 0.1247139647603035, "learning_rate": 0.00018324092282779274, "loss": 1.1104, "step": 2040 }, { "epoch": 0.2689904636632687, "grad_norm": 0.13827171921730042, "learning_rate": 0.00018311345972096786, "loss": 1.0371, "step": 2045 }, { "epoch": 0.2696481420585334, "grad_norm": 0.1240691989660263, "learning_rate": 0.0001829855584147786, "loss": 1.1777, "step": 2050 }, { "epoch": 0.2703058204537981, "grad_norm": 0.12523069977760315, "learning_rate": 0.00018285721958355948, "loss": 1.0792, "step": 2055 }, { "epoch": 0.2709634988490628, "grad_norm": 0.12097758799791336, "learning_rate": 0.00018272844390395174, "loss": 1.1189, "step": 2060 }, { "epoch": 0.27162117724432755, "grad_norm": 0.12591056525707245, "learning_rate": 0.00018259923205489977, "loss": 1.1208, "step": 2065 }, { "epoch": 0.27227885563959225, "grad_norm": 0.12412361800670624, "learning_rate": 0.00018246958471764766, "loss": 1.1015, "step": 2070 }, { "epoch": 0.27293653403485696, "grad_norm": 0.12062951177358627, "learning_rate": 0.00018233950257573545, "loss": 1.1121, "step": 2075 }, { "epoch": 0.27359421243012166, "grad_norm": 0.13038983941078186, "learning_rate": 0.0001822089863149957, "loss": 1.086, "step": 2080 }, { "epoch": 0.27425189082538637, "grad_norm": 0.13188335299491882, "learning_rate": 0.0001820780366235497, "loss": 1.1359, "step": 2085 }, { "epoch": 0.27490956922065113, "grad_norm": 0.12430387735366821, "learning_rate": 0.0001819466541918039, "loss": 1.1225, "step": 2090 }, { "epoch": 0.27556724761591583, "grad_norm": 0.1407085657119751, "learning_rate": 0.00018181483971244634, "loss": 1.121, "step": 2095 }, { "epoch": 0.27622492601118054, "grad_norm": 0.12808369100093842, "learning_rate": 0.0001816825938804429, "loss": 1.0892, "step": 2100 }, { "epoch": 0.27688260440644524, "grad_norm": 0.12325534224510193, "learning_rate": 0.0001815499173930338, "loss": 1.0897, "step": 2105 }, { "epoch": 0.27754028280170995, "grad_norm": 0.1301654726266861, "learning_rate": 0.00018141681094972962, "loss": 1.0929, "step": 2110 }, { "epoch": 0.27819796119697465, "grad_norm": 0.13606461882591248, "learning_rate": 0.00018128327525230797, "loss": 1.1128, "step": 2115 }, { "epoch": 0.2788556395922394, "grad_norm": 0.12887029349803925, "learning_rate": 0.00018114931100480947, "loss": 1.0772, "step": 2120 }, { "epoch": 0.2795133179875041, "grad_norm": 0.12754212319850922, "learning_rate": 0.0001810149189135343, "loss": 1.1168, "step": 2125 }, { "epoch": 0.2801709963827688, "grad_norm": 0.11819833517074585, "learning_rate": 0.00018088009968703832, "loss": 1.1045, "step": 2130 }, { "epoch": 0.2808286747780335, "grad_norm": 0.12352098524570465, "learning_rate": 0.00018074485403612938, "loss": 1.1169, "step": 2135 }, { "epoch": 0.28148635317329823, "grad_norm": 0.12033966928720474, "learning_rate": 0.00018060918267386354, "loss": 1.0514, "step": 2140 }, { "epoch": 0.282144031568563, "grad_norm": 0.12967579066753387, "learning_rate": 0.00018047308631554142, "loss": 1.1468, "step": 2145 }, { "epoch": 0.2828017099638277, "grad_norm": 0.11962036043405533, "learning_rate": 0.0001803365656787042, "loss": 1.109, "step": 2150 }, { "epoch": 0.2834593883590924, "grad_norm": 0.12012016773223877, "learning_rate": 0.00018019962148313015, "loss": 1.0201, "step": 2155 }, { "epoch": 0.2841170667543571, "grad_norm": 0.12632569670677185, "learning_rate": 0.0001800622544508306, "loss": 1.1081, "step": 2160 }, { "epoch": 0.2847747451496218, "grad_norm": 0.12119702249765396, "learning_rate": 0.00017992446530604617, "loss": 1.0752, "step": 2165 }, { "epoch": 0.28543242354488657, "grad_norm": 0.11959727853536606, "learning_rate": 0.00017978625477524303, "loss": 1.0895, "step": 2170 }, { "epoch": 0.2860901019401513, "grad_norm": 0.13609668612480164, "learning_rate": 0.00017964762358710902, "loss": 1.1415, "step": 2175 }, { "epoch": 0.286747780335416, "grad_norm": 0.122723788022995, "learning_rate": 0.00017950857247254977, "loss": 1.0856, "step": 2180 }, { "epoch": 0.2874054587306807, "grad_norm": 0.12313158810138702, "learning_rate": 0.00017936910216468497, "loss": 1.1343, "step": 2185 }, { "epoch": 0.2880631371259454, "grad_norm": 0.12104373425245285, "learning_rate": 0.00017922921339884437, "loss": 1.1, "step": 2190 }, { "epoch": 0.28872081552121015, "grad_norm": 0.12202125787734985, "learning_rate": 0.00017908890691256394, "loss": 1.0944, "step": 2195 }, { "epoch": 0.28937849391647485, "grad_norm": 0.12844744324684143, "learning_rate": 0.00017894818344558203, "loss": 1.0825, "step": 2200 }, { "epoch": 0.29003617231173956, "grad_norm": 0.12723222374916077, "learning_rate": 0.00017880704373983547, "loss": 1.0711, "step": 2205 }, { "epoch": 0.29069385070700426, "grad_norm": 0.12941156327724457, "learning_rate": 0.00017866548853945555, "loss": 1.1113, "step": 2210 }, { "epoch": 0.29135152910226897, "grad_norm": 0.1239108070731163, "learning_rate": 0.00017852351859076423, "loss": 1.1712, "step": 2215 }, { "epoch": 0.29200920749753373, "grad_norm": 0.12376103550195694, "learning_rate": 0.00017838113464227012, "loss": 1.0943, "step": 2220 }, { "epoch": 0.29266688589279843, "grad_norm": 0.13005106151103973, "learning_rate": 0.00017823833744466457, "loss": 1.1207, "step": 2225 }, { "epoch": 0.29332456428806314, "grad_norm": 0.13091644644737244, "learning_rate": 0.00017809512775081776, "loss": 1.0951, "step": 2230 }, { "epoch": 0.29398224268332784, "grad_norm": 0.12207649648189545, "learning_rate": 0.00017795150631577458, "loss": 1.1287, "step": 2235 }, { "epoch": 0.29463992107859255, "grad_norm": 0.13389946520328522, "learning_rate": 0.0001778074738967508, "loss": 1.1162, "step": 2240 }, { "epoch": 0.2952975994738573, "grad_norm": 0.11950133740901947, "learning_rate": 0.000177663031253129, "loss": 1.07, "step": 2245 }, { "epoch": 0.295955277869122, "grad_norm": 0.12282822281122208, "learning_rate": 0.00017751817914645458, "loss": 1.1491, "step": 2250 }, { "epoch": 0.2966129562643867, "grad_norm": 0.12404743582010269, "learning_rate": 0.00017737291834043185, "loss": 1.14, "step": 2255 }, { "epoch": 0.2972706346596514, "grad_norm": 0.11625643074512482, "learning_rate": 0.00017722724960091978, "loss": 1.1, "step": 2260 }, { "epoch": 0.2979283130549161, "grad_norm": 0.12960922718048096, "learning_rate": 0.00017708117369592816, "loss": 1.159, "step": 2265 }, { "epoch": 0.2985859914501809, "grad_norm": 0.1396075040102005, "learning_rate": 0.00017693469139561343, "loss": 1.1348, "step": 2270 }, { "epoch": 0.2992436698454456, "grad_norm": 0.13064976036548615, "learning_rate": 0.00017678780347227472, "loss": 1.1051, "step": 2275 }, { "epoch": 0.2999013482407103, "grad_norm": 0.11721637099981308, "learning_rate": 0.00017664051070034965, "loss": 1.1215, "step": 2280 }, { "epoch": 0.300559026635975, "grad_norm": 0.12197437882423401, "learning_rate": 0.0001764928138564105, "loss": 1.146, "step": 2285 }, { "epoch": 0.3012167050312397, "grad_norm": 0.12351740896701813, "learning_rate": 0.00017634471371915972, "loss": 1.0756, "step": 2290 }, { "epoch": 0.30187438342650447, "grad_norm": 0.1192743256688118, "learning_rate": 0.0001761962110694262, "loss": 1.1501, "step": 2295 }, { "epoch": 0.30253206182176917, "grad_norm": 0.12094896286725998, "learning_rate": 0.00017604730669016093, "loss": 1.1407, "step": 2300 }, { "epoch": 0.3031897402170339, "grad_norm": 0.12669968605041504, "learning_rate": 0.00017589800136643292, "loss": 1.0965, "step": 2305 }, { "epoch": 0.3038474186122986, "grad_norm": 0.12862138450145721, "learning_rate": 0.00017574829588542511, "loss": 1.081, "step": 2310 }, { "epoch": 0.3045050970075633, "grad_norm": 0.11855939030647278, "learning_rate": 0.00017559819103643028, "loss": 1.0657, "step": 2315 }, { "epoch": 0.305162775402828, "grad_norm": 0.12152007967233658, "learning_rate": 0.00017544768761084667, "loss": 1.0894, "step": 2320 }, { "epoch": 0.30582045379809275, "grad_norm": 0.13230516016483307, "learning_rate": 0.00017529678640217395, "loss": 1.127, "step": 2325 }, { "epoch": 0.30647813219335746, "grad_norm": 0.12552602589130402, "learning_rate": 0.00017514548820600907, "loss": 1.0499, "step": 2330 }, { "epoch": 0.30713581058862216, "grad_norm": 0.12331748753786087, "learning_rate": 0.000174993793820042, "loss": 1.0793, "step": 2335 }, { "epoch": 0.30779348898388686, "grad_norm": 0.12079697847366333, "learning_rate": 0.0001748417040440516, "loss": 1.1465, "step": 2340 }, { "epoch": 0.30845116737915157, "grad_norm": 0.11901120096445084, "learning_rate": 0.00017468921967990118, "loss": 1.0978, "step": 2345 }, { "epoch": 0.30910884577441633, "grad_norm": 0.12657709419727325, "learning_rate": 0.00017453634153153465, "loss": 1.0987, "step": 2350 }, { "epoch": 0.30976652416968103, "grad_norm": 0.12214319407939911, "learning_rate": 0.0001743830704049719, "loss": 1.0886, "step": 2355 }, { "epoch": 0.31042420256494574, "grad_norm": 0.12423741817474365, "learning_rate": 0.00017422940710830483, "loss": 1.1275, "step": 2360 }, { "epoch": 0.31108188096021044, "grad_norm": 0.13517656922340393, "learning_rate": 0.00017407535245169298, "loss": 1.1016, "step": 2365 }, { "epoch": 0.31173955935547515, "grad_norm": 0.11949760466814041, "learning_rate": 0.00017392090724735912, "loss": 1.073, "step": 2370 }, { "epoch": 0.3123972377507399, "grad_norm": 0.12661853432655334, "learning_rate": 0.00017376607230958527, "loss": 1.1103, "step": 2375 }, { "epoch": 0.3130549161460046, "grad_norm": 0.11653164029121399, "learning_rate": 0.00017361084845470814, "loss": 1.061, "step": 2380 }, { "epoch": 0.3137125945412693, "grad_norm": 0.12816305458545685, "learning_rate": 0.000173455236501115, "loss": 1.1077, "step": 2385 }, { "epoch": 0.314370272936534, "grad_norm": 0.125166118144989, "learning_rate": 0.0001732992372692393, "loss": 1.1164, "step": 2390 }, { "epoch": 0.31502795133179873, "grad_norm": 0.12271994352340698, "learning_rate": 0.00017314285158155618, "loss": 1.0721, "step": 2395 }, { "epoch": 0.3156856297270635, "grad_norm": 0.1264958381652832, "learning_rate": 0.00017298608026257848, "loss": 1.1455, "step": 2400 }, { "epoch": 0.3163433081223282, "grad_norm": 0.14083532989025116, "learning_rate": 0.00017282892413885215, "loss": 1.1067, "step": 2405 }, { "epoch": 0.3170009865175929, "grad_norm": 0.16365505754947662, "learning_rate": 0.0001726713840389519, "loss": 1.0725, "step": 2410 }, { "epoch": 0.3176586649128576, "grad_norm": 0.12512172758579254, "learning_rate": 0.00017251346079347695, "loss": 1.0633, "step": 2415 }, { "epoch": 0.3183163433081223, "grad_norm": 0.18074463307857513, "learning_rate": 0.00017235515523504648, "loss": 1.1212, "step": 2420 }, { "epoch": 0.31897402170338707, "grad_norm": 0.12725958228111267, "learning_rate": 0.00017219646819829546, "loss": 1.102, "step": 2425 }, { "epoch": 0.31963170009865177, "grad_norm": 0.12555329501628876, "learning_rate": 0.00017203740051987003, "loss": 1.1415, "step": 2430 }, { "epoch": 0.3202893784939165, "grad_norm": 0.12739256024360657, "learning_rate": 0.00017187795303842327, "loss": 1.0995, "step": 2435 }, { "epoch": 0.3209470568891812, "grad_norm": 0.12488582730293274, "learning_rate": 0.00017171812659461062, "loss": 1.1552, "step": 2440 }, { "epoch": 0.3216047352844459, "grad_norm": 0.12375622987747192, "learning_rate": 0.00017155792203108557, "loss": 1.1402, "step": 2445 }, { "epoch": 0.32226241367971065, "grad_norm": 0.13598953187465668, "learning_rate": 0.00017139734019249513, "loss": 1.1427, "step": 2450 }, { "epoch": 0.32292009207497535, "grad_norm": 0.1290683001279831, "learning_rate": 0.00017123638192547555, "loss": 1.0885, "step": 2455 }, { "epoch": 0.32357777047024006, "grad_norm": 0.17497040331363678, "learning_rate": 0.0001710750480786475, "loss": 1.1031, "step": 2460 }, { "epoch": 0.32423544886550476, "grad_norm": 0.1226261556148529, "learning_rate": 0.00017091333950261208, "loss": 1.1078, "step": 2465 }, { "epoch": 0.32489312726076947, "grad_norm": 0.13011561334133148, "learning_rate": 0.0001707512570499458, "loss": 1.1397, "step": 2470 }, { "epoch": 0.3255508056560342, "grad_norm": 0.12647975981235504, "learning_rate": 0.0001705888015751966, "loss": 1.137, "step": 2475 }, { "epoch": 0.32620848405129893, "grad_norm": 0.1256851702928543, "learning_rate": 0.000170425973934879, "loss": 1.0913, "step": 2480 }, { "epoch": 0.32686616244656364, "grad_norm": 0.12304725497961044, "learning_rate": 0.00017026277498746964, "loss": 1.0895, "step": 2485 }, { "epoch": 0.32752384084182834, "grad_norm": 0.13261272013187408, "learning_rate": 0.00017009920559340292, "loss": 1.108, "step": 2490 }, { "epoch": 0.32818151923709304, "grad_norm": 0.12658004462718964, "learning_rate": 0.00016993526661506628, "loss": 1.0607, "step": 2495 }, { "epoch": 0.3288391976323578, "grad_norm": 0.13057458400726318, "learning_rate": 0.00016977095891679568, "loss": 1.1051, "step": 2500 }, { "epoch": 0.3294968760276225, "grad_norm": 0.12512914836406708, "learning_rate": 0.00016960628336487122, "loss": 1.0935, "step": 2505 }, { "epoch": 0.3301545544228872, "grad_norm": 0.14195580780506134, "learning_rate": 0.00016944124082751225, "loss": 1.0894, "step": 2510 }, { "epoch": 0.3308122328181519, "grad_norm": 0.1212921291589737, "learning_rate": 0.00016927583217487318, "loss": 1.0765, "step": 2515 }, { "epoch": 0.3314699112134166, "grad_norm": 0.1422162652015686, "learning_rate": 0.0001691100582790385, "loss": 1.0866, "step": 2520 }, { "epoch": 0.33212758960868133, "grad_norm": 0.12980493903160095, "learning_rate": 0.0001689439200140185, "loss": 1.075, "step": 2525 }, { "epoch": 0.3327852680039461, "grad_norm": 0.1282375603914261, "learning_rate": 0.0001687774182557445, "loss": 1.0824, "step": 2530 }, { "epoch": 0.3334429463992108, "grad_norm": 0.1268918216228485, "learning_rate": 0.00016861055388206424, "loss": 1.0582, "step": 2535 }, { "epoch": 0.3341006247944755, "grad_norm": 0.126429945230484, "learning_rate": 0.00016844332777273734, "loss": 1.0932, "step": 2540 }, { "epoch": 0.3347583031897402, "grad_norm": 0.12956790626049042, "learning_rate": 0.00016827574080943052, "loss": 1.0958, "step": 2545 }, { "epoch": 0.3354159815850049, "grad_norm": 0.13723298907279968, "learning_rate": 0.0001681077938757131, "loss": 1.0754, "step": 2550 }, { "epoch": 0.33607365998026967, "grad_norm": 0.1244669184088707, "learning_rate": 0.0001679394878570522, "loss": 1.0982, "step": 2555 }, { "epoch": 0.3367313383755344, "grad_norm": 0.13264647126197815, "learning_rate": 0.00016777082364080825, "loss": 1.1032, "step": 2560 }, { "epoch": 0.3373890167707991, "grad_norm": 0.12390298396348953, "learning_rate": 0.00016760180211623012, "loss": 1.071, "step": 2565 }, { "epoch": 0.3380466951660638, "grad_norm": 0.12482525408267975, "learning_rate": 0.00016743242417445052, "loss": 1.0383, "step": 2570 }, { "epoch": 0.3387043735613285, "grad_norm": 0.13515181839466095, "learning_rate": 0.00016726269070848143, "loss": 1.1296, "step": 2575 }, { "epoch": 0.33936205195659325, "grad_norm": 0.12122641503810883, "learning_rate": 0.00016709260261320899, "loss": 1.1106, "step": 2580 }, { "epoch": 0.34001973035185795, "grad_norm": 0.12400903552770615, "learning_rate": 0.0001669221607853893, "loss": 1.0857, "step": 2585 }, { "epoch": 0.34067740874712266, "grad_norm": 0.1242712140083313, "learning_rate": 0.00016675136612364342, "loss": 1.0954, "step": 2590 }, { "epoch": 0.34133508714238736, "grad_norm": 0.1238674446940422, "learning_rate": 0.00016658021952845254, "loss": 1.1159, "step": 2595 }, { "epoch": 0.34199276553765207, "grad_norm": 0.16190621256828308, "learning_rate": 0.00016640872190215337, "loss": 1.0816, "step": 2600 }, { "epoch": 0.3426504439329168, "grad_norm": 0.12380421906709671, "learning_rate": 0.0001662368741489335, "loss": 1.0925, "step": 2605 }, { "epoch": 0.34330812232818153, "grad_norm": 0.12329338490962982, "learning_rate": 0.00016606467717482633, "loss": 1.074, "step": 2610 }, { "epoch": 0.34396580072344624, "grad_norm": 0.12777286767959595, "learning_rate": 0.00016589213188770657, "loss": 1.0925, "step": 2615 }, { "epoch": 0.34462347911871094, "grad_norm": 0.11988986283540726, "learning_rate": 0.00016571923919728526, "loss": 1.0737, "step": 2620 }, { "epoch": 0.34528115751397565, "grad_norm": 0.1298821121454239, "learning_rate": 0.0001655460000151051, "loss": 1.1112, "step": 2625 }, { "epoch": 0.3459388359092404, "grad_norm": 0.12213220447301865, "learning_rate": 0.00016537241525453565, "loss": 1.1152, "step": 2630 }, { "epoch": 0.3465965143045051, "grad_norm": 0.13164806365966797, "learning_rate": 0.00016519848583076833, "loss": 1.1107, "step": 2635 }, { "epoch": 0.3472541926997698, "grad_norm": 0.1268683671951294, "learning_rate": 0.00016502421266081186, "loss": 1.1066, "step": 2640 }, { "epoch": 0.3479118710950345, "grad_norm": 0.12438154220581055, "learning_rate": 0.00016484959666348723, "loss": 1.0989, "step": 2645 }, { "epoch": 0.3485695494902992, "grad_norm": 0.1333337426185608, "learning_rate": 0.0001646746387594229, "loss": 1.1507, "step": 2650 }, { "epoch": 0.349227227885564, "grad_norm": 0.15039579570293427, "learning_rate": 0.00016449933987105006, "loss": 1.1035, "step": 2655 }, { "epoch": 0.3498849062808287, "grad_norm": 0.13221368193626404, "learning_rate": 0.00016432370092259754, "loss": 1.1134, "step": 2660 }, { "epoch": 0.3505425846760934, "grad_norm": 0.12188355624675751, "learning_rate": 0.0001641477228400872, "loss": 1.137, "step": 2665 }, { "epoch": 0.3512002630713581, "grad_norm": 0.1230362057685852, "learning_rate": 0.00016397140655132882, "loss": 1.0566, "step": 2670 }, { "epoch": 0.3518579414666228, "grad_norm": 0.1330370157957077, "learning_rate": 0.00016379475298591536, "loss": 1.1074, "step": 2675 }, { "epoch": 0.35251561986188756, "grad_norm": 0.12705188989639282, "learning_rate": 0.00016361776307521794, "loss": 1.0819, "step": 2680 }, { "epoch": 0.35317329825715227, "grad_norm": 0.12731413543224335, "learning_rate": 0.00016344043775238113, "loss": 1.0922, "step": 2685 }, { "epoch": 0.353830976652417, "grad_norm": 0.1292489618062973, "learning_rate": 0.00016326277795231776, "loss": 1.0522, "step": 2690 }, { "epoch": 0.3544886550476817, "grad_norm": 0.12442190200090408, "learning_rate": 0.00016308478461170417, "loss": 1.1163, "step": 2695 }, { "epoch": 0.3551463334429464, "grad_norm": 0.12585832178592682, "learning_rate": 0.00016290645866897524, "loss": 1.0989, "step": 2700 }, { "epoch": 0.3558040118382111, "grad_norm": 0.1383058875799179, "learning_rate": 0.00016272780106431939, "loss": 1.0987, "step": 2705 }, { "epoch": 0.35646169023347585, "grad_norm": 0.12392554432153702, "learning_rate": 0.0001625488127396737, "loss": 1.0608, "step": 2710 }, { "epoch": 0.35711936862874055, "grad_norm": 0.13014213740825653, "learning_rate": 0.00016236949463871894, "loss": 1.1146, "step": 2715 }, { "epoch": 0.35777704702400526, "grad_norm": 0.12385332584381104, "learning_rate": 0.00016218984770687448, "loss": 1.1072, "step": 2720 }, { "epoch": 0.35843472541926996, "grad_norm": 0.13289488852024078, "learning_rate": 0.00016200987289129347, "loss": 1.1155, "step": 2725 }, { "epoch": 0.35909240381453467, "grad_norm": 0.13012099266052246, "learning_rate": 0.00016182957114085764, "loss": 1.1128, "step": 2730 }, { "epoch": 0.3597500822097994, "grad_norm": 0.12161566317081451, "learning_rate": 0.0001616489434061725, "loss": 1.0788, "step": 2735 }, { "epoch": 0.36040776060506413, "grad_norm": 0.1252036839723587, "learning_rate": 0.00016146799063956232, "loss": 1.0717, "step": 2740 }, { "epoch": 0.36106543900032884, "grad_norm": 0.12470807135105133, "learning_rate": 0.00016128671379506493, "loss": 1.1193, "step": 2745 }, { "epoch": 0.36172311739559354, "grad_norm": 0.12273968011140823, "learning_rate": 0.00016110511382842678, "loss": 1.0805, "step": 2750 }, { "epoch": 0.36238079579085825, "grad_norm": 0.12397629022598267, "learning_rate": 0.00016092319169709805, "loss": 1.107, "step": 2755 }, { "epoch": 0.363038474186123, "grad_norm": 0.12711986899375916, "learning_rate": 0.0001607409483602273, "loss": 1.1507, "step": 2760 }, { "epoch": 0.3636961525813877, "grad_norm": 0.12929007411003113, "learning_rate": 0.0001605583847786567, "loss": 1.2242, "step": 2765 }, { "epoch": 0.3643538309766524, "grad_norm": 0.12682737410068512, "learning_rate": 0.00016037550191491686, "loss": 1.1165, "step": 2770 }, { "epoch": 0.3650115093719171, "grad_norm": 0.14706994593143463, "learning_rate": 0.0001601923007332216, "loss": 1.1261, "step": 2775 }, { "epoch": 0.3656691877671818, "grad_norm": 0.11725219339132309, "learning_rate": 0.0001600087821994632, "loss": 1.0876, "step": 2780 }, { "epoch": 0.3663268661624466, "grad_norm": 0.12180408835411072, "learning_rate": 0.00015982494728120701, "loss": 1.1388, "step": 2785 }, { "epoch": 0.3669845445577113, "grad_norm": 0.11758913844823837, "learning_rate": 0.0001596407969476864, "loss": 1.1425, "step": 2790 }, { "epoch": 0.367642222952976, "grad_norm": 0.12176872044801712, "learning_rate": 0.00015945633216979784, "loss": 1.0863, "step": 2795 }, { "epoch": 0.3682999013482407, "grad_norm": 0.13060422241687775, "learning_rate": 0.00015927155392009558, "loss": 1.0995, "step": 2800 }, { "epoch": 0.3689575797435054, "grad_norm": 0.1483689397573471, "learning_rate": 0.00015908646317278654, "loss": 1.1121, "step": 2805 }, { "epoch": 0.36961525813877016, "grad_norm": 0.1250036656856537, "learning_rate": 0.00015890106090372532, "loss": 1.1364, "step": 2810 }, { "epoch": 0.37027293653403487, "grad_norm": 0.1355031430721283, "learning_rate": 0.0001587153480904089, "loss": 1.0922, "step": 2815 }, { "epoch": 0.3709306149292996, "grad_norm": 0.11843118071556091, "learning_rate": 0.00015852932571197147, "loss": 1.1006, "step": 2820 }, { "epoch": 0.3715882933245643, "grad_norm": 0.1321127563714981, "learning_rate": 0.00015834299474917955, "loss": 1.1225, "step": 2825 }, { "epoch": 0.372245971719829, "grad_norm": 0.1261935979127884, "learning_rate": 0.00015815635618442636, "loss": 1.1064, "step": 2830 }, { "epoch": 0.37290365011509374, "grad_norm": 0.1262204498052597, "learning_rate": 0.0001579694110017271, "loss": 1.1287, "step": 2835 }, { "epoch": 0.37356132851035845, "grad_norm": 0.11575670540332794, "learning_rate": 0.00015778216018671345, "loss": 1.1442, "step": 2840 }, { "epoch": 0.37421900690562315, "grad_norm": 0.14254365861415863, "learning_rate": 0.00015759460472662847, "loss": 1.122, "step": 2845 }, { "epoch": 0.37487668530088786, "grad_norm": 0.14602047204971313, "learning_rate": 0.00015740674561032144, "loss": 1.1501, "step": 2850 }, { "epoch": 0.37553436369615256, "grad_norm": 0.12559400498867035, "learning_rate": 0.00015721858382824254, "loss": 1.1225, "step": 2855 }, { "epoch": 0.3761920420914173, "grad_norm": 0.14463557302951813, "learning_rate": 0.0001570301203724378, "loss": 1.1054, "step": 2860 }, { "epoch": 0.376849720486682, "grad_norm": 0.12643663585186005, "learning_rate": 0.00015684135623654375, "loss": 1.0898, "step": 2865 }, { "epoch": 0.37750739888194673, "grad_norm": 0.12367218732833862, "learning_rate": 0.00015665229241578206, "loss": 1.0963, "step": 2870 }, { "epoch": 0.37816507727721144, "grad_norm": 0.12368357926607132, "learning_rate": 0.0001564629299069546, "loss": 1.138, "step": 2875 }, { "epoch": 0.37882275567247614, "grad_norm": 0.12159838527441025, "learning_rate": 0.00015627326970843792, "loss": 1.1424, "step": 2880 }, { "epoch": 0.3794804340677409, "grad_norm": 0.12276262789964676, "learning_rate": 0.00015608331282017805, "loss": 1.067, "step": 2885 }, { "epoch": 0.3801381124630056, "grad_norm": 0.13890202343463898, "learning_rate": 0.00015589306024368539, "loss": 1.0702, "step": 2890 }, { "epoch": 0.3807957908582703, "grad_norm": 0.12923169136047363, "learning_rate": 0.00015570251298202922, "loss": 1.1435, "step": 2895 }, { "epoch": 0.381453469253535, "grad_norm": 0.1323583722114563, "learning_rate": 0.00015551167203983244, "loss": 1.1147, "step": 2900 }, { "epoch": 0.3821111476487997, "grad_norm": 0.1250571459531784, "learning_rate": 0.00015532053842326636, "loss": 1.128, "step": 2905 }, { "epoch": 0.3827688260440644, "grad_norm": 0.1353861540555954, "learning_rate": 0.00015512911314004545, "loss": 1.1294, "step": 2910 }, { "epoch": 0.3834265044393292, "grad_norm": 0.14497464895248413, "learning_rate": 0.00015493739719942177, "loss": 1.0923, "step": 2915 }, { "epoch": 0.3840841828345939, "grad_norm": 0.12390535324811935, "learning_rate": 0.00015474539161217994, "loss": 1.1167, "step": 2920 }, { "epoch": 0.3847418612298586, "grad_norm": 0.12288369238376617, "learning_rate": 0.00015455309739063165, "loss": 1.1155, "step": 2925 }, { "epoch": 0.3853995396251233, "grad_norm": 0.11992330104112625, "learning_rate": 0.0001543605155486103, "loss": 1.077, "step": 2930 }, { "epoch": 0.386057218020388, "grad_norm": 0.12463527172803879, "learning_rate": 0.00015416764710146578, "loss": 1.0804, "step": 2935 }, { "epoch": 0.38671489641565276, "grad_norm": 0.18415139615535736, "learning_rate": 0.00015397449306605912, "loss": 1.0035, "step": 2940 }, { "epoch": 0.38737257481091747, "grad_norm": 0.1352183222770691, "learning_rate": 0.0001537810544607568, "loss": 1.1183, "step": 2945 }, { "epoch": 0.3880302532061822, "grad_norm": 0.11789307743310928, "learning_rate": 0.000153587332305426, "loss": 1.0749, "step": 2950 }, { "epoch": 0.3886879316014469, "grad_norm": 0.12985996901988983, "learning_rate": 0.0001533933276214285, "loss": 1.0881, "step": 2955 }, { "epoch": 0.3893456099967116, "grad_norm": 0.12444959580898285, "learning_rate": 0.00015319904143161595, "loss": 1.12, "step": 2960 }, { "epoch": 0.39000328839197634, "grad_norm": 0.11867447942495346, "learning_rate": 0.00015300447476032403, "loss": 1.1156, "step": 2965 }, { "epoch": 0.39066096678724105, "grad_norm": 0.12951654195785522, "learning_rate": 0.00015280962863336716, "loss": 1.0566, "step": 2970 }, { "epoch": 0.39131864518250575, "grad_norm": 0.14193734526634216, "learning_rate": 0.00015261450407803333, "loss": 1.0833, "step": 2975 }, { "epoch": 0.39197632357777046, "grad_norm": 0.13168539106845856, "learning_rate": 0.0001524191021230783, "loss": 1.1158, "step": 2980 }, { "epoch": 0.39263400197303516, "grad_norm": 0.1329220086336136, "learning_rate": 0.0001522234237987204, "loss": 1.1273, "step": 2985 }, { "epoch": 0.3932916803682999, "grad_norm": 0.1280069202184677, "learning_rate": 0.0001520274701366352, "loss": 1.042, "step": 2990 }, { "epoch": 0.3939493587635646, "grad_norm": 0.13296009600162506, "learning_rate": 0.0001518312421699497, "loss": 1.1395, "step": 2995 }, { "epoch": 0.39460703715882933, "grad_norm": 0.11756201833486557, "learning_rate": 0.00015163474093323736, "loss": 1.0452, "step": 3000 }, { "epoch": 0.39526471555409404, "grad_norm": 0.12450161576271057, "learning_rate": 0.00015143796746251224, "loss": 1.1517, "step": 3005 }, { "epoch": 0.39592239394935874, "grad_norm": 0.12644536793231964, "learning_rate": 0.00015124092279522375, "loss": 1.0741, "step": 3010 }, { "epoch": 0.3965800723446235, "grad_norm": 0.12840372323989868, "learning_rate": 0.00015104360797025114, "loss": 1.1077, "step": 3015 }, { "epoch": 0.3972377507398882, "grad_norm": 0.12556473910808563, "learning_rate": 0.00015084602402789802, "loss": 1.1092, "step": 3020 }, { "epoch": 0.3978954291351529, "grad_norm": 0.11911559104919434, "learning_rate": 0.0001506481720098868, "loss": 1.0669, "step": 3025 }, { "epoch": 0.3985531075304176, "grad_norm": 0.1171235591173172, "learning_rate": 0.0001504500529593534, "loss": 1.1161, "step": 3030 }, { "epoch": 0.3992107859256823, "grad_norm": 0.1283843219280243, "learning_rate": 0.00015025166792084147, "loss": 1.109, "step": 3035 }, { "epoch": 0.3998684643209471, "grad_norm": 0.12909185886383057, "learning_rate": 0.00015005301794029712, "loss": 1.0807, "step": 3040 }, { "epoch": 0.4005261427162118, "grad_norm": 0.12499774992465973, "learning_rate": 0.00014985410406506325, "loss": 1.0629, "step": 3045 }, { "epoch": 0.4011838211114765, "grad_norm": 0.12170391529798508, "learning_rate": 0.0001496549273438742, "loss": 1.0798, "step": 3050 }, { "epoch": 0.4018414995067412, "grad_norm": 0.128540500998497, "learning_rate": 0.00014945548882685, "loss": 1.1395, "step": 3055 }, { "epoch": 0.4024991779020059, "grad_norm": 0.12214395403862, "learning_rate": 0.00014925578956549105, "loss": 1.1439, "step": 3060 }, { "epoch": 0.40315685629727066, "grad_norm": 0.13780651986598969, "learning_rate": 0.00014905583061267233, "loss": 1.1373, "step": 3065 }, { "epoch": 0.40381453469253537, "grad_norm": 0.1346084028482437, "learning_rate": 0.0001488556130226381, "loss": 1.1271, "step": 3070 }, { "epoch": 0.40447221308780007, "grad_norm": 0.12058104574680328, "learning_rate": 0.00014865513785099625, "loss": 1.0951, "step": 3075 }, { "epoch": 0.4051298914830648, "grad_norm": 0.12147203832864761, "learning_rate": 0.00014845440615471266, "loss": 1.1005, "step": 3080 }, { "epoch": 0.4057875698783295, "grad_norm": 0.12585145235061646, "learning_rate": 0.0001482534189921057, "loss": 1.1075, "step": 3085 }, { "epoch": 0.40644524827359424, "grad_norm": 0.12978211045265198, "learning_rate": 0.00014805217742284069, "loss": 1.1205, "step": 3090 }, { "epoch": 0.40710292666885894, "grad_norm": 0.12356739491224289, "learning_rate": 0.0001478506825079241, "loss": 1.0938, "step": 3095 }, { "epoch": 0.40776060506412365, "grad_norm": 0.12132400274276733, "learning_rate": 0.00014764893530969835, "loss": 1.0844, "step": 3100 }, { "epoch": 0.40841828345938835, "grad_norm": 0.1217535063624382, "learning_rate": 0.00014744693689183578, "loss": 1.0439, "step": 3105 }, { "epoch": 0.40907596185465306, "grad_norm": 0.11531001329421997, "learning_rate": 0.00014724468831933332, "loss": 1.0825, "step": 3110 }, { "epoch": 0.40973364024991776, "grad_norm": 0.13191929459571838, "learning_rate": 0.00014704219065850678, "loss": 1.0972, "step": 3115 }, { "epoch": 0.4103913186451825, "grad_norm": 0.12195173650979996, "learning_rate": 0.00014683944497698525, "loss": 1.0734, "step": 3120 }, { "epoch": 0.41104899704044723, "grad_norm": 0.12433221191167831, "learning_rate": 0.0001466364523437054, "loss": 1.0471, "step": 3125 }, { "epoch": 0.41170667543571193, "grad_norm": 0.12673376500606537, "learning_rate": 0.00014643321382890604, "loss": 1.0495, "step": 3130 }, { "epoch": 0.41236435383097664, "grad_norm": 0.12182612717151642, "learning_rate": 0.00014622973050412214, "loss": 1.0864, "step": 3135 }, { "epoch": 0.41302203222624134, "grad_norm": 0.11998345702886581, "learning_rate": 0.0001460260034421796, "loss": 1.0994, "step": 3140 }, { "epoch": 0.4136797106215061, "grad_norm": 0.12182337790727615, "learning_rate": 0.0001458220337171892, "loss": 1.0823, "step": 3145 }, { "epoch": 0.4143373890167708, "grad_norm": 0.12792836129665375, "learning_rate": 0.0001456178224045412, "loss": 1.1291, "step": 3150 }, { "epoch": 0.4149950674120355, "grad_norm": 0.13150392472743988, "learning_rate": 0.0001454133705808996, "loss": 1.0571, "step": 3155 }, { "epoch": 0.4156527458073002, "grad_norm": 0.1239900216460228, "learning_rate": 0.00014520867932419642, "loss": 1.1053, "step": 3160 }, { "epoch": 0.4163104242025649, "grad_norm": 0.12728357315063477, "learning_rate": 0.00014500374971362597, "loss": 1.0773, "step": 3165 }, { "epoch": 0.4169681025978297, "grad_norm": 0.14005187153816223, "learning_rate": 0.00014479858282963936, "loss": 1.1883, "step": 3170 }, { "epoch": 0.4176257809930944, "grad_norm": 0.12414336949586868, "learning_rate": 0.0001445931797539386, "loss": 1.1252, "step": 3175 }, { "epoch": 0.4182834593883591, "grad_norm": 0.12211406230926514, "learning_rate": 0.00014438754156947104, "loss": 1.1409, "step": 3180 }, { "epoch": 0.4189411377836238, "grad_norm": 0.1264246553182602, "learning_rate": 0.00014418166936042351, "loss": 1.1444, "step": 3185 }, { "epoch": 0.4195988161788885, "grad_norm": 0.14197120070457458, "learning_rate": 0.00014397556421221673, "loss": 1.1084, "step": 3190 }, { "epoch": 0.42025649457415326, "grad_norm": 0.12400893867015839, "learning_rate": 0.00014376922721149953, "loss": 1.069, "step": 3195 }, { "epoch": 0.42091417296941797, "grad_norm": 0.12539450824260712, "learning_rate": 0.00014356265944614316, "loss": 1.1325, "step": 3200 }, { "epoch": 0.42157185136468267, "grad_norm": 0.13113568723201752, "learning_rate": 0.00014335586200523554, "loss": 1.0709, "step": 3205 }, { "epoch": 0.4222295297599474, "grad_norm": 0.1276724487543106, "learning_rate": 0.00014314883597907547, "loss": 1.0833, "step": 3210 }, { "epoch": 0.4228872081552121, "grad_norm": 0.13035115599632263, "learning_rate": 0.00014294158245916697, "loss": 1.0753, "step": 3215 }, { "epoch": 0.42354488655047684, "grad_norm": 0.1298818439245224, "learning_rate": 0.00014273410253821343, "loss": 1.107, "step": 3220 }, { "epoch": 0.42420256494574154, "grad_norm": 0.16027629375457764, "learning_rate": 0.0001425263973101119, "loss": 1.0454, "step": 3225 }, { "epoch": 0.42486024334100625, "grad_norm": 0.13499340415000916, "learning_rate": 0.00014231846786994735, "loss": 1.1252, "step": 3230 }, { "epoch": 0.42551792173627095, "grad_norm": 0.12805627286434174, "learning_rate": 0.00014211031531398681, "loss": 1.1043, "step": 3235 }, { "epoch": 0.42617560013153566, "grad_norm": 0.1284942775964737, "learning_rate": 0.00014190194073967374, "loss": 1.1226, "step": 3240 }, { "epoch": 0.4268332785268004, "grad_norm": 0.12042582780122757, "learning_rate": 0.00014169334524562202, "loss": 1.1221, "step": 3245 }, { "epoch": 0.4274909569220651, "grad_norm": 0.13211563229560852, "learning_rate": 0.00014148452993161044, "loss": 1.1008, "step": 3250 }, { "epoch": 0.42814863531732983, "grad_norm": 0.12737567722797394, "learning_rate": 0.00014127549589857657, "loss": 1.1002, "step": 3255 }, { "epoch": 0.42880631371259453, "grad_norm": 0.12149865180253983, "learning_rate": 0.00014106624424861127, "loss": 1.0709, "step": 3260 }, { "epoch": 0.42946399210785924, "grad_norm": 0.12509702146053314, "learning_rate": 0.00014085677608495267, "loss": 1.0674, "step": 3265 }, { "epoch": 0.430121670503124, "grad_norm": 0.130250483751297, "learning_rate": 0.00014064709251198048, "loss": 1.1223, "step": 3270 }, { "epoch": 0.4307793488983887, "grad_norm": 0.12997671961784363, "learning_rate": 0.00014043719463521006, "loss": 1.0932, "step": 3275 }, { "epoch": 0.4314370272936534, "grad_norm": 0.11667392402887344, "learning_rate": 0.00014022708356128667, "loss": 1.0609, "step": 3280 }, { "epoch": 0.4320947056889181, "grad_norm": 0.11944939941167831, "learning_rate": 0.00014001676039797963, "loss": 1.1311, "step": 3285 }, { "epoch": 0.4327523840841828, "grad_norm": 0.1317392736673355, "learning_rate": 0.00013980622625417644, "loss": 1.1634, "step": 3290 }, { "epoch": 0.4334100624794476, "grad_norm": 0.11947982013225555, "learning_rate": 0.00013959548223987692, "loss": 1.0331, "step": 3295 }, { "epoch": 0.4340677408747123, "grad_norm": 0.13477420806884766, "learning_rate": 0.00013938452946618747, "loss": 1.0521, "step": 3300 }, { "epoch": 0.434725419269977, "grad_norm": 0.12390883266925812, "learning_rate": 0.00013917336904531504, "loss": 1.0423, "step": 3305 }, { "epoch": 0.4353830976652417, "grad_norm": 0.12213205546140671, "learning_rate": 0.0001389620020905614, "loss": 1.095, "step": 3310 }, { "epoch": 0.4360407760605064, "grad_norm": 0.12726520001888275, "learning_rate": 0.0001387504297163173, "loss": 1.1031, "step": 3315 }, { "epoch": 0.4366984544557711, "grad_norm": 0.1460600644350052, "learning_rate": 0.0001385386530380564, "loss": 1.0563, "step": 3320 }, { "epoch": 0.43735613285103586, "grad_norm": 0.12723597884178162, "learning_rate": 0.00013832667317232957, "loss": 1.0956, "step": 3325 }, { "epoch": 0.43801381124630057, "grad_norm": 0.15137435495853424, "learning_rate": 0.00013811449123675898, "loss": 1.1428, "step": 3330 }, { "epoch": 0.43867148964156527, "grad_norm": 0.12733370065689087, "learning_rate": 0.0001379021083500321, "loss": 1.0866, "step": 3335 }, { "epoch": 0.43932916803683, "grad_norm": 0.13179180026054382, "learning_rate": 0.00013768952563189598, "loss": 1.1577, "step": 3340 }, { "epoch": 0.4399868464320947, "grad_norm": 0.1282440572977066, "learning_rate": 0.0001374767442031511, "loss": 1.0989, "step": 3345 }, { "epoch": 0.44064452482735944, "grad_norm": 0.12464404851198196, "learning_rate": 0.0001372637651856457, "loss": 1.1176, "step": 3350 }, { "epoch": 0.44130220322262415, "grad_norm": 0.13569608330726624, "learning_rate": 0.0001370505897022698, "loss": 1.0791, "step": 3355 }, { "epoch": 0.44195988161788885, "grad_norm": 0.12432218343019485, "learning_rate": 0.00013683721887694912, "loss": 1.086, "step": 3360 }, { "epoch": 0.44261756001315355, "grad_norm": 0.1288546621799469, "learning_rate": 0.0001366236538346394, "loss": 1.089, "step": 3365 }, { "epoch": 0.44327523840841826, "grad_norm": 0.12247249484062195, "learning_rate": 0.0001364098957013203, "loss": 1.0728, "step": 3370 }, { "epoch": 0.443932916803683, "grad_norm": 0.12455036491155624, "learning_rate": 0.00013619594560398954, "loss": 1.0879, "step": 3375 }, { "epoch": 0.4445905951989477, "grad_norm": 0.1363576352596283, "learning_rate": 0.0001359818046706569, "loss": 1.1117, "step": 3380 }, { "epoch": 0.44524827359421243, "grad_norm": 0.11969294399023056, "learning_rate": 0.0001357674740303383, "loss": 1.0286, "step": 3385 }, { "epoch": 0.44590595198947713, "grad_norm": 0.1257266402244568, "learning_rate": 0.0001355529548130499, "loss": 1.1575, "step": 3390 }, { "epoch": 0.44656363038474184, "grad_norm": 0.12998563051223755, "learning_rate": 0.00013533824814980207, "loss": 1.0898, "step": 3395 }, { "epoch": 0.4472213087800066, "grad_norm": 0.1394844353199005, "learning_rate": 0.0001351233551725934, "loss": 1.107, "step": 3400 }, { "epoch": 0.4478789871752713, "grad_norm": 0.13212339580059052, "learning_rate": 0.00013490827701440492, "loss": 1.1068, "step": 3405 }, { "epoch": 0.448536665570536, "grad_norm": 0.12933598458766937, "learning_rate": 0.00013469301480919385, "loss": 1.0736, "step": 3410 }, { "epoch": 0.4491943439658007, "grad_norm": 0.13337726891040802, "learning_rate": 0.0001344775696918878, "loss": 1.098, "step": 3415 }, { "epoch": 0.4498520223610654, "grad_norm": 0.1277601271867752, "learning_rate": 0.0001342619427983788, "loss": 1.1233, "step": 3420 }, { "epoch": 0.4505097007563302, "grad_norm": 0.13602183759212494, "learning_rate": 0.0001340461352655172, "loss": 1.1101, "step": 3425 }, { "epoch": 0.4511673791515949, "grad_norm": 0.12428414821624756, "learning_rate": 0.00013383014823110581, "loss": 1.0925, "step": 3430 }, { "epoch": 0.4518250575468596, "grad_norm": 0.12462227791547775, "learning_rate": 0.00013361398283389365, "loss": 1.0967, "step": 3435 }, { "epoch": 0.4524827359421243, "grad_norm": 0.12934766709804535, "learning_rate": 0.00013339764021357041, "loss": 1.1215, "step": 3440 }, { "epoch": 0.453140414337389, "grad_norm": 0.1186889186501503, "learning_rate": 0.00013318112151075988, "loss": 1.0971, "step": 3445 }, { "epoch": 0.45379809273265376, "grad_norm": 0.1325155645608902, "learning_rate": 0.00013296442786701434, "loss": 1.076, "step": 3450 }, { "epoch": 0.45445577112791846, "grad_norm": 0.12837158143520355, "learning_rate": 0.00013274756042480843, "loss": 1.1157, "step": 3455 }, { "epoch": 0.45511344952318317, "grad_norm": 0.12614837288856506, "learning_rate": 0.00013253052032753302, "loss": 1.0957, "step": 3460 }, { "epoch": 0.45577112791844787, "grad_norm": 0.12727420032024384, "learning_rate": 0.0001323133087194894, "loss": 1.0594, "step": 3465 }, { "epoch": 0.4564288063137126, "grad_norm": 0.1287621557712555, "learning_rate": 0.00013209592674588293, "loss": 1.1056, "step": 3470 }, { "epoch": 0.45708648470897734, "grad_norm": 0.12691043317317963, "learning_rate": 0.0001318783755528174, "loss": 1.1341, "step": 3475 }, { "epoch": 0.45774416310424204, "grad_norm": 0.12944604456424713, "learning_rate": 0.00013166065628728862, "loss": 1.0734, "step": 3480 }, { "epoch": 0.45840184149950675, "grad_norm": 0.1291738599538803, "learning_rate": 0.00013144277009717855, "loss": 1.126, "step": 3485 }, { "epoch": 0.45905951989477145, "grad_norm": 0.1381511688232422, "learning_rate": 0.00013122471813124933, "loss": 1.1032, "step": 3490 }, { "epoch": 0.45971719829003616, "grad_norm": 0.13690516352653503, "learning_rate": 0.00013100650153913704, "loss": 1.0974, "step": 3495 }, { "epoch": 0.4603748766853009, "grad_norm": 0.1291900873184204, "learning_rate": 0.00013078812147134574, "loss": 1.0688, "step": 3500 }, { "epoch": 0.4610325550805656, "grad_norm": 0.11823274195194244, "learning_rate": 0.0001305695790792413, "loss": 1.112, "step": 3505 }, { "epoch": 0.4616902334758303, "grad_norm": 0.12258408218622208, "learning_rate": 0.0001303508755150455, "loss": 1.1238, "step": 3510 }, { "epoch": 0.46234791187109503, "grad_norm": 0.1247965544462204, "learning_rate": 0.00013013201193182993, "loss": 1.0884, "step": 3515 }, { "epoch": 0.46300559026635973, "grad_norm": 0.12965965270996094, "learning_rate": 0.0001299129894835097, "loss": 1.0893, "step": 3520 }, { "epoch": 0.46366326866162444, "grad_norm": 0.12263607978820801, "learning_rate": 0.00012969380932483765, "loss": 1.0891, "step": 3525 }, { "epoch": 0.4643209470568892, "grad_norm": 0.11778580397367477, "learning_rate": 0.00012947447261139793, "loss": 1.0513, "step": 3530 }, { "epoch": 0.4649786254521539, "grad_norm": 0.13151909410953522, "learning_rate": 0.00012925498049960035, "loss": 1.1106, "step": 3535 }, { "epoch": 0.4656363038474186, "grad_norm": 0.14801956713199615, "learning_rate": 0.00012903533414667376, "loss": 1.1005, "step": 3540 }, { "epoch": 0.4662939822426833, "grad_norm": 0.12768159806728363, "learning_rate": 0.00012881553471066039, "loss": 1.0821, "step": 3545 }, { "epoch": 0.466951660637948, "grad_norm": 0.12660740315914154, "learning_rate": 0.00012859558335040955, "loss": 1.0799, "step": 3550 }, { "epoch": 0.4676093390332128, "grad_norm": 0.13079871237277985, "learning_rate": 0.00012837548122557148, "loss": 1.1052, "step": 3555 }, { "epoch": 0.4682670174284775, "grad_norm": 0.11994558572769165, "learning_rate": 0.00012815522949659132, "loss": 1.1254, "step": 3560 }, { "epoch": 0.4689246958237422, "grad_norm": 0.12230885773897171, "learning_rate": 0.00012793482932470305, "loss": 1.0928, "step": 3565 }, { "epoch": 0.4695823742190069, "grad_norm": 0.12666869163513184, "learning_rate": 0.00012771428187192312, "loss": 1.076, "step": 3570 }, { "epoch": 0.4702400526142716, "grad_norm": 0.13335050642490387, "learning_rate": 0.00012749358830104463, "loss": 1.0779, "step": 3575 }, { "epoch": 0.47089773100953636, "grad_norm": 0.12296222895383835, "learning_rate": 0.000127272749775631, "loss": 1.096, "step": 3580 }, { "epoch": 0.47155540940480106, "grad_norm": 0.12848436832427979, "learning_rate": 0.00012705176746000992, "loss": 1.08, "step": 3585 }, { "epoch": 0.47221308780006577, "grad_norm": 0.13130496442317963, "learning_rate": 0.0001268306425192672, "loss": 1.0807, "step": 3590 }, { "epoch": 0.47287076619533047, "grad_norm": 0.12951184809207916, "learning_rate": 0.00012660937611924048, "loss": 1.0464, "step": 3595 }, { "epoch": 0.4735284445905952, "grad_norm": 0.13656343519687653, "learning_rate": 0.00012638796942651345, "loss": 1.1131, "step": 3600 }, { "epoch": 0.47418612298585994, "grad_norm": 0.1269528865814209, "learning_rate": 0.00012616642360840927, "loss": 1.098, "step": 3605 }, { "epoch": 0.47484380138112464, "grad_norm": 0.12353064864873886, "learning_rate": 0.00012594473983298468, "loss": 1.0564, "step": 3610 }, { "epoch": 0.47550147977638935, "grad_norm": 0.1453634351491928, "learning_rate": 0.00012572291926902376, "loss": 1.0651, "step": 3615 }, { "epoch": 0.47615915817165405, "grad_norm": 0.1334347277879715, "learning_rate": 0.00012550096308603184, "loss": 1.1059, "step": 3620 }, { "epoch": 0.47681683656691876, "grad_norm": 0.12640570104122162, "learning_rate": 0.00012527887245422918, "loss": 1.0656, "step": 3625 }, { "epoch": 0.4774745149621835, "grad_norm": 0.15378595888614655, "learning_rate": 0.00012505664854454496, "loss": 1.1319, "step": 3630 }, { "epoch": 0.4781321933574482, "grad_norm": 0.12921735644340515, "learning_rate": 0.000124834292528611, "loss": 1.0895, "step": 3635 }, { "epoch": 0.4787898717527129, "grad_norm": 0.1252502202987671, "learning_rate": 0.00012461180557875572, "loss": 1.0861, "step": 3640 }, { "epoch": 0.47944755014797763, "grad_norm": 0.12254251539707184, "learning_rate": 0.00012438918886799772, "loss": 1.113, "step": 3645 }, { "epoch": 0.48010522854324233, "grad_norm": 0.1281600445508957, "learning_rate": 0.0001241664435700398, "loss": 1.1301, "step": 3650 }, { "epoch": 0.4807629069385071, "grad_norm": 0.12778957188129425, "learning_rate": 0.00012394357085926275, "loss": 1.1324, "step": 3655 }, { "epoch": 0.4814205853337718, "grad_norm": 0.1261673867702484, "learning_rate": 0.0001237205719107191, "loss": 1.0696, "step": 3660 }, { "epoch": 0.4820782637290365, "grad_norm": 0.12591837346553802, "learning_rate": 0.00012349744790012693, "loss": 1.1219, "step": 3665 }, { "epoch": 0.4827359421243012, "grad_norm": 0.12479467689990997, "learning_rate": 0.00012327420000386364, "loss": 1.0605, "step": 3670 }, { "epoch": 0.4833936205195659, "grad_norm": 0.12552618980407715, "learning_rate": 0.00012305082939895992, "loss": 1.0853, "step": 3675 }, { "epoch": 0.4840512989148307, "grad_norm": 0.1314186304807663, "learning_rate": 0.00012282733726309326, "loss": 1.0707, "step": 3680 }, { "epoch": 0.4847089773100954, "grad_norm": 0.12562698125839233, "learning_rate": 0.000122603724774582, "loss": 1.0583, "step": 3685 }, { "epoch": 0.4853666557053601, "grad_norm": 0.13361532986164093, "learning_rate": 0.00012237999311237905, "loss": 1.1069, "step": 3690 }, { "epoch": 0.4860243341006248, "grad_norm": 0.13389931619167328, "learning_rate": 0.00012215614345606547, "loss": 1.047, "step": 3695 }, { "epoch": 0.4866820124958895, "grad_norm": 0.12122887372970581, "learning_rate": 0.00012193217698584465, "loss": 1.073, "step": 3700 }, { "epoch": 0.4873396908911542, "grad_norm": 0.1348717361688614, "learning_rate": 0.00012170809488253567, "loss": 1.0474, "step": 3705 }, { "epoch": 0.48799736928641896, "grad_norm": 0.14813917875289917, "learning_rate": 0.00012148389832756732, "loss": 1.0855, "step": 3710 }, { "epoch": 0.48865504768168366, "grad_norm": 0.11762000620365143, "learning_rate": 0.0001212595885029719, "loss": 1.0761, "step": 3715 }, { "epoch": 0.48931272607694837, "grad_norm": 0.12465359270572662, "learning_rate": 0.00012103516659137875, "loss": 1.1398, "step": 3720 }, { "epoch": 0.4899704044722131, "grad_norm": 0.16188563406467438, "learning_rate": 0.00012081063377600828, "loss": 1.1318, "step": 3725 }, { "epoch": 0.4906280828674778, "grad_norm": 0.12848694622516632, "learning_rate": 0.00012058599124066561, "loss": 1.1845, "step": 3730 }, { "epoch": 0.49128576126274254, "grad_norm": 0.1271568387746811, "learning_rate": 0.00012036124016973422, "loss": 1.0288, "step": 3735 }, { "epoch": 0.49194343965800724, "grad_norm": 0.12502196431159973, "learning_rate": 0.00012013638174817004, "loss": 1.0749, "step": 3740 }, { "epoch": 0.49260111805327195, "grad_norm": 0.12536533176898956, "learning_rate": 0.00011991141716149477, "loss": 1.0729, "step": 3745 }, { "epoch": 0.49325879644853665, "grad_norm": 0.12105745822191238, "learning_rate": 0.00011968634759578997, "loss": 1.0911, "step": 3750 }, { "epoch": 0.49391647484380136, "grad_norm": 0.13725827634334564, "learning_rate": 0.00011946117423769061, "loss": 1.1244, "step": 3755 }, { "epoch": 0.4945741532390661, "grad_norm": 0.12375085055828094, "learning_rate": 0.00011923589827437896, "loss": 1.098, "step": 3760 }, { "epoch": 0.4952318316343308, "grad_norm": 0.12235120683908463, "learning_rate": 0.00011901052089357818, "loss": 1.14, "step": 3765 }, { "epoch": 0.4958895100295955, "grad_norm": 0.19725097715854645, "learning_rate": 0.0001187850432835462, "loss": 1.0934, "step": 3770 }, { "epoch": 0.49654718842486023, "grad_norm": 0.13043101131916046, "learning_rate": 0.00011855946663306934, "loss": 1.0853, "step": 3775 }, { "epoch": 0.49720486682012494, "grad_norm": 0.12537997961044312, "learning_rate": 0.0001183337921314561, "loss": 1.0512, "step": 3780 }, { "epoch": 0.4978625452153897, "grad_norm": 0.13492323458194733, "learning_rate": 0.00011810802096853091, "loss": 1.1728, "step": 3785 }, { "epoch": 0.4985202236106544, "grad_norm": 0.12620237469673157, "learning_rate": 0.0001178821543346278, "loss": 1.0924, "step": 3790 }, { "epoch": 0.4991779020059191, "grad_norm": 0.1316300481557846, "learning_rate": 0.00011765619342058411, "loss": 1.1039, "step": 3795 }, { "epoch": 0.4998355804011838, "grad_norm": 0.14139185845851898, "learning_rate": 0.00011743013941773444, "loss": 1.1026, "step": 3800 }, { "epoch": 0.5004932587964486, "grad_norm": 0.12787555158138275, "learning_rate": 0.00011720399351790387, "loss": 1.0962, "step": 3805 }, { "epoch": 0.5011509371917132, "grad_norm": 0.12925289571285248, "learning_rate": 0.00011697775691340229, "loss": 1.0949, "step": 3810 }, { "epoch": 0.501808615586978, "grad_norm": 0.12285643815994263, "learning_rate": 0.00011675143079701767, "loss": 1.0274, "step": 3815 }, { "epoch": 0.5024662939822426, "grad_norm": 0.14616061747074127, "learning_rate": 0.00011652501636200985, "loss": 1.036, "step": 3820 }, { "epoch": 0.5031239723775074, "grad_norm": 0.1477762907743454, "learning_rate": 0.0001162985148021045, "loss": 1.102, "step": 3825 }, { "epoch": 0.5037816507727721, "grad_norm": 0.15864263474941254, "learning_rate": 0.00011607192731148646, "loss": 1.1251, "step": 3830 }, { "epoch": 0.5044393291680368, "grad_norm": 0.11946552991867065, "learning_rate": 0.00011584525508479371, "loss": 1.1445, "step": 3835 }, { "epoch": 0.5050970075633016, "grad_norm": 0.12886208295822144, "learning_rate": 0.00011561849931711102, "loss": 1.1131, "step": 3840 }, { "epoch": 0.5057546859585662, "grad_norm": 0.12497090548276901, "learning_rate": 0.00011539166120396347, "loss": 1.0961, "step": 3845 }, { "epoch": 0.506412364353831, "grad_norm": 0.1256610006093979, "learning_rate": 0.00011516474194131044, "loss": 1.0921, "step": 3850 }, { "epoch": 0.5070700427490957, "grad_norm": 0.11142367124557495, "learning_rate": 0.0001149377427255391, "loss": 1.1241, "step": 3855 }, { "epoch": 0.5077277211443604, "grad_norm": 0.12402156740427017, "learning_rate": 0.00011471066475345814, "loss": 1.14, "step": 3860 }, { "epoch": 0.5083853995396251, "grad_norm": 0.12801967561244965, "learning_rate": 0.00011448350922229147, "loss": 1.0967, "step": 3865 }, { "epoch": 0.5090430779348898, "grad_norm": 0.1194251999258995, "learning_rate": 0.00011425627732967202, "loss": 1.0192, "step": 3870 }, { "epoch": 0.5097007563301545, "grad_norm": 0.1363779902458191, "learning_rate": 0.00011402897027363513, "loss": 1.1172, "step": 3875 }, { "epoch": 0.5103584347254193, "grad_norm": 0.1349778026342392, "learning_rate": 0.00011380158925261257, "loss": 1.1123, "step": 3880 }, { "epoch": 0.511016113120684, "grad_norm": 0.12127676606178284, "learning_rate": 0.00011357413546542607, "loss": 1.0482, "step": 3885 }, { "epoch": 0.5116737915159487, "grad_norm": 0.12605348229408264, "learning_rate": 0.00011334661011128096, "loss": 1.0773, "step": 3890 }, { "epoch": 0.5123314699112134, "grad_norm": 0.1431836038827896, "learning_rate": 0.00011311901438975989, "loss": 1.0888, "step": 3895 }, { "epoch": 0.5129891483064781, "grad_norm": 0.13068416714668274, "learning_rate": 0.00011289134950081654, "loss": 1.1275, "step": 3900 }, { "epoch": 0.5136468267017429, "grad_norm": 0.13922950625419617, "learning_rate": 0.00011266361664476921, "loss": 1.1153, "step": 3905 }, { "epoch": 0.5143045050970075, "grad_norm": 0.1311306357383728, "learning_rate": 0.00011243581702229462, "loss": 1.0799, "step": 3910 }, { "epoch": 0.5149621834922723, "grad_norm": 0.12552884221076965, "learning_rate": 0.00011220795183442145, "loss": 1.1509, "step": 3915 }, { "epoch": 0.515619861887537, "grad_norm": 0.1237616091966629, "learning_rate": 0.00011198002228252405, "loss": 1.0999, "step": 3920 }, { "epoch": 0.5162775402828017, "grad_norm": 0.13869383931159973, "learning_rate": 0.00011175202956831619, "loss": 1.1293, "step": 3925 }, { "epoch": 0.5169352186780665, "grad_norm": 0.1433591991662979, "learning_rate": 0.00011152397489384454, "loss": 1.1498, "step": 3930 }, { "epoch": 0.5175928970733311, "grad_norm": 0.1644476354122162, "learning_rate": 0.00011129585946148253, "loss": 1.0723, "step": 3935 }, { "epoch": 0.5182505754685959, "grad_norm": 0.12420455366373062, "learning_rate": 0.00011106768447392391, "loss": 1.119, "step": 3940 }, { "epoch": 0.5189082538638605, "grad_norm": 0.12628088891506195, "learning_rate": 0.00011083945113417638, "loss": 1.023, "step": 3945 }, { "epoch": 0.5195659322591253, "grad_norm": 0.1276852786540985, "learning_rate": 0.00011061116064555532, "loss": 1.1178, "step": 3950 }, { "epoch": 0.52022361065439, "grad_norm": 0.13520023226737976, "learning_rate": 0.00011038281421167747, "loss": 1.0918, "step": 3955 }, { "epoch": 0.5208812890496547, "grad_norm": 0.15304797887802124, "learning_rate": 0.00011015441303645442, "loss": 1.0428, "step": 3960 }, { "epoch": 0.5215389674449195, "grad_norm": 0.12026746571063995, "learning_rate": 0.00010992595832408647, "loss": 1.1081, "step": 3965 }, { "epoch": 0.5221966458401841, "grad_norm": 0.13084089756011963, "learning_rate": 0.00010969745127905608, "loss": 1.0643, "step": 3970 }, { "epoch": 0.5228543242354489, "grad_norm": 0.13253986835479736, "learning_rate": 0.00010946889310612176, "loss": 1.1111, "step": 3975 }, { "epoch": 0.5235120026307136, "grad_norm": 0.12795184552669525, "learning_rate": 0.0001092402850103115, "loss": 1.0887, "step": 3980 }, { "epoch": 0.5241696810259783, "grad_norm": 0.12404151260852814, "learning_rate": 0.00010901162819691643, "loss": 1.1272, "step": 3985 }, { "epoch": 0.524827359421243, "grad_norm": 0.12153102457523346, "learning_rate": 0.00010878292387148472, "loss": 1.0378, "step": 3990 }, { "epoch": 0.5254850378165077, "grad_norm": 0.12357180565595627, "learning_rate": 0.0001085541732398149, "loss": 1.0972, "step": 3995 }, { "epoch": 0.5261427162117724, "grad_norm": 0.12835635244846344, "learning_rate": 0.00010832537750794963, "loss": 1.0708, "step": 4000 }, { "epoch": 0.5268003946070372, "grad_norm": 0.12693968415260315, "learning_rate": 0.00010809653788216936, "loss": 1.0732, "step": 4005 }, { "epoch": 0.5274580730023019, "grad_norm": 0.1281595081090927, "learning_rate": 0.0001078676555689861, "loss": 1.0015, "step": 4010 }, { "epoch": 0.5281157513975666, "grad_norm": 0.1419307291507721, "learning_rate": 0.00010763873177513677, "loss": 1.0486, "step": 4015 }, { "epoch": 0.5287734297928313, "grad_norm": 0.12419544160366058, "learning_rate": 0.00010740976770757698, "loss": 1.1406, "step": 4020 }, { "epoch": 0.529431108188096, "grad_norm": 0.13191036880016327, "learning_rate": 0.00010718076457347483, "loss": 1.1435, "step": 4025 }, { "epoch": 0.5300887865833608, "grad_norm": 0.12010519951581955, "learning_rate": 0.0001069517235802042, "loss": 1.0575, "step": 4030 }, { "epoch": 0.5307464649786254, "grad_norm": 0.12629194557666779, "learning_rate": 0.00010672264593533872, "loss": 1.0761, "step": 4035 }, { "epoch": 0.5314041433738902, "grad_norm": 0.1302013397216797, "learning_rate": 0.00010649353284664516, "loss": 1.0621, "step": 4040 }, { "epoch": 0.5320618217691548, "grad_norm": 0.13380949199199677, "learning_rate": 0.00010626438552207723, "loss": 1.0892, "step": 4045 }, { "epoch": 0.5327195001644196, "grad_norm": 0.13530650734901428, "learning_rate": 0.00010603520516976915, "loss": 1.1088, "step": 4050 }, { "epoch": 0.5333771785596844, "grad_norm": 0.12939399480819702, "learning_rate": 0.00010580599299802913, "loss": 1.092, "step": 4055 }, { "epoch": 0.534034856954949, "grad_norm": 0.12343386560678482, "learning_rate": 0.00010557675021533337, "loss": 1.1005, "step": 4060 }, { "epoch": 0.5346925353502138, "grad_norm": 0.1252499222755432, "learning_rate": 0.00010534747803031927, "loss": 1.0733, "step": 4065 }, { "epoch": 0.5353502137454784, "grad_norm": 0.1284775584936142, "learning_rate": 0.00010511817765177933, "loss": 1.0829, "step": 4070 }, { "epoch": 0.5360078921407432, "grad_norm": 0.1305878609418869, "learning_rate": 0.00010488885028865471, "loss": 1.057, "step": 4075 }, { "epoch": 0.5366655705360079, "grad_norm": 0.1279468685388565, "learning_rate": 0.0001046594971500288, "loss": 1.1057, "step": 4080 }, { "epoch": 0.5373232489312726, "grad_norm": 0.12850476801395416, "learning_rate": 0.00010443011944512087, "loss": 1.1265, "step": 4085 }, { "epoch": 0.5379809273265374, "grad_norm": 0.1240396574139595, "learning_rate": 0.00010420071838327978, "loss": 1.0793, "step": 4090 }, { "epoch": 0.538638605721802, "grad_norm": 0.13395895063877106, "learning_rate": 0.00010397129517397746, "loss": 1.0788, "step": 4095 }, { "epoch": 0.5392962841170668, "grad_norm": 0.15064816176891327, "learning_rate": 0.00010374185102680267, "loss": 1.0859, "step": 4100 }, { "epoch": 0.5399539625123315, "grad_norm": 0.1287085860967636, "learning_rate": 0.00010351238715145453, "loss": 1.1039, "step": 4105 }, { "epoch": 0.5406116409075962, "grad_norm": 0.12358924746513367, "learning_rate": 0.00010328290475773614, "loss": 1.0849, "step": 4110 }, { "epoch": 0.5412693193028609, "grad_norm": 0.11661688983440399, "learning_rate": 0.00010305340505554835, "loss": 1.0532, "step": 4115 }, { "epoch": 0.5419269976981256, "grad_norm": 0.12571509182453156, "learning_rate": 0.00010282388925488314, "loss": 1.1239, "step": 4120 }, { "epoch": 0.5425846760933903, "grad_norm": 0.12855681777000427, "learning_rate": 0.00010259435856581738, "loss": 1.1209, "step": 4125 }, { "epoch": 0.5432423544886551, "grad_norm": 0.14639022946357727, "learning_rate": 0.00010236481419850652, "loss": 1.1086, "step": 4130 }, { "epoch": 0.5439000328839197, "grad_norm": 0.14279168844223022, "learning_rate": 0.00010213525736317806, "loss": 1.125, "step": 4135 }, { "epoch": 0.5445577112791845, "grad_norm": 0.11954186856746674, "learning_rate": 0.00010190568927012528, "loss": 1.1666, "step": 4140 }, { "epoch": 0.5452153896744492, "grad_norm": 0.12374676764011383, "learning_rate": 0.0001016761111297007, "loss": 1.1627, "step": 4145 }, { "epoch": 0.5458730680697139, "grad_norm": 0.13391058146953583, "learning_rate": 0.00010144652415231002, "loss": 1.1057, "step": 4150 }, { "epoch": 0.5465307464649787, "grad_norm": 0.13291941583156586, "learning_rate": 0.00010121692954840529, "loss": 1.1473, "step": 4155 }, { "epoch": 0.5471884248602433, "grad_norm": 0.1344461441040039, "learning_rate": 0.00010098732852847894, "loss": 1.1237, "step": 4160 }, { "epoch": 0.5478461032555081, "grad_norm": 0.12248563021421432, "learning_rate": 0.00010075772230305721, "loss": 1.0535, "step": 4165 }, { "epoch": 0.5485037816507727, "grad_norm": 0.1296956092119217, "learning_rate": 0.00010052811208269365, "loss": 1.1294, "step": 4170 }, { "epoch": 0.5491614600460375, "grad_norm": 0.13205696642398834, "learning_rate": 0.00010029849907796311, "loss": 1.0943, "step": 4175 }, { "epoch": 0.5498191384413023, "grad_norm": 0.14146430790424347, "learning_rate": 0.00010006888449945488, "loss": 1.072, "step": 4180 }, { "epoch": 0.5504768168365669, "grad_norm": 0.18991486728191376, "learning_rate": 9.983926955776668e-05, "loss": 1.0586, "step": 4185 }, { "epoch": 0.5511344952318317, "grad_norm": 0.1339097023010254, "learning_rate": 9.960965546349811e-05, "loss": 1.1189, "step": 4190 }, { "epoch": 0.5517921736270963, "grad_norm": 0.1323249191045761, "learning_rate": 9.938004342724432e-05, "loss": 1.1165, "step": 4195 }, { "epoch": 0.5524498520223611, "grad_norm": 0.11826788634061813, "learning_rate": 9.915043465958957e-05, "loss": 1.0306, "step": 4200 }, { "epoch": 0.5531075304176257, "grad_norm": 0.12944374978542328, "learning_rate": 9.892083037110094e-05, "loss": 1.0816, "step": 4205 }, { "epoch": 0.5537652088128905, "grad_norm": 0.1280798614025116, "learning_rate": 9.869123177232186e-05, "loss": 1.1049, "step": 4210 }, { "epoch": 0.5544228872081552, "grad_norm": 0.1301519274711609, "learning_rate": 9.846164007376575e-05, "loss": 1.0996, "step": 4215 }, { "epoch": 0.5550805656034199, "grad_norm": 0.12969233095645905, "learning_rate": 9.82320564859097e-05, "loss": 1.0966, "step": 4220 }, { "epoch": 0.5557382439986847, "grad_norm": 0.12612180411815643, "learning_rate": 9.800248221918804e-05, "loss": 1.123, "step": 4225 }, { "epoch": 0.5563959223939493, "grad_norm": 0.13390159606933594, "learning_rate": 9.77729184839858e-05, "loss": 1.0792, "step": 4230 }, { "epoch": 0.5570536007892141, "grad_norm": 0.12435296177864075, "learning_rate": 9.754336649063275e-05, "loss": 1.1266, "step": 4235 }, { "epoch": 0.5577112791844788, "grad_norm": 0.1283431351184845, "learning_rate": 9.731382744939655e-05, "loss": 1.1128, "step": 4240 }, { "epoch": 0.5583689575797435, "grad_norm": 0.11855222284793854, "learning_rate": 9.708430257047659e-05, "loss": 1.0759, "step": 4245 }, { "epoch": 0.5590266359750082, "grad_norm": 0.12805521488189697, "learning_rate": 9.685479306399774e-05, "loss": 1.0544, "step": 4250 }, { "epoch": 0.5596843143702729, "grad_norm": 0.12418148666620255, "learning_rate": 9.662530014000363e-05, "loss": 1.0999, "step": 4255 }, { "epoch": 0.5603419927655376, "grad_norm": 0.1269996017217636, "learning_rate": 9.639582500845058e-05, "loss": 1.1174, "step": 4260 }, { "epoch": 0.5609996711608024, "grad_norm": 0.12534643709659576, "learning_rate": 9.616636887920108e-05, "loss": 1.1123, "step": 4265 }, { "epoch": 0.561657349556067, "grad_norm": 0.12626636028289795, "learning_rate": 9.593693296201744e-05, "loss": 1.1355, "step": 4270 }, { "epoch": 0.5623150279513318, "grad_norm": 0.1723729372024536, "learning_rate": 9.570751846655533e-05, "loss": 1.0674, "step": 4275 }, { "epoch": 0.5629727063465965, "grad_norm": 0.13262727856636047, "learning_rate": 9.547812660235764e-05, "loss": 1.0927, "step": 4280 }, { "epoch": 0.5636303847418612, "grad_norm": 0.14280182123184204, "learning_rate": 9.524875857884776e-05, "loss": 1.0847, "step": 4285 }, { "epoch": 0.564288063137126, "grad_norm": 0.12689192593097687, "learning_rate": 9.501941560532349e-05, "loss": 1.0726, "step": 4290 }, { "epoch": 0.5649457415323906, "grad_norm": 0.1304570436477661, "learning_rate": 9.479009889095057e-05, "loss": 1.0815, "step": 4295 }, { "epoch": 0.5656034199276554, "grad_norm": 0.13616132736206055, "learning_rate": 9.456080964475624e-05, "loss": 1.1187, "step": 4300 }, { "epoch": 0.56626109832292, "grad_norm": 0.13223066926002502, "learning_rate": 9.433154907562289e-05, "loss": 1.0651, "step": 4305 }, { "epoch": 0.5669187767181848, "grad_norm": 0.1338409036397934, "learning_rate": 9.410231839228185e-05, "loss": 1.053, "step": 4310 }, { "epoch": 0.5675764551134496, "grad_norm": 0.1230703741312027, "learning_rate": 9.387311880330675e-05, "loss": 1.0388, "step": 4315 }, { "epoch": 0.5682341335087142, "grad_norm": 0.12398272007703781, "learning_rate": 9.364395151710732e-05, "loss": 1.0579, "step": 4320 }, { "epoch": 0.568891811903979, "grad_norm": 0.12621094286441803, "learning_rate": 9.3414817741923e-05, "loss": 1.0732, "step": 4325 }, { "epoch": 0.5695494902992436, "grad_norm": 0.13278800249099731, "learning_rate": 9.318571868581656e-05, "loss": 1.1154, "step": 4330 }, { "epoch": 0.5702071686945084, "grad_norm": 0.13054777681827545, "learning_rate": 9.295665555666769e-05, "loss": 1.0566, "step": 4335 }, { "epoch": 0.5708648470897731, "grad_norm": 0.12582288682460785, "learning_rate": 9.272762956216664e-05, "loss": 1.0486, "step": 4340 }, { "epoch": 0.5715225254850378, "grad_norm": 0.13897186517715454, "learning_rate": 9.249864190980794e-05, "loss": 1.0838, "step": 4345 }, { "epoch": 0.5721802038803026, "grad_norm": 0.12586842477321625, "learning_rate": 9.226969380688395e-05, "loss": 1.0854, "step": 4350 }, { "epoch": 0.5728378822755672, "grad_norm": 0.13024652004241943, "learning_rate": 9.204078646047843e-05, "loss": 1.0965, "step": 4355 }, { "epoch": 0.573495560670832, "grad_norm": 0.1406366527080536, "learning_rate": 9.181192107746043e-05, "loss": 1.0248, "step": 4360 }, { "epoch": 0.5741532390660967, "grad_norm": 0.12962545454502106, "learning_rate": 9.158309886447757e-05, "loss": 1.1433, "step": 4365 }, { "epoch": 0.5748109174613614, "grad_norm": 0.12938588857650757, "learning_rate": 9.135432102794994e-05, "loss": 1.1608, "step": 4370 }, { "epoch": 0.5754685958566261, "grad_norm": 0.13744384050369263, "learning_rate": 9.112558877406377e-05, "loss": 1.0718, "step": 4375 }, { "epoch": 0.5761262742518908, "grad_norm": 0.1436116099357605, "learning_rate": 9.089690330876479e-05, "loss": 1.1672, "step": 4380 }, { "epoch": 0.5767839526471555, "grad_norm": 0.12727364897727966, "learning_rate": 9.066826583775211e-05, "loss": 1.0975, "step": 4385 }, { "epoch": 0.5774416310424203, "grad_norm": 0.1289556473493576, "learning_rate": 9.043967756647188e-05, "loss": 1.0613, "step": 4390 }, { "epoch": 0.578099309437685, "grad_norm": 0.11940281838178635, "learning_rate": 9.021113970011074e-05, "loss": 1.1276, "step": 4395 }, { "epoch": 0.5787569878329497, "grad_norm": 0.1333748996257782, "learning_rate": 8.998265344358961e-05, "loss": 1.0706, "step": 4400 }, { "epoch": 0.5794146662282144, "grad_norm": 0.13023719191551208, "learning_rate": 8.975422000155737e-05, "loss": 1.048, "step": 4405 }, { "epoch": 0.5800723446234791, "grad_norm": 0.11700793355703354, "learning_rate": 8.95258405783844e-05, "loss": 1.1044, "step": 4410 }, { "epoch": 0.5807300230187439, "grad_norm": 0.12123888731002808, "learning_rate": 8.929751637815623e-05, "loss": 1.1481, "step": 4415 }, { "epoch": 0.5813877014140085, "grad_norm": 0.13541606068611145, "learning_rate": 8.906924860466733e-05, "loss": 1.0368, "step": 4420 }, { "epoch": 0.5820453798092733, "grad_norm": 0.12696059048175812, "learning_rate": 8.884103846141462e-05, "loss": 1.0893, "step": 4425 }, { "epoch": 0.5827030582045379, "grad_norm": 0.1310996115207672, "learning_rate": 8.861288715159112e-05, "loss": 1.0896, "step": 4430 }, { "epoch": 0.5833607365998027, "grad_norm": 0.1255674809217453, "learning_rate": 8.838479587807984e-05, "loss": 1.0606, "step": 4435 }, { "epoch": 0.5840184149950675, "grad_norm": 0.1342833936214447, "learning_rate": 8.815676584344704e-05, "loss": 1.1027, "step": 4440 }, { "epoch": 0.5846760933903321, "grad_norm": 0.1314217895269394, "learning_rate": 8.792879824993626e-05, "loss": 1.0953, "step": 4445 }, { "epoch": 0.5853337717855969, "grad_norm": 0.1255432367324829, "learning_rate": 8.770089429946176e-05, "loss": 1.0813, "step": 4450 }, { "epoch": 0.5859914501808615, "grad_norm": 0.1117679700255394, "learning_rate": 8.747305519360231e-05, "loss": 1.091, "step": 4455 }, { "epoch": 0.5866491285761263, "grad_norm": 0.13899491727352142, "learning_rate": 8.724528213359476e-05, "loss": 1.1019, "step": 4460 }, { "epoch": 0.587306806971391, "grad_norm": 0.12620800733566284, "learning_rate": 8.701757632032775e-05, "loss": 1.1046, "step": 4465 }, { "epoch": 0.5879644853666557, "grad_norm": 0.12983182072639465, "learning_rate": 8.678993895433538e-05, "loss": 1.0714, "step": 4470 }, { "epoch": 0.5886221637619204, "grad_norm": 0.12854884564876556, "learning_rate": 8.65623712357909e-05, "loss": 1.0514, "step": 4475 }, { "epoch": 0.5892798421571851, "grad_norm": 0.12460478395223618, "learning_rate": 8.633487436450027e-05, "loss": 1.0697, "step": 4480 }, { "epoch": 0.5899375205524499, "grad_norm": 0.1330973207950592, "learning_rate": 8.610744953989608e-05, "loss": 1.0535, "step": 4485 }, { "epoch": 0.5905951989477146, "grad_norm": 0.12497002631425858, "learning_rate": 8.58800979610309e-05, "loss": 1.0914, "step": 4490 }, { "epoch": 0.5912528773429793, "grad_norm": 0.12324236333370209, "learning_rate": 8.56528208265712e-05, "loss": 1.0681, "step": 4495 }, { "epoch": 0.591910555738244, "grad_norm": 0.12551970779895782, "learning_rate": 8.5425619334791e-05, "loss": 1.042, "step": 4500 }, { "epoch": 0.5925682341335087, "grad_norm": 0.13206686079502106, "learning_rate": 8.519849468356547e-05, "loss": 1.123, "step": 4505 }, { "epoch": 0.5932259125287734, "grad_norm": 0.14632025361061096, "learning_rate": 8.497144807036457e-05, "loss": 1.1433, "step": 4510 }, { "epoch": 0.5938835909240382, "grad_norm": 0.11878155171871185, "learning_rate": 8.474448069224698e-05, "loss": 1.0603, "step": 4515 }, { "epoch": 0.5945412693193028, "grad_norm": 0.1202649474143982, "learning_rate": 8.451759374585351e-05, "loss": 1.0799, "step": 4520 }, { "epoch": 0.5951989477145676, "grad_norm": 0.12806564569473267, "learning_rate": 8.429078842740093e-05, "loss": 1.0979, "step": 4525 }, { "epoch": 0.5958566261098323, "grad_norm": 0.13916490972042084, "learning_rate": 8.406406593267571e-05, "loss": 1.0651, "step": 4530 }, { "epoch": 0.596514304505097, "grad_norm": 0.1473415642976761, "learning_rate": 8.383742745702757e-05, "loss": 1.1485, "step": 4535 }, { "epoch": 0.5971719829003618, "grad_norm": 0.12505421042442322, "learning_rate": 8.361087419536327e-05, "loss": 1.0219, "step": 4540 }, { "epoch": 0.5978296612956264, "grad_norm": 0.1269635111093521, "learning_rate": 8.338440734214032e-05, "loss": 1.1456, "step": 4545 }, { "epoch": 0.5984873396908912, "grad_norm": 0.1218641921877861, "learning_rate": 8.315802809136068e-05, "loss": 1.1302, "step": 4550 }, { "epoch": 0.5991450180861558, "grad_norm": 0.12357281148433685, "learning_rate": 8.293173763656433e-05, "loss": 1.0469, "step": 4555 }, { "epoch": 0.5998026964814206, "grad_norm": 0.11598797887563705, "learning_rate": 8.270553717082327e-05, "loss": 1.12, "step": 4560 }, { "epoch": 0.6004603748766854, "grad_norm": 0.12966689467430115, "learning_rate": 8.24794278867349e-05, "loss": 1.1177, "step": 4565 }, { "epoch": 0.60111805327195, "grad_norm": 0.1571948081254959, "learning_rate": 8.225341097641592e-05, "loss": 1.107, "step": 4570 }, { "epoch": 0.6017757316672148, "grad_norm": 0.15326593816280365, "learning_rate": 8.202748763149603e-05, "loss": 1.0927, "step": 4575 }, { "epoch": 0.6024334100624794, "grad_norm": 0.13218168914318085, "learning_rate": 8.180165904311164e-05, "loss": 1.12, "step": 4580 }, { "epoch": 0.6030910884577442, "grad_norm": 0.12504245340824127, "learning_rate": 8.157592640189955e-05, "loss": 1.0672, "step": 4585 }, { "epoch": 0.6037487668530089, "grad_norm": 0.12856173515319824, "learning_rate": 8.135029089799067e-05, "loss": 1.1002, "step": 4590 }, { "epoch": 0.6044064452482736, "grad_norm": 0.1248098760843277, "learning_rate": 8.112475372100384e-05, "loss": 1.0543, "step": 4595 }, { "epoch": 0.6050641236435383, "grad_norm": 0.145641028881073, "learning_rate": 8.089931606003947e-05, "loss": 1.0766, "step": 4600 }, { "epoch": 0.605721802038803, "grad_norm": 0.1349770575761795, "learning_rate": 8.067397910367314e-05, "loss": 1.092, "step": 4605 }, { "epoch": 0.6063794804340678, "grad_norm": 0.12177833169698715, "learning_rate": 8.044874403994982e-05, "loss": 1.1211, "step": 4610 }, { "epoch": 0.6070371588293324, "grad_norm": 0.12176353484392166, "learning_rate": 8.022361205637692e-05, "loss": 1.0804, "step": 4615 }, { "epoch": 0.6076948372245972, "grad_norm": 0.1461487114429474, "learning_rate": 7.999858433991856e-05, "loss": 1.0988, "step": 4620 }, { "epoch": 0.6083525156198619, "grad_norm": 0.12258977442979813, "learning_rate": 7.977366207698915e-05, "loss": 1.0766, "step": 4625 }, { "epoch": 0.6090101940151266, "grad_norm": 0.13349422812461853, "learning_rate": 7.954884645344704e-05, "loss": 1.0994, "step": 4630 }, { "epoch": 0.6096678724103913, "grad_norm": 0.13774828612804413, "learning_rate": 7.932413865458834e-05, "loss": 1.0711, "step": 4635 }, { "epoch": 0.610325550805656, "grad_norm": 0.12503942847251892, "learning_rate": 7.909953986514079e-05, "loss": 1.1116, "step": 4640 }, { "epoch": 0.6109832292009207, "grad_norm": 0.12939752638339996, "learning_rate": 7.887505126925724e-05, "loss": 1.0989, "step": 4645 }, { "epoch": 0.6116409075961855, "grad_norm": 0.12965986132621765, "learning_rate": 7.865067405050968e-05, "loss": 1.0863, "step": 4650 }, { "epoch": 0.6122985859914502, "grad_norm": 0.13008306920528412, "learning_rate": 7.842640939188285e-05, "loss": 1.1533, "step": 4655 }, { "epoch": 0.6129562643867149, "grad_norm": 0.15885281562805176, "learning_rate": 7.820225847576806e-05, "loss": 1.0734, "step": 4660 }, { "epoch": 0.6136139427819796, "grad_norm": 0.13538002967834473, "learning_rate": 7.797822248395685e-05, "loss": 1.0641, "step": 4665 }, { "epoch": 0.6142716211772443, "grad_norm": 0.13269497454166412, "learning_rate": 7.775430259763499e-05, "loss": 1.069, "step": 4670 }, { "epoch": 0.6149292995725091, "grad_norm": 0.12757182121276855, "learning_rate": 7.753049999737599e-05, "loss": 1.0667, "step": 4675 }, { "epoch": 0.6155869779677737, "grad_norm": 0.13657952845096588, "learning_rate": 7.730681586313498e-05, "loss": 1.0891, "step": 4680 }, { "epoch": 0.6162446563630385, "grad_norm": 0.1249275952577591, "learning_rate": 7.708325137424266e-05, "loss": 1.1633, "step": 4685 }, { "epoch": 0.6169023347583031, "grad_norm": 0.12819121778011322, "learning_rate": 7.685980770939872e-05, "loss": 1.1276, "step": 4690 }, { "epoch": 0.6175600131535679, "grad_norm": 0.1296698898077011, "learning_rate": 7.663648604666593e-05, "loss": 1.0588, "step": 4695 }, { "epoch": 0.6182176915488327, "grad_norm": 0.13147692382335663, "learning_rate": 7.641328756346386e-05, "loss": 1.0689, "step": 4700 }, { "epoch": 0.6188753699440973, "grad_norm": 0.1483958661556244, "learning_rate": 7.61902134365626e-05, "loss": 1.1611, "step": 4705 }, { "epoch": 0.6195330483393621, "grad_norm": 0.13763529062271118, "learning_rate": 7.596726484207656e-05, "loss": 1.1172, "step": 4710 }, { "epoch": 0.6201907267346267, "grad_norm": 0.1272275596857071, "learning_rate": 7.574444295545832e-05, "loss": 1.1087, "step": 4715 }, { "epoch": 0.6208484051298915, "grad_norm": 0.12362238019704819, "learning_rate": 7.552174895149252e-05, "loss": 1.0924, "step": 4720 }, { "epoch": 0.6215060835251562, "grad_norm": 0.11790554970502853, "learning_rate": 7.529918400428945e-05, "loss": 1.1028, "step": 4725 }, { "epoch": 0.6221637619204209, "grad_norm": 0.12725265324115753, "learning_rate": 7.507674928727894e-05, "loss": 1.0633, "step": 4730 }, { "epoch": 0.6228214403156856, "grad_norm": 0.12149564176797867, "learning_rate": 7.485444597320437e-05, "loss": 1.0808, "step": 4735 }, { "epoch": 0.6234791187109503, "grad_norm": 0.1267111599445343, "learning_rate": 7.463227523411618e-05, "loss": 1.0996, "step": 4740 }, { "epoch": 0.6241367971062151, "grad_norm": 0.13148045539855957, "learning_rate": 7.441023824136585e-05, "loss": 1.0556, "step": 4745 }, { "epoch": 0.6247944755014798, "grad_norm": 0.13412028551101685, "learning_rate": 7.41883361655998e-05, "loss": 1.0376, "step": 4750 }, { "epoch": 0.6254521538967445, "grad_norm": 0.12124128639698029, "learning_rate": 7.396657017675304e-05, "loss": 1.0577, "step": 4755 }, { "epoch": 0.6261098322920092, "grad_norm": 0.13998383283615112, "learning_rate": 7.374494144404309e-05, "loss": 1.1176, "step": 4760 }, { "epoch": 0.6267675106872739, "grad_norm": 0.12529389560222626, "learning_rate": 7.35234511359639e-05, "loss": 1.0413, "step": 4765 }, { "epoch": 0.6274251890825386, "grad_norm": 0.1399412751197815, "learning_rate": 7.33021004202795e-05, "loss": 1.0656, "step": 4770 }, { "epoch": 0.6280828674778034, "grad_norm": 0.1321883499622345, "learning_rate": 7.308089046401798e-05, "loss": 1.1419, "step": 4775 }, { "epoch": 0.628740545873068, "grad_norm": 0.12116825580596924, "learning_rate": 7.285982243346534e-05, "loss": 1.0302, "step": 4780 }, { "epoch": 0.6293982242683328, "grad_norm": 0.11946643888950348, "learning_rate": 7.263889749415926e-05, "loss": 1.0645, "step": 4785 }, { "epoch": 0.6300559026635975, "grad_norm": 0.12613585591316223, "learning_rate": 7.241811681088303e-05, "loss": 1.0661, "step": 4790 }, { "epoch": 0.6307135810588622, "grad_norm": 0.1241552084684372, "learning_rate": 7.219748154765935e-05, "loss": 1.1013, "step": 4795 }, { "epoch": 0.631371259454127, "grad_norm": 0.12678402662277222, "learning_rate": 7.197699286774428e-05, "loss": 1.0444, "step": 4800 }, { "epoch": 0.6320289378493916, "grad_norm": 0.13133499026298523, "learning_rate": 7.175665193362092e-05, "loss": 1.0908, "step": 4805 }, { "epoch": 0.6326866162446564, "grad_norm": 0.13088297843933105, "learning_rate": 7.15364599069936e-05, "loss": 1.0632, "step": 4810 }, { "epoch": 0.633344294639921, "grad_norm": 0.12425372004508972, "learning_rate": 7.131641794878138e-05, "loss": 1.0986, "step": 4815 }, { "epoch": 0.6340019730351858, "grad_norm": 0.12345828115940094, "learning_rate": 7.10965272191122e-05, "loss": 1.0686, "step": 4820 }, { "epoch": 0.6346596514304506, "grad_norm": 0.12331658601760864, "learning_rate": 7.087678887731672e-05, "loss": 1.0724, "step": 4825 }, { "epoch": 0.6353173298257152, "grad_norm": 0.11671179533004761, "learning_rate": 7.065720408192207e-05, "loss": 1.0843, "step": 4830 }, { "epoch": 0.63597500822098, "grad_norm": 0.12846963107585907, "learning_rate": 7.04377739906459e-05, "loss": 1.0987, "step": 4835 }, { "epoch": 0.6366326866162446, "grad_norm": 0.12462946027517319, "learning_rate": 7.021849976039016e-05, "loss": 1.0592, "step": 4840 }, { "epoch": 0.6372903650115094, "grad_norm": 0.12981706857681274, "learning_rate": 6.999938254723515e-05, "loss": 1.1017, "step": 4845 }, { "epoch": 0.6379480434067741, "grad_norm": 0.12490827590227127, "learning_rate": 6.978042350643324e-05, "loss": 1.0892, "step": 4850 }, { "epoch": 0.6386057218020388, "grad_norm": 0.1285996288061142, "learning_rate": 6.956162379240283e-05, "loss": 1.0583, "step": 4855 }, { "epoch": 0.6392634001973035, "grad_norm": 0.13999207317829132, "learning_rate": 6.93429845587225e-05, "loss": 1.1173, "step": 4860 }, { "epoch": 0.6399210785925682, "grad_norm": 0.15554587543010712, "learning_rate": 6.912450695812448e-05, "loss": 1.0802, "step": 4865 }, { "epoch": 0.640578756987833, "grad_norm": 0.14130368828773499, "learning_rate": 6.890619214248897e-05, "loss": 1.1111, "step": 4870 }, { "epoch": 0.6412364353830977, "grad_norm": 0.12835191190242767, "learning_rate": 6.868804126283789e-05, "loss": 1.1291, "step": 4875 }, { "epoch": 0.6418941137783624, "grad_norm": 0.12079683691263199, "learning_rate": 6.847005546932884e-05, "loss": 1.0617, "step": 4880 }, { "epoch": 0.6425517921736271, "grad_norm": 0.11975207179784775, "learning_rate": 6.825223591124897e-05, "loss": 1.0673, "step": 4885 }, { "epoch": 0.6432094705688918, "grad_norm": 0.12441540509462357, "learning_rate": 6.803458373700913e-05, "loss": 1.064, "step": 4890 }, { "epoch": 0.6438671489641565, "grad_norm": 0.12355376034975052, "learning_rate": 6.781710009413756e-05, "loss": 1.0684, "step": 4895 }, { "epoch": 0.6445248273594213, "grad_norm": 0.14830844104290009, "learning_rate": 6.759978612927393e-05, "loss": 1.0781, "step": 4900 }, { "epoch": 0.6451825057546859, "grad_norm": 0.12683749198913574, "learning_rate": 6.738264298816343e-05, "loss": 1.0452, "step": 4905 }, { "epoch": 0.6458401841499507, "grad_norm": 0.14045307040214539, "learning_rate": 6.716567181565057e-05, "loss": 1.0759, "step": 4910 }, { "epoch": 0.6464978625452154, "grad_norm": 0.13314536213874817, "learning_rate": 6.694887375567302e-05, "loss": 1.127, "step": 4915 }, { "epoch": 0.6471555409404801, "grad_norm": 0.12198376655578613, "learning_rate": 6.673224995125606e-05, "loss": 1.0772, "step": 4920 }, { "epoch": 0.6478132193357449, "grad_norm": 0.13734495639801025, "learning_rate": 6.651580154450606e-05, "loss": 1.1213, "step": 4925 }, { "epoch": 0.6484708977310095, "grad_norm": 0.12702366709709167, "learning_rate": 6.629952967660452e-05, "loss": 1.1223, "step": 4930 }, { "epoch": 0.6491285761262743, "grad_norm": 0.1282837688922882, "learning_rate": 6.608343548780249e-05, "loss": 1.0744, "step": 4935 }, { "epoch": 0.6497862545215389, "grad_norm": 0.1293090581893921, "learning_rate": 6.586752011741397e-05, "loss": 1.0743, "step": 4940 }, { "epoch": 0.6504439329168037, "grad_norm": 0.12564392387866974, "learning_rate": 6.565178470381027e-05, "loss": 1.0705, "step": 4945 }, { "epoch": 0.6511016113120685, "grad_norm": 0.13575321435928345, "learning_rate": 6.543623038441395e-05, "loss": 1.0452, "step": 4950 }, { "epoch": 0.6517592897073331, "grad_norm": 0.1340646892786026, "learning_rate": 6.522085829569273e-05, "loss": 1.1179, "step": 4955 }, { "epoch": 0.6524169681025979, "grad_norm": 0.14496612548828125, "learning_rate": 6.500566957315359e-05, "loss": 1.1424, "step": 4960 }, { "epoch": 0.6530746464978625, "grad_norm": 0.11920231580734253, "learning_rate": 6.479066535133668e-05, "loss": 1.0914, "step": 4965 }, { "epoch": 0.6537323248931273, "grad_norm": 0.1298276036977768, "learning_rate": 6.457584676380954e-05, "loss": 1.1127, "step": 4970 }, { "epoch": 0.654390003288392, "grad_norm": 0.1245148628950119, "learning_rate": 6.436121494316087e-05, "loss": 1.1606, "step": 4975 }, { "epoch": 0.6550476816836567, "grad_norm": 0.15223371982574463, "learning_rate": 6.414677102099465e-05, "loss": 1.1091, "step": 4980 }, { "epoch": 0.6557053600789214, "grad_norm": 0.12049220502376556, "learning_rate": 6.393251612792441e-05, "loss": 1.0651, "step": 4985 }, { "epoch": 0.6563630384741861, "grad_norm": 0.12289486825466156, "learning_rate": 6.371845139356681e-05, "loss": 1.1168, "step": 4990 }, { "epoch": 0.6570207168694508, "grad_norm": 0.12030717730522156, "learning_rate": 6.350457794653607e-05, "loss": 1.1171, "step": 4995 }, { "epoch": 0.6576783952647156, "grad_norm": 0.13632145524024963, "learning_rate": 6.329089691443787e-05, "loss": 1.0806, "step": 5000 }, { "epoch": 0.6583360736599803, "grad_norm": 0.12515822052955627, "learning_rate": 6.307740942386344e-05, "loss": 1.0289, "step": 5005 }, { "epoch": 0.658993752055245, "grad_norm": 0.15778982639312744, "learning_rate": 6.286411660038351e-05, "loss": 1.0915, "step": 5010 }, { "epoch": 0.6596514304505097, "grad_norm": 0.1267745941877365, "learning_rate": 6.265101956854261e-05, "loss": 1.1028, "step": 5015 }, { "epoch": 0.6603091088457744, "grad_norm": 0.12595635652542114, "learning_rate": 6.243811945185283e-05, "loss": 1.0829, "step": 5020 }, { "epoch": 0.6609667872410391, "grad_norm": 0.12369897216558456, "learning_rate": 6.222541737278818e-05, "loss": 1.1032, "step": 5025 }, { "epoch": 0.6616244656363038, "grad_norm": 0.1489342600107193, "learning_rate": 6.201291445277854e-05, "loss": 1.0493, "step": 5030 }, { "epoch": 0.6622821440315686, "grad_norm": 0.1278959959745407, "learning_rate": 6.180061181220378e-05, "loss": 1.0922, "step": 5035 }, { "epoch": 0.6629398224268332, "grad_norm": 0.12223947048187256, "learning_rate": 6.158851057038768e-05, "loss": 1.0892, "step": 5040 }, { "epoch": 0.663597500822098, "grad_norm": 0.14581537246704102, "learning_rate": 6.137661184559242e-05, "loss": 1.0391, "step": 5045 }, { "epoch": 0.6642551792173627, "grad_norm": 0.12690412998199463, "learning_rate": 6.116491675501234e-05, "loss": 1.0926, "step": 5050 }, { "epoch": 0.6649128576126274, "grad_norm": 0.12724992632865906, "learning_rate": 6.095342641476807e-05, "loss": 1.0755, "step": 5055 }, { "epoch": 0.6655705360078922, "grad_norm": 0.12413624674081802, "learning_rate": 6.074214193990091e-05, "loss": 1.0618, "step": 5060 }, { "epoch": 0.6662282144031568, "grad_norm": 0.13771510124206543, "learning_rate": 6.0531064444366646e-05, "loss": 1.1238, "step": 5065 }, { "epoch": 0.6668858927984216, "grad_norm": 0.12114086002111435, "learning_rate": 6.0320195041029814e-05, "loss": 1.0835, "step": 5070 }, { "epoch": 0.6675435711936862, "grad_norm": 0.12329627573490143, "learning_rate": 6.010953484165789e-05, "loss": 1.0896, "step": 5075 }, { "epoch": 0.668201249588951, "grad_norm": 0.13847826421260834, "learning_rate": 5.989908495691534e-05, "loss": 1.0873, "step": 5080 }, { "epoch": 0.6688589279842158, "grad_norm": 0.1346912533044815, "learning_rate": 5.968884649635774e-05, "loss": 1.1177, "step": 5085 }, { "epoch": 0.6695166063794804, "grad_norm": 0.12257359176874161, "learning_rate": 5.947882056842606e-05, "loss": 1.0963, "step": 5090 }, { "epoch": 0.6701742847747452, "grad_norm": 0.1313018649816513, "learning_rate": 5.926900828044064e-05, "loss": 1.106, "step": 5095 }, { "epoch": 0.6708319631700098, "grad_norm": 0.12532012164592743, "learning_rate": 5.905941073859553e-05, "loss": 1.0516, "step": 5100 }, { "epoch": 0.6714896415652746, "grad_norm": 0.12844298779964447, "learning_rate": 5.885002904795246e-05, "loss": 1.0915, "step": 5105 }, { "epoch": 0.6721473199605393, "grad_norm": 0.14367496967315674, "learning_rate": 5.86408643124353e-05, "loss": 1.0592, "step": 5110 }, { "epoch": 0.672804998355804, "grad_norm": 0.13090352714061737, "learning_rate": 5.8431917634823884e-05, "loss": 1.0982, "step": 5115 }, { "epoch": 0.6734626767510687, "grad_norm": 0.13327063620090485, "learning_rate": 5.822319011674846e-05, "loss": 1.086, "step": 5120 }, { "epoch": 0.6741203551463334, "grad_norm": 0.13549299538135529, "learning_rate": 5.801468285868387e-05, "loss": 1.0751, "step": 5125 }, { "epoch": 0.6747780335415982, "grad_norm": 0.16332031786441803, "learning_rate": 5.780639695994361e-05, "loss": 1.0884, "step": 5130 }, { "epoch": 0.6754357119368629, "grad_norm": 0.12516476213932037, "learning_rate": 5.7598333518673975e-05, "loss": 1.0595, "step": 5135 }, { "epoch": 0.6760933903321276, "grad_norm": 0.1265784204006195, "learning_rate": 5.739049363184865e-05, "loss": 1.1204, "step": 5140 }, { "epoch": 0.6767510687273923, "grad_norm": 0.13382838666439056, "learning_rate": 5.718287839526253e-05, "loss": 1.1167, "step": 5145 }, { "epoch": 0.677408747122657, "grad_norm": 0.12388251721858978, "learning_rate": 5.6975488903526065e-05, "loss": 1.0721, "step": 5150 }, { "epoch": 0.6780664255179217, "grad_norm": 0.12773160636425018, "learning_rate": 5.676832625005957e-05, "loss": 1.1316, "step": 5155 }, { "epoch": 0.6787241039131865, "grad_norm": 0.12280789762735367, "learning_rate": 5.656139152708737e-05, "loss": 1.0869, "step": 5160 }, { "epoch": 0.6793817823084511, "grad_norm": 0.1364842653274536, "learning_rate": 5.635468582563204e-05, "loss": 1.1223, "step": 5165 }, { "epoch": 0.6800394607037159, "grad_norm": 0.12486580014228821, "learning_rate": 5.61482102355088e-05, "loss": 1.0986, "step": 5170 }, { "epoch": 0.6806971390989806, "grad_norm": 0.12128328531980515, "learning_rate": 5.5941965845319554e-05, "loss": 1.055, "step": 5175 }, { "epoch": 0.6813548174942453, "grad_norm": 0.13697503507137299, "learning_rate": 5.573595374244724e-05, "loss": 1.1399, "step": 5180 }, { "epoch": 0.6820124958895101, "grad_norm": 0.1303701400756836, "learning_rate": 5.553017501305018e-05, "loss": 1.0574, "step": 5185 }, { "epoch": 0.6826701742847747, "grad_norm": 0.1283029168844223, "learning_rate": 5.532463074205621e-05, "loss": 1.0702, "step": 5190 }, { "epoch": 0.6833278526800395, "grad_norm": 0.15387962758541107, "learning_rate": 5.5119322013157016e-05, "loss": 1.1244, "step": 5195 }, { "epoch": 0.6839855310753041, "grad_norm": 0.14025266468524933, "learning_rate": 5.4914249908802565e-05, "loss": 1.0293, "step": 5200 }, { "epoch": 0.6846432094705689, "grad_norm": 0.12347293645143509, "learning_rate": 5.4709415510195194e-05, "loss": 1.1424, "step": 5205 }, { "epoch": 0.6853008878658337, "grad_norm": 0.12594734132289886, "learning_rate": 5.450481989728383e-05, "loss": 1.0843, "step": 5210 }, { "epoch": 0.6859585662610983, "grad_norm": 0.1268152892589569, "learning_rate": 5.430046414875873e-05, "loss": 1.0694, "step": 5215 }, { "epoch": 0.6866162446563631, "grad_norm": 0.12298762798309326, "learning_rate": 5.409634934204531e-05, "loss": 1.0908, "step": 5220 }, { "epoch": 0.6872739230516277, "grad_norm": 0.11927527189254761, "learning_rate": 5.389247655329874e-05, "loss": 1.0314, "step": 5225 }, { "epoch": 0.6879316014468925, "grad_norm": 0.14134618639945984, "learning_rate": 5.3688846857398204e-05, "loss": 1.0438, "step": 5230 }, { "epoch": 0.6885892798421572, "grad_norm": 0.12124871462583542, "learning_rate": 5.348546132794118e-05, "loss": 1.1111, "step": 5235 }, { "epoch": 0.6892469582374219, "grad_norm": 0.12315499782562256, "learning_rate": 5.328232103723787e-05, "loss": 1.1021, "step": 5240 }, { "epoch": 0.6899046366326866, "grad_norm": 0.12770317494869232, "learning_rate": 5.3079427056305464e-05, "loss": 1.0787, "step": 5245 }, { "epoch": 0.6905623150279513, "grad_norm": 0.12022534757852554, "learning_rate": 5.287678045486263e-05, "loss": 1.0158, "step": 5250 }, { "epoch": 0.691219993423216, "grad_norm": 0.13245612382888794, "learning_rate": 5.2674382301323686e-05, "loss": 1.077, "step": 5255 }, { "epoch": 0.6918776718184808, "grad_norm": 0.1252785176038742, "learning_rate": 5.2472233662792994e-05, "loss": 1.0942, "step": 5260 }, { "epoch": 0.6925353502137455, "grad_norm": 0.13478106260299683, "learning_rate": 5.2270335605059564e-05, "loss": 1.0932, "step": 5265 }, { "epoch": 0.6931930286090102, "grad_norm": 0.1371147334575653, "learning_rate": 5.206868919259116e-05, "loss": 1.1185, "step": 5270 }, { "epoch": 0.6938507070042749, "grad_norm": 0.13604164123535156, "learning_rate": 5.1867295488528824e-05, "loss": 1.066, "step": 5275 }, { "epoch": 0.6945083853995396, "grad_norm": 0.11950071901082993, "learning_rate": 5.166615555468122e-05, "loss": 1.1535, "step": 5280 }, { "epoch": 0.6951660637948044, "grad_norm": 0.1293693333864212, "learning_rate": 5.146527045151908e-05, "loss": 1.1291, "step": 5285 }, { "epoch": 0.695823742190069, "grad_norm": 0.11650462448596954, "learning_rate": 5.126464123816957e-05, "loss": 1.118, "step": 5290 }, { "epoch": 0.6964814205853338, "grad_norm": 0.12421900033950806, "learning_rate": 5.106426897241079e-05, "loss": 1.1269, "step": 5295 }, { "epoch": 0.6971390989805984, "grad_norm": 0.12671126425266266, "learning_rate": 5.086415471066608e-05, "loss": 1.0778, "step": 5300 }, { "epoch": 0.6977967773758632, "grad_norm": 0.11943601071834564, "learning_rate": 5.066429950799849e-05, "loss": 1.0613, "step": 5305 }, { "epoch": 0.698454455771128, "grad_norm": 0.11990439146757126, "learning_rate": 5.046470441810528e-05, "loss": 1.0816, "step": 5310 }, { "epoch": 0.6991121341663926, "grad_norm": 0.13803936541080475, "learning_rate": 5.026537049331228e-05, "loss": 1.1136, "step": 5315 }, { "epoch": 0.6997698125616574, "grad_norm": 0.12677443027496338, "learning_rate": 5.006629878456835e-05, "loss": 1.0909, "step": 5320 }, { "epoch": 0.700427490956922, "grad_norm": 0.12767791748046875, "learning_rate": 4.986749034143997e-05, "loss": 1.1022, "step": 5325 }, { "epoch": 0.7010851693521868, "grad_norm": 0.122572161257267, "learning_rate": 4.966894621210558e-05, "loss": 1.0933, "step": 5330 }, { "epoch": 0.7017428477474515, "grad_norm": 0.12715312838554382, "learning_rate": 4.9470667443349896e-05, "loss": 1.0568, "step": 5335 }, { "epoch": 0.7024005261427162, "grad_norm": 0.1509445458650589, "learning_rate": 4.927265508055884e-05, "loss": 1.1062, "step": 5340 }, { "epoch": 0.703058204537981, "grad_norm": 0.12912996113300323, "learning_rate": 4.907491016771361e-05, "loss": 1.0875, "step": 5345 }, { "epoch": 0.7037158829332456, "grad_norm": 0.13760599493980408, "learning_rate": 4.8877433747385326e-05, "loss": 1.0674, "step": 5350 }, { "epoch": 0.7043735613285104, "grad_norm": 0.12424129247665405, "learning_rate": 4.8680226860729585e-05, "loss": 1.1437, "step": 5355 }, { "epoch": 0.7050312397237751, "grad_norm": 0.12263917922973633, "learning_rate": 4.848329054748089e-05, "loss": 1.0494, "step": 5360 }, { "epoch": 0.7056889181190398, "grad_norm": 0.12154939025640488, "learning_rate": 4.828662584594721e-05, "loss": 1.0755, "step": 5365 }, { "epoch": 0.7063465965143045, "grad_norm": 0.12045510113239288, "learning_rate": 4.809023379300445e-05, "loss": 1.1459, "step": 5370 }, { "epoch": 0.7070042749095692, "grad_norm": 0.12804795801639557, "learning_rate": 4.7894115424091135e-05, "loss": 1.0111, "step": 5375 }, { "epoch": 0.707661953304834, "grad_norm": 0.11997427046298981, "learning_rate": 4.769827177320281e-05, "loss": 1.0409, "step": 5380 }, { "epoch": 0.7083196317000987, "grad_norm": 0.12534627318382263, "learning_rate": 4.7502703872886456e-05, "loss": 1.078, "step": 5385 }, { "epoch": 0.7089773100953634, "grad_norm": 0.12500308454036713, "learning_rate": 4.730741275423546e-05, "loss": 1.1349, "step": 5390 }, { "epoch": 0.7096349884906281, "grad_norm": 0.12224148958921432, "learning_rate": 4.7112399446883806e-05, "loss": 1.0617, "step": 5395 }, { "epoch": 0.7102926668858928, "grad_norm": 0.12461768835783005, "learning_rate": 4.6917664979000777e-05, "loss": 1.1302, "step": 5400 }, { "epoch": 0.7109503452811575, "grad_norm": 0.13118930160999298, "learning_rate": 4.6723210377285545e-05, "loss": 1.1041, "step": 5405 }, { "epoch": 0.7116080236764222, "grad_norm": 0.11962978541851044, "learning_rate": 4.652903666696173e-05, "loss": 1.1115, "step": 5410 }, { "epoch": 0.7122657020716869, "grad_norm": 0.13389453291893005, "learning_rate": 4.6335144871772e-05, "loss": 1.1347, "step": 5415 }, { "epoch": 0.7129233804669517, "grad_norm": 0.12964506447315216, "learning_rate": 4.6141536013972754e-05, "loss": 1.0762, "step": 5420 }, { "epoch": 0.7135810588622163, "grad_norm": 0.13209283351898193, "learning_rate": 4.5948211114328575e-05, "loss": 1.095, "step": 5425 }, { "epoch": 0.7142387372574811, "grad_norm": 0.1927478015422821, "learning_rate": 4.575517119210696e-05, "loss": 1.0934, "step": 5430 }, { "epoch": 0.7148964156527458, "grad_norm": 0.12653924524784088, "learning_rate": 4.556241726507292e-05, "loss": 1.0752, "step": 5435 }, { "epoch": 0.7155540940480105, "grad_norm": 0.12985467910766602, "learning_rate": 4.5369950349483616e-05, "loss": 1.0751, "step": 5440 }, { "epoch": 0.7162117724432753, "grad_norm": 0.12432250380516052, "learning_rate": 4.517777146008294e-05, "loss": 1.0938, "step": 5445 }, { "epoch": 0.7168694508385399, "grad_norm": 0.1240161657333374, "learning_rate": 4.498588161009641e-05, "loss": 1.0933, "step": 5450 }, { "epoch": 0.7175271292338047, "grad_norm": 0.12687431275844574, "learning_rate": 4.479428181122538e-05, "loss": 1.0698, "step": 5455 }, { "epoch": 0.7181848076290693, "grad_norm": 0.16903181374073029, "learning_rate": 4.4602973073642094e-05, "loss": 1.1258, "step": 5460 }, { "epoch": 0.7188424860243341, "grad_norm": 0.12974195182323456, "learning_rate": 4.441195640598432e-05, "loss": 1.1193, "step": 5465 }, { "epoch": 0.7195001644195989, "grad_norm": 0.1297825127840042, "learning_rate": 4.422123281534981e-05, "loss": 1.1068, "step": 5470 }, { "epoch": 0.7201578428148635, "grad_norm": 0.12946543097496033, "learning_rate": 4.403080330729118e-05, "loss": 1.0866, "step": 5475 }, { "epoch": 0.7208155212101283, "grad_norm": 0.12342122197151184, "learning_rate": 4.384066888581053e-05, "loss": 1.0726, "step": 5480 }, { "epoch": 0.7214731996053929, "grad_norm": 0.11856316030025482, "learning_rate": 4.365083055335419e-05, "loss": 1.1108, "step": 5485 }, { "epoch": 0.7221308780006577, "grad_norm": 0.12029319256544113, "learning_rate": 4.346128931080744e-05, "loss": 1.0804, "step": 5490 }, { "epoch": 0.7227885563959224, "grad_norm": 0.12537036836147308, "learning_rate": 4.327204615748914e-05, "loss": 1.1164, "step": 5495 }, { "epoch": 0.7234462347911871, "grad_norm": 0.1305924504995346, "learning_rate": 4.3083102091146654e-05, "loss": 1.1141, "step": 5500 }, { "epoch": 0.7241039131864518, "grad_norm": 0.13634340465068817, "learning_rate": 4.28944581079504e-05, "loss": 1.0766, "step": 5505 }, { "epoch": 0.7247615915817165, "grad_norm": 0.12790511548519135, "learning_rate": 4.270611520248855e-05, "loss": 1.0884, "step": 5510 }, { "epoch": 0.7254192699769813, "grad_norm": 0.12282072007656097, "learning_rate": 4.2518074367762154e-05, "loss": 1.1056, "step": 5515 }, { "epoch": 0.726076948372246, "grad_norm": 0.12589825689792633, "learning_rate": 4.2330336595179446e-05, "loss": 1.1011, "step": 5520 }, { "epoch": 0.7267346267675107, "grad_norm": 0.11987843364477158, "learning_rate": 4.2142902874550905e-05, "loss": 1.0911, "step": 5525 }, { "epoch": 0.7273923051627754, "grad_norm": 0.12477397918701172, "learning_rate": 4.1955774194083964e-05, "loss": 1.0406, "step": 5530 }, { "epoch": 0.7280499835580401, "grad_norm": 0.11790572851896286, "learning_rate": 4.176895154037775e-05, "loss": 1.1059, "step": 5535 }, { "epoch": 0.7287076619533048, "grad_norm": 0.12950554490089417, "learning_rate": 4.1582435898417936e-05, "loss": 1.0638, "step": 5540 }, { "epoch": 0.7293653403485696, "grad_norm": 0.1302611380815506, "learning_rate": 4.139622825157161e-05, "loss": 1.091, "step": 5545 }, { "epoch": 0.7300230187438342, "grad_norm": 0.1281176209449768, "learning_rate": 4.121032958158191e-05, "loss": 1.0339, "step": 5550 }, { "epoch": 0.730680697139099, "grad_norm": 0.13138709962368011, "learning_rate": 4.102474086856302e-05, "loss": 1.0746, "step": 5555 }, { "epoch": 0.7313383755343636, "grad_norm": 0.11881053447723389, "learning_rate": 4.083946309099489e-05, "loss": 1.0312, "step": 5560 }, { "epoch": 0.7319960539296284, "grad_norm": 0.12164081633090973, "learning_rate": 4.065449722571817e-05, "loss": 1.1014, "step": 5565 }, { "epoch": 0.7326537323248932, "grad_norm": 0.14139993488788605, "learning_rate": 4.046984424792895e-05, "loss": 1.0499, "step": 5570 }, { "epoch": 0.7333114107201578, "grad_norm": 0.1338062882423401, "learning_rate": 4.028550513117384e-05, "loss": 1.0524, "step": 5575 }, { "epoch": 0.7339690891154226, "grad_norm": 0.14831510186195374, "learning_rate": 4.010148084734445e-05, "loss": 1.0959, "step": 5580 }, { "epoch": 0.7346267675106872, "grad_norm": 0.13278047740459442, "learning_rate": 3.9917772366672624e-05, "loss": 1.0351, "step": 5585 }, { "epoch": 0.735284445905952, "grad_norm": 0.13070765137672424, "learning_rate": 3.973438065772523e-05, "loss": 1.0518, "step": 5590 }, { "epoch": 0.7359421243012167, "grad_norm": 0.12224950641393661, "learning_rate": 3.955130668739896e-05, "loss": 1.1184, "step": 5595 }, { "epoch": 0.7365998026964814, "grad_norm": 0.13142383098602295, "learning_rate": 3.936855142091529e-05, "loss": 1.0636, "step": 5600 }, { "epoch": 0.7372574810917462, "grad_norm": 0.13372188806533813, "learning_rate": 3.9186115821815416e-05, "loss": 1.1268, "step": 5605 }, { "epoch": 0.7379151594870108, "grad_norm": 0.1241203173995018, "learning_rate": 3.900400085195514e-05, "loss": 1.0681, "step": 5610 }, { "epoch": 0.7385728378822756, "grad_norm": 0.1251928061246872, "learning_rate": 3.882220747149981e-05, "loss": 1.0437, "step": 5615 }, { "epoch": 0.7392305162775403, "grad_norm": 0.12923811376094818, "learning_rate": 3.8640736638919205e-05, "loss": 1.0977, "step": 5620 }, { "epoch": 0.739888194672805, "grad_norm": 0.12261676043272018, "learning_rate": 3.845958931098268e-05, "loss": 1.1176, "step": 5625 }, { "epoch": 0.7405458730680697, "grad_norm": 0.12809810042381287, "learning_rate": 3.8278766442753876e-05, "loss": 1.0736, "step": 5630 }, { "epoch": 0.7412035514633344, "grad_norm": 0.12051175534725189, "learning_rate": 3.8098268987585685e-05, "loss": 1.1431, "step": 5635 }, { "epoch": 0.7418612298585991, "grad_norm": 0.13410921394824982, "learning_rate": 3.7918097897115536e-05, "loss": 1.0494, "step": 5640 }, { "epoch": 0.7425189082538639, "grad_norm": 0.1260906457901001, "learning_rate": 3.7738254121259995e-05, "loss": 1.0658, "step": 5645 }, { "epoch": 0.7431765866491286, "grad_norm": 0.13504593074321747, "learning_rate": 3.755873860821003e-05, "loss": 1.0938, "step": 5650 }, { "epoch": 0.7438342650443933, "grad_norm": 0.12676137685775757, "learning_rate": 3.7379552304425836e-05, "loss": 1.1051, "step": 5655 }, { "epoch": 0.744491943439658, "grad_norm": 0.11792674660682678, "learning_rate": 3.7200696154631957e-05, "loss": 1.0847, "step": 5660 }, { "epoch": 0.7451496218349227, "grad_norm": 0.13282136619091034, "learning_rate": 3.702217110181221e-05, "loss": 1.091, "step": 5665 }, { "epoch": 0.7458073002301875, "grad_norm": 0.15089426934719086, "learning_rate": 3.6843978087204876e-05, "loss": 1.1069, "step": 5670 }, { "epoch": 0.7464649786254521, "grad_norm": 0.12523071467876434, "learning_rate": 3.666611805029756e-05, "loss": 1.0831, "step": 5675 }, { "epoch": 0.7471226570207169, "grad_norm": 0.11661294847726822, "learning_rate": 3.648859192882229e-05, "loss": 1.0747, "step": 5680 }, { "epoch": 0.7477803354159815, "grad_norm": 0.12711404263973236, "learning_rate": 3.631140065875063e-05, "loss": 1.1339, "step": 5685 }, { "epoch": 0.7484380138112463, "grad_norm": 0.12335199862718582, "learning_rate": 3.613454517428867e-05, "loss": 1.1319, "step": 5690 }, { "epoch": 0.7490956922065111, "grad_norm": 0.12527814507484436, "learning_rate": 3.5958026407872136e-05, "loss": 1.0489, "step": 5695 }, { "epoch": 0.7497533706017757, "grad_norm": 0.13666552305221558, "learning_rate": 3.5781845290161585e-05, "loss": 1.0671, "step": 5700 }, { "epoch": 0.7504110489970405, "grad_norm": 0.12504583597183228, "learning_rate": 3.560600275003723e-05, "loss": 1.0834, "step": 5705 }, { "epoch": 0.7510687273923051, "grad_norm": 0.13556911051273346, "learning_rate": 3.5430499714594226e-05, "loss": 1.0748, "step": 5710 }, { "epoch": 0.7517264057875699, "grad_norm": 0.1435319036245346, "learning_rate": 3.525533710913789e-05, "loss": 1.0583, "step": 5715 }, { "epoch": 0.7523840841828346, "grad_norm": 0.12800288200378418, "learning_rate": 3.5080515857178544e-05, "loss": 1.0664, "step": 5720 }, { "epoch": 0.7530417625780993, "grad_norm": 0.1283649206161499, "learning_rate": 3.490603688042686e-05, "loss": 1.0855, "step": 5725 }, { "epoch": 0.753699440973364, "grad_norm": 0.13517865538597107, "learning_rate": 3.473190109878892e-05, "loss": 1.0972, "step": 5730 }, { "epoch": 0.7543571193686287, "grad_norm": 0.12633182108402252, "learning_rate": 3.455810943036136e-05, "loss": 1.0742, "step": 5735 }, { "epoch": 0.7550147977638935, "grad_norm": 0.13017481565475464, "learning_rate": 3.438466279142657e-05, "loss": 1.0886, "step": 5740 }, { "epoch": 0.7556724761591582, "grad_norm": 0.1217026337981224, "learning_rate": 3.421156209644779e-05, "loss": 1.0591, "step": 5745 }, { "epoch": 0.7563301545544229, "grad_norm": 0.13338933885097504, "learning_rate": 3.403880825806445e-05, "loss": 1.053, "step": 5750 }, { "epoch": 0.7569878329496876, "grad_norm": 0.1305582970380783, "learning_rate": 3.386640218708715e-05, "loss": 1.1054, "step": 5755 }, { "epoch": 0.7576455113449523, "grad_norm": 0.12431497126817703, "learning_rate": 3.3694344792492917e-05, "loss": 1.0709, "step": 5760 }, { "epoch": 0.758303189740217, "grad_norm": 0.12837854027748108, "learning_rate": 3.352263698142059e-05, "loss": 1.111, "step": 5765 }, { "epoch": 0.7589608681354818, "grad_norm": 0.1517431139945984, "learning_rate": 3.335127965916578e-05, "loss": 1.0912, "step": 5770 }, { "epoch": 0.7596185465307465, "grad_norm": 0.12348992377519608, "learning_rate": 3.318027372917628e-05, "loss": 1.0851, "step": 5775 }, { "epoch": 0.7602762249260112, "grad_norm": 0.12005271762609482, "learning_rate": 3.300962009304719e-05, "loss": 1.0215, "step": 5780 }, { "epoch": 0.7609339033212759, "grad_norm": 0.13298583030700684, "learning_rate": 3.2839319650516256e-05, "loss": 1.0975, "step": 5785 }, { "epoch": 0.7615915817165406, "grad_norm": 0.12193068861961365, "learning_rate": 3.266937329945898e-05, "loss": 1.086, "step": 5790 }, { "epoch": 0.7622492601118054, "grad_norm": 0.12363184243440628, "learning_rate": 3.2499781935884174e-05, "loss": 1.028, "step": 5795 }, { "epoch": 0.76290693850707, "grad_norm": 0.12665733695030212, "learning_rate": 3.233054645392888e-05, "loss": 1.1055, "step": 5800 }, { "epoch": 0.7635646169023348, "grad_norm": 0.128564715385437, "learning_rate": 3.216166774585388e-05, "loss": 1.0454, "step": 5805 }, { "epoch": 0.7642222952975994, "grad_norm": 0.1310684084892273, "learning_rate": 3.1993146702038955e-05, "loss": 1.0436, "step": 5810 }, { "epoch": 0.7648799736928642, "grad_norm": 0.15509584546089172, "learning_rate": 3.1824984210978123e-05, "loss": 1.0292, "step": 5815 }, { "epoch": 0.7655376520881289, "grad_norm": 0.12039491534233093, "learning_rate": 3.165718115927503e-05, "loss": 1.0883, "step": 5820 }, { "epoch": 0.7661953304833936, "grad_norm": 0.12884244322776794, "learning_rate": 3.1489738431638335e-05, "loss": 1.0921, "step": 5825 }, { "epoch": 0.7668530088786584, "grad_norm": 0.13951320946216583, "learning_rate": 3.132265691087678e-05, "loss": 1.0823, "step": 5830 }, { "epoch": 0.767510687273923, "grad_norm": 0.13622590899467468, "learning_rate": 3.11559374778948e-05, "loss": 1.0844, "step": 5835 }, { "epoch": 0.7681683656691878, "grad_norm": 0.1248575896024704, "learning_rate": 3.0989581011687896e-05, "loss": 1.0768, "step": 5840 }, { "epoch": 0.7688260440644524, "grad_norm": 0.12006829679012299, "learning_rate": 3.0823588389337754e-05, "loss": 1.0598, "step": 5845 }, { "epoch": 0.7694837224597172, "grad_norm": 0.12956292927265167, "learning_rate": 3.065796048600782e-05, "loss": 1.1936, "step": 5850 }, { "epoch": 0.770141400854982, "grad_norm": 0.12849761545658112, "learning_rate": 3.0492698174938638e-05, "loss": 1.1292, "step": 5855 }, { "epoch": 0.7707990792502466, "grad_norm": 0.12100765109062195, "learning_rate": 3.0327802327443212e-05, "loss": 1.0767, "step": 5860 }, { "epoch": 0.7714567576455114, "grad_norm": 0.13766878843307495, "learning_rate": 3.0163273812902472e-05, "loss": 1.1233, "step": 5865 }, { "epoch": 0.772114436040776, "grad_norm": 0.1391197144985199, "learning_rate": 2.999911349876059e-05, "loss": 1.0918, "step": 5870 }, { "epoch": 0.7727721144360408, "grad_norm": 0.1272421032190323, "learning_rate": 2.983532225052065e-05, "loss": 1.0508, "step": 5875 }, { "epoch": 0.7734297928313055, "grad_norm": 0.12618601322174072, "learning_rate": 2.967190093173965e-05, "loss": 1.1133, "step": 5880 }, { "epoch": 0.7740874712265702, "grad_norm": 0.12637467682361603, "learning_rate": 2.950885040402438e-05, "loss": 1.0992, "step": 5885 }, { "epoch": 0.7747451496218349, "grad_norm": 0.12542709708213806, "learning_rate": 2.93461715270267e-05, "loss": 1.0366, "step": 5890 }, { "epoch": 0.7754028280170996, "grad_norm": 0.12898610532283783, "learning_rate": 2.918386515843897e-05, "loss": 1.1121, "step": 5895 }, { "epoch": 0.7760605064123643, "grad_norm": 0.12418232858181, "learning_rate": 2.9021932153989585e-05, "loss": 1.0766, "step": 5900 }, { "epoch": 0.7767181848076291, "grad_norm": 0.1434275060892105, "learning_rate": 2.8860373367438442e-05, "loss": 1.0537, "step": 5905 }, { "epoch": 0.7773758632028938, "grad_norm": 0.12791401147842407, "learning_rate": 2.8699189650572465e-05, "loss": 1.0657, "step": 5910 }, { "epoch": 0.7780335415981585, "grad_norm": 0.1229763776063919, "learning_rate": 2.8538381853201057e-05, "loss": 1.0425, "step": 5915 }, { "epoch": 0.7786912199934232, "grad_norm": 0.12512493133544922, "learning_rate": 2.8377950823151756e-05, "loss": 1.0707, "step": 5920 }, { "epoch": 0.7793488983886879, "grad_norm": 0.1252317726612091, "learning_rate": 2.8217897406265558e-05, "loss": 1.011, "step": 5925 }, { "epoch": 0.7800065767839527, "grad_norm": 0.13542944192886353, "learning_rate": 2.8058222446392633e-05, "loss": 1.1348, "step": 5930 }, { "epoch": 0.7806642551792173, "grad_norm": 0.1242247000336647, "learning_rate": 2.789892678538778e-05, "loss": 1.0955, "step": 5935 }, { "epoch": 0.7813219335744821, "grad_norm": 0.12406536191701889, "learning_rate": 2.7740011263106047e-05, "loss": 1.1129, "step": 5940 }, { "epoch": 0.7819796119697467, "grad_norm": 0.13313288986682892, "learning_rate": 2.7581476717398215e-05, "loss": 1.0824, "step": 5945 }, { "epoch": 0.7826372903650115, "grad_norm": 0.12449498474597931, "learning_rate": 2.7423323984106573e-05, "loss": 1.0483, "step": 5950 }, { "epoch": 0.7832949687602763, "grad_norm": 0.13022910058498383, "learning_rate": 2.7265553897060214e-05, "loss": 1.075, "step": 5955 }, { "epoch": 0.7839526471555409, "grad_norm": 0.12742061913013458, "learning_rate": 2.7108167288070873e-05, "loss": 1.1042, "step": 5960 }, { "epoch": 0.7846103255508057, "grad_norm": 0.1270124316215515, "learning_rate": 2.6951164986928535e-05, "loss": 1.0423, "step": 5965 }, { "epoch": 0.7852680039460703, "grad_norm": 0.12378037720918655, "learning_rate": 2.679454782139693e-05, "loss": 1.1425, "step": 5970 }, { "epoch": 0.7859256823413351, "grad_norm": 0.1281309574842453, "learning_rate": 2.663831661720925e-05, "loss": 1.079, "step": 5975 }, { "epoch": 0.7865833607365998, "grad_norm": 0.12711021304130554, "learning_rate": 2.6482472198063767e-05, "loss": 1.0912, "step": 5980 }, { "epoch": 0.7872410391318645, "grad_norm": 0.12602056562900543, "learning_rate": 2.6327015385619523e-05, "loss": 1.0563, "step": 5985 }, { "epoch": 0.7878987175271293, "grad_norm": 0.13072004914283752, "learning_rate": 2.617194699949199e-05, "loss": 1.1225, "step": 5990 }, { "epoch": 0.7885563959223939, "grad_norm": 0.12265162169933319, "learning_rate": 2.6017267857248685e-05, "loss": 1.0655, "step": 5995 }, { "epoch": 0.7892140743176587, "grad_norm": 0.12272915989160538, "learning_rate": 2.5862978774405035e-05, "loss": 1.0768, "step": 6000 }, { "epoch": 0.7898717527129234, "grad_norm": 0.1362546980381012, "learning_rate": 2.5709080564419797e-05, "loss": 1.0752, "step": 6005 }, { "epoch": 0.7905294311081881, "grad_norm": 0.12533649802207947, "learning_rate": 2.5555574038691e-05, "loss": 1.0836, "step": 6010 }, { "epoch": 0.7911871095034528, "grad_norm": 0.12511833012104034, "learning_rate": 2.5402460006551654e-05, "loss": 1.0652, "step": 6015 }, { "epoch": 0.7918447878987175, "grad_norm": 0.13956275582313538, "learning_rate": 2.524973927526535e-05, "loss": 1.0535, "step": 6020 }, { "epoch": 0.7925024662939822, "grad_norm": 0.13039498031139374, "learning_rate": 2.50974126500221e-05, "loss": 1.0935, "step": 6025 }, { "epoch": 0.793160144689247, "grad_norm": 0.1342824548482895, "learning_rate": 2.494548093393404e-05, "loss": 1.0697, "step": 6030 }, { "epoch": 0.7938178230845117, "grad_norm": 0.12390636652708054, "learning_rate": 2.4793944928031286e-05, "loss": 1.0611, "step": 6035 }, { "epoch": 0.7944755014797764, "grad_norm": 0.12488777935504913, "learning_rate": 2.4642805431257566e-05, "loss": 1.072, "step": 6040 }, { "epoch": 0.7951331798750411, "grad_norm": 0.12500470876693726, "learning_rate": 2.44920632404662e-05, "loss": 1.1041, "step": 6045 }, { "epoch": 0.7957908582703058, "grad_norm": 0.12756222486495972, "learning_rate": 2.434171915041571e-05, "loss": 1.1253, "step": 6050 }, { "epoch": 0.7964485366655706, "grad_norm": 0.15743061900138855, "learning_rate": 2.4191773953765716e-05, "loss": 1.1549, "step": 6055 }, { "epoch": 0.7971062150608352, "grad_norm": 0.12112820148468018, "learning_rate": 2.4042228441072788e-05, "loss": 1.0701, "step": 6060 }, { "epoch": 0.7977638934561, "grad_norm": 0.1167597696185112, "learning_rate": 2.389308340078621e-05, "loss": 1.0453, "step": 6065 }, { "epoch": 0.7984215718513646, "grad_norm": 0.12690216302871704, "learning_rate": 2.3744339619243828e-05, "loss": 1.0751, "step": 6070 }, { "epoch": 0.7990792502466294, "grad_norm": 0.138803169131279, "learning_rate": 2.3595997880668064e-05, "loss": 1.0619, "step": 6075 }, { "epoch": 0.7997369286418942, "grad_norm": 0.1289168745279312, "learning_rate": 2.3448058967161445e-05, "loss": 1.0456, "step": 6080 }, { "epoch": 0.8003946070371588, "grad_norm": 0.1514570713043213, "learning_rate": 2.330052365870281e-05, "loss": 1.1591, "step": 6085 }, { "epoch": 0.8010522854324236, "grad_norm": 0.18199890851974487, "learning_rate": 2.31533927331431e-05, "loss": 1.0836, "step": 6090 }, { "epoch": 0.8017099638276882, "grad_norm": 0.12002385407686234, "learning_rate": 2.300666696620112e-05, "loss": 1.0344, "step": 6095 }, { "epoch": 0.802367642222953, "grad_norm": 0.12299599498510361, "learning_rate": 2.2860347131459636e-05, "loss": 1.0835, "step": 6100 }, { "epoch": 0.8030253206182177, "grad_norm": 0.12145963311195374, "learning_rate": 2.2714434000361172e-05, "loss": 1.0586, "step": 6105 }, { "epoch": 0.8036829990134824, "grad_norm": 0.1262415051460266, "learning_rate": 2.256892834220402e-05, "loss": 1.1157, "step": 6110 }, { "epoch": 0.8043406774087472, "grad_norm": 0.12122668325901031, "learning_rate": 2.2423830924138088e-05, "loss": 1.0905, "step": 6115 }, { "epoch": 0.8049983558040118, "grad_norm": 0.12394457310438156, "learning_rate": 2.2279142511161043e-05, "loss": 1.0825, "step": 6120 }, { "epoch": 0.8056560341992766, "grad_norm": 0.12165627628564835, "learning_rate": 2.2134863866114064e-05, "loss": 1.1071, "step": 6125 }, { "epoch": 0.8063137125945413, "grad_norm": 0.12368518859148026, "learning_rate": 2.1990995749677856e-05, "loss": 1.1327, "step": 6130 }, { "epoch": 0.806971390989806, "grad_norm": 0.12915311753749847, "learning_rate": 2.184753892036877e-05, "loss": 1.0584, "step": 6135 }, { "epoch": 0.8076290693850707, "grad_norm": 0.12574177980422974, "learning_rate": 2.170449413453478e-05, "loss": 1.0537, "step": 6140 }, { "epoch": 0.8082867477803354, "grad_norm": 0.13796794414520264, "learning_rate": 2.156186214635133e-05, "loss": 1.0685, "step": 6145 }, { "epoch": 0.8089444261756001, "grad_norm": 0.15391041338443756, "learning_rate": 2.1419643707817517e-05, "loss": 1.0759, "step": 6150 }, { "epoch": 0.8096021045708649, "grad_norm": 0.12321018427610397, "learning_rate": 2.127783956875209e-05, "loss": 1.1091, "step": 6155 }, { "epoch": 0.8102597829661295, "grad_norm": 0.1268635392189026, "learning_rate": 2.113645047678946e-05, "loss": 1.1375, "step": 6160 }, { "epoch": 0.8109174613613943, "grad_norm": 0.134885773062706, "learning_rate": 2.0995477177375777e-05, "loss": 1.0544, "step": 6165 }, { "epoch": 0.811575139756659, "grad_norm": 0.12396198511123657, "learning_rate": 2.085492041376509e-05, "loss": 1.0949, "step": 6170 }, { "epoch": 0.8122328181519237, "grad_norm": 0.12038164585828781, "learning_rate": 2.071478092701523e-05, "loss": 1.0738, "step": 6175 }, { "epoch": 0.8128904965471885, "grad_norm": 0.1275164932012558, "learning_rate": 2.057505945598409e-05, "loss": 1.119, "step": 6180 }, { "epoch": 0.8135481749424531, "grad_norm": 0.12457544356584549, "learning_rate": 2.0435756737325628e-05, "loss": 1.0458, "step": 6185 }, { "epoch": 0.8142058533377179, "grad_norm": 0.12855833768844604, "learning_rate": 2.0296873505486035e-05, "loss": 1.0729, "step": 6190 }, { "epoch": 0.8148635317329825, "grad_norm": 0.1232529878616333, "learning_rate": 2.0158410492699774e-05, "loss": 1.105, "step": 6195 }, { "epoch": 0.8155212101282473, "grad_norm": 0.12522192299365997, "learning_rate": 2.0020368428985936e-05, "loss": 1.1102, "step": 6200 }, { "epoch": 0.816178888523512, "grad_norm": 0.13071826100349426, "learning_rate": 1.9882748042144063e-05, "loss": 1.0719, "step": 6205 }, { "epoch": 0.8168365669187767, "grad_norm": 0.12266150861978531, "learning_rate": 1.9745550057750574e-05, "loss": 1.067, "step": 6210 }, { "epoch": 0.8174942453140415, "grad_norm": 0.12871146202087402, "learning_rate": 1.960877519915487e-05, "loss": 1.0851, "step": 6215 }, { "epoch": 0.8181519237093061, "grad_norm": 0.1294073760509491, "learning_rate": 1.9472424187475502e-05, "loss": 1.1056, "step": 6220 }, { "epoch": 0.8188096021045709, "grad_norm": 0.1266794204711914, "learning_rate": 1.9336497741596328e-05, "loss": 1.0634, "step": 6225 }, { "epoch": 0.8194672804998355, "grad_norm": 0.12367183715105057, "learning_rate": 1.920099657816279e-05, "loss": 1.0649, "step": 6230 }, { "epoch": 0.8201249588951003, "grad_norm": 0.1354343146085739, "learning_rate": 1.9065921411578114e-05, "loss": 1.1248, "step": 6235 }, { "epoch": 0.820782637290365, "grad_norm": 0.12939414381980896, "learning_rate": 1.8931272953999523e-05, "loss": 1.0727, "step": 6240 }, { "epoch": 0.8214403156856297, "grad_norm": 0.13352051377296448, "learning_rate": 1.8797051915334563e-05, "loss": 1.1093, "step": 6245 }, { "epoch": 0.8220979940808945, "grad_norm": 0.1339080035686493, "learning_rate": 1.8663259003237254e-05, "loss": 1.0804, "step": 6250 }, { "epoch": 0.8227556724761591, "grad_norm": 0.12919768691062927, "learning_rate": 1.8529894923104363e-05, "loss": 1.0978, "step": 6255 }, { "epoch": 0.8234133508714239, "grad_norm": 0.12053103744983673, "learning_rate": 1.8396960378071747e-05, "loss": 1.0413, "step": 6260 }, { "epoch": 0.8240710292666886, "grad_norm": 0.11699766665697098, "learning_rate": 1.8264456069010692e-05, "loss": 1.0589, "step": 6265 }, { "epoch": 0.8247287076619533, "grad_norm": 0.12871229648590088, "learning_rate": 1.8132382694524086e-05, "loss": 1.0447, "step": 6270 }, { "epoch": 0.825386386057218, "grad_norm": 0.13642272353172302, "learning_rate": 1.8000740950942807e-05, "loss": 1.0844, "step": 6275 }, { "epoch": 0.8260440644524827, "grad_norm": 0.13707904517650604, "learning_rate": 1.7869531532322037e-05, "loss": 1.135, "step": 6280 }, { "epoch": 0.8267017428477474, "grad_norm": 0.12285296618938446, "learning_rate": 1.7738755130437633e-05, "loss": 1.0369, "step": 6285 }, { "epoch": 0.8273594212430122, "grad_norm": 0.11796989291906357, "learning_rate": 1.7608412434782427e-05, "loss": 1.071, "step": 6290 }, { "epoch": 0.8280170996382769, "grad_norm": 0.12308959662914276, "learning_rate": 1.747850413256267e-05, "loss": 1.0327, "step": 6295 }, { "epoch": 0.8286747780335416, "grad_norm": 0.123761385679245, "learning_rate": 1.7349030908694308e-05, "loss": 1.0899, "step": 6300 }, { "epoch": 0.8293324564288063, "grad_norm": 0.12833473086357117, "learning_rate": 1.7219993445799443e-05, "loss": 1.0884, "step": 6305 }, { "epoch": 0.829990134824071, "grad_norm": 0.1321207880973816, "learning_rate": 1.70913924242027e-05, "loss": 1.0936, "step": 6310 }, { "epoch": 0.8306478132193358, "grad_norm": 0.13119511306285858, "learning_rate": 1.696322852192769e-05, "loss": 1.1488, "step": 6315 }, { "epoch": 0.8313054916146004, "grad_norm": 0.12500077486038208, "learning_rate": 1.683550241469333e-05, "loss": 1.097, "step": 6320 }, { "epoch": 0.8319631700098652, "grad_norm": 0.1229422464966774, "learning_rate": 1.6708214775910446e-05, "loss": 1.1105, "step": 6325 }, { "epoch": 0.8326208484051298, "grad_norm": 0.13275906443595886, "learning_rate": 1.6581366276678034e-05, "loss": 1.0957, "step": 6330 }, { "epoch": 0.8332785268003946, "grad_norm": 0.125226691365242, "learning_rate": 1.6454957585779827e-05, "loss": 1.0866, "step": 6335 }, { "epoch": 0.8339362051956594, "grad_norm": 0.12462674826383591, "learning_rate": 1.6328989369680837e-05, "loss": 1.1224, "step": 6340 }, { "epoch": 0.834593883590924, "grad_norm": 0.1262083500623703, "learning_rate": 1.6203462292523673e-05, "loss": 1.0634, "step": 6345 }, { "epoch": 0.8352515619861888, "grad_norm": 0.12686122953891754, "learning_rate": 1.6078377016125145e-05, "loss": 1.1067, "step": 6350 }, { "epoch": 0.8359092403814534, "grad_norm": 0.19298367202281952, "learning_rate": 1.595373419997277e-05, "loss": 1.0901, "step": 6355 }, { "epoch": 0.8365669187767182, "grad_norm": 0.1409919261932373, "learning_rate": 1.582953450122128e-05, "loss": 1.1006, "step": 6360 }, { "epoch": 0.8372245971719829, "grad_norm": 0.12460668385028839, "learning_rate": 1.570577857468911e-05, "loss": 1.0723, "step": 6365 }, { "epoch": 0.8378822755672476, "grad_norm": 0.12689638137817383, "learning_rate": 1.5582467072855088e-05, "loss": 1.0519, "step": 6370 }, { "epoch": 0.8385399539625124, "grad_norm": 0.12708817422389984, "learning_rate": 1.5459600645854853e-05, "loss": 1.1147, "step": 6375 }, { "epoch": 0.839197632357777, "grad_norm": 0.1359519064426422, "learning_rate": 1.533717994147742e-05, "loss": 1.0811, "step": 6380 }, { "epoch": 0.8398553107530418, "grad_norm": 0.1492021381855011, "learning_rate": 1.5215205605161876e-05, "loss": 1.0821, "step": 6385 }, { "epoch": 0.8405129891483065, "grad_norm": 0.1319791078567505, "learning_rate": 1.5093678279993928e-05, "loss": 1.0833, "step": 6390 }, { "epoch": 0.8411706675435712, "grad_norm": 0.12496403604745865, "learning_rate": 1.4972598606702504e-05, "loss": 1.0336, "step": 6395 }, { "epoch": 0.8418283459388359, "grad_norm": 0.12768518924713135, "learning_rate": 1.4851967223656327e-05, "loss": 1.0245, "step": 6400 }, { "epoch": 0.8424860243341006, "grad_norm": 0.12619437277317047, "learning_rate": 1.4731784766860645e-05, "loss": 1.0966, "step": 6405 }, { "epoch": 0.8431437027293653, "grad_norm": 0.13095134496688843, "learning_rate": 1.4612051869953824e-05, "loss": 1.0603, "step": 6410 }, { "epoch": 0.8438013811246301, "grad_norm": 0.13002678751945496, "learning_rate": 1.4492769164203967e-05, "loss": 1.053, "step": 6415 }, { "epoch": 0.8444590595198948, "grad_norm": 0.12430264055728912, "learning_rate": 1.4373937278505733e-05, "loss": 1.036, "step": 6420 }, { "epoch": 0.8451167379151595, "grad_norm": 0.11899828910827637, "learning_rate": 1.425555683937685e-05, "loss": 1.0772, "step": 6425 }, { "epoch": 0.8457744163104242, "grad_norm": 0.12336203455924988, "learning_rate": 1.4137628470954823e-05, "loss": 1.2016, "step": 6430 }, { "epoch": 0.8464320947056889, "grad_norm": 0.14179807901382446, "learning_rate": 1.4020152794993845e-05, "loss": 1.0869, "step": 6435 }, { "epoch": 0.8470897731009537, "grad_norm": 0.1420190930366516, "learning_rate": 1.3903130430861289e-05, "loss": 1.1364, "step": 6440 }, { "epoch": 0.8477474514962183, "grad_norm": 0.1326138824224472, "learning_rate": 1.3786561995534519e-05, "loss": 1.0957, "step": 6445 }, { "epoch": 0.8484051298914831, "grad_norm": 0.1243433952331543, "learning_rate": 1.3670448103597733e-05, "loss": 1.0783, "step": 6450 }, { "epoch": 0.8490628082867477, "grad_norm": 0.12677830457687378, "learning_rate": 1.355478936723853e-05, "loss": 1.0601, "step": 6455 }, { "epoch": 0.8497204866820125, "grad_norm": 0.14486576616764069, "learning_rate": 1.343958639624483e-05, "loss": 1.1037, "step": 6460 }, { "epoch": 0.8503781650772773, "grad_norm": 0.13399994373321533, "learning_rate": 1.3324839798001688e-05, "loss": 1.1105, "step": 6465 }, { "epoch": 0.8510358434725419, "grad_norm": 0.11788889020681381, "learning_rate": 1.3210550177487934e-05, "loss": 1.0683, "step": 6470 }, { "epoch": 0.8516935218678067, "grad_norm": 0.15149006247520447, "learning_rate": 1.3096718137273113e-05, "loss": 1.0817, "step": 6475 }, { "epoch": 0.8523512002630713, "grad_norm": 0.1239786222577095, "learning_rate": 1.2983344277514264e-05, "loss": 1.1039, "step": 6480 }, { "epoch": 0.8530088786583361, "grad_norm": 0.13700757920742035, "learning_rate": 1.2870429195952748e-05, "loss": 1.0775, "step": 6485 }, { "epoch": 0.8536665570536008, "grad_norm": 0.12752103805541992, "learning_rate": 1.2757973487911112e-05, "loss": 1.1008, "step": 6490 }, { "epoch": 0.8543242354488655, "grad_norm": 0.12763357162475586, "learning_rate": 1.2645977746289972e-05, "loss": 1.0727, "step": 6495 }, { "epoch": 0.8549819138441302, "grad_norm": 0.12177115678787231, "learning_rate": 1.2534442561564863e-05, "loss": 1.109, "step": 6500 }, { "epoch": 0.8556395922393949, "grad_norm": 0.12414301186800003, "learning_rate": 1.242336852178304e-05, "loss": 1.0783, "step": 6505 }, { "epoch": 0.8562972706346597, "grad_norm": 0.13752809166908264, "learning_rate": 1.231275621256054e-05, "loss": 1.0349, "step": 6510 }, { "epoch": 0.8569549490299244, "grad_norm": 0.12942947447299957, "learning_rate": 1.2202606217079037e-05, "loss": 1.0601, "step": 6515 }, { "epoch": 0.8576126274251891, "grad_norm": 0.12161218374967575, "learning_rate": 1.209291911608269e-05, "loss": 1.0858, "step": 6520 }, { "epoch": 0.8582703058204538, "grad_norm": 0.12335877865552902, "learning_rate": 1.1983695487875168e-05, "loss": 1.0785, "step": 6525 }, { "epoch": 0.8589279842157185, "grad_norm": 0.12881027162075043, "learning_rate": 1.1874935908316543e-05, "loss": 1.0563, "step": 6530 }, { "epoch": 0.8595856626109832, "grad_norm": 0.13974754512310028, "learning_rate": 1.1766640950820317e-05, "loss": 1.0699, "step": 6535 }, { "epoch": 0.860243341006248, "grad_norm": 0.1257486641407013, "learning_rate": 1.1658811186350316e-05, "loss": 1.1036, "step": 6540 }, { "epoch": 0.8609010194015126, "grad_norm": 0.12332329154014587, "learning_rate": 1.1551447183417808e-05, "loss": 1.0944, "step": 6545 }, { "epoch": 0.8615586977967774, "grad_norm": 0.12767980992794037, "learning_rate": 1.1444549508078372e-05, "loss": 1.1416, "step": 6550 }, { "epoch": 0.862216376192042, "grad_norm": 0.12986689805984497, "learning_rate": 1.1338118723928904e-05, "loss": 1.111, "step": 6555 }, { "epoch": 0.8628740545873068, "grad_norm": 0.13037726283073425, "learning_rate": 1.1232155392104837e-05, "loss": 1.0524, "step": 6560 }, { "epoch": 0.8635317329825716, "grad_norm": 0.12034355849027634, "learning_rate": 1.1126660071276972e-05, "loss": 1.1082, "step": 6565 }, { "epoch": 0.8641894113778362, "grad_norm": 0.12144242972135544, "learning_rate": 1.1021633317648616e-05, "loss": 1.0757, "step": 6570 }, { "epoch": 0.864847089773101, "grad_norm": 0.13375237584114075, "learning_rate": 1.0917075684952726e-05, "loss": 1.0919, "step": 6575 }, { "epoch": 0.8655047681683656, "grad_norm": 0.11908838897943497, "learning_rate": 1.0812987724448775e-05, "loss": 1.109, "step": 6580 }, { "epoch": 0.8661624465636304, "grad_norm": 0.1239566057920456, "learning_rate": 1.070936998492007e-05, "loss": 1.0914, "step": 6585 }, { "epoch": 0.8668201249588952, "grad_norm": 0.11583433300256729, "learning_rate": 1.0606223012670791e-05, "loss": 1.0826, "step": 6590 }, { "epoch": 0.8674778033541598, "grad_norm": 0.1269976645708084, "learning_rate": 1.0503547351523036e-05, "loss": 1.1559, "step": 6595 }, { "epoch": 0.8681354817494246, "grad_norm": 0.13367483019828796, "learning_rate": 1.0401343542814025e-05, "loss": 1.1103, "step": 6600 }, { "epoch": 0.8687931601446892, "grad_norm": 0.12513022124767303, "learning_rate": 1.0299612125393233e-05, "loss": 1.1129, "step": 6605 }, { "epoch": 0.869450838539954, "grad_norm": 0.12229305505752563, "learning_rate": 1.0198353635619551e-05, "loss": 1.0573, "step": 6610 }, { "epoch": 0.8701085169352186, "grad_norm": 0.12285857647657394, "learning_rate": 1.0097568607358421e-05, "loss": 1.1456, "step": 6615 }, { "epoch": 0.8707661953304834, "grad_norm": 0.13464665412902832, "learning_rate": 9.997257571979102e-06, "loss": 1.1222, "step": 6620 }, { "epoch": 0.8714238737257481, "grad_norm": 0.1342402994632721, "learning_rate": 9.897421058351809e-06, "loss": 1.1232, "step": 6625 }, { "epoch": 0.8720815521210128, "grad_norm": 0.124883733689785, "learning_rate": 9.798059592844855e-06, "loss": 1.0465, "step": 6630 }, { "epoch": 0.8727392305162776, "grad_norm": 0.12607227265834808, "learning_rate": 9.699173699322073e-06, "loss": 1.0561, "step": 6635 }, { "epoch": 0.8733969089115422, "grad_norm": 0.13045276701450348, "learning_rate": 9.600763899139865e-06, "loss": 1.0981, "step": 6640 }, { "epoch": 0.874054587306807, "grad_norm": 0.1471523642539978, "learning_rate": 9.502830711144528e-06, "loss": 1.1118, "step": 6645 }, { "epoch": 0.8747122657020717, "grad_norm": 0.1303831934928894, "learning_rate": 9.405374651669552e-06, "loss": 1.0877, "step": 6650 }, { "epoch": 0.8753699440973364, "grad_norm": 0.11664953827857971, "learning_rate": 9.308396234532823e-06, "loss": 1.1333, "step": 6655 }, { "epoch": 0.8760276224926011, "grad_norm": 0.13784165680408478, "learning_rate": 9.211895971033968e-06, "loss": 1.0668, "step": 6660 }, { "epoch": 0.8766853008878658, "grad_norm": 0.13570618629455566, "learning_rate": 9.115874369951615e-06, "loss": 1.0505, "step": 6665 }, { "epoch": 0.8773429792831305, "grad_norm": 0.12611496448516846, "learning_rate": 9.020331937540815e-06, "loss": 1.0584, "step": 6670 }, { "epoch": 0.8780006576783953, "grad_norm": 0.13073426485061646, "learning_rate": 8.925269177530238e-06, "loss": 1.055, "step": 6675 }, { "epoch": 0.87865833607366, "grad_norm": 0.13663744926452637, "learning_rate": 8.83068659111952e-06, "loss": 1.098, "step": 6680 }, { "epoch": 0.8793160144689247, "grad_norm": 0.11919693648815155, "learning_rate": 8.73658467697681e-06, "loss": 1.1012, "step": 6685 }, { "epoch": 0.8799736928641894, "grad_norm": 0.1184806302189827, "learning_rate": 8.642963931235904e-06, "loss": 1.0686, "step": 6690 }, { "epoch": 0.8806313712594541, "grad_norm": 0.12958255410194397, "learning_rate": 8.549824847493748e-06, "loss": 1.0574, "step": 6695 }, { "epoch": 0.8812890496547189, "grad_norm": 0.13822266459465027, "learning_rate": 8.457167916807907e-06, "loss": 1.0702, "step": 6700 }, { "epoch": 0.8819467280499835, "grad_norm": 0.1490565687417984, "learning_rate": 8.364993627693752e-06, "loss": 1.0506, "step": 6705 }, { "epoch": 0.8826044064452483, "grad_norm": 0.13421525061130524, "learning_rate": 8.27330246612208e-06, "loss": 1.1077, "step": 6710 }, { "epoch": 0.8832620848405129, "grad_norm": 0.12697631120681763, "learning_rate": 8.182094915516546e-06, "loss": 1.0822, "step": 6715 }, { "epoch": 0.8839197632357777, "grad_norm": 0.1256536841392517, "learning_rate": 8.091371456751006e-06, "loss": 1.0873, "step": 6720 }, { "epoch": 0.8845774416310425, "grad_norm": 0.13120722770690918, "learning_rate": 8.001132568147029e-06, "loss": 1.0635, "step": 6725 }, { "epoch": 0.8852351200263071, "grad_norm": 0.13410533964633942, "learning_rate": 7.911378725471419e-06, "loss": 1.0713, "step": 6730 }, { "epoch": 0.8858927984215719, "grad_norm": 0.12565448880195618, "learning_rate": 7.822110401933635e-06, "loss": 1.1031, "step": 6735 }, { "epoch": 0.8865504768168365, "grad_norm": 0.1179744154214859, "learning_rate": 7.733328068183343e-06, "loss": 1.1267, "step": 6740 }, { "epoch": 0.8872081552121013, "grad_norm": 0.12747325003147125, "learning_rate": 7.645032192307976e-06, "loss": 1.0478, "step": 6745 }, { "epoch": 0.887865833607366, "grad_norm": 0.13428014516830444, "learning_rate": 7.557223239830147e-06, "loss": 1.0664, "step": 6750 }, { "epoch": 0.8885235120026307, "grad_norm": 0.13759776949882507, "learning_rate": 7.469901673705238e-06, "loss": 1.0979, "step": 6755 }, { "epoch": 0.8891811903978954, "grad_norm": 0.12811830639839172, "learning_rate": 7.383067954319078e-06, "loss": 1.135, "step": 6760 }, { "epoch": 0.8898388687931601, "grad_norm": 0.13517077267169952, "learning_rate": 7.296722539485356e-06, "loss": 1.0651, "step": 6765 }, { "epoch": 0.8904965471884249, "grad_norm": 0.1311229020357132, "learning_rate": 7.2108658844432856e-06, "loss": 1.1017, "step": 6770 }, { "epoch": 0.8911542255836896, "grad_norm": 0.13094618916511536, "learning_rate": 7.125498441855183e-06, "loss": 1.1234, "step": 6775 }, { "epoch": 0.8918119039789543, "grad_norm": 0.1234104186296463, "learning_rate": 7.040620661804087e-06, "loss": 1.1003, "step": 6780 }, { "epoch": 0.892469582374219, "grad_norm": 0.1251104176044464, "learning_rate": 6.9562329917914e-06, "loss": 1.0802, "step": 6785 }, { "epoch": 0.8931272607694837, "grad_norm": 0.12144535034894943, "learning_rate": 6.872335876734503e-06, "loss": 1.112, "step": 6790 }, { "epoch": 0.8937849391647484, "grad_norm": 0.13212600350379944, "learning_rate": 6.788929758964446e-06, "loss": 1.1308, "step": 6795 }, { "epoch": 0.8944426175600132, "grad_norm": 0.12547363340854645, "learning_rate": 6.7060150782235935e-06, "loss": 1.0836, "step": 6800 }, { "epoch": 0.8951002959552778, "grad_norm": 0.14967547357082367, "learning_rate": 6.623592271663215e-06, "loss": 1.0622, "step": 6805 }, { "epoch": 0.8957579743505426, "grad_norm": 0.1382812112569809, "learning_rate": 6.541661773841401e-06, "loss": 1.0887, "step": 6810 }, { "epoch": 0.8964156527458073, "grad_norm": 0.15224428474903107, "learning_rate": 6.460224016720573e-06, "loss": 1.0914, "step": 6815 }, { "epoch": 0.897073331141072, "grad_norm": 0.1278597116470337, "learning_rate": 6.379279429665275e-06, "loss": 1.0723, "step": 6820 }, { "epoch": 0.8977310095363368, "grad_norm": 0.12243839353322983, "learning_rate": 6.2988284394399185e-06, "loss": 1.0864, "step": 6825 }, { "epoch": 0.8983886879316014, "grad_norm": 0.12724310159683228, "learning_rate": 6.218871470206533e-06, "loss": 1.1144, "step": 6830 }, { "epoch": 0.8990463663268662, "grad_norm": 0.1192445307970047, "learning_rate": 6.139408943522474e-06, "loss": 1.0633, "step": 6835 }, { "epoch": 0.8997040447221308, "grad_norm": 0.135670006275177, "learning_rate": 6.060441278338314e-06, "loss": 1.1398, "step": 6840 }, { "epoch": 0.9003617231173956, "grad_norm": 0.142990380525589, "learning_rate": 5.981968890995515e-06, "loss": 1.1055, "step": 6845 }, { "epoch": 0.9010194015126604, "grad_norm": 0.12928073108196259, "learning_rate": 5.9039921952243175e-06, "loss": 1.0566, "step": 6850 }, { "epoch": 0.901677079907925, "grad_norm": 0.14728300273418427, "learning_rate": 5.826511602141493e-06, "loss": 1.1261, "step": 6855 }, { "epoch": 0.9023347583031898, "grad_norm": 0.13266786932945251, "learning_rate": 5.749527520248221e-06, "loss": 1.0567, "step": 6860 }, { "epoch": 0.9029924366984544, "grad_norm": 0.11777203530073166, "learning_rate": 5.6730403554279275e-06, "loss": 1.0192, "step": 6865 }, { "epoch": 0.9036501150937192, "grad_norm": 0.12118373066186905, "learning_rate": 5.5970505109441576e-06, "loss": 1.0393, "step": 6870 }, { "epoch": 0.9043077934889839, "grad_norm": 0.1274781972169876, "learning_rate": 5.521558387438419e-06, "loss": 1.134, "step": 6875 }, { "epoch": 0.9049654718842486, "grad_norm": 0.13446952402591705, "learning_rate": 5.4465643829280324e-06, "loss": 1.0798, "step": 6880 }, { "epoch": 0.9056231502795133, "grad_norm": 0.12585683166980743, "learning_rate": 5.372068892804194e-06, "loss": 1.0795, "step": 6885 }, { "epoch": 0.906280828674778, "grad_norm": 0.13548751175403595, "learning_rate": 5.298072309829683e-06, "loss": 1.0729, "step": 6890 }, { "epoch": 0.9069385070700428, "grad_norm": 0.13813109695911407, "learning_rate": 5.2245750241369355e-06, "loss": 1.0521, "step": 6895 }, { "epoch": 0.9075961854653075, "grad_norm": 0.13424000144004822, "learning_rate": 5.151577423225928e-06, "loss": 1.1241, "step": 6900 }, { "epoch": 0.9082538638605722, "grad_norm": 0.13467825949192047, "learning_rate": 5.079079891962147e-06, "loss": 1.0949, "step": 6905 }, { "epoch": 0.9089115422558369, "grad_norm": 0.12294696271419525, "learning_rate": 5.007082812574559e-06, "loss": 1.1083, "step": 6910 }, { "epoch": 0.9095692206511016, "grad_norm": 0.12151917815208435, "learning_rate": 4.935586564653605e-06, "loss": 1.0728, "step": 6915 }, { "epoch": 0.9102268990463663, "grad_norm": 0.12292072176933289, "learning_rate": 4.864591525149198e-06, "loss": 1.0953, "step": 6920 }, { "epoch": 0.9108845774416311, "grad_norm": 0.1301289051771164, "learning_rate": 4.794098068368713e-06, "loss": 1.0908, "step": 6925 }, { "epoch": 0.9115422558368957, "grad_norm": 0.12175721675157547, "learning_rate": 4.724106565974995e-06, "loss": 1.051, "step": 6930 }, { "epoch": 0.9121999342321605, "grad_norm": 0.1283656358718872, "learning_rate": 4.654617386984517e-06, "loss": 1.0659, "step": 6935 }, { "epoch": 0.9128576126274252, "grad_norm": 0.12970061600208282, "learning_rate": 4.585630897765281e-06, "loss": 1.0706, "step": 6940 }, { "epoch": 0.9135152910226899, "grad_norm": 0.13352157175540924, "learning_rate": 4.5171474620349875e-06, "loss": 1.0976, "step": 6945 }, { "epoch": 0.9141729694179547, "grad_norm": 0.12641961872577667, "learning_rate": 4.44916744085907e-06, "loss": 1.0693, "step": 6950 }, { "epoch": 0.9148306478132193, "grad_norm": 0.1266348958015442, "learning_rate": 4.381691192648818e-06, "loss": 1.073, "step": 6955 }, { "epoch": 0.9154883262084841, "grad_norm": 0.12315046042203903, "learning_rate": 4.314719073159468e-06, "loss": 1.1138, "step": 6960 }, { "epoch": 0.9161460046037487, "grad_norm": 0.12821537256240845, "learning_rate": 4.24825143548836e-06, "loss": 1.021, "step": 6965 }, { "epoch": 0.9168036829990135, "grad_norm": 0.12609626352787018, "learning_rate": 4.182288630073028e-06, "loss": 1.0788, "step": 6970 }, { "epoch": 0.9174613613942783, "grad_norm": 0.12676601111888885, "learning_rate": 4.116831004689381e-06, "loss": 1.0912, "step": 6975 }, { "epoch": 0.9181190397895429, "grad_norm": 0.13219903409481049, "learning_rate": 4.05187890444989e-06, "loss": 1.1152, "step": 6980 }, { "epoch": 0.9187767181848077, "grad_norm": 0.12630099058151245, "learning_rate": 3.987432671801694e-06, "loss": 1.1112, "step": 6985 }, { "epoch": 0.9194343965800723, "grad_norm": 0.1281658113002777, "learning_rate": 3.923492646524885e-06, "loss": 1.1186, "step": 6990 }, { "epoch": 0.9200920749753371, "grad_norm": 0.12938450276851654, "learning_rate": 3.860059165730678e-06, "loss": 1.1492, "step": 6995 }, { "epoch": 0.9207497533706018, "grad_norm": 0.12509390711784363, "learning_rate": 3.79713256385964e-06, "loss": 1.1089, "step": 7000 }, { "epoch": 0.9214074317658665, "grad_norm": 0.12819696962833405, "learning_rate": 3.734713172679838e-06, "loss": 1.0953, "step": 7005 }, { "epoch": 0.9220651101611312, "grad_norm": 0.12569569051265717, "learning_rate": 3.6728013212852797e-06, "loss": 1.0832, "step": 7010 }, { "epoch": 0.9227227885563959, "grad_norm": 0.13164682686328888, "learning_rate": 3.6113973360940134e-06, "loss": 1.125, "step": 7015 }, { "epoch": 0.9233804669516607, "grad_norm": 0.1335838884115219, "learning_rate": 3.5505015408464627e-06, "loss": 1.1125, "step": 7020 }, { "epoch": 0.9240381453469253, "grad_norm": 0.13036727905273438, "learning_rate": 3.490114256603727e-06, "loss": 1.0643, "step": 7025 }, { "epoch": 0.9246958237421901, "grad_norm": 0.12281776964664459, "learning_rate": 3.4302358017458845e-06, "loss": 1.0435, "step": 7030 }, { "epoch": 0.9253535021374548, "grad_norm": 0.11984923481941223, "learning_rate": 3.3708664919703147e-06, "loss": 1.0652, "step": 7035 }, { "epoch": 0.9260111805327195, "grad_norm": 0.1248653307557106, "learning_rate": 3.3120066402899887e-06, "loss": 1.0823, "step": 7040 }, { "epoch": 0.9266688589279842, "grad_norm": 0.1274206042289734, "learning_rate": 3.2536565570319477e-06, "loss": 1.0864, "step": 7045 }, { "epoch": 0.9273265373232489, "grad_norm": 0.12672635912895203, "learning_rate": 3.1958165498355063e-06, "loss": 1.1243, "step": 7050 }, { "epoch": 0.9279842157185136, "grad_norm": 0.12420553714036942, "learning_rate": 3.1384869236506854e-06, "loss": 1.0381, "step": 7055 }, { "epoch": 0.9286418941137784, "grad_norm": 0.13672147691249847, "learning_rate": 3.0816679807367132e-06, "loss": 1.082, "step": 7060 }, { "epoch": 0.929299572509043, "grad_norm": 0.12322887778282166, "learning_rate": 3.0253600206602504e-06, "loss": 1.0682, "step": 7065 }, { "epoch": 0.9299572509043078, "grad_norm": 0.12149498611688614, "learning_rate": 2.969563340293957e-06, "loss": 1.1128, "step": 7070 }, { "epoch": 0.9306149292995725, "grad_norm": 0.12750664353370667, "learning_rate": 2.91427823381486e-06, "loss": 1.0487, "step": 7075 }, { "epoch": 0.9312726076948372, "grad_norm": 0.12178990244865417, "learning_rate": 2.859504992702777e-06, "loss": 1.1289, "step": 7080 }, { "epoch": 0.931930286090102, "grad_norm": 0.1273735910654068, "learning_rate": 2.8052439057388636e-06, "loss": 1.1021, "step": 7085 }, { "epoch": 0.9325879644853666, "grad_norm": 0.13085977733135223, "learning_rate": 2.7514952590040222e-06, "loss": 1.0876, "step": 7090 }, { "epoch": 0.9332456428806314, "grad_norm": 0.11890529841184616, "learning_rate": 2.6982593358774288e-06, "loss": 1.0612, "step": 7095 }, { "epoch": 0.933903321275896, "grad_norm": 0.11826446652412415, "learning_rate": 2.6455364170350106e-06, "loss": 1.1222, "step": 7100 }, { "epoch": 0.9345609996711608, "grad_norm": 0.13533374667167664, "learning_rate": 2.593326780447969e-06, "loss": 1.114, "step": 7105 }, { "epoch": 0.9352186780664256, "grad_norm": 0.142277792096138, "learning_rate": 2.541630701381359e-06, "loss": 1.1075, "step": 7110 }, { "epoch": 0.9358763564616902, "grad_norm": 0.12411517649888992, "learning_rate": 2.4904484523925573e-06, "loss": 1.087, "step": 7115 }, { "epoch": 0.936534034856955, "grad_norm": 0.1550077199935913, "learning_rate": 2.4397803033299415e-06, "loss": 1.0613, "step": 7120 }, { "epoch": 0.9371917132522196, "grad_norm": 0.11703425645828247, "learning_rate": 2.3896265213313562e-06, "loss": 1.0448, "step": 7125 }, { "epoch": 0.9378493916474844, "grad_norm": 0.12909579277038574, "learning_rate": 2.339987370822705e-06, "loss": 1.0388, "step": 7130 }, { "epoch": 0.9385070700427491, "grad_norm": 0.12960663437843323, "learning_rate": 2.2908631135166836e-06, "loss": 1.0741, "step": 7135 }, { "epoch": 0.9391647484380138, "grad_norm": 0.1335333287715912, "learning_rate": 2.2422540084112596e-06, "loss": 1.0847, "step": 7140 }, { "epoch": 0.9398224268332785, "grad_norm": 0.12828828394412994, "learning_rate": 2.1941603117883846e-06, "loss": 1.0485, "step": 7145 }, { "epoch": 0.9404801052285432, "grad_norm": 0.13621141016483307, "learning_rate": 2.1465822772126056e-06, "loss": 1.0639, "step": 7150 }, { "epoch": 0.941137783623808, "grad_norm": 0.1261686235666275, "learning_rate": 2.099520155529744e-06, "loss": 1.0903, "step": 7155 }, { "epoch": 0.9417954620190727, "grad_norm": 0.1290721446275711, "learning_rate": 2.0529741948655866e-06, "loss": 1.0896, "step": 7160 }, { "epoch": 0.9424531404143374, "grad_norm": 0.12567773461341858, "learning_rate": 2.006944640624542e-06, "loss": 1.0835, "step": 7165 }, { "epoch": 0.9431108188096021, "grad_norm": 0.1283196061849594, "learning_rate": 1.961431735488395e-06, "loss": 1.031, "step": 7170 }, { "epoch": 0.9437684972048668, "grad_norm": 0.12203031778335571, "learning_rate": 1.916435719414977e-06, "loss": 1.0699, "step": 7175 }, { "epoch": 0.9444261756001315, "grad_norm": 0.1307421177625656, "learning_rate": 1.8719568296369116e-06, "loss": 1.0813, "step": 7180 }, { "epoch": 0.9450838539953963, "grad_norm": 0.1171189621090889, "learning_rate": 1.8279953006604233e-06, "loss": 1.1123, "step": 7185 }, { "epoch": 0.9457415323906609, "grad_norm": 0.12556134164333344, "learning_rate": 1.78455136426402e-06, "loss": 1.1423, "step": 7190 }, { "epoch": 0.9463992107859257, "grad_norm": 0.14397232234477997, "learning_rate": 1.7416252494972918e-06, "loss": 1.1076, "step": 7195 }, { "epoch": 0.9470568891811904, "grad_norm": 0.11753013730049133, "learning_rate": 1.6992171826797465e-06, "loss": 1.1023, "step": 7200 }, { "epoch": 0.9477145675764551, "grad_norm": 0.12375151365995407, "learning_rate": 1.6573273873995655e-06, "loss": 1.0309, "step": 7205 }, { "epoch": 0.9483722459717199, "grad_norm": 0.12732307612895966, "learning_rate": 1.6159560845124488e-06, "loss": 1.0797, "step": 7210 }, { "epoch": 0.9490299243669845, "grad_norm": 0.12448208779096603, "learning_rate": 1.5751034921404617e-06, "loss": 1.0694, "step": 7215 }, { "epoch": 0.9496876027622493, "grad_norm": 0.13296104967594147, "learning_rate": 1.5347698256708675e-06, "loss": 1.1185, "step": 7220 }, { "epoch": 0.9503452811575139, "grad_norm": 0.13507743179798126, "learning_rate": 1.494955297754974e-06, "loss": 1.0458, "step": 7225 }, { "epoch": 0.9510029595527787, "grad_norm": 0.12835751473903656, "learning_rate": 1.455660118307045e-06, "loss": 1.1151, "step": 7230 }, { "epoch": 0.9516606379480435, "grad_norm": 0.13565976917743683, "learning_rate": 1.41688449450319e-06, "loss": 1.102, "step": 7235 }, { "epoch": 0.9523183163433081, "grad_norm": 0.12277020514011383, "learning_rate": 1.3786286307802432e-06, "loss": 1.0782, "step": 7240 }, { "epoch": 0.9529759947385729, "grad_norm": 0.1334349364042282, "learning_rate": 1.3408927288347417e-06, "loss": 1.0918, "step": 7245 }, { "epoch": 0.9536336731338375, "grad_norm": 0.14164070785045624, "learning_rate": 1.3036769876218047e-06, "loss": 1.0708, "step": 7250 }, { "epoch": 0.9542913515291023, "grad_norm": 0.12703892588615417, "learning_rate": 1.2669816033540672e-06, "loss": 1.0617, "step": 7255 }, { "epoch": 0.954949029924367, "grad_norm": 0.12203647196292877, "learning_rate": 1.2308067695007697e-06, "loss": 1.023, "step": 7260 }, { "epoch": 0.9556067083196317, "grad_norm": 0.12659655511379242, "learning_rate": 1.1951526767865818e-06, "loss": 1.0768, "step": 7265 }, { "epoch": 0.9562643867148964, "grad_norm": 0.1347762942314148, "learning_rate": 1.1600195131906911e-06, "loss": 1.0467, "step": 7270 }, { "epoch": 0.9569220651101611, "grad_norm": 0.1408466249704361, "learning_rate": 1.1254074639458045e-06, "loss": 1.1221, "step": 7275 }, { "epoch": 0.9575797435054259, "grad_norm": 0.12063086777925491, "learning_rate": 1.091316711537127e-06, "loss": 1.0667, "step": 7280 }, { "epoch": 0.9582374219006906, "grad_norm": 0.14076277613639832, "learning_rate": 1.0577474357014617e-06, "loss": 1.0629, "step": 7285 }, { "epoch": 0.9588951002959553, "grad_norm": 0.14120733737945557, "learning_rate": 1.0246998134261886e-06, "loss": 1.1094, "step": 7290 }, { "epoch": 0.95955277869122, "grad_norm": 0.12422536313533783, "learning_rate": 9.921740189484107e-07, "loss": 1.0735, "step": 7295 }, { "epoch": 0.9602104570864847, "grad_norm": 0.12713931500911713, "learning_rate": 9.601702237539866e-07, "loss": 1.0711, "step": 7300 }, { "epoch": 0.9608681354817494, "grad_norm": 0.12748518586158752, "learning_rate": 9.28688596576599e-07, "loss": 1.1069, "step": 7305 }, { "epoch": 0.9615258138770142, "grad_norm": 0.1322038769721985, "learning_rate": 8.977293033969658e-07, "loss": 1.0894, "step": 7310 }, { "epoch": 0.9621834922722788, "grad_norm": 0.11743160337209702, "learning_rate": 8.672925074418525e-07, "loss": 1.0787, "step": 7315 }, { "epoch": 0.9628411706675436, "grad_norm": 0.12396471947431564, "learning_rate": 8.373783691832726e-07, "loss": 1.0734, "step": 7320 }, { "epoch": 0.9634988490628082, "grad_norm": 0.1272258162498474, "learning_rate": 8.079870463376549e-07, "loss": 1.1123, "step": 7325 }, { "epoch": 0.964156527458073, "grad_norm": 0.1307511180639267, "learning_rate": 7.791186938649552e-07, "loss": 1.0562, "step": 7330 }, { "epoch": 0.9648142058533378, "grad_norm": 0.13303637504577637, "learning_rate": 7.507734639678687e-07, "loss": 1.1025, "step": 7335 }, { "epoch": 0.9654718842486024, "grad_norm": 0.12764723598957062, "learning_rate": 7.229515060910741e-07, "loss": 1.0536, "step": 7340 }, { "epoch": 0.9661295626438672, "grad_norm": 0.12426929175853729, "learning_rate": 6.95652966920346e-07, "loss": 1.1073, "step": 7345 }, { "epoch": 0.9667872410391318, "grad_norm": 0.13067397475242615, "learning_rate": 6.688779903818664e-07, "loss": 1.1007, "step": 7350 }, { "epoch": 0.9674449194343966, "grad_norm": 0.12941089272499084, "learning_rate": 6.426267176414369e-07, "loss": 1.0776, "step": 7355 }, { "epoch": 0.9681025978296613, "grad_norm": 0.12120051681995392, "learning_rate": 6.168992871037227e-07, "loss": 1.0984, "step": 7360 }, { "epoch": 0.968760276224926, "grad_norm": 0.13271667063236237, "learning_rate": 5.91695834411543e-07, "loss": 1.0592, "step": 7365 }, { "epoch": 0.9694179546201908, "grad_norm": 0.13047733902931213, "learning_rate": 5.670164924451382e-07, "loss": 1.0935, "step": 7370 }, { "epoch": 0.9700756330154554, "grad_norm": 0.1226421594619751, "learning_rate": 5.428613913214919e-07, "loss": 1.1125, "step": 7375 }, { "epoch": 0.9707333114107202, "grad_norm": 0.12262436747550964, "learning_rate": 5.192306583936102e-07, "loss": 1.0297, "step": 7380 }, { "epoch": 0.9713909898059849, "grad_norm": 0.1315319836139679, "learning_rate": 4.961244182498992e-07, "loss": 1.0621, "step": 7385 }, { "epoch": 0.9720486682012496, "grad_norm": 0.1439238339662552, "learning_rate": 4.73542792713455e-07, "loss": 1.0649, "step": 7390 }, { "epoch": 0.9727063465965143, "grad_norm": 0.13110719621181488, "learning_rate": 4.5148590084145294e-07, "loss": 1.0915, "step": 7395 }, { "epoch": 0.973364024991779, "grad_norm": 0.12557129561901093, "learning_rate": 4.2995385892452553e-07, "loss": 1.0541, "step": 7400 }, { "epoch": 0.9740217033870437, "grad_norm": 0.12438076734542847, "learning_rate": 4.0894678048611913e-07, "loss": 1.0836, "step": 7405 }, { "epoch": 0.9746793817823084, "grad_norm": 0.12633442878723145, "learning_rate": 3.88464776281916e-07, "loss": 1.0957, "step": 7410 }, { "epoch": 0.9753370601775732, "grad_norm": 0.1273554116487503, "learning_rate": 3.6850795429926865e-07, "loss": 1.0436, "step": 7415 }, { "epoch": 0.9759947385728379, "grad_norm": 0.11931316554546356, "learning_rate": 3.4907641975658876e-07, "loss": 1.0458, "step": 7420 }, { "epoch": 0.9766524169681026, "grad_norm": 0.12170768529176712, "learning_rate": 3.301702751028146e-07, "loss": 1.0764, "step": 7425 }, { "epoch": 0.9773100953633673, "grad_norm": 0.13220398128032684, "learning_rate": 3.1178962001687794e-07, "loss": 1.1188, "step": 7430 }, { "epoch": 0.977967773758632, "grad_norm": 0.1284247487783432, "learning_rate": 2.939345514071601e-07, "loss": 1.0551, "step": 7435 }, { "epoch": 0.9786254521538967, "grad_norm": 0.1260274201631546, "learning_rate": 2.766051634110145e-07, "loss": 1.0836, "step": 7440 }, { "epoch": 0.9792831305491615, "grad_norm": 0.12609007954597473, "learning_rate": 2.598015473942117e-07, "loss": 1.0765, "step": 7445 }, { "epoch": 0.9799408089444261, "grad_norm": 0.12903781235218048, "learning_rate": 2.4352379195051733e-07, "loss": 1.0652, "step": 7450 }, { "epoch": 0.9805984873396909, "grad_norm": 0.12213218212127686, "learning_rate": 2.2777198290119263e-07, "loss": 1.0491, "step": 7455 }, { "epoch": 0.9812561657349556, "grad_norm": 0.11681472510099411, "learning_rate": 2.1254620329456132e-07, "loss": 1.1194, "step": 7460 }, { "epoch": 0.9819138441302203, "grad_norm": 0.12463615834712982, "learning_rate": 1.978465334055324e-07, "loss": 1.0611, "step": 7465 }, { "epoch": 0.9825715225254851, "grad_norm": 0.13764384388923645, "learning_rate": 1.8367305073524466e-07, "loss": 1.1117, "step": 7470 }, { "epoch": 0.9832292009207497, "grad_norm": 0.1318746656179428, "learning_rate": 1.7002583001058948e-07, "loss": 1.1106, "step": 7475 }, { "epoch": 0.9838868793160145, "grad_norm": 0.12471514195203781, "learning_rate": 1.569049431838443e-07, "loss": 1.0913, "step": 7480 }, { "epoch": 0.9845445577112791, "grad_norm": 0.12861767411231995, "learning_rate": 1.4431045943231746e-07, "loss": 1.1446, "step": 7485 }, { "epoch": 0.9852022361065439, "grad_norm": 0.12710155546665192, "learning_rate": 1.3224244515792628e-07, "loss": 1.089, "step": 7490 }, { "epoch": 0.9858599145018087, "grad_norm": 0.12626656889915466, "learning_rate": 1.2070096398691945e-07, "loss": 1.0873, "step": 7495 }, { "epoch": 0.9865175928970733, "grad_norm": 0.1263827234506607, "learning_rate": 1.096860767694885e-07, "loss": 1.1289, "step": 7500 }, { "epoch": 0.9871752712923381, "grad_norm": 0.12817326188087463, "learning_rate": 9.91978415794459e-08, "loss": 1.0784, "step": 7505 }, { "epoch": 0.9878329496876027, "grad_norm": 0.12611013650894165, "learning_rate": 8.923631371394736e-08, "loss": 1.1583, "step": 7510 }, { "epoch": 0.9884906280828675, "grad_norm": 0.1285797357559204, "learning_rate": 7.98015456932033e-08, "loss": 1.055, "step": 7515 }, { "epoch": 0.9891483064781322, "grad_norm": 0.12791886925697327, "learning_rate": 7.089358726015682e-08, "loss": 1.1116, "step": 7520 }, { "epoch": 0.9898059848733969, "grad_norm": 0.13284216821193695, "learning_rate": 6.251248538028387e-08, "loss": 1.0815, "step": 7525 }, { "epoch": 0.9904636632686616, "grad_norm": 0.11763113737106323, "learning_rate": 5.4658284241271283e-08, "loss": 1.079, "step": 7530 }, { "epoch": 0.9911213416639263, "grad_norm": 0.1247347965836525, "learning_rate": 4.7331025252872475e-08, "loss": 1.0838, "step": 7535 }, { "epoch": 0.991779020059191, "grad_norm": 0.12390042841434479, "learning_rate": 4.053074704659654e-08, "loss": 1.0413, "step": 7540 }, { "epoch": 0.9924366984544558, "grad_norm": 0.12448058277368546, "learning_rate": 3.425748547559726e-08, "loss": 1.0017, "step": 7545 }, { "epoch": 0.9930943768497205, "grad_norm": 0.12865927815437317, "learning_rate": 2.851127361439554e-08, "loss": 1.0514, "step": 7550 }, { "epoch": 0.9937520552449852, "grad_norm": 0.11859242618083954, "learning_rate": 2.3292141758768372e-08, "loss": 1.0596, "step": 7555 }, { "epoch": 0.9944097336402499, "grad_norm": 0.12460646778345108, "learning_rate": 1.8600117425549014e-08, "loss": 1.0641, "step": 7560 }, { "epoch": 0.9950674120355146, "grad_norm": 0.1188538447022438, "learning_rate": 1.4435225352538162e-08, "loss": 1.0315, "step": 7565 }, { "epoch": 0.9957250904307794, "grad_norm": 0.14251664280891418, "learning_rate": 1.079748749828191e-08, "loss": 1.0872, "step": 7570 }, { "epoch": 0.996382768826044, "grad_norm": 0.13406243920326233, "learning_rate": 7.686923042060645e-09, "loss": 1.1053, "step": 7575 }, { "epoch": 0.9970404472213088, "grad_norm": 0.13371390104293823, "learning_rate": 5.103548383700307e-09, "loss": 1.0606, "step": 7580 }, { "epoch": 0.9976981256165735, "grad_norm": 0.12253880500793457, "learning_rate": 3.0473771435390872e-09, "loss": 1.0191, "step": 7585 }, { "epoch": 0.9983558040118382, "grad_norm": 0.12514935433864594, "learning_rate": 1.5184201623386074e-09, "loss": 1.094, "step": 7590 }, { "epoch": 0.999013482407103, "grad_norm": 0.12491371482610703, "learning_rate": 5.166855012062044e-10, "loss": 1.1313, "step": 7595 }, { "epoch": 0.9996711608023676, "grad_norm": 0.1304769665002823, "learning_rate": 4.217844161713558e-11, "loss": 1.1227, "step": 7600 }, { "epoch": 0.9999342321604735, "eval_loss": 1.089726448059082, "eval_runtime": 1166.2979, "eval_samples_per_second": 11.538, "eval_steps_per_second": 0.722, "step": 7602 }, { "epoch": 0.9999342321604735, "step": 7602, "total_flos": 1.127965466700651e+19, "train_loss": 1.1009578104553583, "train_runtime": 43291.062, "train_samples_per_second": 2.81, "train_steps_per_second": 0.176 } ], "logging_steps": 5, "max_steps": 7602, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.127965466700651e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }