{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.014954172696575, "eval_steps": 500, "global_step": 12500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00120598166907863, "grad_norm": 22.5, "learning_rate": 3e-06, "loss": 6.2443, "step": 5 }, { "epoch": 0.00241196333815726, "grad_norm": 19.75, "learning_rate": 6e-06, "loss": 6.2538, "step": 10 }, { "epoch": 0.00361794500723589, "grad_norm": 20.375, "learning_rate": 9e-06, "loss": 6.2059, "step": 15 }, { "epoch": 0.00482392667631452, "grad_norm": 15.0625, "learning_rate": 1.2e-05, "loss": 5.8969, "step": 20 }, { "epoch": 0.00602990834539315, "grad_norm": 10.8125, "learning_rate": 1.5e-05, "loss": 5.6284, "step": 25 }, { "epoch": 0.00723589001447178, "grad_norm": 11.8125, "learning_rate": 1.8e-05, "loss": 5.3795, "step": 30 }, { "epoch": 0.00844187168355041, "grad_norm": 6.21875, "learning_rate": 2.1e-05, "loss": 5.0908, "step": 35 }, { "epoch": 0.00964785335262904, "grad_norm": 7.6875, "learning_rate": 2.4e-05, "loss": 5.075, "step": 40 }, { "epoch": 0.01085383502170767, "grad_norm": 6.3125, "learning_rate": 2.7000000000000002e-05, "loss": 4.9373, "step": 45 }, { "epoch": 0.0120598166907863, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.6968, "step": 50 }, { "epoch": 0.01326579835986493, "grad_norm": 5.4375, "learning_rate": 3e-05, "loss": 4.6801, "step": 55 }, { "epoch": 0.01447178002894356, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.5958, "step": 60 }, { "epoch": 0.01567776169802219, "grad_norm": 5.25, "learning_rate": 3e-05, "loss": 4.6229, "step": 65 }, { "epoch": 0.01688374336710082, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.6261, "step": 70 }, { "epoch": 0.01808972503617945, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.5555, "step": 75 }, { "epoch": 0.01929570670525808, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.4618, "step": 80 }, { "epoch": 0.02050168837433671, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.4834, "step": 85 }, { "epoch": 0.02170767004341534, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.5056, "step": 90 }, { "epoch": 0.02291365171249397, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.3603, "step": 95 }, { "epoch": 0.0241196333815726, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.4798, "step": 100 }, { "epoch": 0.02532561505065123, "grad_norm": 6.0625, "learning_rate": 3e-05, "loss": 4.5368, "step": 105 }, { "epoch": 0.02653159671972986, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.3459, "step": 110 }, { "epoch": 0.02773757838880849, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.4357, "step": 115 }, { "epoch": 0.02894356005788712, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.4581, "step": 120 }, { "epoch": 0.03014954172696575, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.4183, "step": 125 }, { "epoch": 0.03135552339604438, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.4097, "step": 130 }, { "epoch": 0.03256150506512301, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.3926, "step": 135 }, { "epoch": 0.03376748673420164, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.4114, "step": 140 }, { "epoch": 0.03497346840328027, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.4665, "step": 145 }, { "epoch": 0.0361794500723589, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.2466, "step": 150 }, { "epoch": 0.03738543174143753, "grad_norm": 6.1875, "learning_rate": 3e-05, "loss": 4.4846, "step": 155 }, { "epoch": 0.03859141341051616, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.3888, "step": 160 }, { "epoch": 0.03979739507959479, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.3876, "step": 165 }, { "epoch": 0.04100337674867342, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.5379, "step": 170 }, { "epoch": 0.04220935841775205, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.3969, "step": 175 }, { "epoch": 0.04341534008683068, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.3758, "step": 180 }, { "epoch": 0.04462132175590931, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.4679, "step": 185 }, { "epoch": 0.04582730342498794, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.2303, "step": 190 }, { "epoch": 0.04703328509406657, "grad_norm": 2.703125, "learning_rate": 3e-05, "loss": 4.4293, "step": 195 }, { "epoch": 0.0482392667631452, "grad_norm": 2.375, "learning_rate": 3e-05, "loss": 4.3641, "step": 200 }, { "epoch": 0.04944524843222383, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.3563, "step": 205 }, { "epoch": 0.05065123010130246, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.2946, "step": 210 }, { "epoch": 0.05185721177038109, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.4174, "step": 215 }, { "epoch": 0.05306319343945972, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.3575, "step": 220 }, { "epoch": 0.05426917510853835, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.475, "step": 225 }, { "epoch": 0.05547515677761698, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.2004, "step": 230 }, { "epoch": 0.05668113844669561, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.4246, "step": 235 }, { "epoch": 0.05788712011577424, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.3133, "step": 240 }, { "epoch": 0.05909310178485287, "grad_norm": 5.3125, "learning_rate": 3e-05, "loss": 4.1943, "step": 245 }, { "epoch": 0.0602990834539315, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.2584, "step": 250 }, { "epoch": 0.06150506512301013, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.2586, "step": 255 }, { "epoch": 0.06271104679208876, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.2761, "step": 260 }, { "epoch": 0.0639170284611674, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.3398, "step": 265 }, { "epoch": 0.06512301013024602, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.234, "step": 270 }, { "epoch": 0.06632899179932465, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.2857, "step": 275 }, { "epoch": 0.06753497346840329, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.2967, "step": 280 }, { "epoch": 0.06874095513748191, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.3572, "step": 285 }, { "epoch": 0.06994693680656054, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.3631, "step": 290 }, { "epoch": 0.07115291847563918, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.1602, "step": 295 }, { "epoch": 0.0723589001447178, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.2951, "step": 300 }, { "epoch": 0.07356488181379643, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.3617, "step": 305 }, { "epoch": 0.07477086348287507, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 4.2659, "step": 310 }, { "epoch": 0.07597684515195369, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.394, "step": 315 }, { "epoch": 0.07718282682103232, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.1805, "step": 320 }, { "epoch": 0.07838880849011096, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.2425, "step": 325 }, { "epoch": 0.07959479015918958, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.2048, "step": 330 }, { "epoch": 0.08080077182826821, "grad_norm": 2.609375, "learning_rate": 3e-05, "loss": 4.2608, "step": 335 }, { "epoch": 0.08200675349734685, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.2567, "step": 340 }, { "epoch": 0.08321273516642547, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.3995, "step": 345 }, { "epoch": 0.0844187168355041, "grad_norm": 5.8125, "learning_rate": 3e-05, "loss": 4.3725, "step": 350 }, { "epoch": 0.08562469850458274, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.3952, "step": 355 }, { "epoch": 0.08683068017366136, "grad_norm": 5.65625, "learning_rate": 3e-05, "loss": 4.252, "step": 360 }, { "epoch": 0.08803666184273999, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 4.3442, "step": 365 }, { "epoch": 0.08924264351181863, "grad_norm": 5.125, "learning_rate": 3e-05, "loss": 4.2784, "step": 370 }, { "epoch": 0.09044862518089725, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1631, "step": 375 }, { "epoch": 0.09165460684997588, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.35, "step": 380 }, { "epoch": 0.09286058851905452, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.2332, "step": 385 }, { "epoch": 0.09406657018813314, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.2496, "step": 390 }, { "epoch": 0.09527255185721177, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.25, "step": 395 }, { "epoch": 0.0964785335262904, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.3593, "step": 400 }, { "epoch": 0.09768451519536903, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.3217, "step": 405 }, { "epoch": 0.09889049686444766, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.2979, "step": 410 }, { "epoch": 0.1000964785335263, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.3535, "step": 415 }, { "epoch": 0.10130246020260492, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.2129, "step": 420 }, { "epoch": 0.10250844187168355, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.2214, "step": 425 }, { "epoch": 0.10371442354076219, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.5381, "step": 430 }, { "epoch": 0.1049204052098408, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.1932, "step": 435 }, { "epoch": 0.10612638687891944, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.183, "step": 440 }, { "epoch": 0.10733236854799807, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.2582, "step": 445 }, { "epoch": 0.1085383502170767, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.3375, "step": 450 }, { "epoch": 0.10974433188615533, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.2944, "step": 455 }, { "epoch": 0.11095031355523396, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.2933, "step": 460 }, { "epoch": 0.11215629522431259, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.3441, "step": 465 }, { "epoch": 0.11336227689339122, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.2824, "step": 470 }, { "epoch": 0.11456825856246985, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.2159, "step": 475 }, { "epoch": 0.11577424023154848, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.4054, "step": 480 }, { "epoch": 0.11698022190062711, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.4035, "step": 485 }, { "epoch": 0.11818620356970574, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.2271, "step": 490 }, { "epoch": 0.11939218523878437, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.237, "step": 495 }, { "epoch": 0.120598166907863, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.313, "step": 500 }, { "epoch": 0.12180414857694163, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.1403, "step": 505 }, { "epoch": 0.12301013024602026, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.1213, "step": 510 }, { "epoch": 0.12421611191509889, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.2051, "step": 515 }, { "epoch": 0.12542209358417752, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.229, "step": 520 }, { "epoch": 0.12662807525325614, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.3853, "step": 525 }, { "epoch": 0.1278340569223348, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.2949, "step": 530 }, { "epoch": 0.12904003859141341, "grad_norm": 2.46875, "learning_rate": 3e-05, "loss": 4.1855, "step": 535 }, { "epoch": 0.13024602026049203, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.2316, "step": 540 }, { "epoch": 0.13145200192957068, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.2915, "step": 545 }, { "epoch": 0.1326579835986493, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.3255, "step": 550 }, { "epoch": 0.13386396526772792, "grad_norm": 6.71875, "learning_rate": 3e-05, "loss": 4.2751, "step": 555 }, { "epoch": 0.13506994693680657, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.3209, "step": 560 }, { "epoch": 0.1362759286058852, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.2309, "step": 565 }, { "epoch": 0.13748191027496381, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.1827, "step": 570 }, { "epoch": 0.13868789194404246, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.2377, "step": 575 }, { "epoch": 0.13989387361312108, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.3084, "step": 580 }, { "epoch": 0.1410998552821997, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.2637, "step": 585 }, { "epoch": 0.14230583695127835, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 4.1406, "step": 590 }, { "epoch": 0.14351181862035697, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.2476, "step": 595 }, { "epoch": 0.1447178002894356, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.326, "step": 600 }, { "epoch": 0.14592378195851424, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.1569, "step": 605 }, { "epoch": 0.14712976362759286, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.2639, "step": 610 }, { "epoch": 0.14833574529667148, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.2063, "step": 615 }, { "epoch": 0.14954172696575013, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.1874, "step": 620 }, { "epoch": 0.15074770863482875, "grad_norm": 2.703125, "learning_rate": 3e-05, "loss": 4.1462, "step": 625 }, { "epoch": 0.15195369030390737, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.1116, "step": 630 }, { "epoch": 0.15315967197298602, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.2557, "step": 635 }, { "epoch": 0.15436565364206464, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.2758, "step": 640 }, { "epoch": 0.15557163531114326, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.3447, "step": 645 }, { "epoch": 0.1567776169802219, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.2642, "step": 650 }, { "epoch": 0.15798359864930053, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.3163, "step": 655 }, { "epoch": 0.15918958031837915, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.3545, "step": 660 }, { "epoch": 0.1603955619874578, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.2177, "step": 665 }, { "epoch": 0.16160154365653642, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0781, "step": 670 }, { "epoch": 0.16280752532561504, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.1701, "step": 675 }, { "epoch": 0.1640135069946937, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.2681, "step": 680 }, { "epoch": 0.1652194886637723, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.2961, "step": 685 }, { "epoch": 0.16642547033285093, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.2781, "step": 690 }, { "epoch": 0.16763145200192958, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1237, "step": 695 }, { "epoch": 0.1688374336710082, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 4.1292, "step": 700 }, { "epoch": 0.17004341534008682, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.3216, "step": 705 }, { "epoch": 0.17124939700916547, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.237, "step": 710 }, { "epoch": 0.1724553786782441, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.1728, "step": 715 }, { "epoch": 0.1736613603473227, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.3643, "step": 720 }, { "epoch": 0.17486734201640136, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.2338, "step": 725 }, { "epoch": 0.17607332368547998, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.2015, "step": 730 }, { "epoch": 0.1772793053545586, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.2167, "step": 735 }, { "epoch": 0.17848528702363725, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.1632, "step": 740 }, { "epoch": 0.17969126869271587, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.2327, "step": 745 }, { "epoch": 0.1808972503617945, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.3116, "step": 750 }, { "epoch": 0.18210323203087314, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.258, "step": 755 }, { "epoch": 0.18330921369995176, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.1931, "step": 760 }, { "epoch": 0.18451519536903038, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.1596, "step": 765 }, { "epoch": 0.18572117703810903, "grad_norm": 2.421875, "learning_rate": 3e-05, "loss": 4.1996, "step": 770 }, { "epoch": 0.18692715870718765, "grad_norm": 5.96875, "learning_rate": 3e-05, "loss": 4.1826, "step": 775 }, { "epoch": 0.18813314037626627, "grad_norm": 6.6875, "learning_rate": 3e-05, "loss": 4.2366, "step": 780 }, { "epoch": 0.18933912204534492, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.3849, "step": 785 }, { "epoch": 0.19054510371442354, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.1976, "step": 790 }, { "epoch": 0.19175108538350216, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.2774, "step": 795 }, { "epoch": 0.1929570670525808, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.3555, "step": 800 }, { "epoch": 0.19416304872165943, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.2718, "step": 805 }, { "epoch": 0.19536903039073805, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.1201, "step": 810 }, { "epoch": 0.1965750120598167, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.1703, "step": 815 }, { "epoch": 0.19778099372889532, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.169, "step": 820 }, { "epoch": 0.19898697539797394, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.0994, "step": 825 }, { "epoch": 0.2001929570670526, "grad_norm": 2.6875, "learning_rate": 3e-05, "loss": 4.2446, "step": 830 }, { "epoch": 0.2013989387361312, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.1832, "step": 835 }, { "epoch": 0.20260492040520983, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.3235, "step": 840 }, { "epoch": 0.20381090207428848, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.3012, "step": 845 }, { "epoch": 0.2050168837433671, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.3252, "step": 850 }, { "epoch": 0.20622286541244572, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.1674, "step": 855 }, { "epoch": 0.20742884708152437, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.1337, "step": 860 }, { "epoch": 0.208634828750603, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.3821, "step": 865 }, { "epoch": 0.2098408104196816, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.3003, "step": 870 }, { "epoch": 0.21104679208876026, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.0001, "step": 875 }, { "epoch": 0.21225277375783888, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.2517, "step": 880 }, { "epoch": 0.2134587554269175, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.2317, "step": 885 }, { "epoch": 0.21466473709599615, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.2065, "step": 890 }, { "epoch": 0.21587071876507477, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.2405, "step": 895 }, { "epoch": 0.2170767004341534, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.2579, "step": 900 }, { "epoch": 0.21828268210323204, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.174, "step": 905 }, { "epoch": 0.21948866377231066, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.2155, "step": 910 }, { "epoch": 0.22069464544138928, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.1694, "step": 915 }, { "epoch": 0.22190062711046793, "grad_norm": 2.640625, "learning_rate": 3e-05, "loss": 4.1913, "step": 920 }, { "epoch": 0.22310660877954655, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0647, "step": 925 }, { "epoch": 0.22431259044862517, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.3277, "step": 930 }, { "epoch": 0.22551857211770382, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.0869, "step": 935 }, { "epoch": 0.22672455378678244, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.121, "step": 940 }, { "epoch": 0.22793053545586106, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.1617, "step": 945 }, { "epoch": 0.2291365171249397, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1112, "step": 950 }, { "epoch": 0.23034249879401833, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.2586, "step": 955 }, { "epoch": 0.23154848046309695, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.0603, "step": 960 }, { "epoch": 0.2327544621321756, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.2251, "step": 965 }, { "epoch": 0.23396044380125422, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.3597, "step": 970 }, { "epoch": 0.23516642547033284, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.3015, "step": 975 }, { "epoch": 0.2363724071394115, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.1425, "step": 980 }, { "epoch": 0.2375783888084901, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.2607, "step": 985 }, { "epoch": 0.23878437047756873, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0987, "step": 990 }, { "epoch": 0.23999035214664738, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.2733, "step": 995 }, { "epoch": 0.241196333815726, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.1086, "step": 1000 }, { "epoch": 0.24240231548480462, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.2467, "step": 1005 }, { "epoch": 0.24360829715388327, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.2346, "step": 1010 }, { "epoch": 0.2448142788229619, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.2599, "step": 1015 }, { "epoch": 0.2460202604920405, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1569, "step": 1020 }, { "epoch": 0.24722624216111916, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.2987, "step": 1025 }, { "epoch": 0.24843222383019778, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.1631, "step": 1030 }, { "epoch": 0.2496382054992764, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.1682, "step": 1035 }, { "epoch": 0.25084418716835505, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.216, "step": 1040 }, { "epoch": 0.2520501688374337, "grad_norm": 5.59375, "learning_rate": 3e-05, "loss": 4.1954, "step": 1045 }, { "epoch": 0.2532561505065123, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1728, "step": 1050 }, { "epoch": 0.25446213217559094, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.206, "step": 1055 }, { "epoch": 0.2556681138446696, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.9989, "step": 1060 }, { "epoch": 0.2568740955137482, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.2045, "step": 1065 }, { "epoch": 0.25808007718282683, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.236, "step": 1070 }, { "epoch": 0.2592860588519055, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.1638, "step": 1075 }, { "epoch": 0.26049204052098407, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.2251, "step": 1080 }, { "epoch": 0.2616980221900627, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.2282, "step": 1085 }, { "epoch": 0.26290400385914137, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.1292, "step": 1090 }, { "epoch": 0.26410998552821996, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.1986, "step": 1095 }, { "epoch": 0.2653159671972986, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.1354, "step": 1100 }, { "epoch": 0.26652194886637726, "grad_norm": 2.234375, "learning_rate": 3e-05, "loss": 4.1516, "step": 1105 }, { "epoch": 0.26772793053545585, "grad_norm": 5.6875, "learning_rate": 3e-05, "loss": 4.055, "step": 1110 }, { "epoch": 0.2689339122045345, "grad_norm": 6.375, "learning_rate": 3e-05, "loss": 4.2714, "step": 1115 }, { "epoch": 0.27013989387361315, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0749, "step": 1120 }, { "epoch": 0.27134587554269174, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.04, "step": 1125 }, { "epoch": 0.2725518572117704, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.1945, "step": 1130 }, { "epoch": 0.27375783888084904, "grad_norm": 2.609375, "learning_rate": 3e-05, "loss": 4.1164, "step": 1135 }, { "epoch": 0.27496382054992763, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.1937, "step": 1140 }, { "epoch": 0.2761698022190063, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.0479, "step": 1145 }, { "epoch": 0.2773757838880849, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.1835, "step": 1150 }, { "epoch": 0.2785817655571635, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.3251, "step": 1155 }, { "epoch": 0.27978774722624217, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.2993, "step": 1160 }, { "epoch": 0.2809937288953208, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.115, "step": 1165 }, { "epoch": 0.2821997105643994, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.2222, "step": 1170 }, { "epoch": 0.28340569223347806, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1045, "step": 1175 }, { "epoch": 0.2846116739025567, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.2198, "step": 1180 }, { "epoch": 0.2858176555716353, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0839, "step": 1185 }, { "epoch": 0.28702363724071395, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.2363, "step": 1190 }, { "epoch": 0.2882296189097926, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.2227, "step": 1195 }, { "epoch": 0.2894356005788712, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.2365, "step": 1200 }, { "epoch": 0.29064158224794984, "grad_norm": 5.21875, "learning_rate": 3e-05, "loss": 4.1324, "step": 1205 }, { "epoch": 0.2918475639170285, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.2204, "step": 1210 }, { "epoch": 0.2930535455861071, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1821, "step": 1215 }, { "epoch": 0.2942595272551857, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0904, "step": 1220 }, { "epoch": 0.2954655089242644, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.2362, "step": 1225 }, { "epoch": 0.29667149059334297, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 4.1517, "step": 1230 }, { "epoch": 0.2978774722624216, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.145, "step": 1235 }, { "epoch": 0.29908345393150026, "grad_norm": 5.75, "learning_rate": 3e-05, "loss": 4.2132, "step": 1240 }, { "epoch": 0.30028943560057886, "grad_norm": 5.3125, "learning_rate": 3e-05, "loss": 4.1831, "step": 1245 }, { "epoch": 0.3014954172696575, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.1045, "step": 1250 }, { "epoch": 0.30270139893873615, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.1805, "step": 1255 }, { "epoch": 0.30390738060781475, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.1913, "step": 1260 }, { "epoch": 0.3051133622768934, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.0797, "step": 1265 }, { "epoch": 0.30631934394597204, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.1287, "step": 1270 }, { "epoch": 0.30752532561505064, "grad_norm": 6.09375, "learning_rate": 3e-05, "loss": 4.261, "step": 1275 }, { "epoch": 0.3087313072841293, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 4.1196, "step": 1280 }, { "epoch": 0.30993728895320793, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.1841, "step": 1285 }, { "epoch": 0.3111432706222865, "grad_norm": 2.75, "learning_rate": 3e-05, "loss": 3.944, "step": 1290 }, { "epoch": 0.3123492522913652, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.1501, "step": 1295 }, { "epoch": 0.3135552339604438, "grad_norm": 2.546875, "learning_rate": 3e-05, "loss": 4.2362, "step": 1300 }, { "epoch": 0.3147612156295224, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.2518, "step": 1305 }, { "epoch": 0.31596719729860107, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1319, "step": 1310 }, { "epoch": 0.3171731789676797, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.293, "step": 1315 }, { "epoch": 0.3183791606367583, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.2332, "step": 1320 }, { "epoch": 0.31958514230583696, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.0787, "step": 1325 }, { "epoch": 0.3207911239749156, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.1102, "step": 1330 }, { "epoch": 0.3219971056439942, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.1015, "step": 1335 }, { "epoch": 0.32320308731307285, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.1708, "step": 1340 }, { "epoch": 0.3244090689821515, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.2679, "step": 1345 }, { "epoch": 0.3256150506512301, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1485, "step": 1350 }, { "epoch": 0.32682103232030874, "grad_norm": 6.0, "learning_rate": 3e-05, "loss": 4.3679, "step": 1355 }, { "epoch": 0.3280270139893874, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.1893, "step": 1360 }, { "epoch": 0.329232995658466, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.1619, "step": 1365 }, { "epoch": 0.3304389773275446, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.2143, "step": 1370 }, { "epoch": 0.3316449589966233, "grad_norm": 5.71875, "learning_rate": 3e-05, "loss": 4.2253, "step": 1375 }, { "epoch": 0.33285094066570187, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.1519, "step": 1380 }, { "epoch": 0.3340569223347805, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.1902, "step": 1385 }, { "epoch": 0.33526290400385916, "grad_norm": 2.578125, "learning_rate": 3e-05, "loss": 4.003, "step": 1390 }, { "epoch": 0.33646888567293776, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.2294, "step": 1395 }, { "epoch": 0.3376748673420164, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.1724, "step": 1400 }, { "epoch": 0.33888084901109505, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.3581, "step": 1405 }, { "epoch": 0.34008683068017365, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 4.2843, "step": 1410 }, { "epoch": 0.3412928123492523, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.1434, "step": 1415 }, { "epoch": 0.34249879401833094, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.232, "step": 1420 }, { "epoch": 0.34370477568740954, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.1193, "step": 1425 }, { "epoch": 0.3449107573564882, "grad_norm": 2.328125, "learning_rate": 3e-05, "loss": 4.1568, "step": 1430 }, { "epoch": 0.34611673902556683, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 4.3296, "step": 1435 }, { "epoch": 0.3473227206946454, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.2405, "step": 1440 }, { "epoch": 0.3485287023637241, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.2386, "step": 1445 }, { "epoch": 0.3497346840328027, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.2121, "step": 1450 }, { "epoch": 0.3509406657018813, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1053, "step": 1455 }, { "epoch": 0.35214664737095996, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.0892, "step": 1460 }, { "epoch": 0.3533526290400386, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.3391, "step": 1465 }, { "epoch": 0.3545586107091172, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.1439, "step": 1470 }, { "epoch": 0.35576459237819585, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.2866, "step": 1475 }, { "epoch": 0.3569705740472745, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0751, "step": 1480 }, { "epoch": 0.3581765557163531, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1502, "step": 1485 }, { "epoch": 0.35938253738543174, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.2203, "step": 1490 }, { "epoch": 0.3605885190545104, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.1743, "step": 1495 }, { "epoch": 0.361794500723589, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.1114, "step": 1500 }, { "epoch": 0.36300048239266763, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1696, "step": 1505 }, { "epoch": 0.3642064640617463, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.1296, "step": 1510 }, { "epoch": 0.3654124457308249, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.0871, "step": 1515 }, { "epoch": 0.3666184273999035, "grad_norm": 5.125, "learning_rate": 3e-05, "loss": 4.1233, "step": 1520 }, { "epoch": 0.36782440906898217, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.2566, "step": 1525 }, { "epoch": 0.36903039073806077, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.3678, "step": 1530 }, { "epoch": 0.3702363724071394, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.1894, "step": 1535 }, { "epoch": 0.37144235407621806, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.1688, "step": 1540 }, { "epoch": 0.37264833574529665, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.1066, "step": 1545 }, { "epoch": 0.3738543174143753, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.1249, "step": 1550 }, { "epoch": 0.37506029908345395, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.213, "step": 1555 }, { "epoch": 0.37626628075253254, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.157, "step": 1560 }, { "epoch": 0.3774722624216112, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0269, "step": 1565 }, { "epoch": 0.37867824409068984, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 4.0324, "step": 1570 }, { "epoch": 0.37988422575976843, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.2588, "step": 1575 }, { "epoch": 0.3810902074288471, "grad_norm": 2.578125, "learning_rate": 3e-05, "loss": 4.2777, "step": 1580 }, { "epoch": 0.38229618909792573, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.0556, "step": 1585 }, { "epoch": 0.3835021707670043, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1019, "step": 1590 }, { "epoch": 0.384708152436083, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.0898, "step": 1595 }, { "epoch": 0.3859141341051616, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.0325, "step": 1600 }, { "epoch": 0.3871201157742402, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.3301, "step": 1605 }, { "epoch": 0.38832609744331886, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.2186, "step": 1610 }, { "epoch": 0.3895320791123975, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.2252, "step": 1615 }, { "epoch": 0.3907380607814761, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.1065, "step": 1620 }, { "epoch": 0.39194404245055475, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.2158, "step": 1625 }, { "epoch": 0.3931500241196334, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.0941, "step": 1630 }, { "epoch": 0.394356005788712, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.1335, "step": 1635 }, { "epoch": 0.39556198745779064, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.1734, "step": 1640 }, { "epoch": 0.3967679691268693, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.1759, "step": 1645 }, { "epoch": 0.3979739507959479, "grad_norm": 2.453125, "learning_rate": 3e-05, "loss": 4.2206, "step": 1650 }, { "epoch": 0.39917993246502653, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.068, "step": 1655 }, { "epoch": 0.4003859141341052, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.2343, "step": 1660 }, { "epoch": 0.4015918958031838, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.2603, "step": 1665 }, { "epoch": 0.4027978774722624, "grad_norm": 2.640625, "learning_rate": 3e-05, "loss": 4.2034, "step": 1670 }, { "epoch": 0.40400385914134107, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.2815, "step": 1675 }, { "epoch": 0.40520984081041966, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.2653, "step": 1680 }, { "epoch": 0.4064158224794983, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.2602, "step": 1685 }, { "epoch": 0.40762180414857696, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.2528, "step": 1690 }, { "epoch": 0.40882778581765555, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.1146, "step": 1695 }, { "epoch": 0.4100337674867342, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.2133, "step": 1700 }, { "epoch": 0.41123974915581285, "grad_norm": 2.671875, "learning_rate": 3e-05, "loss": 4.2018, "step": 1705 }, { "epoch": 0.41244573082489144, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 3.9966, "step": 1710 }, { "epoch": 0.4136517124939701, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.209, "step": 1715 }, { "epoch": 0.41485769416304874, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.1959, "step": 1720 }, { "epoch": 0.41606367583212733, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.1882, "step": 1725 }, { "epoch": 0.417269657501206, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.0389, "step": 1730 }, { "epoch": 0.41847563917028463, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1526, "step": 1735 }, { "epoch": 0.4196816208393632, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.2271, "step": 1740 }, { "epoch": 0.42088760250844187, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 3.9929, "step": 1745 }, { "epoch": 0.4220935841775205, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.2844, "step": 1750 }, { "epoch": 0.4232995658465991, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.1703, "step": 1755 }, { "epoch": 0.42450554751567776, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0977, "step": 1760 }, { "epoch": 0.4257115291847564, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.1388, "step": 1765 }, { "epoch": 0.426917510853835, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.1038, "step": 1770 }, { "epoch": 0.42812349252291365, "grad_norm": 2.5625, "learning_rate": 3e-05, "loss": 4.113, "step": 1775 }, { "epoch": 0.4293294741919923, "grad_norm": 2.390625, "learning_rate": 3e-05, "loss": 4.1214, "step": 1780 }, { "epoch": 0.4305354558610709, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.1337, "step": 1785 }, { "epoch": 0.43174143753014954, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.1413, "step": 1790 }, { "epoch": 0.4329474191992282, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.166, "step": 1795 }, { "epoch": 0.4341534008683068, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.2672, "step": 1800 }, { "epoch": 0.43535938253738543, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.1397, "step": 1805 }, { "epoch": 0.4365653642064641, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.1451, "step": 1810 }, { "epoch": 0.4377713458755427, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.115, "step": 1815 }, { "epoch": 0.4389773275446213, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.073, "step": 1820 }, { "epoch": 0.44018330921369997, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.2127, "step": 1825 }, { "epoch": 0.44138929088277856, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.1417, "step": 1830 }, { "epoch": 0.4425952725518572, "grad_norm": 2.875, "learning_rate": 3e-05, "loss": 4.2536, "step": 1835 }, { "epoch": 0.44380125422093586, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.2157, "step": 1840 }, { "epoch": 0.44500723589001445, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.1808, "step": 1845 }, { "epoch": 0.4462132175590931, "grad_norm": 5.65625, "learning_rate": 3e-05, "loss": 4.0751, "step": 1850 }, { "epoch": 0.44741919922817175, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.2425, "step": 1855 }, { "epoch": 0.44862518089725034, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.2218, "step": 1860 }, { "epoch": 0.449831162566329, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0803, "step": 1865 }, { "epoch": 0.45103714423540764, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.249, "step": 1870 }, { "epoch": 0.45224312590448623, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.1166, "step": 1875 }, { "epoch": 0.4534491075735649, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.1878, "step": 1880 }, { "epoch": 0.45465508924264353, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1737, "step": 1885 }, { "epoch": 0.4558610709117221, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.9819, "step": 1890 }, { "epoch": 0.45706705258080077, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.1822, "step": 1895 }, { "epoch": 0.4582730342498794, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.1979, "step": 1900 }, { "epoch": 0.459479015918958, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.1131, "step": 1905 }, { "epoch": 0.46068499758803666, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.1488, "step": 1910 }, { "epoch": 0.4618909792571153, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.1834, "step": 1915 }, { "epoch": 0.4630969609261939, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.2266, "step": 1920 }, { "epoch": 0.46430294259527255, "grad_norm": 2.875, "learning_rate": 3e-05, "loss": 4.2299, "step": 1925 }, { "epoch": 0.4655089242643512, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.1458, "step": 1930 }, { "epoch": 0.4667149059334298, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.2016, "step": 1935 }, { "epoch": 0.46792088760250844, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.2645, "step": 1940 }, { "epoch": 0.4691268692715871, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.2563, "step": 1945 }, { "epoch": 0.4703328509406657, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.171, "step": 1950 }, { "epoch": 0.47153883260974433, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.2428, "step": 1955 }, { "epoch": 0.472744814278823, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.1909, "step": 1960 }, { "epoch": 0.47395079594790157, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.2549, "step": 1965 }, { "epoch": 0.4751567776169802, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.2075, "step": 1970 }, { "epoch": 0.47636275928605887, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.3145, "step": 1975 }, { "epoch": 0.47756874095513746, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.1468, "step": 1980 }, { "epoch": 0.4787747226242161, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.1315, "step": 1985 }, { "epoch": 0.47998070429329476, "grad_norm": 5.0, "learning_rate": 3e-05, "loss": 4.2904, "step": 1990 }, { "epoch": 0.48118668596237335, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.076, "step": 1995 }, { "epoch": 0.482392667631452, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 4.2555, "step": 2000 }, { "epoch": 0.48359864930053065, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.1034, "step": 2005 }, { "epoch": 0.48480463096960924, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0275, "step": 2010 }, { "epoch": 0.4860106126386879, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.1454, "step": 2015 }, { "epoch": 0.48721659430776654, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.2019, "step": 2020 }, { "epoch": 0.48842257597684513, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.179, "step": 2025 }, { "epoch": 0.4896285576459238, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.1085, "step": 2030 }, { "epoch": 0.4908345393150024, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1489, "step": 2035 }, { "epoch": 0.492040520984081, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1989, "step": 2040 }, { "epoch": 0.49324650265315967, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.066, "step": 2045 }, { "epoch": 0.4944524843222383, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.1999, "step": 2050 }, { "epoch": 0.4956584659913169, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.0279, "step": 2055 }, { "epoch": 0.49686444766039556, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.1165, "step": 2060 }, { "epoch": 0.4980704293294742, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.1403, "step": 2065 }, { "epoch": 0.4992764109985528, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 4.18, "step": 2070 }, { "epoch": 0.5004823926676315, "grad_norm": 2.421875, "learning_rate": 3e-05, "loss": 4.0944, "step": 2075 }, { "epoch": 0.5016883743367101, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.0649, "step": 2080 }, { "epoch": 0.5028943560057887, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.1182, "step": 2085 }, { "epoch": 0.5041003376748674, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.1181, "step": 2090 }, { "epoch": 0.505306319343946, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.2296, "step": 2095 }, { "epoch": 0.5065123010130246, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.1816, "step": 2100 }, { "epoch": 0.5077182826821033, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.2295, "step": 2105 }, { "epoch": 0.5089242643511819, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.2045, "step": 2110 }, { "epoch": 0.5101302460202605, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.2348, "step": 2115 }, { "epoch": 0.5113362276893392, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.2069, "step": 2120 }, { "epoch": 0.5125422093584178, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.1736, "step": 2125 }, { "epoch": 0.5137481910274964, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.2593, "step": 2130 }, { "epoch": 0.5149541726965751, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1689, "step": 2135 }, { "epoch": 0.5161601543656537, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.209, "step": 2140 }, { "epoch": 0.5173661360347322, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.1328, "step": 2145 }, { "epoch": 0.518572117703811, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.1345, "step": 2150 }, { "epoch": 0.5197780993728895, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.1546, "step": 2155 }, { "epoch": 0.5209840810419681, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.0774, "step": 2160 }, { "epoch": 0.5221900627110468, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0292, "step": 2165 }, { "epoch": 0.5233960443801254, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.1713, "step": 2170 }, { "epoch": 0.524602026049204, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 4.126, "step": 2175 }, { "epoch": 0.5258080077182827, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.1709, "step": 2180 }, { "epoch": 0.5270139893873613, "grad_norm": 2.71875, "learning_rate": 3e-05, "loss": 4.0871, "step": 2185 }, { "epoch": 0.5282199710564399, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.1486, "step": 2190 }, { "epoch": 0.5294259527255186, "grad_norm": 2.9375, "learning_rate": 3e-05, "loss": 4.1588, "step": 2195 }, { "epoch": 0.5306319343945972, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0876, "step": 2200 }, { "epoch": 0.5318379160636758, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0972, "step": 2205 }, { "epoch": 0.5330438977327545, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.1229, "step": 2210 }, { "epoch": 0.5342498794018331, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.1464, "step": 2215 }, { "epoch": 0.5354558610709117, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.0546, "step": 2220 }, { "epoch": 0.5366618427399904, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.1456, "step": 2225 }, { "epoch": 0.537867824409069, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.0351, "step": 2230 }, { "epoch": 0.5390738060781476, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.1626, "step": 2235 }, { "epoch": 0.5402797877472263, "grad_norm": 2.65625, "learning_rate": 3e-05, "loss": 4.0184, "step": 2240 }, { "epoch": 0.5414857694163049, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1745, "step": 2245 }, { "epoch": 0.5426917510853835, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.1887, "step": 2250 }, { "epoch": 0.5438977327544622, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.1051, "step": 2255 }, { "epoch": 0.5451037144235408, "grad_norm": 6.34375, "learning_rate": 3e-05, "loss": 4.1245, "step": 2260 }, { "epoch": 0.5463096960926194, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.1345, "step": 2265 }, { "epoch": 0.5475156777616981, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.1925, "step": 2270 }, { "epoch": 0.5487216594307767, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 4.1781, "step": 2275 }, { "epoch": 0.5499276410998553, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.0196, "step": 2280 }, { "epoch": 0.551133622768934, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.2346, "step": 2285 }, { "epoch": 0.5523396044380126, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.1857, "step": 2290 }, { "epoch": 0.5535455861070911, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.136, "step": 2295 }, { "epoch": 0.5547515677761699, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.2876, "step": 2300 }, { "epoch": 0.5559575494452484, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.1898, "step": 2305 }, { "epoch": 0.557163531114327, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.036, "step": 2310 }, { "epoch": 0.5583695127834057, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.0531, "step": 2315 }, { "epoch": 0.5595754944524843, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.289, "step": 2320 }, { "epoch": 0.5607814761215629, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.159, "step": 2325 }, { "epoch": 0.5619874577906416, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1376, "step": 2330 }, { "epoch": 0.5631934394597202, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.0069, "step": 2335 }, { "epoch": 0.5643994211287988, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0897, "step": 2340 }, { "epoch": 0.5656054027978775, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.0205, "step": 2345 }, { "epoch": 0.5668113844669561, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.1753, "step": 2350 }, { "epoch": 0.5680173661360347, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.3916, "step": 2355 }, { "epoch": 0.5692233478051134, "grad_norm": 2.609375, "learning_rate": 3e-05, "loss": 4.1402, "step": 2360 }, { "epoch": 0.570429329474192, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 3.9654, "step": 2365 }, { "epoch": 0.5716353111432706, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.2347, "step": 2370 }, { "epoch": 0.5728412928123493, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.2693, "step": 2375 }, { "epoch": 0.5740472744814279, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0152, "step": 2380 }, { "epoch": 0.5752532561505065, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.1378, "step": 2385 }, { "epoch": 0.5764592378195852, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.1269, "step": 2390 }, { "epoch": 0.5776652194886638, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.1937, "step": 2395 }, { "epoch": 0.5788712011577424, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.1337, "step": 2400 }, { "epoch": 0.5800771828268211, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.2105, "step": 2405 }, { "epoch": 0.5812831644958997, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.1123, "step": 2410 }, { "epoch": 0.5824891461649783, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.2527, "step": 2415 }, { "epoch": 0.583695127834057, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.1075, "step": 2420 }, { "epoch": 0.5849011095031356, "grad_norm": 2.65625, "learning_rate": 3e-05, "loss": 4.0709, "step": 2425 }, { "epoch": 0.5861070911722142, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.2959, "step": 2430 }, { "epoch": 0.5873130728412929, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.0766, "step": 2435 }, { "epoch": 0.5885190545103715, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.2177, "step": 2440 }, { "epoch": 0.58972503617945, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.1869, "step": 2445 }, { "epoch": 0.5909310178485288, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.1476, "step": 2450 }, { "epoch": 0.5921369995176073, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.1426, "step": 2455 }, { "epoch": 0.5933429811866859, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.2495, "step": 2460 }, { "epoch": 0.5945489628557646, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.0684, "step": 2465 }, { "epoch": 0.5957549445248432, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.199, "step": 2470 }, { "epoch": 0.5969609261939218, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0596, "step": 2475 }, { "epoch": 0.5981669078630005, "grad_norm": 2.6875, "learning_rate": 3e-05, "loss": 4.0264, "step": 2480 }, { "epoch": 0.5993728895320791, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.216, "step": 2485 }, { "epoch": 0.6005788712011577, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.085, "step": 2490 }, { "epoch": 0.6017848528702364, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.2301, "step": 2495 }, { "epoch": 0.602990834539315, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.139, "step": 2500 }, { "epoch": 0.6041968162083936, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.2281, "step": 2505 }, { "epoch": 0.6054027978774723, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.2202, "step": 2510 }, { "epoch": 0.6066087795465509, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.1567, "step": 2515 }, { "epoch": 0.6078147612156295, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.0756, "step": 2520 }, { "epoch": 0.6090207428847082, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.2095, "step": 2525 }, { "epoch": 0.6102267245537868, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1841, "step": 2530 }, { "epoch": 0.6114327062228654, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.1841, "step": 2535 }, { "epoch": 0.6126386878919441, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.2975, "step": 2540 }, { "epoch": 0.6138446695610227, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.2581, "step": 2545 }, { "epoch": 0.6150506512301013, "grad_norm": 2.5, "learning_rate": 3e-05, "loss": 4.2376, "step": 2550 }, { "epoch": 0.61625663289918, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.0852, "step": 2555 }, { "epoch": 0.6174626145682586, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 4.0458, "step": 2560 }, { "epoch": 0.6186685962373372, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.1143, "step": 2565 }, { "epoch": 0.6198745779064159, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.1963, "step": 2570 }, { "epoch": 0.6210805595754945, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 4.0839, "step": 2575 }, { "epoch": 0.622286541244573, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.2316, "step": 2580 }, { "epoch": 0.6234925229136518, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.1051, "step": 2585 }, { "epoch": 0.6246985045827304, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.0591, "step": 2590 }, { "epoch": 0.625904486251809, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.1764, "step": 2595 }, { "epoch": 0.6271104679208876, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.0678, "step": 2600 }, { "epoch": 0.6283164495899662, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.3058, "step": 2605 }, { "epoch": 0.6295224312590448, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.2234, "step": 2610 }, { "epoch": 0.6307284129281235, "grad_norm": 2.671875, "learning_rate": 3e-05, "loss": 4.2313, "step": 2615 }, { "epoch": 0.6319343945972021, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.1046, "step": 2620 }, { "epoch": 0.6331403762662807, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.2329, "step": 2625 }, { "epoch": 0.6343463579353594, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0876, "step": 2630 }, { "epoch": 0.635552339604438, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.1475, "step": 2635 }, { "epoch": 0.6367583212735166, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0809, "step": 2640 }, { "epoch": 0.6379643029425953, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.0742, "step": 2645 }, { "epoch": 0.6391702846116739, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1174, "step": 2650 }, { "epoch": 0.6403762662807525, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.1518, "step": 2655 }, { "epoch": 0.6415822479498312, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.23, "step": 2660 }, { "epoch": 0.6427882296189098, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.3166, "step": 2665 }, { "epoch": 0.6439942112879884, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.1846, "step": 2670 }, { "epoch": 0.6452001929570671, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.215, "step": 2675 }, { "epoch": 0.6464061746261457, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.0283, "step": 2680 }, { "epoch": 0.6476121562952243, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.0859, "step": 2685 }, { "epoch": 0.648818137964303, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.0457, "step": 2690 }, { "epoch": 0.6500241196333816, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0543, "step": 2695 }, { "epoch": 0.6512301013024602, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.1125, "step": 2700 }, { "epoch": 0.6524360829715389, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.1797, "step": 2705 }, { "epoch": 0.6536420646406175, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1836, "step": 2710 }, { "epoch": 0.6548480463096961, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1151, "step": 2715 }, { "epoch": 0.6560540279787748, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.1353, "step": 2720 }, { "epoch": 0.6572600096478534, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.1144, "step": 2725 }, { "epoch": 0.658465991316932, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.2057, "step": 2730 }, { "epoch": 0.6596719729860107, "grad_norm": 2.75, "learning_rate": 3e-05, "loss": 4.2709, "step": 2735 }, { "epoch": 0.6608779546550893, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.2824, "step": 2740 }, { "epoch": 0.6620839363241678, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.1346, "step": 2745 }, { "epoch": 0.6632899179932465, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.2731, "step": 2750 }, { "epoch": 0.6644958996623251, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.1086, "step": 2755 }, { "epoch": 0.6657018813314037, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.077, "step": 2760 }, { "epoch": 0.6669078630004824, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.2347, "step": 2765 }, { "epoch": 0.668113844669561, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1723, "step": 2770 }, { "epoch": 0.6693198263386396, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.195, "step": 2775 }, { "epoch": 0.6705258080077183, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.2575, "step": 2780 }, { "epoch": 0.6717317896767969, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.2252, "step": 2785 }, { "epoch": 0.6729377713458755, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.1504, "step": 2790 }, { "epoch": 0.6741437530149542, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.0045, "step": 2795 }, { "epoch": 0.6753497346840328, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.1425, "step": 2800 }, { "epoch": 0.6765557163531114, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.215, "step": 2805 }, { "epoch": 0.6777616980221901, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 3.9333, "step": 2810 }, { "epoch": 0.6789676796912687, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.1888, "step": 2815 }, { "epoch": 0.6801736613603473, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.2001, "step": 2820 }, { "epoch": 0.681379643029426, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.0495, "step": 2825 }, { "epoch": 0.6825856246985046, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.1673, "step": 2830 }, { "epoch": 0.6837916063675832, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.0732, "step": 2835 }, { "epoch": 0.6849975880366619, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1815, "step": 2840 }, { "epoch": 0.6862035697057405, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.1796, "step": 2845 }, { "epoch": 0.6874095513748191, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0375, "step": 2850 }, { "epoch": 0.6886155330438978, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.112, "step": 2855 }, { "epoch": 0.6898215147129764, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.0995, "step": 2860 }, { "epoch": 0.691027496382055, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.1648, "step": 2865 }, { "epoch": 0.6922334780511337, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.0328, "step": 2870 }, { "epoch": 0.6934394597202123, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.0953, "step": 2875 }, { "epoch": 0.6946454413892909, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.1325, "step": 2880 }, { "epoch": 0.6958514230583696, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.146, "step": 2885 }, { "epoch": 0.6970574047274481, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 3.9722, "step": 2890 }, { "epoch": 0.6982633863965267, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.1843, "step": 2895 }, { "epoch": 0.6994693680656054, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.1012, "step": 2900 }, { "epoch": 0.700675349734684, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1883, "step": 2905 }, { "epoch": 0.7018813314037626, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.0869, "step": 2910 }, { "epoch": 0.7030873130728413, "grad_norm": 5.25, "learning_rate": 3e-05, "loss": 4.1282, "step": 2915 }, { "epoch": 0.7042932947419199, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0039, "step": 2920 }, { "epoch": 0.7054992764109985, "grad_norm": 2.65625, "learning_rate": 3e-05, "loss": 4.2039, "step": 2925 }, { "epoch": 0.7067052580800772, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.1585, "step": 2930 }, { "epoch": 0.7079112397491558, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.147, "step": 2935 }, { "epoch": 0.7091172214182344, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.1196, "step": 2940 }, { "epoch": 0.7103232030873131, "grad_norm": 2.75, "learning_rate": 3e-05, "loss": 4.0802, "step": 2945 }, { "epoch": 0.7115291847563917, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.064, "step": 2950 }, { "epoch": 0.7127351664254703, "grad_norm": 2.65625, "learning_rate": 3e-05, "loss": 4.0665, "step": 2955 }, { "epoch": 0.713941148094549, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.1405, "step": 2960 }, { "epoch": 0.7151471297636276, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.1095, "step": 2965 }, { "epoch": 0.7163531114327062, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.1906, "step": 2970 }, { "epoch": 0.7175590931017849, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0102, "step": 2975 }, { "epoch": 0.7187650747708635, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.2478, "step": 2980 }, { "epoch": 0.7199710564399421, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.1967, "step": 2985 }, { "epoch": 0.7211770381090208, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.0495, "step": 2990 }, { "epoch": 0.7223830197780994, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.2745, "step": 2995 }, { "epoch": 0.723589001447178, "grad_norm": 2.703125, "learning_rate": 3e-05, "loss": 4.0887, "step": 3000 }, { "epoch": 0.7247949831162567, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1064, "step": 3005 }, { "epoch": 0.7260009647853353, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 4.075, "step": 3010 }, { "epoch": 0.7272069464544139, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.0182, "step": 3015 }, { "epoch": 0.7284129281234926, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0842, "step": 3020 }, { "epoch": 0.7296189097925712, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.0547, "step": 3025 }, { "epoch": 0.7308248914616498, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1655, "step": 3030 }, { "epoch": 0.7320308731307285, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.1803, "step": 3035 }, { "epoch": 0.733236854799807, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0956, "step": 3040 }, { "epoch": 0.7344428364688856, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.0712, "step": 3045 }, { "epoch": 0.7356488181379643, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1154, "step": 3050 }, { "epoch": 0.7368547998070429, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1704, "step": 3055 }, { "epoch": 0.7380607814761215, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.1081, "step": 3060 }, { "epoch": 0.7392667631452002, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.0683, "step": 3065 }, { "epoch": 0.7404727448142788, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.0593, "step": 3070 }, { "epoch": 0.7416787264833574, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0845, "step": 3075 }, { "epoch": 0.7428847081524361, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.2632, "step": 3080 }, { "epoch": 0.7440906898215147, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1532, "step": 3085 }, { "epoch": 0.7452966714905933, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 4.1585, "step": 3090 }, { "epoch": 0.746502653159672, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0654, "step": 3095 }, { "epoch": 0.7477086348287506, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.015, "step": 3100 }, { "epoch": 0.7489146164978292, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.9281, "step": 3105 }, { "epoch": 0.7501205981669079, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.2236, "step": 3110 }, { "epoch": 0.7513265798359865, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.2555, "step": 3115 }, { "epoch": 0.7525325615050651, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.2435, "step": 3120 }, { "epoch": 0.7537385431741438, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.2766, "step": 3125 }, { "epoch": 0.7549445248432224, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.0061, "step": 3130 }, { "epoch": 0.756150506512301, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.2062, "step": 3135 }, { "epoch": 0.7573564881813797, "grad_norm": 2.484375, "learning_rate": 3e-05, "loss": 4.105, "step": 3140 }, { "epoch": 0.7585624698504583, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.3122, "step": 3145 }, { "epoch": 0.7597684515195369, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.1549, "step": 3150 }, { "epoch": 0.7609744331886156, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 4.0874, "step": 3155 }, { "epoch": 0.7621804148576942, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.1952, "step": 3160 }, { "epoch": 0.7633863965267728, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0554, "step": 3165 }, { "epoch": 0.7645923781958515, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.0241, "step": 3170 }, { "epoch": 0.7657983598649301, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.0432, "step": 3175 }, { "epoch": 0.7670043415340086, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0681, "step": 3180 }, { "epoch": 0.7682103232030874, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.2807, "step": 3185 }, { "epoch": 0.769416304872166, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1062, "step": 3190 }, { "epoch": 0.7706222865412445, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0742, "step": 3195 }, { "epoch": 0.7718282682103232, "grad_norm": 5.21875, "learning_rate": 3e-05, "loss": 4.0541, "step": 3200 }, { "epoch": 0.7730342498794018, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.0214, "step": 3205 }, { "epoch": 0.7742402315484804, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 3.8959, "step": 3210 }, { "epoch": 0.7754462132175591, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.3601, "step": 3215 }, { "epoch": 0.7766521948866377, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.3528, "step": 3220 }, { "epoch": 0.7778581765557163, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.1698, "step": 3225 }, { "epoch": 0.779064158224795, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.1279, "step": 3230 }, { "epoch": 0.7802701398938736, "grad_norm": 6.21875, "learning_rate": 3e-05, "loss": 4.1072, "step": 3235 }, { "epoch": 0.7814761215629522, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.1003, "step": 3240 }, { "epoch": 0.7826821032320309, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.1118, "step": 3245 }, { "epoch": 0.7838880849011095, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.2427, "step": 3250 }, { "epoch": 0.7850940665701881, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1098, "step": 3255 }, { "epoch": 0.7863000482392668, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1955, "step": 3260 }, { "epoch": 0.7875060299083454, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.2035, "step": 3265 }, { "epoch": 0.788712011577424, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.352, "step": 3270 }, { "epoch": 0.7899179932465027, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.0648, "step": 3275 }, { "epoch": 0.7911239749155813, "grad_norm": 2.5, "learning_rate": 3e-05, "loss": 4.055, "step": 3280 }, { "epoch": 0.7923299565846599, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.2687, "step": 3285 }, { "epoch": 0.7935359382537386, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.199, "step": 3290 }, { "epoch": 0.7947419199228172, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.1332, "step": 3295 }, { "epoch": 0.7959479015918958, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.1949, "step": 3300 }, { "epoch": 0.7971538832609745, "grad_norm": 5.375, "learning_rate": 3e-05, "loss": 4.0888, "step": 3305 }, { "epoch": 0.7983598649300531, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.1863, "step": 3310 }, { "epoch": 0.7995658465991317, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.1919, "step": 3315 }, { "epoch": 0.8007718282682104, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.1392, "step": 3320 }, { "epoch": 0.801977809937289, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0842, "step": 3325 }, { "epoch": 0.8031837916063675, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.156, "step": 3330 }, { "epoch": 0.8043897732754463, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.1954, "step": 3335 }, { "epoch": 0.8055957549445248, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.0855, "step": 3340 }, { "epoch": 0.8068017366136034, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.2085, "step": 3345 }, { "epoch": 0.8080077182826821, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.1738, "step": 3350 }, { "epoch": 0.8092136999517607, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0994, "step": 3355 }, { "epoch": 0.8104196816208393, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.1153, "step": 3360 }, { "epoch": 0.811625663289918, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.0873, "step": 3365 }, { "epoch": 0.8128316449589966, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.2071, "step": 3370 }, { "epoch": 0.8140376266280752, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0511, "step": 3375 }, { "epoch": 0.8152436082971539, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.2248, "step": 3380 }, { "epoch": 0.8164495899662325, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.0855, "step": 3385 }, { "epoch": 0.8176555716353111, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.1643, "step": 3390 }, { "epoch": 0.8188615533043898, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.2038, "step": 3395 }, { "epoch": 0.8200675349734684, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.0179, "step": 3400 }, { "epoch": 0.821273516642547, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0906, "step": 3405 }, { "epoch": 0.8224794983116257, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.0746, "step": 3410 }, { "epoch": 0.8236854799807043, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.0359, "step": 3415 }, { "epoch": 0.8248914616497829, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1236, "step": 3420 }, { "epoch": 0.8260974433188616, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1128, "step": 3425 }, { "epoch": 0.8273034249879402, "grad_norm": 2.328125, "learning_rate": 3e-05, "loss": 4.1955, "step": 3430 }, { "epoch": 0.8285094066570188, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.1397, "step": 3435 }, { "epoch": 0.8297153883260975, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.1762, "step": 3440 }, { "epoch": 0.8309213699951761, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0738, "step": 3445 }, { "epoch": 0.8321273516642547, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.1588, "step": 3450 }, { "epoch": 0.8333333333333334, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.1227, "step": 3455 }, { "epoch": 0.834539315002412, "grad_norm": 2.6875, "learning_rate": 3e-05, "loss": 4.183, "step": 3460 }, { "epoch": 0.8357452966714906, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 3.9831, "step": 3465 }, { "epoch": 0.8369512783405693, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.1618, "step": 3470 }, { "epoch": 0.8381572600096479, "grad_norm": 5.0, "learning_rate": 3e-05, "loss": 4.1736, "step": 3475 }, { "epoch": 0.8393632416787264, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.187, "step": 3480 }, { "epoch": 0.8405692233478052, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.1988, "step": 3485 }, { "epoch": 0.8417752050168837, "grad_norm": 2.640625, "learning_rate": 3e-05, "loss": 4.0244, "step": 3490 }, { "epoch": 0.8429811866859623, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.9979, "step": 3495 }, { "epoch": 0.844187168355041, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0922, "step": 3500 }, { "epoch": 0.8453931500241196, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.073, "step": 3505 }, { "epoch": 0.8465991316931982, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0701, "step": 3510 }, { "epoch": 0.8478051133622769, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.2934, "step": 3515 }, { "epoch": 0.8490110950313555, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.084, "step": 3520 }, { "epoch": 0.8502170767004341, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.3179, "step": 3525 }, { "epoch": 0.8514230583695128, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.1636, "step": 3530 }, { "epoch": 0.8526290400385914, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.1177, "step": 3535 }, { "epoch": 0.85383502170767, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.1316, "step": 3540 }, { "epoch": 0.8550410033767487, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.021, "step": 3545 }, { "epoch": 0.8562469850458273, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0359, "step": 3550 }, { "epoch": 0.8574529667149059, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.096, "step": 3555 }, { "epoch": 0.8586589483839846, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.1779, "step": 3560 }, { "epoch": 0.8598649300530632, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0249, "step": 3565 }, { "epoch": 0.8610709117221418, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 3.9852, "step": 3570 }, { "epoch": 0.8622768933912205, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.2015, "step": 3575 }, { "epoch": 0.8634828750602991, "grad_norm": 6.34375, "learning_rate": 3e-05, "loss": 4.3137, "step": 3580 }, { "epoch": 0.8646888567293777, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 4.0761, "step": 3585 }, { "epoch": 0.8658948383984564, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.1935, "step": 3590 }, { "epoch": 0.867100820067535, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0718, "step": 3595 }, { "epoch": 0.8683068017366136, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.0554, "step": 3600 }, { "epoch": 0.8695127834056923, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.0916, "step": 3605 }, { "epoch": 0.8707187650747709, "grad_norm": 2.875, "learning_rate": 3e-05, "loss": 4.088, "step": 3610 }, { "epoch": 0.8719247467438495, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.0514, "step": 3615 }, { "epoch": 0.8731307284129282, "grad_norm": 5.78125, "learning_rate": 3e-05, "loss": 4.1346, "step": 3620 }, { "epoch": 0.8743367100820068, "grad_norm": 2.4375, "learning_rate": 3e-05, "loss": 4.1326, "step": 3625 }, { "epoch": 0.8755426917510853, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.0424, "step": 3630 }, { "epoch": 0.876748673420164, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.221, "step": 3635 }, { "epoch": 0.8779546550892426, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9499, "step": 3640 }, { "epoch": 0.8791606367583212, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.0092, "step": 3645 }, { "epoch": 0.8803666184273999, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0644, "step": 3650 }, { "epoch": 0.8815726000964785, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.2003, "step": 3655 }, { "epoch": 0.8827785817655571, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9038, "step": 3660 }, { "epoch": 0.8839845634346358, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.2161, "step": 3665 }, { "epoch": 0.8851905451037144, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.2232, "step": 3670 }, { "epoch": 0.886396526772793, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1059, "step": 3675 }, { "epoch": 0.8876025084418717, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.2917, "step": 3680 }, { "epoch": 0.8888084901109503, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 3.8746, "step": 3685 }, { "epoch": 0.8900144717800289, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.243, "step": 3690 }, { "epoch": 0.8912204534491076, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 4.0628, "step": 3695 }, { "epoch": 0.8924264351181862, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.0743, "step": 3700 }, { "epoch": 0.8936324167872648, "grad_norm": 2.609375, "learning_rate": 3e-05, "loss": 4.2276, "step": 3705 }, { "epoch": 0.8948383984563435, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 4.1271, "step": 3710 }, { "epoch": 0.8960443801254221, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.0546, "step": 3715 }, { "epoch": 0.8972503617945007, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.1377, "step": 3720 }, { "epoch": 0.8984563434635794, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0211, "step": 3725 }, { "epoch": 0.899662325132658, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.1557, "step": 3730 }, { "epoch": 0.9008683068017366, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0704, "step": 3735 }, { "epoch": 0.9020742884708153, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.1882, "step": 3740 }, { "epoch": 0.9032802701398939, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.0423, "step": 3745 }, { "epoch": 0.9044862518089725, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.2532, "step": 3750 }, { "epoch": 0.9056922334780512, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.1152, "step": 3755 }, { "epoch": 0.9068982151471298, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.2205, "step": 3760 }, { "epoch": 0.9081041968162084, "grad_norm": 2.828125, "learning_rate": 3e-05, "loss": 4.0618, "step": 3765 }, { "epoch": 0.9093101784852871, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.2105, "step": 3770 }, { "epoch": 0.9105161601543657, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.048, "step": 3775 }, { "epoch": 0.9117221418234442, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 3.965, "step": 3780 }, { "epoch": 0.912928123492523, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0743, "step": 3785 }, { "epoch": 0.9141341051616015, "grad_norm": 2.75, "learning_rate": 3e-05, "loss": 4.0414, "step": 3790 }, { "epoch": 0.9153400868306801, "grad_norm": 2.671875, "learning_rate": 3e-05, "loss": 4.0869, "step": 3795 }, { "epoch": 0.9165460684997588, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.002, "step": 3800 }, { "epoch": 0.9177520501688374, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.1947, "step": 3805 }, { "epoch": 0.918958031837916, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.1552, "step": 3810 }, { "epoch": 0.9201640135069947, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.0425, "step": 3815 }, { "epoch": 0.9213699951760733, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.1572, "step": 3820 }, { "epoch": 0.9225759768451519, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 4.0533, "step": 3825 }, { "epoch": 0.9237819585142306, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0589, "step": 3830 }, { "epoch": 0.9249879401833092, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.029, "step": 3835 }, { "epoch": 0.9261939218523878, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 4.1819, "step": 3840 }, { "epoch": 0.9273999035214665, "grad_norm": 2.703125, "learning_rate": 3e-05, "loss": 4.1109, "step": 3845 }, { "epoch": 0.9286058851905451, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 3.9705, "step": 3850 }, { "epoch": 0.9298118668596237, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.239, "step": 3855 }, { "epoch": 0.9310178485287024, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.1412, "step": 3860 }, { "epoch": 0.932223830197781, "grad_norm": 2.828125, "learning_rate": 3e-05, "loss": 4.1011, "step": 3865 }, { "epoch": 0.9334298118668596, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.1593, "step": 3870 }, { "epoch": 0.9346357935359383, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.067, "step": 3875 }, { "epoch": 0.9358417752050169, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.1545, "step": 3880 }, { "epoch": 0.9370477568740955, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 3.979, "step": 3885 }, { "epoch": 0.9382537385431742, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1696, "step": 3890 }, { "epoch": 0.9394597202122528, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.0584, "step": 3895 }, { "epoch": 0.9406657018813314, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.1227, "step": 3900 }, { "epoch": 0.9418716835504101, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.1175, "step": 3905 }, { "epoch": 0.9430776652194887, "grad_norm": 2.84375, "learning_rate": 3e-05, "loss": 4.0724, "step": 3910 }, { "epoch": 0.9442836468885673, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.2071, "step": 3915 }, { "epoch": 0.945489628557646, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.0892, "step": 3920 }, { "epoch": 0.9466956102267245, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1683, "step": 3925 }, { "epoch": 0.9479015918958031, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 3.9843, "step": 3930 }, { "epoch": 0.9491075735648818, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.0902, "step": 3935 }, { "epoch": 0.9503135552339604, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.2005, "step": 3940 }, { "epoch": 0.951519536903039, "grad_norm": 2.6875, "learning_rate": 3e-05, "loss": 4.1105, "step": 3945 }, { "epoch": 0.9527255185721177, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.003, "step": 3950 }, { "epoch": 0.9539315002411963, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.9763, "step": 3955 }, { "epoch": 0.9551374819102749, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.1505, "step": 3960 }, { "epoch": 0.9563434635793536, "grad_norm": 2.65625, "learning_rate": 3e-05, "loss": 3.9586, "step": 3965 }, { "epoch": 0.9575494452484322, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0434, "step": 3970 }, { "epoch": 0.9587554269175108, "grad_norm": 5.625, "learning_rate": 3e-05, "loss": 4.1737, "step": 3975 }, { "epoch": 0.9599614085865895, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.1195, "step": 3980 }, { "epoch": 0.9611673902556681, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 3.9995, "step": 3985 }, { "epoch": 0.9623733719247467, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 3.9289, "step": 3990 }, { "epoch": 0.9635793535938254, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1749, "step": 3995 }, { "epoch": 0.964785335262904, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.1535, "step": 4000 }, { "epoch": 0.9659913169319826, "grad_norm": 2.515625, "learning_rate": 3e-05, "loss": 4.1935, "step": 4005 }, { "epoch": 0.9671972986010613, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 3.9899, "step": 4010 }, { "epoch": 0.9684032802701399, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.0252, "step": 4015 }, { "epoch": 0.9696092619392185, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.0276, "step": 4020 }, { "epoch": 0.9708152436082972, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.1245, "step": 4025 }, { "epoch": 0.9720212252773758, "grad_norm": 2.5, "learning_rate": 3e-05, "loss": 4.1081, "step": 4030 }, { "epoch": 0.9732272069464544, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.1967, "step": 4035 }, { "epoch": 0.9744331886155331, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 4.1422, "step": 4040 }, { "epoch": 0.9756391702846117, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0146, "step": 4045 }, { "epoch": 0.9768451519536903, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 3.9423, "step": 4050 }, { "epoch": 0.978051133622769, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.1214, "step": 4055 }, { "epoch": 0.9792571152918476, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.126, "step": 4060 }, { "epoch": 0.9804630969609262, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.1571, "step": 4065 }, { "epoch": 0.9816690786300049, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.2122, "step": 4070 }, { "epoch": 0.9828750602990834, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 4.0405, "step": 4075 }, { "epoch": 0.984081041968162, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0581, "step": 4080 }, { "epoch": 0.9852870236372407, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.0323, "step": 4085 }, { "epoch": 0.9864930053063193, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 3.9715, "step": 4090 }, { "epoch": 0.9876989869753979, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0065, "step": 4095 }, { "epoch": 0.9889049686444766, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.2328, "step": 4100 }, { "epoch": 0.9901109503135552, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.0894, "step": 4105 }, { "epoch": 0.9913169319826338, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.0842, "step": 4110 }, { "epoch": 0.9925229136517125, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0222, "step": 4115 }, { "epoch": 0.9937288953207911, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.1906, "step": 4120 }, { "epoch": 0.9949348769898697, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 3.9373, "step": 4125 }, { "epoch": 0.9961408586589484, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0573, "step": 4130 }, { "epoch": 0.997346840328027, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.0869, "step": 4135 }, { "epoch": 0.9985528219971056, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.114, "step": 4140 }, { "epoch": 0.9997588036661843, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.1973, "step": 4145 }, { "epoch": 1.000964785335263, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0131, "step": 4150 }, { "epoch": 1.0021707670043416, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0084, "step": 4155 }, { "epoch": 1.0033767486734202, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.2257, "step": 4160 }, { "epoch": 1.0045827303424988, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 4.0854, "step": 4165 }, { "epoch": 1.0057887120115774, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.1503, "step": 4170 }, { "epoch": 1.006994693680656, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.025, "step": 4175 }, { "epoch": 1.0082006753497348, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0242, "step": 4180 }, { "epoch": 1.0094066570188134, "grad_norm": 6.40625, "learning_rate": 3e-05, "loss": 4.203, "step": 4185 }, { "epoch": 1.010612638687892, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.2164, "step": 4190 }, { "epoch": 1.0118186203569706, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 3.9584, "step": 4195 }, { "epoch": 1.0130246020260492, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.1287, "step": 4200 }, { "epoch": 1.0142305836951278, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9676, "step": 4205 }, { "epoch": 1.0154365653642066, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.1237, "step": 4210 }, { "epoch": 1.0166425470332852, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.1521, "step": 4215 }, { "epoch": 1.0178485287023638, "grad_norm": 2.421875, "learning_rate": 3e-05, "loss": 4.0903, "step": 4220 }, { "epoch": 1.0190545103714423, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.0297, "step": 4225 }, { "epoch": 1.020260492040521, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0136, "step": 4230 }, { "epoch": 1.0214664737095995, "grad_norm": 2.484375, "learning_rate": 3e-05, "loss": 4.1453, "step": 4235 }, { "epoch": 1.0226724553786783, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0196, "step": 4240 }, { "epoch": 1.023878437047757, "grad_norm": 5.6875, "learning_rate": 3e-05, "loss": 4.0562, "step": 4245 }, { "epoch": 1.0250844187168355, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.1402, "step": 4250 }, { "epoch": 1.0262904003859141, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0279, "step": 4255 }, { "epoch": 1.0274963820549927, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.0851, "step": 4260 }, { "epoch": 1.0287023637240713, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.086, "step": 4265 }, { "epoch": 1.0299083453931501, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1226, "step": 4270 }, { "epoch": 1.0311143270622287, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.017, "step": 4275 }, { "epoch": 1.0323203087313073, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.1188, "step": 4280 }, { "epoch": 1.033526290400386, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.0281, "step": 4285 }, { "epoch": 1.0347322720694645, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.187, "step": 4290 }, { "epoch": 1.035938253738543, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.8952, "step": 4295 }, { "epoch": 1.037144235407622, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.1883, "step": 4300 }, { "epoch": 1.0383502170767005, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.1133, "step": 4305 }, { "epoch": 1.039556198745779, "grad_norm": 5.96875, "learning_rate": 3e-05, "loss": 4.0701, "step": 4310 }, { "epoch": 1.0407621804148577, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.2537, "step": 4315 }, { "epoch": 1.0419681620839363, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0716, "step": 4320 }, { "epoch": 1.0431741437530149, "grad_norm": 2.65625, "learning_rate": 3e-05, "loss": 4.156, "step": 4325 }, { "epoch": 1.0443801254220937, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.0882, "step": 4330 }, { "epoch": 1.0455861070911723, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.0266, "step": 4335 }, { "epoch": 1.0467920887602509, "grad_norm": 5.875, "learning_rate": 3e-05, "loss": 4.1419, "step": 4340 }, { "epoch": 1.0479980704293295, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.1335, "step": 4345 }, { "epoch": 1.049204052098408, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.041, "step": 4350 }, { "epoch": 1.0504100337674867, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.0228, "step": 4355 }, { "epoch": 1.0516160154365655, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.1802, "step": 4360 }, { "epoch": 1.052821997105644, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.0947, "step": 4365 }, { "epoch": 1.0540279787747227, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.2415, "step": 4370 }, { "epoch": 1.0552339604438012, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 3.9798, "step": 4375 }, { "epoch": 1.0564399421128798, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.0757, "step": 4380 }, { "epoch": 1.0576459237819584, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1394, "step": 4385 }, { "epoch": 1.0588519054510372, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 3.9464, "step": 4390 }, { "epoch": 1.0600578871201158, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 3.9927, "step": 4395 }, { "epoch": 1.0612638687891944, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 3.9352, "step": 4400 }, { "epoch": 1.062469850458273, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.0786, "step": 4405 }, { "epoch": 1.0636758321273516, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.1314, "step": 4410 }, { "epoch": 1.0648818137964302, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0288, "step": 4415 }, { "epoch": 1.066087795465509, "grad_norm": 2.9375, "learning_rate": 3e-05, "loss": 4.0097, "step": 4420 }, { "epoch": 1.0672937771345876, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0171, "step": 4425 }, { "epoch": 1.0684997588036662, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.139, "step": 4430 }, { "epoch": 1.0697057404727448, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.1061, "step": 4435 }, { "epoch": 1.0709117221418234, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.9998, "step": 4440 }, { "epoch": 1.072117703810902, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0178, "step": 4445 }, { "epoch": 1.0733236854799808, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1711, "step": 4450 }, { "epoch": 1.0745296671490594, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0605, "step": 4455 }, { "epoch": 1.075735648818138, "grad_norm": 5.6875, "learning_rate": 3e-05, "loss": 4.1019, "step": 4460 }, { "epoch": 1.0769416304872166, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 3.9648, "step": 4465 }, { "epoch": 1.0781476121562952, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0724, "step": 4470 }, { "epoch": 1.0793535938253738, "grad_norm": 5.125, "learning_rate": 3e-05, "loss": 3.9531, "step": 4475 }, { "epoch": 1.0805595754944526, "grad_norm": 5.3125, "learning_rate": 3e-05, "loss": 4.0394, "step": 4480 }, { "epoch": 1.0817655571635312, "grad_norm": 5.5625, "learning_rate": 3e-05, "loss": 4.0621, "step": 4485 }, { "epoch": 1.0829715388326098, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.1399, "step": 4490 }, { "epoch": 1.0841775205016884, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1697, "step": 4495 }, { "epoch": 1.085383502170767, "grad_norm": 5.46875, "learning_rate": 3e-05, "loss": 4.1787, "step": 4500 }, { "epoch": 1.0865894838398455, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0595, "step": 4505 }, { "epoch": 1.0877954655089244, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1744, "step": 4510 }, { "epoch": 1.089001447178003, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.0673, "step": 4515 }, { "epoch": 1.0902074288470815, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 3.9511, "step": 4520 }, { "epoch": 1.0914134105161601, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1133, "step": 4525 }, { "epoch": 1.0926193921852387, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.0124, "step": 4530 }, { "epoch": 1.0938253738543173, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0508, "step": 4535 }, { "epoch": 1.0950313555233961, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.0289, "step": 4540 }, { "epoch": 1.0962373371924747, "grad_norm": 2.9375, "learning_rate": 3e-05, "loss": 4.1944, "step": 4545 }, { "epoch": 1.0974433188615533, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.0608, "step": 4550 }, { "epoch": 1.098649300530632, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1398, "step": 4555 }, { "epoch": 1.0998552821997105, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.2017, "step": 4560 }, { "epoch": 1.101061263868789, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 3.967, "step": 4565 }, { "epoch": 1.102267245537868, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0359, "step": 4570 }, { "epoch": 1.1034732272069465, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.2121, "step": 4575 }, { "epoch": 1.104679208876025, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.1189, "step": 4580 }, { "epoch": 1.1058851905451037, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 4.0132, "step": 4585 }, { "epoch": 1.1070911722141823, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.1075, "step": 4590 }, { "epoch": 1.1082971538832609, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0972, "step": 4595 }, { "epoch": 1.1095031355523397, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.1007, "step": 4600 }, { "epoch": 1.1107091172214183, "grad_norm": 5.0, "learning_rate": 3e-05, "loss": 4.099, "step": 4605 }, { "epoch": 1.111915098890497, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.074, "step": 4610 }, { "epoch": 1.1131210805595755, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1763, "step": 4615 }, { "epoch": 1.114327062228654, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.0287, "step": 4620 }, { "epoch": 1.1155330438977327, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.12, "step": 4625 }, { "epoch": 1.1167390255668115, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.2681, "step": 4630 }, { "epoch": 1.11794500723589, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.075, "step": 4635 }, { "epoch": 1.1191509889049687, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.0756, "step": 4640 }, { "epoch": 1.1203569705740473, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.0585, "step": 4645 }, { "epoch": 1.1215629522431259, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0099, "step": 4650 }, { "epoch": 1.1227689339122044, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.992, "step": 4655 }, { "epoch": 1.1239749155812833, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0102, "step": 4660 }, { "epoch": 1.1251808972503619, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.0133, "step": 4665 }, { "epoch": 1.1263868789194404, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.2031, "step": 4670 }, { "epoch": 1.127592860588519, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0605, "step": 4675 }, { "epoch": 1.1287988422575976, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.1167, "step": 4680 }, { "epoch": 1.1300048239266762, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.0597, "step": 4685 }, { "epoch": 1.1312108055957548, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0291, "step": 4690 }, { "epoch": 1.1324167872648336, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.129, "step": 4695 }, { "epoch": 1.1336227689339122, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.163, "step": 4700 }, { "epoch": 1.1348287506029908, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0971, "step": 4705 }, { "epoch": 1.1360347322720694, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.1331, "step": 4710 }, { "epoch": 1.137240713941148, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 3.9733, "step": 4715 }, { "epoch": 1.1384466956102268, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.0609, "step": 4720 }, { "epoch": 1.1396526772793054, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.1458, "step": 4725 }, { "epoch": 1.140858658948384, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.1225, "step": 4730 }, { "epoch": 1.1420646406174626, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 3.952, "step": 4735 }, { "epoch": 1.1432706222865412, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.0724, "step": 4740 }, { "epoch": 1.1444766039556198, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.0841, "step": 4745 }, { "epoch": 1.1456825856246984, "grad_norm": 5.53125, "learning_rate": 3e-05, "loss": 4.0742, "step": 4750 }, { "epoch": 1.1468885672937772, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0647, "step": 4755 }, { "epoch": 1.1480945489628558, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 4.0807, "step": 4760 }, { "epoch": 1.1493005306319344, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.9932, "step": 4765 }, { "epoch": 1.150506512301013, "grad_norm": 2.5625, "learning_rate": 3e-05, "loss": 4.0109, "step": 4770 }, { "epoch": 1.1517124939700916, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 3.9304, "step": 4775 }, { "epoch": 1.1529184756391704, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.0111, "step": 4780 }, { "epoch": 1.154124457308249, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.1092, "step": 4785 }, { "epoch": 1.1553304389773276, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.1632, "step": 4790 }, { "epoch": 1.1565364206464062, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0784, "step": 4795 }, { "epoch": 1.1577424023154848, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.2074, "step": 4800 }, { "epoch": 1.1589483839845633, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.2259, "step": 4805 }, { "epoch": 1.1601543656536422, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.025, "step": 4810 }, { "epoch": 1.1613603473227208, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 3.9329, "step": 4815 }, { "epoch": 1.1625663289917993, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 3.9496, "step": 4820 }, { "epoch": 1.163772310660878, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.1522, "step": 4825 }, { "epoch": 1.1649782923299565, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.1553, "step": 4830 }, { "epoch": 1.1661842739990351, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.0963, "step": 4835 }, { "epoch": 1.167390255668114, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.0543, "step": 4840 }, { "epoch": 1.1685962373371925, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 3.877, "step": 4845 }, { "epoch": 1.1698022190062711, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.1312, "step": 4850 }, { "epoch": 1.1710082006753497, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.0766, "step": 4855 }, { "epoch": 1.1722141823444283, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9747, "step": 4860 }, { "epoch": 1.173420164013507, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.2538, "step": 4865 }, { "epoch": 1.1746261456825857, "grad_norm": 2.84375, "learning_rate": 3e-05, "loss": 4.0593, "step": 4870 }, { "epoch": 1.1758321273516643, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0721, "step": 4875 }, { "epoch": 1.177038109020743, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0717, "step": 4880 }, { "epoch": 1.1782440906898215, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0056, "step": 4885 }, { "epoch": 1.1794500723589, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.0515, "step": 4890 }, { "epoch": 1.1806560540279787, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1549, "step": 4895 }, { "epoch": 1.1818620356970575, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.044, "step": 4900 }, { "epoch": 1.183068017366136, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.1535, "step": 4905 }, { "epoch": 1.1842739990352147, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 3.9842, "step": 4910 }, { "epoch": 1.1854799807042933, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0685, "step": 4915 }, { "epoch": 1.1866859623733719, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 3.9961, "step": 4920 }, { "epoch": 1.1878919440424505, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.1142, "step": 4925 }, { "epoch": 1.1890979257115293, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.1558, "step": 4930 }, { "epoch": 1.1903039073806079, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.1491, "step": 4935 }, { "epoch": 1.1915098890496865, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.0457, "step": 4940 }, { "epoch": 1.192715870718765, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.2197, "step": 4945 }, { "epoch": 1.1939218523878437, "grad_norm": 2.390625, "learning_rate": 3e-05, "loss": 4.084, "step": 4950 }, { "epoch": 1.1951278340569222, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0203, "step": 4955 }, { "epoch": 1.196333815726001, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.0261, "step": 4960 }, { "epoch": 1.1975397973950797, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 3.9922, "step": 4965 }, { "epoch": 1.1987457790641582, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 3.9868, "step": 4970 }, { "epoch": 1.1999517607332368, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.1119, "step": 4975 }, { "epoch": 1.2011577424023154, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.0618, "step": 4980 }, { "epoch": 1.202363724071394, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.0839, "step": 4985 }, { "epoch": 1.2035697057404728, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.1701, "step": 4990 }, { "epoch": 1.2047756874095514, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.2366, "step": 4995 }, { "epoch": 1.20598166907863, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0488, "step": 5000 }, { "epoch": 1.2071876507477086, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 3.9488, "step": 5005 }, { "epoch": 1.2083936324167872, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.1987, "step": 5010 }, { "epoch": 1.2095996140858658, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.0267, "step": 5015 }, { "epoch": 1.2108055957549446, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.1341, "step": 5020 }, { "epoch": 1.2120115774240232, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.0292, "step": 5025 }, { "epoch": 1.2132175590931018, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0797, "step": 5030 }, { "epoch": 1.2144235407621804, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.0511, "step": 5035 }, { "epoch": 1.215629522431259, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.1113, "step": 5040 }, { "epoch": 1.2168355041003376, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0921, "step": 5045 }, { "epoch": 1.2180414857694164, "grad_norm": 5.5, "learning_rate": 3e-05, "loss": 4.0496, "step": 5050 }, { "epoch": 1.219247467438495, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.0404, "step": 5055 }, { "epoch": 1.2204534491075736, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.0831, "step": 5060 }, { "epoch": 1.2216594307766522, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.0557, "step": 5065 }, { "epoch": 1.2228654124457308, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.8572, "step": 5070 }, { "epoch": 1.2240713941148094, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.1655, "step": 5075 }, { "epoch": 1.2252773757838882, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9933, "step": 5080 }, { "epoch": 1.2264833574529668, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 3.9613, "step": 5085 }, { "epoch": 1.2276893391220454, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.0718, "step": 5090 }, { "epoch": 1.228895320791124, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.999, "step": 5095 }, { "epoch": 1.2301013024602026, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 3.9332, "step": 5100 }, { "epoch": 1.2313072841292811, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.052, "step": 5105 }, { "epoch": 1.23251326579836, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0346, "step": 5110 }, { "epoch": 1.2337192474674386, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1594, "step": 5115 }, { "epoch": 1.2349252291365171, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.2756, "step": 5120 }, { "epoch": 1.2361312108055957, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.0703, "step": 5125 }, { "epoch": 1.2373371924746743, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0417, "step": 5130 }, { "epoch": 1.238543174143753, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0235, "step": 5135 }, { "epoch": 1.2397491558128317, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0859, "step": 5140 }, { "epoch": 1.2409551374819103, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 3.9445, "step": 5145 }, { "epoch": 1.242161119150989, "grad_norm": 2.53125, "learning_rate": 3e-05, "loss": 4.1462, "step": 5150 }, { "epoch": 1.2433671008200675, "grad_norm": 5.4375, "learning_rate": 3e-05, "loss": 4.1486, "step": 5155 }, { "epoch": 1.244573082489146, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0261, "step": 5160 }, { "epoch": 1.2457790641582247, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 4.0704, "step": 5165 }, { "epoch": 1.2469850458273035, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.1459, "step": 5170 }, { "epoch": 1.248191027496382, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.0925, "step": 5175 }, { "epoch": 1.2493970091654607, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 3.9671, "step": 5180 }, { "epoch": 1.2506029908345393, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.1209, "step": 5185 }, { "epoch": 1.251808972503618, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.0154, "step": 5190 }, { "epoch": 1.2530149541726967, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.0826, "step": 5195 }, { "epoch": 1.2542209358417753, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 3.8896, "step": 5200 }, { "epoch": 1.255426917510854, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.1578, "step": 5205 }, { "epoch": 1.2566328991799325, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.0039, "step": 5210 }, { "epoch": 1.257838880849011, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0763, "step": 5215 }, { "epoch": 1.2590448625180897, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.0125, "step": 5220 }, { "epoch": 1.2602508441871683, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.0794, "step": 5225 }, { "epoch": 1.2614568258562469, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1165, "step": 5230 }, { "epoch": 1.2626628075253257, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.0182, "step": 5235 }, { "epoch": 1.2638687891944043, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.0168, "step": 5240 }, { "epoch": 1.2650747708634829, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0701, "step": 5245 }, { "epoch": 1.2662807525325614, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0513, "step": 5250 }, { "epoch": 1.2674867342016403, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.8264, "step": 5255 }, { "epoch": 1.2686927158707189, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.1547, "step": 5260 }, { "epoch": 1.2698986975397974, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 3.9976, "step": 5265 }, { "epoch": 1.271104679208876, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 3.9339, "step": 5270 }, { "epoch": 1.2723106608779546, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.0799, "step": 5275 }, { "epoch": 1.2735166425470332, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 3.9947, "step": 5280 }, { "epoch": 1.2747226242161118, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 3.8924, "step": 5285 }, { "epoch": 1.2759286058851904, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.0693, "step": 5290 }, { "epoch": 1.2771345875542692, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 3.9979, "step": 5295 }, { "epoch": 1.2783405692233478, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.1561, "step": 5300 }, { "epoch": 1.2795465508924264, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.264, "step": 5305 }, { "epoch": 1.280752532561505, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 4.0551, "step": 5310 }, { "epoch": 1.2819585142305838, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0058, "step": 5315 }, { "epoch": 1.2831644958996624, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 3.9965, "step": 5320 }, { "epoch": 1.284370477568741, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.078, "step": 5325 }, { "epoch": 1.2855764592378196, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 3.9244, "step": 5330 }, { "epoch": 1.2867824409068982, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.1328, "step": 5335 }, { "epoch": 1.2879884225759768, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.1223, "step": 5340 }, { "epoch": 1.2891944042450554, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.0503, "step": 5345 }, { "epoch": 1.290400385914134, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0781, "step": 5350 }, { "epoch": 1.2916063675832128, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.1099, "step": 5355 }, { "epoch": 1.2928123492522914, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.0771, "step": 5360 }, { "epoch": 1.29401833092137, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 3.9172, "step": 5365 }, { "epoch": 1.2952243125904486, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.1189, "step": 5370 }, { "epoch": 1.2964302942595274, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.982, "step": 5375 }, { "epoch": 1.297636275928606, "grad_norm": 5.875, "learning_rate": 3e-05, "loss": 4.0818, "step": 5380 }, { "epoch": 1.2988422575976846, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 3.9988, "step": 5385 }, { "epoch": 1.3000482392667632, "grad_norm": 6.75, "learning_rate": 3e-05, "loss": 4.0758, "step": 5390 }, { "epoch": 1.3012542209358418, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0294, "step": 5395 }, { "epoch": 1.3024602026049203, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.1552, "step": 5400 }, { "epoch": 1.303666184273999, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0111, "step": 5405 }, { "epoch": 1.3048721659430775, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 3.9238, "step": 5410 }, { "epoch": 1.3060781476121563, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.0513, "step": 5415 }, { "epoch": 1.307284129281235, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0781, "step": 5420 }, { "epoch": 1.3084901109503135, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0511, "step": 5425 }, { "epoch": 1.3096960926193921, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0244, "step": 5430 }, { "epoch": 1.310902074288471, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 3.9475, "step": 5435 }, { "epoch": 1.3121080559575495, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 3.9267, "step": 5440 }, { "epoch": 1.3133140376266281, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.1677, "step": 5445 }, { "epoch": 1.3145200192957067, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.0992, "step": 5450 }, { "epoch": 1.3157260009647853, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0578, "step": 5455 }, { "epoch": 1.316931982633864, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.1223, "step": 5460 }, { "epoch": 1.3181379643029425, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0945, "step": 5465 }, { "epoch": 1.319343945972021, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.0013, "step": 5470 }, { "epoch": 1.3205499276411, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 3.9855, "step": 5475 }, { "epoch": 1.3217559093101785, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 3.9744, "step": 5480 }, { "epoch": 1.322961890979257, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.0379, "step": 5485 }, { "epoch": 1.3241678726483357, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1908, "step": 5490 }, { "epoch": 1.3253738543174145, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.0127, "step": 5495 }, { "epoch": 1.326579835986493, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.1413, "step": 5500 }, { "epoch": 1.3277858176555717, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1576, "step": 5505 }, { "epoch": 1.3289917993246503, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.0902, "step": 5510 }, { "epoch": 1.3301977809937289, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0613, "step": 5515 }, { "epoch": 1.3314037626628075, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.1063, "step": 5520 }, { "epoch": 1.332609744331886, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 4.0418, "step": 5525 }, { "epoch": 1.3338157260009647, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.0198, "step": 5530 }, { "epoch": 1.3350217076700435, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9589, "step": 5535 }, { "epoch": 1.336227689339122, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.0921, "step": 5540 }, { "epoch": 1.3374336710082007, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1048, "step": 5545 }, { "epoch": 1.3386396526772792, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.1445, "step": 5550 }, { "epoch": 1.339845634346358, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1278, "step": 5555 }, { "epoch": 1.3410516160154367, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.123, "step": 5560 }, { "epoch": 1.3422575976845152, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.0924, "step": 5565 }, { "epoch": 1.3434635793535938, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.0382, "step": 5570 }, { "epoch": 1.3446695610226724, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0597, "step": 5575 }, { "epoch": 1.345875542691751, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.175, "step": 5580 }, { "epoch": 1.3470815243608296, "grad_norm": 5.4375, "learning_rate": 3e-05, "loss": 4.1842, "step": 5585 }, { "epoch": 1.3482875060299082, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1211, "step": 5590 }, { "epoch": 1.349493487698987, "grad_norm": 2.875, "learning_rate": 3e-05, "loss": 4.1151, "step": 5595 }, { "epoch": 1.3506994693680656, "grad_norm": 2.71875, "learning_rate": 3e-05, "loss": 3.9768, "step": 5600 }, { "epoch": 1.3519054510371442, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.0144, "step": 5605 }, { "epoch": 1.3531114327062228, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.2033, "step": 5610 }, { "epoch": 1.3543174143753016, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 4.0453, "step": 5615 }, { "epoch": 1.3555233960443802, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.15, "step": 5620 }, { "epoch": 1.3567293777134588, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9706, "step": 5625 }, { "epoch": 1.3579353593825374, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.0151, "step": 5630 }, { "epoch": 1.359141341051616, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.1179, "step": 5635 }, { "epoch": 1.3603473227206946, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.0877, "step": 5640 }, { "epoch": 1.3615533043897732, "grad_norm": 5.46875, "learning_rate": 3e-05, "loss": 3.9866, "step": 5645 }, { "epoch": 1.3627592860588518, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.0513, "step": 5650 }, { "epoch": 1.3639652677279306, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0651, "step": 5655 }, { "epoch": 1.3651712493970092, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.9922, "step": 5660 }, { "epoch": 1.3663772310660878, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 3.9578, "step": 5665 }, { "epoch": 1.3675832127351664, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.1715, "step": 5670 }, { "epoch": 1.3687891944042452, "grad_norm": 2.6875, "learning_rate": 3e-05, "loss": 4.1772, "step": 5675 }, { "epoch": 1.3699951760733238, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.1823, "step": 5680 }, { "epoch": 1.3712011577424024, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 3.9954, "step": 5685 }, { "epoch": 1.372407139411481, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 3.9842, "step": 5690 }, { "epoch": 1.3736131210805596, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.05, "step": 5695 }, { "epoch": 1.3748191027496381, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.0843, "step": 5700 }, { "epoch": 1.3760250844187167, "grad_norm": 5.5, "learning_rate": 3e-05, "loss": 4.0568, "step": 5705 }, { "epoch": 1.3772310660877953, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.9894, "step": 5710 }, { "epoch": 1.3784370477568741, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 3.9011, "step": 5715 }, { "epoch": 1.3796430294259527, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0859, "step": 5720 }, { "epoch": 1.3808490110950313, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.2131, "step": 5725 }, { "epoch": 1.38205499276411, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 3.9654, "step": 5730 }, { "epoch": 1.3832609744331887, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 3.9865, "step": 5735 }, { "epoch": 1.3844669561022673, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0356, "step": 5740 }, { "epoch": 1.385672937771346, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9057, "step": 5745 }, { "epoch": 1.3868789194404245, "grad_norm": 5.40625, "learning_rate": 3e-05, "loss": 4.1082, "step": 5750 }, { "epoch": 1.388084901109503, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.2088, "step": 5755 }, { "epoch": 1.3892908827785817, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.0625, "step": 5760 }, { "epoch": 1.3904968644476603, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0387, "step": 5765 }, { "epoch": 1.391702846116739, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0907, "step": 5770 }, { "epoch": 1.3929088277858177, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.0449, "step": 5775 }, { "epoch": 1.3941148094548963, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0096, "step": 5780 }, { "epoch": 1.395320791123975, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.0399, "step": 5785 }, { "epoch": 1.3965267727930535, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.1171, "step": 5790 }, { "epoch": 1.3977327544621323, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.0642, "step": 5795 }, { "epoch": 1.398938736131211, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.987, "step": 5800 }, { "epoch": 1.4001447178002895, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.1565, "step": 5805 }, { "epoch": 1.401350699469368, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0982, "step": 5810 }, { "epoch": 1.4025566811384467, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.0979, "step": 5815 }, { "epoch": 1.4037626628075253, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.2027, "step": 5820 }, { "epoch": 1.4049686444766039, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.1227, "step": 5825 }, { "epoch": 1.4061746261456824, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.1815, "step": 5830 }, { "epoch": 1.4073806078147613, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.2241, "step": 5835 }, { "epoch": 1.4085865894838399, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 3.9867, "step": 5840 }, { "epoch": 1.4097925711529185, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 4.0721, "step": 5845 }, { "epoch": 1.410998552821997, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.1121, "step": 5850 }, { "epoch": 1.4122045344910759, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 3.9555, "step": 5855 }, { "epoch": 1.4134105161601545, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0313, "step": 5860 }, { "epoch": 1.414616497829233, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.0074, "step": 5865 }, { "epoch": 1.4158224794983116, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.1935, "step": 5870 }, { "epoch": 1.4170284611673902, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 3.9094, "step": 5875 }, { "epoch": 1.4182344428364688, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 3.9999, "step": 5880 }, { "epoch": 1.4194404245055474, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.1755, "step": 5885 }, { "epoch": 1.420646406174626, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 3.9492, "step": 5890 }, { "epoch": 1.4218523878437048, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.0462, "step": 5895 }, { "epoch": 1.4230583695127834, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1595, "step": 5900 }, { "epoch": 1.424264351181862, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0068, "step": 5905 }, { "epoch": 1.4254703328509406, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.0575, "step": 5910 }, { "epoch": 1.4266763145200194, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.006, "step": 5915 }, { "epoch": 1.427882296189098, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 3.9659, "step": 5920 }, { "epoch": 1.4290882778581766, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.0954, "step": 5925 }, { "epoch": 1.4302942595272552, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.9642, "step": 5930 }, { "epoch": 1.4315002411963338, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0137, "step": 5935 }, { "epoch": 1.4327062228654124, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0934, "step": 5940 }, { "epoch": 1.433912204534491, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.1815, "step": 5945 }, { "epoch": 1.4351181862035696, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.043, "step": 5950 }, { "epoch": 1.4363241678726484, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.983, "step": 5955 }, { "epoch": 1.437530149541727, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.0582, "step": 5960 }, { "epoch": 1.4387361312108056, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.02, "step": 5965 }, { "epoch": 1.4399421128798842, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.078, "step": 5970 }, { "epoch": 1.441148094548963, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.0567, "step": 5975 }, { "epoch": 1.4423540762180416, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.0969, "step": 5980 }, { "epoch": 1.4435600578871202, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.1273, "step": 5985 }, { "epoch": 1.4447660395561988, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0941, "step": 5990 }, { "epoch": 1.4459720212252773, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9656, "step": 5995 }, { "epoch": 1.447178002894356, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.1771, "step": 6000 }, { "epoch": 1.4483839845634345, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.1066, "step": 6005 }, { "epoch": 1.4495899662325131, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 3.9577, "step": 6010 }, { "epoch": 1.450795947901592, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.151, "step": 6015 }, { "epoch": 1.4520019295706705, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0168, "step": 6020 }, { "epoch": 1.4532079112397491, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.0785, "step": 6025 }, { "epoch": 1.4544138929088277, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.0594, "step": 6030 }, { "epoch": 1.4556198745779065, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0033, "step": 6035 }, { "epoch": 1.4568258562469851, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.0501, "step": 6040 }, { "epoch": 1.4580318379160637, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 3.9919, "step": 6045 }, { "epoch": 1.4592378195851423, "grad_norm": 2.703125, "learning_rate": 3e-05, "loss": 4.0784, "step": 6050 }, { "epoch": 1.460443801254221, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 3.994, "step": 6055 }, { "epoch": 1.4616497829232995, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.1935, "step": 6060 }, { "epoch": 1.462855764592378, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.1135, "step": 6065 }, { "epoch": 1.464061746261457, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.0636, "step": 6070 }, { "epoch": 1.4652677279305355, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.087, "step": 6075 }, { "epoch": 1.466473709599614, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1237, "step": 6080 }, { "epoch": 1.4676796912686927, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.098, "step": 6085 }, { "epoch": 1.4688856729377713, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.2113, "step": 6090 }, { "epoch": 1.47009165460685, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.0715, "step": 6095 }, { "epoch": 1.4712976362759287, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.1509, "step": 6100 }, { "epoch": 1.4725036179450073, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.0719, "step": 6105 }, { "epoch": 1.4737095996140859, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.2094, "step": 6110 }, { "epoch": 1.4749155812831645, "grad_norm": 2.71875, "learning_rate": 3e-05, "loss": 4.1142, "step": 6115 }, { "epoch": 1.476121562952243, "grad_norm": 2.671875, "learning_rate": 3e-05, "loss": 4.1545, "step": 6120 }, { "epoch": 1.4773275446213217, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.0612, "step": 6125 }, { "epoch": 1.4785335262904005, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1144, "step": 6130 }, { "epoch": 1.479739507959479, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.1587, "step": 6135 }, { "epoch": 1.4809454896285577, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.026, "step": 6140 }, { "epoch": 1.4821514712976362, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.0831, "step": 6145 }, { "epoch": 1.4833574529667148, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9802, "step": 6150 }, { "epoch": 1.4845634346357937, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 3.997, "step": 6155 }, { "epoch": 1.4857694163048722, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.023, "step": 6160 }, { "epoch": 1.4869753979739508, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.073, "step": 6165 }, { "epoch": 1.4881813796430294, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0931, "step": 6170 }, { "epoch": 1.489387361312108, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.0449, "step": 6175 }, { "epoch": 1.4905933429811866, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.061, "step": 6180 }, { "epoch": 1.4917993246502652, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.1143, "step": 6185 }, { "epoch": 1.493005306319344, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 3.9468, "step": 6190 }, { "epoch": 1.4942112879884226, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.1211, "step": 6195 }, { "epoch": 1.4954172696575012, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 3.9431, "step": 6200 }, { "epoch": 1.4966232513265798, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 3.9994, "step": 6205 }, { "epoch": 1.4978292329956584, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0697, "step": 6210 }, { "epoch": 1.4990352146647372, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0316, "step": 6215 }, { "epoch": 1.5002411963338158, "grad_norm": 2.75, "learning_rate": 3e-05, "loss": 4.0617, "step": 6220 }, { "epoch": 1.5014471780028944, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 3.9171, "step": 6225 }, { "epoch": 1.502653159671973, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.0316, "step": 6230 }, { "epoch": 1.5038591413410516, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0101, "step": 6235 }, { "epoch": 1.5050651230101302, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.1336, "step": 6240 }, { "epoch": 1.5062711046792088, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.1254, "step": 6245 }, { "epoch": 1.5074770863482874, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.1024, "step": 6250 }, { "epoch": 1.5086830680173662, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.1214, "step": 6255 }, { "epoch": 1.5098890496864448, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.1627, "step": 6260 }, { "epoch": 1.5110950313555234, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0722, "step": 6265 }, { "epoch": 1.5123010130246022, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 4.0688, "step": 6270 }, { "epoch": 1.5135069946936808, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 4.2566, "step": 6275 }, { "epoch": 1.5147129763627594, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 3.9691, "step": 6280 }, { "epoch": 1.515918958031838, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1557, "step": 6285 }, { "epoch": 1.5171249397009166, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.1215, "step": 6290 }, { "epoch": 1.5183309213699951, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0001, "step": 6295 }, { "epoch": 1.5195369030390737, "grad_norm": 2.875, "learning_rate": 3e-05, "loss": 4.0917, "step": 6300 }, { "epoch": 1.5207428847081523, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 3.958, "step": 6305 }, { "epoch": 1.521948866377231, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 3.9232, "step": 6310 }, { "epoch": 1.5231548480463097, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.0597, "step": 6315 }, { "epoch": 1.5243608297153883, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0826, "step": 6320 }, { "epoch": 1.525566811384467, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.0404, "step": 6325 }, { "epoch": 1.5267727930535457, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.0571, "step": 6330 }, { "epoch": 1.5279787747226243, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0195, "step": 6335 }, { "epoch": 1.529184756391703, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1032, "step": 6340 }, { "epoch": 1.5303907380607815, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 3.9927, "step": 6345 }, { "epoch": 1.5315967197298601, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0574, "step": 6350 }, { "epoch": 1.5328027013989387, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 3.9402, "step": 6355 }, { "epoch": 1.5340086830680173, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0778, "step": 6360 }, { "epoch": 1.535214664737096, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 3.9871, "step": 6365 }, { "epoch": 1.5364206464061745, "grad_norm": 2.828125, "learning_rate": 3e-05, "loss": 4.0588, "step": 6370 }, { "epoch": 1.5376266280752533, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 3.9708, "step": 6375 }, { "epoch": 1.538832609744332, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 3.9978, "step": 6380 }, { "epoch": 1.5400385914134105, "grad_norm": 2.84375, "learning_rate": 3e-05, "loss": 3.9885, "step": 6385 }, { "epoch": 1.5412445730824893, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.0532, "step": 6390 }, { "epoch": 1.542450554751568, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.0937, "step": 6395 }, { "epoch": 1.5436565364206465, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.013, "step": 6400 }, { "epoch": 1.544862518089725, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0434, "step": 6405 }, { "epoch": 1.5460684997588037, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.0633, "step": 6410 }, { "epoch": 1.5472744814278823, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.0284, "step": 6415 }, { "epoch": 1.5484804630969609, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.1968, "step": 6420 }, { "epoch": 1.5496864447660395, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 3.8845, "step": 6425 }, { "epoch": 1.550892426435118, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0771, "step": 6430 }, { "epoch": 1.5520984081041969, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.181, "step": 6435 }, { "epoch": 1.5533043897732755, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0284, "step": 6440 }, { "epoch": 1.554510371442354, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.1828, "step": 6445 }, { "epoch": 1.5557163531114329, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.1427, "step": 6450 }, { "epoch": 1.5569223347805115, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9848, "step": 6455 }, { "epoch": 1.55812831644959, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 3.9284, "step": 6460 }, { "epoch": 1.5593342981186686, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 4.1667, "step": 6465 }, { "epoch": 1.5605402797877472, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.1201, "step": 6470 }, { "epoch": 1.5617462614568258, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.0102, "step": 6475 }, { "epoch": 1.5629522431259044, "grad_norm": 2.9375, "learning_rate": 3e-05, "loss": 3.9665, "step": 6480 }, { "epoch": 1.564158224794983, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 3.984, "step": 6485 }, { "epoch": 1.5653642064640616, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 3.9308, "step": 6490 }, { "epoch": 1.5665701881331404, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.086, "step": 6495 }, { "epoch": 1.567776169802219, "grad_norm": 2.84375, "learning_rate": 3e-05, "loss": 4.0952, "step": 6500 }, { "epoch": 1.5689821514712976, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.2551, "step": 6505 }, { "epoch": 1.5701881331403764, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 3.868, "step": 6510 }, { "epoch": 1.571394114809455, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.1807, "step": 6515 }, { "epoch": 1.5726000964785336, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1749, "step": 6520 }, { "epoch": 1.5738060781476122, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9546, "step": 6525 }, { "epoch": 1.5750120598166908, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 3.9979, "step": 6530 }, { "epoch": 1.5762180414857694, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0671, "step": 6535 }, { "epoch": 1.577424023154848, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.0977, "step": 6540 }, { "epoch": 1.5786300048239266, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0935, "step": 6545 }, { "epoch": 1.5798359864930052, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0934, "step": 6550 }, { "epoch": 1.581041968162084, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0132, "step": 6555 }, { "epoch": 1.5822479498311626, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.1223, "step": 6560 }, { "epoch": 1.5834539315002412, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.0892, "step": 6565 }, { "epoch": 1.58465991316932, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.0163, "step": 6570 }, { "epoch": 1.5858658948383986, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0977, "step": 6575 }, { "epoch": 1.5870718765074772, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.0438, "step": 6580 }, { "epoch": 1.5882778581765558, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1282, "step": 6585 }, { "epoch": 1.5894838398456343, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0903, "step": 6590 }, { "epoch": 1.590689821514713, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.0423, "step": 6595 }, { "epoch": 1.5918958031837915, "grad_norm": 5.8125, "learning_rate": 3e-05, "loss": 4.1039, "step": 6600 }, { "epoch": 1.5931017848528701, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.2106, "step": 6605 }, { "epoch": 1.5943077665219487, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 3.9636, "step": 6610 }, { "epoch": 1.5955137481910275, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0388, "step": 6615 }, { "epoch": 1.5967197298601061, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0383, "step": 6620 }, { "epoch": 1.5979257115291847, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 3.9126, "step": 6625 }, { "epoch": 1.5991316931982635, "grad_norm": 7.21875, "learning_rate": 3e-05, "loss": 4.149, "step": 6630 }, { "epoch": 1.6003376748673421, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9852, "step": 6635 }, { "epoch": 1.6015436565364207, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.1079, "step": 6640 }, { "epoch": 1.6027496382054993, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1275, "step": 6645 }, { "epoch": 1.603955619874578, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1051, "step": 6650 }, { "epoch": 1.6051616015436565, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.1882, "step": 6655 }, { "epoch": 1.606367583212735, "grad_norm": 5.75, "learning_rate": 3e-05, "loss": 4.0404, "step": 6660 }, { "epoch": 1.6075735648818137, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 3.9786, "step": 6665 }, { "epoch": 1.6087795465508923, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0706, "step": 6670 }, { "epoch": 1.609985528219971, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.1825, "step": 6675 }, { "epoch": 1.6111915098890497, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0734, "step": 6680 }, { "epoch": 1.6123974915581283, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1595, "step": 6685 }, { "epoch": 1.613603473227207, "grad_norm": 5.96875, "learning_rate": 3e-05, "loss": 4.1883, "step": 6690 }, { "epoch": 1.6148094548962857, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.1794, "step": 6695 }, { "epoch": 1.6160154365653643, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.092, "step": 6700 }, { "epoch": 1.6172214182344429, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 3.8939, "step": 6705 }, { "epoch": 1.6184273999035215, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.0475, "step": 6710 }, { "epoch": 1.6196333815726, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1587, "step": 6715 }, { "epoch": 1.6208393632416787, "grad_norm": 2.4375, "learning_rate": 3e-05, "loss": 3.9752, "step": 6720 }, { "epoch": 1.6220453449107572, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.1102, "step": 6725 }, { "epoch": 1.6232513265798358, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0718, "step": 6730 }, { "epoch": 1.6244573082489147, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9342, "step": 6735 }, { "epoch": 1.6256632899179932, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.1315, "step": 6740 }, { "epoch": 1.6268692715870718, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0222, "step": 6745 }, { "epoch": 1.6280752532561507, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.042, "step": 6750 }, { "epoch": 1.6292812349252292, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.2125, "step": 6755 }, { "epoch": 1.6304872165943078, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.1834, "step": 6760 }, { "epoch": 1.6316931982633864, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0591, "step": 6765 }, { "epoch": 1.632899179932465, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.1278, "step": 6770 }, { "epoch": 1.6341051616015436, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 3.9817, "step": 6775 }, { "epoch": 1.6353111432706222, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0339, "step": 6780 }, { "epoch": 1.6365171249397008, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0388, "step": 6785 }, { "epoch": 1.6377231066087794, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 3.9839, "step": 6790 }, { "epoch": 1.6389290882778582, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9976, "step": 6795 }, { "epoch": 1.6401350699469368, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.147, "step": 6800 }, { "epoch": 1.6413410516160154, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.1033, "step": 6805 }, { "epoch": 1.6425470332850942, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.2364, "step": 6810 }, { "epoch": 1.6437530149541728, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.1362, "step": 6815 }, { "epoch": 1.6449589966232514, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.1039, "step": 6820 }, { "epoch": 1.64616497829233, "grad_norm": 6.03125, "learning_rate": 3e-05, "loss": 3.9782, "step": 6825 }, { "epoch": 1.6473709599614086, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 3.9864, "step": 6830 }, { "epoch": 1.6485769416304872, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.0008, "step": 6835 }, { "epoch": 1.6497829232995658, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 3.9438, "step": 6840 }, { "epoch": 1.6509889049686444, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 3.9866, "step": 6845 }, { "epoch": 1.652194886637723, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.1417, "step": 6850 }, { "epoch": 1.6534008683068018, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.1054, "step": 6855 }, { "epoch": 1.6546068499758804, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9584, "step": 6860 }, { "epoch": 1.655812831644959, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 4.1609, "step": 6865 }, { "epoch": 1.6570188133140378, "grad_norm": 5.21875, "learning_rate": 3e-05, "loss": 4.0505, "step": 6870 }, { "epoch": 1.6582247949831164, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.0725, "step": 6875 }, { "epoch": 1.659430776652195, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.2059, "step": 6880 }, { "epoch": 1.6606367583212736, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1739, "step": 6885 }, { "epoch": 1.6618427399903521, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0417, "step": 6890 }, { "epoch": 1.6630487216594307, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.1691, "step": 6895 }, { "epoch": 1.6642547033285093, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0885, "step": 6900 }, { "epoch": 1.665460684997588, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0171, "step": 6905 }, { "epoch": 1.6666666666666665, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0843, "step": 6910 }, { "epoch": 1.6678726483357453, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 3.9988, "step": 6915 }, { "epoch": 1.669078630004824, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.2238, "step": 6920 }, { "epoch": 1.6702846116739025, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.1999, "step": 6925 }, { "epoch": 1.6714905933429813, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1223, "step": 6930 }, { "epoch": 1.67269657501206, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0065, "step": 6935 }, { "epoch": 1.6739025566811385, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0256, "step": 6940 }, { "epoch": 1.6751085383502171, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.0496, "step": 6945 }, { "epoch": 1.6763145200192957, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1024, "step": 6950 }, { "epoch": 1.6775205016883743, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.9367, "step": 6955 }, { "epoch": 1.678726483357453, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0398, "step": 6960 }, { "epoch": 1.6799324650265315, "grad_norm": 5.125, "learning_rate": 3e-05, "loss": 4.1259, "step": 6965 }, { "epoch": 1.68113844669561, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.0163, "step": 6970 }, { "epoch": 1.682344428364689, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.1114, "step": 6975 }, { "epoch": 1.6835504100337675, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 3.9766, "step": 6980 }, { "epoch": 1.684756391702846, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.0283, "step": 6985 }, { "epoch": 1.685962373371925, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.1656, "step": 6990 }, { "epoch": 1.6871683550410035, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 3.9653, "step": 6995 }, { "epoch": 1.688374336710082, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.0377, "step": 7000 }, { "epoch": 1.6895803183791607, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.0417, "step": 7005 }, { "epoch": 1.6907863000482393, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0365, "step": 7010 }, { "epoch": 1.6919922817173179, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9801, "step": 7015 }, { "epoch": 1.6931982633863965, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0513, "step": 7020 }, { "epoch": 1.694404245055475, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 3.943, "step": 7025 }, { "epoch": 1.6956102267245536, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.089, "step": 7030 }, { "epoch": 1.6968162083936325, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 3.9218, "step": 7035 }, { "epoch": 1.698022190062711, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.1588, "step": 7040 }, { "epoch": 1.6992281717317896, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.0247, "step": 7045 }, { "epoch": 1.7004341534008685, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0594, "step": 7050 }, { "epoch": 1.701640135069947, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.005, "step": 7055 }, { "epoch": 1.7028461167390256, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0014, "step": 7060 }, { "epoch": 1.7040520984081042, "grad_norm": 5.75, "learning_rate": 3e-05, "loss": 3.988, "step": 7065 }, { "epoch": 1.7052580800771828, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.1577, "step": 7070 }, { "epoch": 1.7064640617462614, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 3.9659, "step": 7075 }, { "epoch": 1.70767004341534, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.179, "step": 7080 }, { "epoch": 1.7088760250844186, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0521, "step": 7085 }, { "epoch": 1.7100820067534972, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 3.9942, "step": 7090 }, { "epoch": 1.711287988422576, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 4.0061, "step": 7095 }, { "epoch": 1.7124939700916546, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9869, "step": 7100 }, { "epoch": 1.7136999517607332, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 3.9908, "step": 7105 }, { "epoch": 1.714905933429812, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0848, "step": 7110 }, { "epoch": 1.7161119150988906, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.0721, "step": 7115 }, { "epoch": 1.7173178967679692, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.978, "step": 7120 }, { "epoch": 1.7185238784370478, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1058, "step": 7125 }, { "epoch": 1.7197298601061264, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.2165, "step": 7130 }, { "epoch": 1.720935841775205, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 3.9482, "step": 7135 }, { "epoch": 1.7221418234442836, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.2209, "step": 7140 }, { "epoch": 1.7233478051133622, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9868, "step": 7145 }, { "epoch": 1.7245537867824408, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.0049, "step": 7150 }, { "epoch": 1.7257597684515196, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9966, "step": 7155 }, { "epoch": 1.7269657501205982, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 4.017, "step": 7160 }, { "epoch": 1.7281717317896768, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 3.9384, "step": 7165 }, { "epoch": 1.7293777134587556, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.0184, "step": 7170 }, { "epoch": 1.7305836951278342, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0153, "step": 7175 }, { "epoch": 1.7317896767969128, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.1265, "step": 7180 }, { "epoch": 1.7329956584659914, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0462, "step": 7185 }, { "epoch": 1.73420164013507, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.9652, "step": 7190 }, { "epoch": 1.7354076218041485, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.0858, "step": 7195 }, { "epoch": 1.7366136034732271, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.0721, "step": 7200 }, { "epoch": 1.7378195851423057, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 4.0116, "step": 7205 }, { "epoch": 1.7390255668113843, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0017, "step": 7210 }, { "epoch": 1.7402315484804631, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0213, "step": 7215 }, { "epoch": 1.7414375301495417, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0262, "step": 7220 }, { "epoch": 1.7426435118186203, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.1149, "step": 7225 }, { "epoch": 1.7438494934876991, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.1181, "step": 7230 }, { "epoch": 1.7450554751567777, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.1266, "step": 7235 }, { "epoch": 1.7462614568258563, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.9725, "step": 7240 }, { "epoch": 1.747467438494935, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 3.9636, "step": 7245 }, { "epoch": 1.7486734201640135, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 3.84, "step": 7250 }, { "epoch": 1.749879401833092, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.0006, "step": 7255 }, { "epoch": 1.7510853835021707, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.3034, "step": 7260 }, { "epoch": 1.7522913651712493, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.1396, "step": 7265 }, { "epoch": 1.7534973468403279, "grad_norm": 2.75, "learning_rate": 3e-05, "loss": 4.1083, "step": 7270 }, { "epoch": 1.7547033285094067, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.06, "step": 7275 }, { "epoch": 1.7559093101784853, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.1536, "step": 7280 }, { "epoch": 1.7571152918475639, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.0071, "step": 7285 }, { "epoch": 1.7583212735166427, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.2143, "step": 7290 }, { "epoch": 1.7595272551857213, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.0663, "step": 7295 }, { "epoch": 1.7607332368547999, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.0093, "step": 7300 }, { "epoch": 1.7619392185238785, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.0823, "step": 7305 }, { "epoch": 1.763145200192957, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.0331, "step": 7310 }, { "epoch": 1.7643511818620357, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9956, "step": 7315 }, { "epoch": 1.7655571635311142, "grad_norm": 2.75, "learning_rate": 3e-05, "loss": 3.9552, "step": 7320 }, { "epoch": 1.7667631452001928, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 3.9776, "step": 7325 }, { "epoch": 1.7679691268692714, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.1784, "step": 7330 }, { "epoch": 1.7691751085383502, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.0413, "step": 7335 }, { "epoch": 1.7703810902074288, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.0568, "step": 7340 }, { "epoch": 1.7715870718765074, "grad_norm": 2.640625, "learning_rate": 3e-05, "loss": 3.9845, "step": 7345 }, { "epoch": 1.7727930535455863, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 3.9184, "step": 7350 }, { "epoch": 1.7739990352146648, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 3.8063, "step": 7355 }, { "epoch": 1.7752050168837434, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.2464, "step": 7360 }, { "epoch": 1.776410998552822, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.1772, "step": 7365 }, { "epoch": 1.7776169802219006, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.1838, "step": 7370 }, { "epoch": 1.7788229618909792, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.1762, "step": 7375 }, { "epoch": 1.7800289435600578, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 3.915, "step": 7380 }, { "epoch": 1.7812349252291364, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.0977, "step": 7385 }, { "epoch": 1.782440906898215, "grad_norm": 2.59375, "learning_rate": 3e-05, "loss": 4.0152, "step": 7390 }, { "epoch": 1.7836468885672938, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1677, "step": 7395 }, { "epoch": 1.7848528702363724, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0483, "step": 7400 }, { "epoch": 1.786058851905451, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0789, "step": 7405 }, { "epoch": 1.7872648335745298, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.1165, "step": 7410 }, { "epoch": 1.7884708152436084, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.3544, "step": 7415 }, { "epoch": 1.789676796912687, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 3.9878, "step": 7420 }, { "epoch": 1.7908827785817656, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.0127, "step": 7425 }, { "epoch": 1.7920887602508442, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.1269, "step": 7430 }, { "epoch": 1.7932947419199228, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.164, "step": 7435 }, { "epoch": 1.7945007235890014, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0352, "step": 7440 }, { "epoch": 1.79570670525808, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1365, "step": 7445 }, { "epoch": 1.7969126869271586, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.0127, "step": 7450 }, { "epoch": 1.7981186685962374, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.9628, "step": 7455 }, { "epoch": 1.799324650265316, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.2594, "step": 7460 }, { "epoch": 1.8005306319343946, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0675, "step": 7465 }, { "epoch": 1.8017366136034734, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 3.9905, "step": 7470 }, { "epoch": 1.802942595272552, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.0903, "step": 7475 }, { "epoch": 1.8041485769416306, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.1105, "step": 7480 }, { "epoch": 1.8053545586107091, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0873, "step": 7485 }, { "epoch": 1.8065605402797877, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.1105, "step": 7490 }, { "epoch": 1.8077665219488663, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1356, "step": 7495 }, { "epoch": 1.808972503617945, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.0782, "step": 7500 }, { "epoch": 1.8101784852870235, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0084, "step": 7505 }, { "epoch": 1.8113844669561021, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0238, "step": 7510 }, { "epoch": 1.812590448625181, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.1001, "step": 7515 }, { "epoch": 1.8137964302942595, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 3.9716, "step": 7520 }, { "epoch": 1.8150024119633381, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1145, "step": 7525 }, { "epoch": 1.816208393632417, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.0803, "step": 7530 }, { "epoch": 1.8174143753014955, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0458, "step": 7535 }, { "epoch": 1.8186203569705741, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.1574, "step": 7540 }, { "epoch": 1.8198263386396527, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.001, "step": 7545 }, { "epoch": 1.8210323203087313, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 3.9705, "step": 7550 }, { "epoch": 1.82223830197781, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0168, "step": 7555 }, { "epoch": 1.8234442836468885, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0009, "step": 7560 }, { "epoch": 1.824650265315967, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.0045, "step": 7565 }, { "epoch": 1.8258562469850457, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0126, "step": 7570 }, { "epoch": 1.8270622286541245, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.1587, "step": 7575 }, { "epoch": 1.828268210323203, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.0435, "step": 7580 }, { "epoch": 1.8294741919922817, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.147, "step": 7585 }, { "epoch": 1.8306801736613605, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 4.0213, "step": 7590 }, { "epoch": 1.831886155330439, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.1076, "step": 7595 }, { "epoch": 1.8330921369995177, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 3.9983, "step": 7600 }, { "epoch": 1.8342981186685963, "grad_norm": 7.0, "learning_rate": 3e-05, "loss": 4.2011, "step": 7605 }, { "epoch": 1.8355041003376749, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.8075, "step": 7610 }, { "epoch": 1.8367100820067535, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.1202, "step": 7615 }, { "epoch": 1.837916063675832, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1142, "step": 7620 }, { "epoch": 1.8391220453449106, "grad_norm": 5.46875, "learning_rate": 3e-05, "loss": 4.076, "step": 7625 }, { "epoch": 1.8403280270139892, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.1685, "step": 7630 }, { "epoch": 1.841534008683068, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 3.972, "step": 7635 }, { "epoch": 1.8427399903521466, "grad_norm": 2.484375, "learning_rate": 3e-05, "loss": 3.9155, "step": 7640 }, { "epoch": 1.8439459720212252, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0701, "step": 7645 }, { "epoch": 1.845151953690304, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 3.9197, "step": 7650 }, { "epoch": 1.8463579353593826, "grad_norm": 6.21875, "learning_rate": 3e-05, "loss": 3.9967, "step": 7655 }, { "epoch": 1.8475639170284612, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.1828, "step": 7660 }, { "epoch": 1.8487698986975398, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0495, "step": 7665 }, { "epoch": 1.8499758803666184, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.1876, "step": 7670 }, { "epoch": 1.851181862035697, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.1964, "step": 7675 }, { "epoch": 1.8523878437047756, "grad_norm": 5.0, "learning_rate": 3e-05, "loss": 4.0717, "step": 7680 }, { "epoch": 1.8535938253738542, "grad_norm": 5.625, "learning_rate": 3e-05, "loss": 4.0, "step": 7685 }, { "epoch": 1.8547998070429328, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.0193, "step": 7690 }, { "epoch": 1.8560057887120116, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 3.9337, "step": 7695 }, { "epoch": 1.8572117703810902, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.0295, "step": 7700 }, { "epoch": 1.8584177520501688, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0421, "step": 7705 }, { "epoch": 1.8596237337192476, "grad_norm": 5.125, "learning_rate": 3e-05, "loss": 4.018, "step": 7710 }, { "epoch": 1.8608297153883262, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 3.9413, "step": 7715 }, { "epoch": 1.8620356970574048, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.0171, "step": 7720 }, { "epoch": 1.8632416787264834, "grad_norm": 5.71875, "learning_rate": 3e-05, "loss": 4.2767, "step": 7725 }, { "epoch": 1.864447660395562, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0138, "step": 7730 }, { "epoch": 1.8656536420646406, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.2037, "step": 7735 }, { "epoch": 1.8668596237337192, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 3.9941, "step": 7740 }, { "epoch": 1.8680656054027978, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 3.9648, "step": 7745 }, { "epoch": 1.8692715870718764, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 3.9468, "step": 7750 }, { "epoch": 1.8704775687409552, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.1002, "step": 7755 }, { "epoch": 1.8716835504100338, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.0032, "step": 7760 }, { "epoch": 1.8728895320791124, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 3.9906, "step": 7765 }, { "epoch": 1.8740955137481912, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.1352, "step": 7770 }, { "epoch": 1.8753014954172698, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 3.974, "step": 7775 }, { "epoch": 1.8765074770863484, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0917, "step": 7780 }, { "epoch": 1.877713458755427, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9602, "step": 7785 }, { "epoch": 1.8789194404245055, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 3.916, "step": 7790 }, { "epoch": 1.8801254220935841, "grad_norm": 2.734375, "learning_rate": 3e-05, "loss": 4.0258, "step": 7795 }, { "epoch": 1.8813314037626627, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.1164, "step": 7800 }, { "epoch": 1.8825373854317413, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9327, "step": 7805 }, { "epoch": 1.88374336710082, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 3.9971, "step": 7810 }, { "epoch": 1.8849493487698987, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.1488, "step": 7815 }, { "epoch": 1.8861553304389773, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.0744, "step": 7820 }, { "epoch": 1.887361312108056, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.2581, "step": 7825 }, { "epoch": 1.8885672937771347, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 3.8083, "step": 7830 }, { "epoch": 1.8897732754462133, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.1401, "step": 7835 }, { "epoch": 1.890979257115292, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.1043, "step": 7840 }, { "epoch": 1.8921852387843705, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 3.9081, "step": 7845 }, { "epoch": 1.893391220453449, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.158, "step": 7850 }, { "epoch": 1.8945972021225277, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.0827, "step": 7855 }, { "epoch": 1.8958031837916063, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9377, "step": 7860 }, { "epoch": 1.8970091654606849, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.1199, "step": 7865 }, { "epoch": 1.8982151471297635, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9251, "step": 7870 }, { "epoch": 1.8994211287988423, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0988, "step": 7875 }, { "epoch": 1.9006271104679209, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.0778, "step": 7880 }, { "epoch": 1.9018330921369995, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.0134, "step": 7885 }, { "epoch": 1.9030390738060783, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.1007, "step": 7890 }, { "epoch": 1.9042450554751569, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.1435, "step": 7895 }, { "epoch": 1.9054510371442355, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0369, "step": 7900 }, { "epoch": 1.906657018813314, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.1594, "step": 7905 }, { "epoch": 1.9078630004823927, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.013, "step": 7910 }, { "epoch": 1.9090689821514712, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.0276, "step": 7915 }, { "epoch": 1.9102749638205498, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.0516, "step": 7920 }, { "epoch": 1.9114809454896284, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 3.9601, "step": 7925 }, { "epoch": 1.912686927158707, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 3.9503, "step": 7930 }, { "epoch": 1.9138929088277858, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 3.9998, "step": 7935 }, { "epoch": 1.9150988904968644, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 3.9856, "step": 7940 }, { "epoch": 1.916304872165943, "grad_norm": 2.421875, "learning_rate": 3e-05, "loss": 4.0189, "step": 7945 }, { "epoch": 1.9175108538350218, "grad_norm": 2.953125, "learning_rate": 3e-05, "loss": 4.0314, "step": 7950 }, { "epoch": 1.9187168355041004, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.1042, "step": 7955 }, { "epoch": 1.919922817173179, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.0222, "step": 7960 }, { "epoch": 1.9211287988422576, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 4.0856, "step": 7965 }, { "epoch": 1.9223347805113362, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0166, "step": 7970 }, { "epoch": 1.9235407621804148, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9457, "step": 7975 }, { "epoch": 1.9247467438494934, "grad_norm": 5.40625, "learning_rate": 3e-05, "loss": 4.0057, "step": 7980 }, { "epoch": 1.925952725518572, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.1287, "step": 7985 }, { "epoch": 1.9271587071876506, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0278, "step": 7990 }, { "epoch": 1.9283646888567294, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9078, "step": 7995 }, { "epoch": 1.929570670525808, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0745, "step": 8000 }, { "epoch": 1.9307766521948866, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.2001, "step": 8005 }, { "epoch": 1.9319826338639654, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 4.0104, "step": 8010 }, { "epoch": 1.933188615533044, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0205, "step": 8015 }, { "epoch": 1.9343945972021226, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0449, "step": 8020 }, { "epoch": 1.9356005788712012, "grad_norm": 6.21875, "learning_rate": 3e-05, "loss": 3.9851, "step": 8025 }, { "epoch": 1.9368065605402798, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.0748, "step": 8030 }, { "epoch": 1.9380125422093584, "grad_norm": 5.0, "learning_rate": 3e-05, "loss": 4.0179, "step": 8035 }, { "epoch": 1.939218523878437, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0333, "step": 8040 }, { "epoch": 1.9404245055475156, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 4.0861, "step": 8045 }, { "epoch": 1.9416304872165941, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0446, "step": 8050 }, { "epoch": 1.942836468885673, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 3.9874, "step": 8055 }, { "epoch": 1.9440424505547516, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0757, "step": 8060 }, { "epoch": 1.9452484322238301, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0877, "step": 8065 }, { "epoch": 1.946454413892909, "grad_norm": 5.96875, "learning_rate": 3e-05, "loss": 4.0475, "step": 8070 }, { "epoch": 1.9476603955619876, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9971, "step": 8075 }, { "epoch": 1.9488663772310661, "grad_norm": 6.0, "learning_rate": 3e-05, "loss": 4.009, "step": 8080 }, { "epoch": 1.9500723589001447, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0473, "step": 8085 }, { "epoch": 1.9512783405692233, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 4.1683, "step": 8090 }, { "epoch": 1.952484322238302, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 3.9103, "step": 8095 }, { "epoch": 1.9536903039073805, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 3.9012, "step": 8100 }, { "epoch": 1.9548962855764591, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.1357, "step": 8105 }, { "epoch": 1.9561022672455377, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 3.9128, "step": 8110 }, { "epoch": 1.9573082489146165, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 3.9138, "step": 8115 }, { "epoch": 1.9585142305836951, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.0067, "step": 8120 }, { "epoch": 1.9597202122527737, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.1704, "step": 8125 }, { "epoch": 1.9609261939218525, "grad_norm": 5.75, "learning_rate": 3e-05, "loss": 3.8835, "step": 8130 }, { "epoch": 1.9621321755909311, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.8896, "step": 8135 }, { "epoch": 1.9633381572600097, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0304, "step": 8140 }, { "epoch": 1.9645441389290883, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.1229, "step": 8145 }, { "epoch": 1.965750120598167, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.2015, "step": 8150 }, { "epoch": 1.9669561022672455, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9445, "step": 8155 }, { "epoch": 1.968162083936324, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 3.9399, "step": 8160 }, { "epoch": 1.9693680656054027, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.0125, "step": 8165 }, { "epoch": 1.9705740472744813, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 3.9463, "step": 8170 }, { "epoch": 1.97178002894356, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.1029, "step": 8175 }, { "epoch": 1.9729860106126387, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1188, "step": 8180 }, { "epoch": 1.9741919922817173, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.082, "step": 8185 }, { "epoch": 1.975397973950796, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 3.9607, "step": 8190 }, { "epoch": 1.9766039556198747, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 3.936, "step": 8195 }, { "epoch": 1.9778099372889533, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 3.9766, "step": 8200 }, { "epoch": 1.9790159189580319, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.1127, "step": 8205 }, { "epoch": 1.9802219006271105, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0685, "step": 8210 }, { "epoch": 1.981427882296189, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.1434, "step": 8215 }, { "epoch": 1.9826338639652676, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0038, "step": 8220 }, { "epoch": 1.9838398456343462, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 3.9862, "step": 8225 }, { "epoch": 1.9850458273034248, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9644, "step": 8230 }, { "epoch": 1.9862518089725036, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 3.8832, "step": 8235 }, { "epoch": 1.9874577906415822, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 3.9464, "step": 8240 }, { "epoch": 1.9886637723106608, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.2143, "step": 8245 }, { "epoch": 1.9898697539797396, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0192, "step": 8250 }, { "epoch": 1.9910757356488182, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9347, "step": 8255 }, { "epoch": 1.9922817173178968, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.017, "step": 8260 }, { "epoch": 1.9934876989869754, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1018, "step": 8265 }, { "epoch": 1.994693680656054, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 3.913, "step": 8270 }, { "epoch": 1.9958996623251326, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 3.9968, "step": 8275 }, { "epoch": 1.9971056439942112, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0281, "step": 8280 }, { "epoch": 1.9983116256632898, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.9992, "step": 8285 }, { "epoch": 1.9995176073323684, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.1548, "step": 8290 }, { "epoch": 2.000723589001447, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 3.9162, "step": 8295 }, { "epoch": 2.001929570670526, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 3.9886, "step": 8300 }, { "epoch": 2.0031355523396046, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.1288, "step": 8305 }, { "epoch": 2.004341534008683, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.071, "step": 8310 }, { "epoch": 2.005547515677762, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.062, "step": 8315 }, { "epoch": 2.0067534973468404, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.009, "step": 8320 }, { "epoch": 2.007959479015919, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 3.954, "step": 8325 }, { "epoch": 2.0091654606849976, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.0571, "step": 8330 }, { "epoch": 2.010371442354076, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.19, "step": 8335 }, { "epoch": 2.0115774240231548, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 3.9523, "step": 8340 }, { "epoch": 2.0127834056922334, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 3.9799, "step": 8345 }, { "epoch": 2.013989387361312, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 3.9742, "step": 8350 }, { "epoch": 2.0151953690303905, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0484, "step": 8355 }, { "epoch": 2.0164013506994696, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0885, "step": 8360 }, { "epoch": 2.017607332368548, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.0423, "step": 8365 }, { "epoch": 2.0188133140376268, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 3.9638, "step": 8370 }, { "epoch": 2.0200192957067054, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 3.9711, "step": 8375 }, { "epoch": 2.021225277375784, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.0327, "step": 8380 }, { "epoch": 2.0224312590448625, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0118, "step": 8385 }, { "epoch": 2.023637240713941, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 3.9142, "step": 8390 }, { "epoch": 2.0248432223830197, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.0863, "step": 8395 }, { "epoch": 2.0260492040520983, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0289, "step": 8400 }, { "epoch": 2.027255185721177, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.0391, "step": 8405 }, { "epoch": 2.0284611673902555, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0212, "step": 8410 }, { "epoch": 2.029667149059334, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.053, "step": 8415 }, { "epoch": 2.030873130728413, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 3.9551, "step": 8420 }, { "epoch": 2.0320791123974917, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.0325, "step": 8425 }, { "epoch": 2.0332850940665703, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 3.9974, "step": 8430 }, { "epoch": 2.034491075735649, "grad_norm": 5.84375, "learning_rate": 3e-05, "loss": 4.0241, "step": 8435 }, { "epoch": 2.0356970574047275, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 3.9208, "step": 8440 }, { "epoch": 2.036903039073806, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0834, "step": 8445 }, { "epoch": 2.0381090207428847, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.0714, "step": 8450 }, { "epoch": 2.0393150024119633, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 3.9686, "step": 8455 }, { "epoch": 2.040520984081042, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.1466, "step": 8460 }, { "epoch": 2.0417269657501205, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 4.03, "step": 8465 }, { "epoch": 2.042932947419199, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.1465, "step": 8470 }, { "epoch": 2.0441389290882777, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.0427, "step": 8475 }, { "epoch": 2.0453449107573567, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9399, "step": 8480 }, { "epoch": 2.0465508924264353, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1075, "step": 8485 }, { "epoch": 2.047756874095514, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.1395, "step": 8490 }, { "epoch": 2.0489628557645925, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.916, "step": 8495 }, { "epoch": 2.050168837433671, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9465, "step": 8500 }, { "epoch": 2.0513748191027497, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.0967, "step": 8505 }, { "epoch": 2.0525808007718283, "grad_norm": 2.578125, "learning_rate": 3e-05, "loss": 4.0344, "step": 8510 }, { "epoch": 2.053786782440907, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 4.2038, "step": 8515 }, { "epoch": 2.0549927641099854, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 3.9879, "step": 8520 }, { "epoch": 2.056198745779064, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.9827, "step": 8525 }, { "epoch": 2.0574047274481426, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.0989, "step": 8530 }, { "epoch": 2.058610709117221, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 3.9475, "step": 8535 }, { "epoch": 2.0598166907863003, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.8495, "step": 8540 }, { "epoch": 2.061022672455379, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 3.876, "step": 8545 }, { "epoch": 2.0622286541244574, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9831, "step": 8550 }, { "epoch": 2.063434635793536, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.0698, "step": 8555 }, { "epoch": 2.0646406174626146, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.028, "step": 8560 }, { "epoch": 2.065846599131693, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.893, "step": 8565 }, { "epoch": 2.067052580800772, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0074, "step": 8570 }, { "epoch": 2.0682585624698504, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0383, "step": 8575 }, { "epoch": 2.069464544138929, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.0575, "step": 8580 }, { "epoch": 2.0706705258080076, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 3.9666, "step": 8585 }, { "epoch": 2.071876507477086, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 3.9089, "step": 8590 }, { "epoch": 2.0730824891461648, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.1263, "step": 8595 }, { "epoch": 2.074288470815244, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0063, "step": 8600 }, { "epoch": 2.0754944524843224, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0378, "step": 8605 }, { "epoch": 2.076700434153401, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0002, "step": 8610 }, { "epoch": 2.0779064158224796, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 3.9879, "step": 8615 }, { "epoch": 2.079112397491558, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 3.8187, "step": 8620 }, { "epoch": 2.0803183791606368, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 3.9555, "step": 8625 }, { "epoch": 2.0815243608297154, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0225, "step": 8630 }, { "epoch": 2.082730342498794, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 4.0721, "step": 8635 }, { "epoch": 2.0839363241678726, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.1471, "step": 8640 }, { "epoch": 2.085142305836951, "grad_norm": 8.0625, "learning_rate": 3e-05, "loss": 4.092, "step": 8645 }, { "epoch": 2.0863482875060297, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.009, "step": 8650 }, { "epoch": 2.0875542691751083, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.1142, "step": 8655 }, { "epoch": 2.0887602508441874, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.0324, "step": 8660 }, { "epoch": 2.089966232513266, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 3.9033, "step": 8665 }, { "epoch": 2.0911722141823446, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.0027, "step": 8670 }, { "epoch": 2.092378195851423, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9593, "step": 8675 }, { "epoch": 2.0935841775205017, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 3.9982, "step": 8680 }, { "epoch": 2.0947901591895803, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 3.9786, "step": 8685 }, { "epoch": 2.095996140858659, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.1296, "step": 8690 }, { "epoch": 2.0972021225277375, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 3.9511, "step": 8695 }, { "epoch": 2.098408104196816, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.1093, "step": 8700 }, { "epoch": 2.0996140858658947, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.226, "step": 8705 }, { "epoch": 2.1008200675349733, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 3.929, "step": 8710 }, { "epoch": 2.102026049204052, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.9087, "step": 8715 }, { "epoch": 2.103232030873131, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.1009, "step": 8720 }, { "epoch": 2.1044380125422095, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 4.1078, "step": 8725 }, { "epoch": 2.105643994211288, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 3.9879, "step": 8730 }, { "epoch": 2.1068499758803667, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.019, "step": 8735 }, { "epoch": 2.1080559575494453, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9734, "step": 8740 }, { "epoch": 2.109261939218524, "grad_norm": 6.6875, "learning_rate": 3e-05, "loss": 4.0955, "step": 8745 }, { "epoch": 2.1104679208876025, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0287, "step": 8750 }, { "epoch": 2.111673902556681, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.0448, "step": 8755 }, { "epoch": 2.1128798842257597, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.1271, "step": 8760 }, { "epoch": 2.1140858658948383, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 3.9896, "step": 8765 }, { "epoch": 2.115291847563917, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0652, "step": 8770 }, { "epoch": 2.1164978292329955, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1651, "step": 8775 }, { "epoch": 2.1177038109020745, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.0744, "step": 8780 }, { "epoch": 2.118909792571153, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.0056, "step": 8785 }, { "epoch": 2.1201157742402317, "grad_norm": 5.0, "learning_rate": 3e-05, "loss": 3.9383, "step": 8790 }, { "epoch": 2.1213217559093103, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.0101, "step": 8795 }, { "epoch": 2.122527737578389, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9052, "step": 8800 }, { "epoch": 2.1237337192474675, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9625, "step": 8805 }, { "epoch": 2.124939700916546, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 3.9514, "step": 8810 }, { "epoch": 2.1261456825856246, "grad_norm": 2.765625, "learning_rate": 3e-05, "loss": 4.0969, "step": 8815 }, { "epoch": 2.1273516642547032, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0527, "step": 8820 }, { "epoch": 2.128557645923782, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0542, "step": 8825 }, { "epoch": 2.1297636275928604, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.0179, "step": 8830 }, { "epoch": 2.130969609261939, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 3.9508, "step": 8835 }, { "epoch": 2.132175590931018, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.059, "step": 8840 }, { "epoch": 2.1333815726000966, "grad_norm": 2.9375, "learning_rate": 3e-05, "loss": 4.1466, "step": 8845 }, { "epoch": 2.1345875542691752, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 3.9752, "step": 8850 }, { "epoch": 2.135793535938254, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 4.1284, "step": 8855 }, { "epoch": 2.1369995176073324, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 3.9328, "step": 8860 }, { "epoch": 2.138205499276411, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.9389, "step": 8865 }, { "epoch": 2.1394114809454896, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.0822, "step": 8870 }, { "epoch": 2.140617462614568, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.0629, "step": 8875 }, { "epoch": 2.141823444283647, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0269, "step": 8880 }, { "epoch": 2.1430294259527254, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 3.9227, "step": 8885 }, { "epoch": 2.144235407621804, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0259, "step": 8890 }, { "epoch": 2.1454413892908826, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 3.9517, "step": 8895 }, { "epoch": 2.1466473709599616, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.0769, "step": 8900 }, { "epoch": 2.14785335262904, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0197, "step": 8905 }, { "epoch": 2.149059334298119, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 3.9141, "step": 8910 }, { "epoch": 2.1502653159671974, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.0124, "step": 8915 }, { "epoch": 2.151471297636276, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 3.8614, "step": 8920 }, { "epoch": 2.1526772793053546, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 3.9036, "step": 8925 }, { "epoch": 2.153883260974433, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0415, "step": 8930 }, { "epoch": 2.1550892426435118, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.1124, "step": 8935 }, { "epoch": 2.1562952243125904, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0552, "step": 8940 }, { "epoch": 2.157501205981669, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.1108, "step": 8945 }, { "epoch": 2.1587071876507475, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.2137, "step": 8950 }, { "epoch": 2.159913169319826, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 3.9945, "step": 8955 }, { "epoch": 2.161119150988905, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 3.9038, "step": 8960 }, { "epoch": 2.1623251326579838, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 3.8394, "step": 8965 }, { "epoch": 2.1635311143270624, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0667, "step": 8970 }, { "epoch": 2.164737095996141, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0579, "step": 8975 }, { "epoch": 2.1659430776652195, "grad_norm": 6.15625, "learning_rate": 3e-05, "loss": 4.0854, "step": 8980 }, { "epoch": 2.167149059334298, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0286, "step": 8985 }, { "epoch": 2.1683550410033767, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 3.8164, "step": 8990 }, { "epoch": 2.1695610226724553, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.0588, "step": 8995 }, { "epoch": 2.170767004341534, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.9986, "step": 9000 }, { "epoch": 2.1719729860106125, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 3.9737, "step": 9005 }, { "epoch": 2.173178967679691, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.0542, "step": 9010 }, { "epoch": 2.1743849493487697, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1171, "step": 9015 }, { "epoch": 2.1755909310178487, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 3.9761, "step": 9020 }, { "epoch": 2.1767969126869273, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0553, "step": 9025 }, { "epoch": 2.178002894356006, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9732, "step": 9030 }, { "epoch": 2.1792088760250845, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 3.947, "step": 9035 }, { "epoch": 2.180414857694163, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.1013, "step": 9040 }, { "epoch": 2.1816208393632417, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.0339, "step": 9045 }, { "epoch": 2.1828268210323203, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.0489, "step": 9050 }, { "epoch": 2.184032802701399, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 3.9915, "step": 9055 }, { "epoch": 2.1852387843704775, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 3.9375, "step": 9060 }, { "epoch": 2.186444766039556, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.0103, "step": 9065 }, { "epoch": 2.1876507477086347, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.0311, "step": 9070 }, { "epoch": 2.1888567293777133, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 3.9954, "step": 9075 }, { "epoch": 2.1900627110467923, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.1672, "step": 9080 }, { "epoch": 2.191268692715871, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0064, "step": 9085 }, { "epoch": 2.1924746743849495, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.1734, "step": 9090 }, { "epoch": 2.193680656054028, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0653, "step": 9095 }, { "epoch": 2.1948866377231067, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 3.9216, "step": 9100 }, { "epoch": 2.1960926193921853, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9937, "step": 9105 }, { "epoch": 2.197298601061264, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 3.925, "step": 9110 }, { "epoch": 2.1985045827303424, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 3.9182, "step": 9115 }, { "epoch": 2.199710564399421, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.0848, "step": 9120 }, { "epoch": 2.2009165460684996, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 3.9821, "step": 9125 }, { "epoch": 2.202122527737578, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9549, "step": 9130 }, { "epoch": 2.2033285094066573, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.2098, "step": 9135 }, { "epoch": 2.204534491075736, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.1277, "step": 9140 }, { "epoch": 2.2057404727448144, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.0673, "step": 9145 }, { "epoch": 2.206946454413893, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.8499, "step": 9150 }, { "epoch": 2.2081524360829716, "grad_norm": 2.78125, "learning_rate": 3e-05, "loss": 4.1384, "step": 9155 }, { "epoch": 2.20935841775205, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 3.9784, "step": 9160 }, { "epoch": 2.210564399421129, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.0929, "step": 9165 }, { "epoch": 2.2117703810902074, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9642, "step": 9170 }, { "epoch": 2.212976362759286, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 4.0104, "step": 9175 }, { "epoch": 2.2141823444283646, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9491, "step": 9180 }, { "epoch": 2.215388326097443, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.063, "step": 9185 }, { "epoch": 2.2165943077665218, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.0995, "step": 9190 }, { "epoch": 2.2178002894356004, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 3.9858, "step": 9195 }, { "epoch": 2.2190062711046794, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 3.9214, "step": 9200 }, { "epoch": 2.220212252773758, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.0387, "step": 9205 }, { "epoch": 2.2214182344428366, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.0368, "step": 9210 }, { "epoch": 2.222624216111915, "grad_norm": 2.71875, "learning_rate": 3e-05, "loss": 3.8422, "step": 9215 }, { "epoch": 2.223830197780994, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.0637, "step": 9220 }, { "epoch": 2.2250361794500724, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 3.99, "step": 9225 }, { "epoch": 2.226242161119151, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9232, "step": 9230 }, { "epoch": 2.2274481427882296, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 4.019, "step": 9235 }, { "epoch": 2.228654124457308, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 3.9174, "step": 9240 }, { "epoch": 2.2298601061263867, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 3.8714, "step": 9245 }, { "epoch": 2.2310660877954653, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0004, "step": 9250 }, { "epoch": 2.2322720694645444, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9824, "step": 9255 }, { "epoch": 2.233478051133623, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.0892, "step": 9260 }, { "epoch": 2.2346840328027016, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.2641, "step": 9265 }, { "epoch": 2.23589001447178, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 3.952, "step": 9270 }, { "epoch": 2.2370959961408587, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.9833, "step": 9275 }, { "epoch": 2.2383019778099373, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 3.9901, "step": 9280 }, { "epoch": 2.239507959479016, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 4.0145, "step": 9285 }, { "epoch": 2.2407139411480945, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 3.9282, "step": 9290 }, { "epoch": 2.241919922817173, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.9866, "step": 9295 }, { "epoch": 2.2431259044862517, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.1575, "step": 9300 }, { "epoch": 2.2443318861553303, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 3.9852, "step": 9305 }, { "epoch": 2.245537867824409, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.0929, "step": 9310 }, { "epoch": 2.2467438494934875, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0305, "step": 9315 }, { "epoch": 2.2479498311625665, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0405, "step": 9320 }, { "epoch": 2.249155812831645, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 3.8529, "step": 9325 }, { "epoch": 2.2503617945007237, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0758, "step": 9330 }, { "epoch": 2.2515677761698023, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 3.9764, "step": 9335 }, { "epoch": 2.252773757838881, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 4.038, "step": 9340 }, { "epoch": 2.2539797395079595, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 3.8767, "step": 9345 }, { "epoch": 2.255185721177038, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.0714, "step": 9350 }, { "epoch": 2.2563917028461167, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 3.918, "step": 9355 }, { "epoch": 2.2575976845151953, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0112, "step": 9360 }, { "epoch": 2.258803666184274, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 3.9805, "step": 9365 }, { "epoch": 2.2600096478533525, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0229, "step": 9370 }, { "epoch": 2.2612156295224315, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0743, "step": 9375 }, { "epoch": 2.2624216111915096, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 3.9903, "step": 9380 }, { "epoch": 2.2636275928605887, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 3.9039, "step": 9385 }, { "epoch": 2.2648335745296673, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0223, "step": 9390 }, { "epoch": 2.266039556198746, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 3.9866, "step": 9395 }, { "epoch": 2.2672455378678245, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.851, "step": 9400 }, { "epoch": 2.268451519536903, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.0267, "step": 9405 }, { "epoch": 2.2696575012059816, "grad_norm": 3.015625, "learning_rate": 3e-05, "loss": 4.0087, "step": 9410 }, { "epoch": 2.2708634828750602, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 3.9095, "step": 9415 }, { "epoch": 2.272069464544139, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.0026, "step": 9420 }, { "epoch": 2.2732754462132174, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 3.8713, "step": 9425 }, { "epoch": 2.274481427882296, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 3.8805, "step": 9430 }, { "epoch": 2.2756874095513746, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 4.0081, "step": 9435 }, { "epoch": 2.2768933912204536, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 3.9613, "step": 9440 }, { "epoch": 2.2780993728895322, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 4.0783, "step": 9445 }, { "epoch": 2.279305354558611, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.2386, "step": 9450 }, { "epoch": 2.2805113362276894, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 3.9309, "step": 9455 }, { "epoch": 2.281717317896768, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9961, "step": 9460 }, { "epoch": 2.2829232995658466, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 3.9943, "step": 9465 }, { "epoch": 2.284129281234925, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9495, "step": 9470 }, { "epoch": 2.285335262904004, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 3.948, "step": 9475 }, { "epoch": 2.2865412445730824, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 3.9479, "step": 9480 }, { "epoch": 2.287747226242161, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.1379, "step": 9485 }, { "epoch": 2.2889532079112396, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 3.9867, "step": 9490 }, { "epoch": 2.2901591895803186, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.0615, "step": 9495 }, { "epoch": 2.2913651712493968, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0377, "step": 9500 }, { "epoch": 2.292571152918476, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0251, "step": 9505 }, { "epoch": 2.2937771345875544, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 3.8445, "step": 9510 }, { "epoch": 2.294983116256633, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0373, "step": 9515 }, { "epoch": 2.2961890979257116, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.015, "step": 9520 }, { "epoch": 2.29739507959479, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.0039, "step": 9525 }, { "epoch": 2.2986010612638688, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9252, "step": 9530 }, { "epoch": 2.2998070429329474, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0319, "step": 9535 }, { "epoch": 2.301013024602026, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 3.9747, "step": 9540 }, { "epoch": 2.3022190062711045, "grad_norm": 6.5, "learning_rate": 3e-05, "loss": 4.1151, "step": 9545 }, { "epoch": 2.303424987940183, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 3.9725, "step": 9550 }, { "epoch": 2.3046309696092617, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 3.8867, "step": 9555 }, { "epoch": 2.3058369512783408, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 3.9076, "step": 9560 }, { "epoch": 2.3070429329474194, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0594, "step": 9565 }, { "epoch": 2.308248914616498, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0468, "step": 9570 }, { "epoch": 2.3094548962855765, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9595, "step": 9575 }, { "epoch": 2.310660877954655, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9125, "step": 9580 }, { "epoch": 2.3118668596237337, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 3.8365, "step": 9585 }, { "epoch": 2.3130728412928123, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.1077, "step": 9590 }, { "epoch": 2.314278822961891, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.009, "step": 9595 }, { "epoch": 2.3154848046309695, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.0516, "step": 9600 }, { "epoch": 2.316690786300048, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.0977, "step": 9605 }, { "epoch": 2.3178967679691267, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9848, "step": 9610 }, { "epoch": 2.3191027496382057, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0433, "step": 9615 }, { "epoch": 2.3203087313072843, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 3.896, "step": 9620 }, { "epoch": 2.321514712976363, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 3.8759, "step": 9625 }, { "epoch": 2.3227206946454415, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0136, "step": 9630 }, { "epoch": 2.32392667631452, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0674, "step": 9635 }, { "epoch": 2.3251326579835987, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.0105, "step": 9640 }, { "epoch": 2.3263386396526773, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.0674, "step": 9645 }, { "epoch": 2.327544621321756, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.1066, "step": 9650 }, { "epoch": 2.3287506029908345, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.0445, "step": 9655 }, { "epoch": 2.329956584659913, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0004, "step": 9660 }, { "epoch": 2.3311625663289917, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.102, "step": 9665 }, { "epoch": 2.3323685479980703, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 3.9679, "step": 9670 }, { "epoch": 2.333574529667149, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.96, "step": 9675 }, { "epoch": 2.334780511336228, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 3.9779, "step": 9680 }, { "epoch": 2.3359864930053065, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0317, "step": 9685 }, { "epoch": 2.337192474674385, "grad_norm": 5.59375, "learning_rate": 3e-05, "loss": 3.9805, "step": 9690 }, { "epoch": 2.3383984563434637, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.084, "step": 9695 }, { "epoch": 2.3396044380125423, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.1074, "step": 9700 }, { "epoch": 2.340810419681621, "grad_norm": 5.625, "learning_rate": 3e-05, "loss": 4.1098, "step": 9705 }, { "epoch": 2.3420164013506994, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.0131, "step": 9710 }, { "epoch": 2.343222383019778, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 3.9985, "step": 9715 }, { "epoch": 2.3444283646888566, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0221, "step": 9720 }, { "epoch": 2.345634346357935, "grad_norm": 5.90625, "learning_rate": 3e-05, "loss": 4.0871, "step": 9725 }, { "epoch": 2.346840328027014, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0396, "step": 9730 }, { "epoch": 2.348046309696093, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.145, "step": 9735 }, { "epoch": 2.3492522913651714, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1407, "step": 9740 }, { "epoch": 2.35045827303425, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 3.9048, "step": 9745 }, { "epoch": 2.3516642547033286, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 3.9346, "step": 9750 }, { "epoch": 2.352870236372407, "grad_norm": 5.3125, "learning_rate": 3e-05, "loss": 4.1898, "step": 9755 }, { "epoch": 2.354076218041486, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9846, "step": 9760 }, { "epoch": 2.3552821997105644, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.0598, "step": 9765 }, { "epoch": 2.356488181379643, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.025, "step": 9770 }, { "epoch": 2.3576941630487216, "grad_norm": 3.09375, "learning_rate": 3e-05, "loss": 3.8896, "step": 9775 }, { "epoch": 2.3589001447178, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0528, "step": 9780 }, { "epoch": 2.3601061263868788, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 4.013, "step": 9785 }, { "epoch": 2.3613121080559574, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.9358, "step": 9790 }, { "epoch": 2.362518089725036, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.027, "step": 9795 }, { "epoch": 2.363724071394115, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 3.9437, "step": 9800 }, { "epoch": 2.3649300530631936, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 4.0307, "step": 9805 }, { "epoch": 2.366136034732272, "grad_norm": 5.53125, "learning_rate": 3e-05, "loss": 3.9036, "step": 9810 }, { "epoch": 2.367342016401351, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.0577, "step": 9815 }, { "epoch": 2.3685479980704294, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.1908, "step": 9820 }, { "epoch": 2.369753979739508, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.1125, "step": 9825 }, { "epoch": 2.3709599614085866, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0062, "step": 9830 }, { "epoch": 2.372165943077665, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 3.8695, "step": 9835 }, { "epoch": 2.3733719247467437, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 4.0452, "step": 9840 }, { "epoch": 2.3745779064158223, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 3.9492, "step": 9845 }, { "epoch": 2.375783888084901, "grad_norm": 6.0, "learning_rate": 3e-05, "loss": 4.0416, "step": 9850 }, { "epoch": 2.37698986975398, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 3.9441, "step": 9855 }, { "epoch": 2.3781958514230586, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.875, "step": 9860 }, { "epoch": 2.379401833092137, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 3.9894, "step": 9865 }, { "epoch": 2.3806078147612157, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.2072, "step": 9870 }, { "epoch": 2.3818137964302943, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 3.8842, "step": 9875 }, { "epoch": 2.383019778099373, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 3.9146, "step": 9880 }, { "epoch": 2.3842257597684515, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.0154, "step": 9885 }, { "epoch": 2.38543174143753, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 3.9023, "step": 9890 }, { "epoch": 2.3866377231066087, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0056, "step": 9895 }, { "epoch": 2.3878437047756873, "grad_norm": 5.21875, "learning_rate": 3e-05, "loss": 4.1729, "step": 9900 }, { "epoch": 2.389049686444766, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9973, "step": 9905 }, { "epoch": 2.3902556681138445, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9871, "step": 9910 }, { "epoch": 2.391461649782923, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.0117, "step": 9915 }, { "epoch": 2.392667631452002, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0031, "step": 9920 }, { "epoch": 2.3938736131210807, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 4.0148, "step": 9925 }, { "epoch": 2.3950795947901593, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 3.9612, "step": 9930 }, { "epoch": 2.396285576459238, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.0401, "step": 9935 }, { "epoch": 2.3974915581283165, "grad_norm": 2.890625, "learning_rate": 3e-05, "loss": 4.0072, "step": 9940 }, { "epoch": 2.398697539797395, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 3.9752, "step": 9945 }, { "epoch": 2.3999035214664737, "grad_norm": 6.875, "learning_rate": 3e-05, "loss": 4.0966, "step": 9950 }, { "epoch": 2.4011095031355523, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.0379, "step": 9955 }, { "epoch": 2.402315484804631, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.088, "step": 9960 }, { "epoch": 2.4035214664737095, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.0655, "step": 9965 }, { "epoch": 2.404727448142788, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.0708, "step": 9970 }, { "epoch": 2.405933429811867, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.162, "step": 9975 }, { "epoch": 2.4071394114809457, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.1941, "step": 9980 }, { "epoch": 2.4083453931500243, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.9607, "step": 9985 }, { "epoch": 2.409551374819103, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 3.9931, "step": 9990 }, { "epoch": 2.4107573564881815, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.1206, "step": 9995 }, { "epoch": 2.41196333815726, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 3.9367, "step": 10000 }, { "epoch": 2.4131693198263386, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 3.9103, "step": 10005 }, { "epoch": 2.4143753014954172, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9542, "step": 10010 }, { "epoch": 2.415581283164496, "grad_norm": 6.4375, "learning_rate": 3e-05, "loss": 4.1594, "step": 10015 }, { "epoch": 2.4167872648335744, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 3.9247, "step": 10020 }, { "epoch": 2.417993246502653, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 3.9016, "step": 10025 }, { "epoch": 2.4191992281717316, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0635, "step": 10030 }, { "epoch": 2.42040520984081, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.0304, "step": 10035 }, { "epoch": 2.4216111915098892, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.9436, "step": 10040 }, { "epoch": 2.422817173178968, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0583, "step": 10045 }, { "epoch": 2.4240231548480464, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.0254, "step": 10050 }, { "epoch": 2.425229136517125, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9717, "step": 10055 }, { "epoch": 2.4264351181862036, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 3.9237, "step": 10060 }, { "epoch": 2.427641099855282, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 3.9579, "step": 10065 }, { "epoch": 2.428847081524361, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0746, "step": 10070 }, { "epoch": 2.4300530631934394, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 3.8609, "step": 10075 }, { "epoch": 2.431259044862518, "grad_norm": 5.5, "learning_rate": 3e-05, "loss": 4.0441, "step": 10080 }, { "epoch": 2.4324650265315966, "grad_norm": 5.40625, "learning_rate": 3e-05, "loss": 4.0175, "step": 10085 }, { "epoch": 2.433671008200675, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.1098, "step": 10090 }, { "epoch": 2.434876989869754, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0046, "step": 10095 }, { "epoch": 2.436082971538833, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 3.9926, "step": 10100 }, { "epoch": 2.4372889532079114, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 3.9632, "step": 10105 }, { "epoch": 2.43849493487699, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9683, "step": 10110 }, { "epoch": 2.4397009165460686, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.0037, "step": 10115 }, { "epoch": 2.440906898215147, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.0325, "step": 10120 }, { "epoch": 2.4421128798842258, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0459, "step": 10125 }, { "epoch": 2.4433188615533044, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.0558, "step": 10130 }, { "epoch": 2.444524843222383, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.0289, "step": 10135 }, { "epoch": 2.4457308248914615, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.0018, "step": 10140 }, { "epoch": 2.44693680656054, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0297, "step": 10145 }, { "epoch": 2.4481427882296187, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0774, "step": 10150 }, { "epoch": 2.4493487698986973, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 3.9053, "step": 10155 }, { "epoch": 2.4505547515677764, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.0873, "step": 10160 }, { "epoch": 2.451760733236855, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0274, "step": 10165 }, { "epoch": 2.4529667149059335, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 3.9991, "step": 10170 }, { "epoch": 2.454172696575012, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.0219, "step": 10175 }, { "epoch": 2.4553786782440907, "grad_norm": 6.78125, "learning_rate": 3e-05, "loss": 3.9819, "step": 10180 }, { "epoch": 2.4565846599131693, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 3.94, "step": 10185 }, { "epoch": 2.457790641582248, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.954, "step": 10190 }, { "epoch": 2.4589966232513265, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.0822, "step": 10195 }, { "epoch": 2.460202604920405, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 3.9649, "step": 10200 }, { "epoch": 2.4614085865894837, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 4.1017, "step": 10205 }, { "epoch": 2.4626145682585623, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.0268, "step": 10210 }, { "epoch": 2.4638205499276413, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 3.9898, "step": 10215 }, { "epoch": 2.46502653159672, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.0867, "step": 10220 }, { "epoch": 2.4662325132657985, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0871, "step": 10225 }, { "epoch": 2.467438494934877, "grad_norm": 2.515625, "learning_rate": 3e-05, "loss": 4.008, "step": 10230 }, { "epoch": 2.4686444766039557, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.1394, "step": 10235 }, { "epoch": 2.4698504582730343, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0622, "step": 10240 }, { "epoch": 2.471056439942113, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.1113, "step": 10245 }, { "epoch": 2.4722624216111915, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0513, "step": 10250 }, { "epoch": 2.47346840328027, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 4.1372, "step": 10255 }, { "epoch": 2.4746743849493487, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.0495, "step": 10260 }, { "epoch": 2.4758803666184273, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 4.0719, "step": 10265 }, { "epoch": 2.477086348287506, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.0851, "step": 10270 }, { "epoch": 2.4782923299565844, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0098, "step": 10275 }, { "epoch": 2.4794983116256635, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.0411, "step": 10280 }, { "epoch": 2.480704293294742, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.0995, "step": 10285 }, { "epoch": 2.4819102749638207, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.0129, "step": 10290 }, { "epoch": 2.4831162566328993, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 3.9698, "step": 10295 }, { "epoch": 2.484322238301978, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 3.9677, "step": 10300 }, { "epoch": 2.4855282199710564, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 3.8932, "step": 10305 }, { "epoch": 2.486734201640135, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 4.0958, "step": 10310 }, { "epoch": 2.4879401833092136, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9532, "step": 10315 }, { "epoch": 2.489146164978292, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0681, "step": 10320 }, { "epoch": 2.490352146647371, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 4.0187, "step": 10325 }, { "epoch": 2.4915581283164494, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 3.9658, "step": 10330 }, { "epoch": 2.4927641099855284, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0548, "step": 10335 }, { "epoch": 2.493970091654607, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9599, "step": 10340 }, { "epoch": 2.4951760733236856, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 3.9463, "step": 10345 }, { "epoch": 2.496382054992764, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0041, "step": 10350 }, { "epoch": 2.497588036661843, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.002, "step": 10355 }, { "epoch": 2.4987940183309214, "grad_norm": 2.984375, "learning_rate": 3e-05, "loss": 3.9211, "step": 10360 }, { "epoch": 2.5, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.0284, "step": 10365 }, { "epoch": 2.5012059816690786, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.8589, "step": 10370 }, { "epoch": 2.502411963338157, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.0118, "step": 10375 }, { "epoch": 2.503617945007236, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 3.9367, "step": 10380 }, { "epoch": 2.5048239266763144, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.0697, "step": 10385 }, { "epoch": 2.5060299083453934, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.0698, "step": 10390 }, { "epoch": 2.5072358900144716, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0291, "step": 10395 }, { "epoch": 2.5084418716835506, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 4.0843, "step": 10400 }, { "epoch": 2.509647853352629, "grad_norm": 4.9375, "learning_rate": 3e-05, "loss": 4.0651, "step": 10405 }, { "epoch": 2.510853835021708, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.1156, "step": 10410 }, { "epoch": 2.5120598166907864, "grad_norm": 2.828125, "learning_rate": 3e-05, "loss": 4.0365, "step": 10415 }, { "epoch": 2.513265798359865, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0958, "step": 10420 }, { "epoch": 2.5144717800289436, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 4.0666, "step": 10425 }, { "epoch": 2.515677761698022, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.0025, "step": 10430 }, { "epoch": 2.5168837433671007, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.1196, "step": 10435 }, { "epoch": 2.5180897250361793, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9603, "step": 10440 }, { "epoch": 2.519295706705258, "grad_norm": 3.15625, "learning_rate": 3e-05, "loss": 4.0522, "step": 10445 }, { "epoch": 2.5205016883743365, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 3.9144, "step": 10450 }, { "epoch": 2.5217076700434156, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 3.8898, "step": 10455 }, { "epoch": 2.5229136517124937, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 3.9678, "step": 10460 }, { "epoch": 2.5241196333815727, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0429, "step": 10465 }, { "epoch": 2.5253256150506513, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 4.0434, "step": 10470 }, { "epoch": 2.52653159671973, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 3.986, "step": 10475 }, { "epoch": 2.5277375783888085, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.9663, "step": 10480 }, { "epoch": 2.528943560057887, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0296, "step": 10485 }, { "epoch": 2.5301495417269657, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 3.9165, "step": 10490 }, { "epoch": 2.5313555233960443, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 4.0782, "step": 10495 }, { "epoch": 2.532561505065123, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 3.8511, "step": 10500 }, { "epoch": 2.5337674867342015, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9846, "step": 10505 }, { "epoch": 2.5349734684032805, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0169, "step": 10510 }, { "epoch": 2.5361794500723587, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9634, "step": 10515 }, { "epoch": 2.5373854317414377, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 3.9975, "step": 10520 }, { "epoch": 2.5385914134105163, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 3.9025, "step": 10525 }, { "epoch": 2.539797395079595, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 3.9682, "step": 10530 }, { "epoch": 2.5410033767486735, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 3.9693, "step": 10535 }, { "epoch": 2.542209358417752, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0392, "step": 10540 }, { "epoch": 2.5434153400868307, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.0166, "step": 10545 }, { "epoch": 2.5446213217559093, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 3.9113, "step": 10550 }, { "epoch": 2.545827303424988, "grad_norm": 3.109375, "learning_rate": 3e-05, "loss": 4.0657, "step": 10555 }, { "epoch": 2.5470332850940665, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.0122, "step": 10560 }, { "epoch": 2.548239266763145, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.182, "step": 10565 }, { "epoch": 2.5494452484322236, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 3.7813, "step": 10570 }, { "epoch": 2.5506512301013027, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 4.0146, "step": 10575 }, { "epoch": 2.551857211770381, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.1594, "step": 10580 }, { "epoch": 2.55306319343946, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 3.9809, "step": 10585 }, { "epoch": 2.5542691751085385, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1957, "step": 10590 }, { "epoch": 2.555475156777617, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.037, "step": 10595 }, { "epoch": 2.5566811384466956, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 3.9303, "step": 10600 }, { "epoch": 2.5578871201157742, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 3.8795, "step": 10605 }, { "epoch": 2.559093101784853, "grad_norm": 5.9375, "learning_rate": 3e-05, "loss": 4.1172, "step": 10610 }, { "epoch": 2.5602990834539314, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0898, "step": 10615 }, { "epoch": 2.56150506512301, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.0463, "step": 10620 }, { "epoch": 2.5627110467920886, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 3.8832, "step": 10625 }, { "epoch": 2.5639170284611676, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 3.9679, "step": 10630 }, { "epoch": 2.565123010130246, "grad_norm": 5.6875, "learning_rate": 3e-05, "loss": 3.8427, "step": 10635 }, { "epoch": 2.566328991799325, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 3.9826, "step": 10640 }, { "epoch": 2.5675349734684034, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.0903, "step": 10645 }, { "epoch": 2.568740955137482, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1599, "step": 10650 }, { "epoch": 2.5699469368065606, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 3.8945, "step": 10655 }, { "epoch": 2.571152918475639, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 3.9928, "step": 10660 }, { "epoch": 2.572358900144718, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.1862, "step": 10665 }, { "epoch": 2.5735648818137964, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9465, "step": 10670 }, { "epoch": 2.574770863482875, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 3.9002, "step": 10675 }, { "epoch": 2.5759768451519536, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0604, "step": 10680 }, { "epoch": 2.577182826821032, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 4.0071, "step": 10685 }, { "epoch": 2.5783888084901108, "grad_norm": 3.34375, "learning_rate": 3e-05, "loss": 4.1044, "step": 10690 }, { "epoch": 2.57959479015919, "grad_norm": 5.5, "learning_rate": 3e-05, "loss": 4.047, "step": 10695 }, { "epoch": 2.580800771828268, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0092, "step": 10700 }, { "epoch": 2.582006753497347, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.0396, "step": 10705 }, { "epoch": 2.5832127351664256, "grad_norm": 5.21875, "learning_rate": 3e-05, "loss": 4.0759, "step": 10710 }, { "epoch": 2.584418716835504, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 3.9551, "step": 10715 }, { "epoch": 2.5856246985045828, "grad_norm": 6.4375, "learning_rate": 3e-05, "loss": 4.0541, "step": 10720 }, { "epoch": 2.5868306801736614, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 4.0582, "step": 10725 }, { "epoch": 2.58803666184274, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.0026, "step": 10730 }, { "epoch": 2.5892426435118185, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.0593, "step": 10735 }, { "epoch": 2.590448625180897, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.0718, "step": 10740 }, { "epoch": 2.5916546068499757, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9607, "step": 10745 }, { "epoch": 2.5928605885190548, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.1247, "step": 10750 }, { "epoch": 2.594066570188133, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 3.9824, "step": 10755 }, { "epoch": 2.595272551857212, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0159, "step": 10760 }, { "epoch": 2.5964785335262905, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0024, "step": 10765 }, { "epoch": 2.597684515195369, "grad_norm": 6.28125, "learning_rate": 3e-05, "loss": 3.9164, "step": 10770 }, { "epoch": 2.5988904968644477, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0072, "step": 10775 }, { "epoch": 2.6000964785335263, "grad_norm": 3.203125, "learning_rate": 3e-05, "loss": 3.9832, "step": 10780 }, { "epoch": 2.601302460202605, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0237, "step": 10785 }, { "epoch": 2.6025084418716835, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0625, "step": 10790 }, { "epoch": 2.603714423540762, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0533, "step": 10795 }, { "epoch": 2.6049204052098407, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 4.0983, "step": 10800 }, { "epoch": 2.6061263868789193, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 4.0411, "step": 10805 }, { "epoch": 2.607332368547998, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0056, "step": 10810 }, { "epoch": 2.608538350217077, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 3.9921, "step": 10815 }, { "epoch": 2.609744331886155, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1284, "step": 10820 }, { "epoch": 2.610950313555234, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.0263, "step": 10825 }, { "epoch": 2.6121562952243127, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.107, "step": 10830 }, { "epoch": 2.6133622768933913, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.159, "step": 10835 }, { "epoch": 2.61456825856247, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.1528, "step": 10840 }, { "epoch": 2.6157742402315485, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.0552, "step": 10845 }, { "epoch": 2.616980221900627, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 3.829, "step": 10850 }, { "epoch": 2.6181862035697057, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 3.9447, "step": 10855 }, { "epoch": 2.6193921852387843, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.1541, "step": 10860 }, { "epoch": 2.620598166907863, "grad_norm": 3.078125, "learning_rate": 3e-05, "loss": 3.9652, "step": 10865 }, { "epoch": 2.621804148576942, "grad_norm": 6.34375, "learning_rate": 3e-05, "loss": 4.0423, "step": 10870 }, { "epoch": 2.62301013024602, "grad_norm": 5.3125, "learning_rate": 3e-05, "loss": 4.0642, "step": 10875 }, { "epoch": 2.624216111915099, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 3.8561, "step": 10880 }, { "epoch": 2.6254220935841777, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 4.1341, "step": 10885 }, { "epoch": 2.6266280752532563, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9242, "step": 10890 }, { "epoch": 2.627834056922335, "grad_norm": 3.46875, "learning_rate": 3e-05, "loss": 3.9403, "step": 10895 }, { "epoch": 2.6290400385914134, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.2382, "step": 10900 }, { "epoch": 2.630246020260492, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1419, "step": 10905 }, { "epoch": 2.6314520019295706, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.0163, "step": 10910 }, { "epoch": 2.632657983598649, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.035, "step": 10915 }, { "epoch": 2.633863965267728, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 3.9878, "step": 10920 }, { "epoch": 2.6350699469368064, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.9777, "step": 10925 }, { "epoch": 2.636275928605885, "grad_norm": 5.5, "learning_rate": 3e-05, "loss": 4.0074, "step": 10930 }, { "epoch": 2.637481910274964, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 3.9663, "step": 10935 }, { "epoch": 2.638687891944042, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 3.9226, "step": 10940 }, { "epoch": 2.6398938736131212, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 4.0992, "step": 10945 }, { "epoch": 2.6410998552822, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 4.02, "step": 10950 }, { "epoch": 2.6423058369512784, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.1398, "step": 10955 }, { "epoch": 2.643511818620357, "grad_norm": 5.4375, "learning_rate": 3e-05, "loss": 4.1957, "step": 10960 }, { "epoch": 2.6447178002894356, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.0482, "step": 10965 }, { "epoch": 2.645923781958514, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.8953, "step": 10970 }, { "epoch": 2.647129763627593, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.9089, "step": 10975 }, { "epoch": 2.6483357452966714, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.0316, "step": 10980 }, { "epoch": 2.64954172696575, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 3.8783, "step": 10985 }, { "epoch": 2.650747708634829, "grad_norm": 5.21875, "learning_rate": 3e-05, "loss": 3.9548, "step": 10990 }, { "epoch": 2.651953690303907, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0886, "step": 10995 }, { "epoch": 2.653159671972986, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0956, "step": 11000 }, { "epoch": 2.654365653642065, "grad_norm": 3.8125, "learning_rate": 3e-05, "loss": 3.9056, "step": 11005 }, { "epoch": 2.6555716353111434, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.0862, "step": 11010 }, { "epoch": 2.656777616980222, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 3.9288, "step": 11015 }, { "epoch": 2.6579835986493006, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.044, "step": 11020 }, { "epoch": 2.659189580318379, "grad_norm": 5.46875, "learning_rate": 3e-05, "loss": 4.2246, "step": 11025 }, { "epoch": 2.6603955619874577, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 4.146, "step": 11030 }, { "epoch": 2.6616015436565363, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0098, "step": 11035 }, { "epoch": 2.662807525325615, "grad_norm": 4.8125, "learning_rate": 3e-05, "loss": 4.1055, "step": 11040 }, { "epoch": 2.6640135069946935, "grad_norm": 3.5625, "learning_rate": 3e-05, "loss": 4.0647, "step": 11045 }, { "epoch": 2.665219488663772, "grad_norm": 3.40625, "learning_rate": 3e-05, "loss": 3.9452, "step": 11050 }, { "epoch": 2.666425470332851, "grad_norm": 3.65625, "learning_rate": 3e-05, "loss": 3.9587, "step": 11055 }, { "epoch": 2.6676314520019293, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0699, "step": 11060 }, { "epoch": 2.6688374336710083, "grad_norm": 6.15625, "learning_rate": 3e-05, "loss": 4.1673, "step": 11065 }, { "epoch": 2.670043415340087, "grad_norm": 4.875, "learning_rate": 3e-05, "loss": 4.1568, "step": 11070 }, { "epoch": 2.6712493970091655, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.0723, "step": 11075 }, { "epoch": 2.672455378678244, "grad_norm": 3.25, "learning_rate": 3e-05, "loss": 3.9567, "step": 11080 }, { "epoch": 2.6736613603473227, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0221, "step": 11085 }, { "epoch": 2.6748673420164013, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 3.9831, "step": 11090 }, { "epoch": 2.67607332368548, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 3.9891, "step": 11095 }, { "epoch": 2.6772793053545585, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 3.9638, "step": 11100 }, { "epoch": 2.678485287023637, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 3.9625, "step": 11105 }, { "epoch": 2.679691268692716, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0772, "step": 11110 }, { "epoch": 2.6808972503617943, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.0057, "step": 11115 }, { "epoch": 2.6821032320308733, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 4.0624, "step": 11120 }, { "epoch": 2.683309213699952, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 3.9544, "step": 11125 }, { "epoch": 2.6845151953690305, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 3.9472, "step": 11130 }, { "epoch": 2.685721177038109, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.1006, "step": 11135 }, { "epoch": 2.6869271587071877, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.017, "step": 11140 }, { "epoch": 2.6881331403762663, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9537, "step": 11145 }, { "epoch": 2.689339122045345, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9811, "step": 11150 }, { "epoch": 2.6905451037144235, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 3.9674, "step": 11155 }, { "epoch": 2.691751085383502, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 3.9732, "step": 11160 }, { "epoch": 2.6929570670525806, "grad_norm": 3.796875, "learning_rate": 3e-05, "loss": 3.9712, "step": 11165 }, { "epoch": 2.6941630487216592, "grad_norm": 3.28125, "learning_rate": 3e-05, "loss": 3.9664, "step": 11170 }, { "epoch": 2.6953690303907383, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0144, "step": 11175 }, { "epoch": 2.6965750120598164, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 3.9428, "step": 11180 }, { "epoch": 2.6977809937288955, "grad_norm": 3.234375, "learning_rate": 3e-05, "loss": 3.9773, "step": 11185 }, { "epoch": 2.698986975397974, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0628, "step": 11190 }, { "epoch": 2.7001929570670526, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 3.9819, "step": 11195 }, { "epoch": 2.7013989387361312, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 3.9798, "step": 11200 }, { "epoch": 2.70260492040521, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 3.973, "step": 11205 }, { "epoch": 2.7038109020742884, "grad_norm": 4.75, "learning_rate": 3e-05, "loss": 3.9366, "step": 11210 }, { "epoch": 2.705016883743367, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.0918, "step": 11215 }, { "epoch": 2.7062228654124456, "grad_norm": 6.25, "learning_rate": 3e-05, "loss": 3.9567, "step": 11220 }, { "epoch": 2.707428847081524, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 4.0896, "step": 11225 }, { "epoch": 2.7086348287506032, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0101, "step": 11230 }, { "epoch": 2.7098408104196814, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9648, "step": 11235 }, { "epoch": 2.7110467920887604, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 4.0414, "step": 11240 }, { "epoch": 2.712252773757839, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 3.8675, "step": 11245 }, { "epoch": 2.7134587554269176, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 3.9268, "step": 11250 }, { "epoch": 2.714664737095996, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.0896, "step": 11255 }, { "epoch": 2.715870718765075, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0144, "step": 11260 }, { "epoch": 2.7170767004341534, "grad_norm": 5.59375, "learning_rate": 3e-05, "loss": 3.99, "step": 11265 }, { "epoch": 2.718282682103232, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 3.9511, "step": 11270 }, { "epoch": 2.7194886637723106, "grad_norm": 3.265625, "learning_rate": 3e-05, "loss": 4.2016, "step": 11275 }, { "epoch": 2.720694645441389, "grad_norm": 3.953125, "learning_rate": 3e-05, "loss": 3.9797, "step": 11280 }, { "epoch": 2.7219006271104678, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0584, "step": 11285 }, { "epoch": 2.7231066087795464, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.0383, "step": 11290 }, { "epoch": 2.7243125904486254, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 3.9276, "step": 11295 }, { "epoch": 2.7255185721177035, "grad_norm": 4.78125, "learning_rate": 3e-05, "loss": 4.0111, "step": 11300 }, { "epoch": 2.7267245537867826, "grad_norm": 2.90625, "learning_rate": 3e-05, "loss": 3.9469, "step": 11305 }, { "epoch": 2.727930535455861, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 3.833, "step": 11310 }, { "epoch": 2.7291365171249398, "grad_norm": 2.625, "learning_rate": 3e-05, "loss": 3.9968, "step": 11315 }, { "epoch": 2.7303424987940184, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9937, "step": 11320 }, { "epoch": 2.731548480463097, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 4.0772, "step": 11325 }, { "epoch": 2.7327544621321755, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 3.9753, "step": 11330 }, { "epoch": 2.733960443801254, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 3.97, "step": 11335 }, { "epoch": 2.7351664254703327, "grad_norm": 5.78125, "learning_rate": 3e-05, "loss": 4.0218, "step": 11340 }, { "epoch": 2.7363724071394113, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9912, "step": 11345 }, { "epoch": 2.7375783888084904, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.0108, "step": 11350 }, { "epoch": 2.7387843704775685, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 3.9697, "step": 11355 }, { "epoch": 2.7399903521466475, "grad_norm": 3.75, "learning_rate": 3e-05, "loss": 3.9146, "step": 11360 }, { "epoch": 2.741196333815726, "grad_norm": 4.59375, "learning_rate": 3e-05, "loss": 4.0337, "step": 11365 }, { "epoch": 2.7424023154848047, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.0019, "step": 11370 }, { "epoch": 2.7436082971538833, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1151, "step": 11375 }, { "epoch": 2.744814278822962, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 4.0218, "step": 11380 }, { "epoch": 2.7460202604920405, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 4.027, "step": 11385 }, { "epoch": 2.747226242161119, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 3.8912, "step": 11390 }, { "epoch": 2.7484322238301977, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 3.8504, "step": 11395 }, { "epoch": 2.7496382054992763, "grad_norm": 5.125, "learning_rate": 3e-05, "loss": 3.9192, "step": 11400 }, { "epoch": 2.750844187168355, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.2261, "step": 11405 }, { "epoch": 2.7520501688374335, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0828, "step": 11410 }, { "epoch": 2.7532561505065125, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0885, "step": 11415 }, { "epoch": 2.7544621321755907, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 4.0034, "step": 11420 }, { "epoch": 2.7556681138446697, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.095, "step": 11425 }, { "epoch": 2.7568740955137483, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.0151, "step": 11430 }, { "epoch": 2.758080077182827, "grad_norm": 4.1875, "learning_rate": 3e-05, "loss": 4.1308, "step": 11435 }, { "epoch": 2.7592860588519055, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.0233, "step": 11440 }, { "epoch": 2.760492040520984, "grad_norm": 4.125, "learning_rate": 3e-05, "loss": 4.0137, "step": 11445 }, { "epoch": 2.7616980221900627, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 4.0372, "step": 11450 }, { "epoch": 2.7629040038591413, "grad_norm": 5.625, "learning_rate": 3e-05, "loss": 4.0119, "step": 11455 }, { "epoch": 2.76410998552822, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 3.9203, "step": 11460 }, { "epoch": 2.7653159671972984, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 3.9422, "step": 11465 }, { "epoch": 2.7665219488663775, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 3.9352, "step": 11470 }, { "epoch": 2.7677279305354556, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0518, "step": 11475 }, { "epoch": 2.7689339122045347, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.0515, "step": 11480 }, { "epoch": 2.7701398938736133, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 4.0527, "step": 11485 }, { "epoch": 2.771345875542692, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9145, "step": 11490 }, { "epoch": 2.7725518572117704, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 3.9191, "step": 11495 }, { "epoch": 2.773757838880849, "grad_norm": 3.484375, "learning_rate": 3e-05, "loss": 3.7375, "step": 11500 }, { "epoch": 2.7749638205499276, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.0839, "step": 11505 }, { "epoch": 2.776169802219006, "grad_norm": 3.515625, "learning_rate": 3e-05, "loss": 4.2001, "step": 11510 }, { "epoch": 2.777375783888085, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.1705, "step": 11515 }, { "epoch": 2.7785817655571634, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.0741, "step": 11520 }, { "epoch": 2.779787747226242, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 3.9043, "step": 11525 }, { "epoch": 2.7809937288953206, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 4.116, "step": 11530 }, { "epoch": 2.7821997105643996, "grad_norm": 2.828125, "learning_rate": 3e-05, "loss": 3.9451, "step": 11535 }, { "epoch": 2.783405692233478, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.0687, "step": 11540 }, { "epoch": 2.784611673902557, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 3.9948, "step": 11545 }, { "epoch": 2.7858176555716354, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0573, "step": 11550 }, { "epoch": 2.787023637240714, "grad_norm": 4.71875, "learning_rate": 3e-05, "loss": 4.1238, "step": 11555 }, { "epoch": 2.7882296189097926, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 4.166, "step": 11560 }, { "epoch": 2.789435600578871, "grad_norm": 3.96875, "learning_rate": 3e-05, "loss": 4.0497, "step": 11565 }, { "epoch": 2.79064158224795, "grad_norm": 5.84375, "learning_rate": 3e-05, "loss": 4.0272, "step": 11570 }, { "epoch": 2.7918475639170284, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0142, "step": 11575 }, { "epoch": 2.793053545586107, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.1065, "step": 11580 }, { "epoch": 2.7942595272551856, "grad_norm": 5.3125, "learning_rate": 3e-05, "loss": 4.0549, "step": 11585 }, { "epoch": 2.7954655089242646, "grad_norm": 5.0, "learning_rate": 3e-05, "loss": 4.0616, "step": 11590 }, { "epoch": 2.7966714905933427, "grad_norm": 3.734375, "learning_rate": 3e-05, "loss": 3.9635, "step": 11595 }, { "epoch": 2.797877472262422, "grad_norm": 3.984375, "learning_rate": 3e-05, "loss": 3.9828, "step": 11600 }, { "epoch": 2.7990834539315004, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.1285, "step": 11605 }, { "epoch": 2.800289435600579, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.1066, "step": 11610 }, { "epoch": 2.8014954172696576, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9635, "step": 11615 }, { "epoch": 2.802701398938736, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 3.969, "step": 11620 }, { "epoch": 2.8039073806078147, "grad_norm": 3.375, "learning_rate": 3e-05, "loss": 4.1113, "step": 11625 }, { "epoch": 2.8051133622768933, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 4.0239, "step": 11630 }, { "epoch": 2.806319343945972, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.077, "step": 11635 }, { "epoch": 2.8075253256150505, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 4.1187, "step": 11640 }, { "epoch": 2.808731307284129, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0216, "step": 11645 }, { "epoch": 2.8099372889532077, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 3.9548, "step": 11650 }, { "epoch": 2.8111432706222867, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 3.9468, "step": 11655 }, { "epoch": 2.812349252291365, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 4.0259, "step": 11660 }, { "epoch": 2.813555233960444, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0014, "step": 11665 }, { "epoch": 2.8147612156295225, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 4.0618, "step": 11670 }, { "epoch": 2.815967197298601, "grad_norm": 5.4375, "learning_rate": 3e-05, "loss": 4.0837, "step": 11675 }, { "epoch": 2.8171731789676797, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.9301, "step": 11680 }, { "epoch": 2.8183791606367583, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.108, "step": 11685 }, { "epoch": 2.819585142305837, "grad_norm": 5.46875, "learning_rate": 3e-05, "loss": 4.0465, "step": 11690 }, { "epoch": 2.8207911239749155, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 3.8629, "step": 11695 }, { "epoch": 2.821997105643994, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 4.0588, "step": 11700 }, { "epoch": 2.8232030873130727, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.8604, "step": 11705 }, { "epoch": 2.8244090689821517, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 3.9333, "step": 11710 }, { "epoch": 2.82561505065123, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.0669, "step": 11715 }, { "epoch": 2.826821032320309, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.1213, "step": 11720 }, { "epoch": 2.8280270139893875, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 3.971, "step": 11725 }, { "epoch": 2.829232995658466, "grad_norm": 3.4375, "learning_rate": 3e-05, "loss": 4.044, "step": 11730 }, { "epoch": 2.8304389773275447, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.0308, "step": 11735 }, { "epoch": 2.8316449589966233, "grad_norm": 5.71875, "learning_rate": 3e-05, "loss": 4.0542, "step": 11740 }, { "epoch": 2.832850940665702, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 3.9498, "step": 11745 }, { "epoch": 2.8340569223347805, "grad_norm": 7.21875, "learning_rate": 3e-05, "loss": 4.1, "step": 11750 }, { "epoch": 2.835262904003859, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 3.8828, "step": 11755 }, { "epoch": 2.8364688856729376, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.0383, "step": 11760 }, { "epoch": 2.8376748673420162, "grad_norm": 3.640625, "learning_rate": 3e-05, "loss": 4.1247, "step": 11765 }, { "epoch": 2.838880849011095, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.8871, "step": 11770 }, { "epoch": 2.840086830680174, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 4.1798, "step": 11775 }, { "epoch": 2.841292812349252, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 3.9785, "step": 11780 }, { "epoch": 2.842498794018331, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 3.9091, "step": 11785 }, { "epoch": 2.8437047756874096, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9894, "step": 11790 }, { "epoch": 2.8449107573564882, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 3.8298, "step": 11795 }, { "epoch": 2.846116739025567, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9988, "step": 11800 }, { "epoch": 2.8473227206946454, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 4.0786, "step": 11805 }, { "epoch": 2.848528702363724, "grad_norm": 3.5, "learning_rate": 3e-05, "loss": 4.0863, "step": 11810 }, { "epoch": 2.8497346840328026, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0437, "step": 11815 }, { "epoch": 2.850940665701881, "grad_norm": 2.6875, "learning_rate": 3e-05, "loss": 4.2416, "step": 11820 }, { "epoch": 2.85214664737096, "grad_norm": 5.5625, "learning_rate": 3e-05, "loss": 4.0002, "step": 11825 }, { "epoch": 2.853352629040039, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 3.9915, "step": 11830 }, { "epoch": 2.854558610709117, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.9982, "step": 11835 }, { "epoch": 2.855764592378196, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.889, "step": 11840 }, { "epoch": 2.8569705740472746, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0306, "step": 11845 }, { "epoch": 2.858176555716353, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.9224, "step": 11850 }, { "epoch": 2.859382537385432, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 4.035, "step": 11855 }, { "epoch": 2.8605885190545104, "grad_norm": 4.96875, "learning_rate": 3e-05, "loss": 3.9325, "step": 11860 }, { "epoch": 2.861794500723589, "grad_norm": 5.5, "learning_rate": 3e-05, "loss": 3.9124, "step": 11865 }, { "epoch": 2.8630004823926676, "grad_norm": 3.9375, "learning_rate": 3e-05, "loss": 4.1184, "step": 11870 }, { "epoch": 2.864206464061746, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 4.123, "step": 11875 }, { "epoch": 2.8654124457308248, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0789, "step": 11880 }, { "epoch": 2.8666184273999034, "grad_norm": 3.78125, "learning_rate": 3e-05, "loss": 4.0293, "step": 11885 }, { "epoch": 2.867824409068982, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 3.8634, "step": 11890 }, { "epoch": 2.869030390738061, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 3.9221, "step": 11895 }, { "epoch": 2.870236372407139, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 4.0992, "step": 11900 }, { "epoch": 2.871442354076218, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.9565, "step": 11905 }, { "epoch": 2.8726483357452968, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9628, "step": 11910 }, { "epoch": 2.8738543174143754, "grad_norm": 2.921875, "learning_rate": 3e-05, "loss": 4.036, "step": 11915 }, { "epoch": 2.875060299083454, "grad_norm": 5.46875, "learning_rate": 3e-05, "loss": 3.9952, "step": 11920 }, { "epoch": 2.8762662807525325, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 3.9537, "step": 11925 }, { "epoch": 2.877472262421611, "grad_norm": 5.9375, "learning_rate": 3e-05, "loss": 4.019, "step": 11930 }, { "epoch": 2.8786782440906897, "grad_norm": 4.6875, "learning_rate": 3e-05, "loss": 3.7904, "step": 11935 }, { "epoch": 2.8798842257597683, "grad_norm": 4.0, "learning_rate": 3e-05, "loss": 3.993, "step": 11940 }, { "epoch": 2.881090207428847, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 4.0835, "step": 11945 }, { "epoch": 2.882296189097926, "grad_norm": 2.84375, "learning_rate": 3e-05, "loss": 3.9718, "step": 11950 }, { "epoch": 2.883502170767004, "grad_norm": 3.1875, "learning_rate": 3e-05, "loss": 3.8826, "step": 11955 }, { "epoch": 2.884708152436083, "grad_norm": 3.90625, "learning_rate": 3e-05, "loss": 4.1541, "step": 11960 }, { "epoch": 2.8859141341051617, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 3.9952, "step": 11965 }, { "epoch": 2.8871201157742403, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.1429, "step": 11970 }, { "epoch": 2.888326097443319, "grad_norm": 6.09375, "learning_rate": 3e-05, "loss": 3.8736, "step": 11975 }, { "epoch": 2.8895320791123975, "grad_norm": 2.796875, "learning_rate": 3e-05, "loss": 4.0975, "step": 11980 }, { "epoch": 2.890738060781476, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 4.0397, "step": 11985 }, { "epoch": 2.8919440424505547, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.892, "step": 11990 }, { "epoch": 2.8931500241196333, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0843, "step": 11995 }, { "epoch": 2.894356005788712, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 4.1105, "step": 12000 }, { "epoch": 2.8955619874577905, "grad_norm": 4.5, "learning_rate": 3e-05, "loss": 3.9092, "step": 12005 }, { "epoch": 2.896767969126869, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 4.0691, "step": 12010 }, { "epoch": 2.897973950795948, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 3.8419, "step": 12015 }, { "epoch": 2.8991799324650263, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1008, "step": 12020 }, { "epoch": 2.9003859141341053, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 4.0445, "step": 12025 }, { "epoch": 2.901591895803184, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.8977, "step": 12030 }, { "epoch": 2.9027978774722625, "grad_norm": 3.53125, "learning_rate": 3e-05, "loss": 4.112, "step": 12035 }, { "epoch": 2.904003859141341, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 4.0914, "step": 12040 }, { "epoch": 2.9052098408104197, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 3.9764, "step": 12045 }, { "epoch": 2.9064158224794983, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.0986, "step": 12050 }, { "epoch": 2.907621804148577, "grad_norm": 6.0, "learning_rate": 3e-05, "loss": 4.0251, "step": 12055 }, { "epoch": 2.9088277858176554, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.0182, "step": 12060 }, { "epoch": 2.910033767486734, "grad_norm": 3.6875, "learning_rate": 3e-05, "loss": 3.9885, "step": 12065 }, { "epoch": 2.911239749155813, "grad_norm": 2.8125, "learning_rate": 3e-05, "loss": 3.9383, "step": 12070 }, { "epoch": 2.912445730824891, "grad_norm": 4.03125, "learning_rate": 3e-05, "loss": 3.8947, "step": 12075 }, { "epoch": 2.9136517124939703, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.9655, "step": 12080 }, { "epoch": 2.914857694163049, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9334, "step": 12085 }, { "epoch": 2.9160636758321274, "grad_norm": 3.171875, "learning_rate": 3e-05, "loss": 3.9861, "step": 12090 }, { "epoch": 2.917269657501206, "grad_norm": 3.0, "learning_rate": 3e-05, "loss": 3.96, "step": 12095 }, { "epoch": 2.9184756391702846, "grad_norm": 5.15625, "learning_rate": 3e-05, "loss": 4.1069, "step": 12100 }, { "epoch": 2.9196816208393632, "grad_norm": 4.65625, "learning_rate": 3e-05, "loss": 4.0015, "step": 12105 }, { "epoch": 2.920887602508442, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 4.0245, "step": 12110 }, { "epoch": 2.9220935841775204, "grad_norm": 4.4375, "learning_rate": 3e-05, "loss": 3.9015, "step": 12115 }, { "epoch": 2.923299565846599, "grad_norm": 5.0625, "learning_rate": 3e-05, "loss": 3.94, "step": 12120 }, { "epoch": 2.924505547515678, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9782, "step": 12125 }, { "epoch": 2.925711529184756, "grad_norm": 4.3125, "learning_rate": 3e-05, "loss": 4.0306, "step": 12130 }, { "epoch": 2.9269175108538352, "grad_norm": 5.03125, "learning_rate": 3e-05, "loss": 4.1057, "step": 12135 }, { "epoch": 2.928123492522914, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 3.865, "step": 12140 }, { "epoch": 2.9293294741919924, "grad_norm": 4.28125, "learning_rate": 3e-05, "loss": 3.9135, "step": 12145 }, { "epoch": 2.930535455861071, "grad_norm": 4.625, "learning_rate": 3e-05, "loss": 4.2542, "step": 12150 }, { "epoch": 2.9317414375301496, "grad_norm": 3.390625, "learning_rate": 3e-05, "loss": 3.9813, "step": 12155 }, { "epoch": 2.932947419199228, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0094, "step": 12160 }, { "epoch": 2.934153400868307, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 3.9488, "step": 12165 }, { "epoch": 2.9353593825373854, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9726, "step": 12170 }, { "epoch": 2.936565364206464, "grad_norm": 5.53125, "learning_rate": 3e-05, "loss": 4.0319, "step": 12175 }, { "epoch": 2.9377713458755426, "grad_norm": 5.6875, "learning_rate": 3e-05, "loss": 3.9492, "step": 12180 }, { "epoch": 2.938977327544621, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.9593, "step": 12185 }, { "epoch": 2.9401833092137, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 4.043, "step": 12190 }, { "epoch": 2.9413892908827783, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0351, "step": 12195 }, { "epoch": 2.9425952725518574, "grad_norm": 6.53125, "learning_rate": 3e-05, "loss": 3.9099, "step": 12200 }, { "epoch": 2.943801254220936, "grad_norm": 3.359375, "learning_rate": 3e-05, "loss": 4.0663, "step": 12205 }, { "epoch": 2.9450072358900146, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.0477, "step": 12210 }, { "epoch": 2.946213217559093, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.9408, "step": 12215 }, { "epoch": 2.9474191992281717, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 4.1049, "step": 12220 }, { "epoch": 2.9486251808972503, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9154, "step": 12225 }, { "epoch": 2.949831162566329, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.9762, "step": 12230 }, { "epoch": 2.9510371442354075, "grad_norm": 3.296875, "learning_rate": 3e-05, "loss": 4.1001, "step": 12235 }, { "epoch": 2.952243125904486, "grad_norm": 3.140625, "learning_rate": 3e-05, "loss": 3.8937, "step": 12240 }, { "epoch": 2.953449107573565, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 3.9493, "step": 12245 }, { "epoch": 2.9546550892426433, "grad_norm": 3.0625, "learning_rate": 3e-05, "loss": 4.0477, "step": 12250 }, { "epoch": 2.9558610709117223, "grad_norm": 3.890625, "learning_rate": 3e-05, "loss": 3.9156, "step": 12255 }, { "epoch": 2.957067052580801, "grad_norm": 4.375, "learning_rate": 3e-05, "loss": 3.8316, "step": 12260 }, { "epoch": 2.9582730342498795, "grad_norm": 3.765625, "learning_rate": 3e-05, "loss": 3.956, "step": 12265 }, { "epoch": 2.959479015918958, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 4.1299, "step": 12270 }, { "epoch": 2.9606849975880367, "grad_norm": 4.40625, "learning_rate": 3e-05, "loss": 3.8095, "step": 12275 }, { "epoch": 2.9618909792571153, "grad_norm": 3.21875, "learning_rate": 3e-05, "loss": 3.9492, "step": 12280 }, { "epoch": 2.963096960926194, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9404, "step": 12285 }, { "epoch": 2.9643029425952725, "grad_norm": 2.9375, "learning_rate": 3e-05, "loss": 4.1068, "step": 12290 }, { "epoch": 2.965508924264351, "grad_norm": 2.859375, "learning_rate": 3e-05, "loss": 4.206, "step": 12295 }, { "epoch": 2.9667149059334297, "grad_norm": 2.96875, "learning_rate": 3e-05, "loss": 3.84, "step": 12300 }, { "epoch": 2.9679208876025083, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 3.9283, "step": 12305 }, { "epoch": 2.9691268692715873, "grad_norm": 3.453125, "learning_rate": 3e-05, "loss": 3.9552, "step": 12310 }, { "epoch": 2.9703328509406655, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 3.8945, "step": 12315 }, { "epoch": 2.9715388326097445, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 4.0348, "step": 12320 }, { "epoch": 2.972744814278823, "grad_norm": 6.375, "learning_rate": 3e-05, "loss": 4.0828, "step": 12325 }, { "epoch": 2.9739507959479017, "grad_norm": 3.875, "learning_rate": 3e-05, "loss": 4.0696, "step": 12330 }, { "epoch": 2.9751567776169803, "grad_norm": 4.53125, "learning_rate": 3e-05, "loss": 3.9339, "step": 12335 }, { "epoch": 2.976362759286059, "grad_norm": 3.671875, "learning_rate": 3e-05, "loss": 3.9051, "step": 12340 }, { "epoch": 2.9775687409551375, "grad_norm": 3.609375, "learning_rate": 3e-05, "loss": 3.9078, "step": 12345 }, { "epoch": 2.978774722624216, "grad_norm": 4.09375, "learning_rate": 3e-05, "loss": 3.989, "step": 12350 }, { "epoch": 2.9799807042932946, "grad_norm": 5.5, "learning_rate": 3e-05, "loss": 4.1028, "step": 12355 }, { "epoch": 2.9811866859623732, "grad_norm": 4.84375, "learning_rate": 3e-05, "loss": 4.1263, "step": 12360 }, { "epoch": 2.9823926676314523, "grad_norm": 3.71875, "learning_rate": 3e-05, "loss": 3.9588, "step": 12365 }, { "epoch": 2.9835986493005304, "grad_norm": 3.625, "learning_rate": 3e-05, "loss": 3.971, "step": 12370 }, { "epoch": 2.9848046309696095, "grad_norm": 3.828125, "learning_rate": 3e-05, "loss": 3.9227, "step": 12375 }, { "epoch": 2.986010612638688, "grad_norm": 3.125, "learning_rate": 3e-05, "loss": 3.8939, "step": 12380 }, { "epoch": 2.9872165943077666, "grad_norm": 3.921875, "learning_rate": 3e-05, "loss": 3.8457, "step": 12385 }, { "epoch": 2.9884225759768452, "grad_norm": 4.90625, "learning_rate": 3e-05, "loss": 4.1444, "step": 12390 }, { "epoch": 2.989628557645924, "grad_norm": 4.25, "learning_rate": 3e-05, "loss": 4.0195, "step": 12395 }, { "epoch": 2.9908345393150024, "grad_norm": 3.328125, "learning_rate": 3e-05, "loss": 3.9538, "step": 12400 }, { "epoch": 2.992040520984081, "grad_norm": 3.046875, "learning_rate": 3e-05, "loss": 3.954, "step": 12405 }, { "epoch": 2.9932465026531596, "grad_norm": 3.546875, "learning_rate": 3e-05, "loss": 4.0707, "step": 12410 }, { "epoch": 2.994452484322238, "grad_norm": 3.3125, "learning_rate": 3e-05, "loss": 3.8717, "step": 12415 }, { "epoch": 2.995658465991317, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9522, "step": 12420 }, { "epoch": 2.9968644476603954, "grad_norm": 3.59375, "learning_rate": 3e-05, "loss": 3.8757, "step": 12425 }, { "epoch": 2.9980704293294744, "grad_norm": 6.03125, "learning_rate": 3e-05, "loss": 4.0314, "step": 12430 }, { "epoch": 2.9992764109985526, "grad_norm": 4.21875, "learning_rate": 3e-05, "loss": 4.1354, "step": 12435 }, { "epoch": 3.0004823926676316, "grad_norm": 3.421875, "learning_rate": 3e-05, "loss": 3.8698, "step": 12440 }, { "epoch": 3.00168837433671, "grad_norm": 4.5625, "learning_rate": 3e-05, "loss": 4.0127, "step": 12445 }, { "epoch": 3.002894356005789, "grad_norm": 5.09375, "learning_rate": 3e-05, "loss": 4.019, "step": 12450 }, { "epoch": 3.0041003376748674, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 4.0906, "step": 12455 }, { "epoch": 3.005306319343946, "grad_norm": 4.34375, "learning_rate": 3e-05, "loss": 3.9361, "step": 12460 }, { "epoch": 3.0065123010130246, "grad_norm": 3.03125, "learning_rate": 3e-05, "loss": 3.9676, "step": 12465 }, { "epoch": 3.007718282682103, "grad_norm": 4.0625, "learning_rate": 3e-05, "loss": 3.9415, "step": 12470 }, { "epoch": 3.0089242643511818, "grad_norm": 3.859375, "learning_rate": 3e-05, "loss": 4.0018, "step": 12475 }, { "epoch": 3.0101302460202604, "grad_norm": 5.1875, "learning_rate": 3e-05, "loss": 4.1846, "step": 12480 }, { "epoch": 3.011336227689339, "grad_norm": 4.46875, "learning_rate": 3e-05, "loss": 3.9359, "step": 12485 }, { "epoch": 3.0125422093584175, "grad_norm": 4.15625, "learning_rate": 3e-05, "loss": 3.9211, "step": 12490 }, { "epoch": 3.0137481910274966, "grad_norm": 3.578125, "learning_rate": 3e-05, "loss": 3.9263, "step": 12495 }, { "epoch": 3.014954172696575, "grad_norm": 3.703125, "learning_rate": 3e-05, "loss": 4.0466, "step": 12500 } ], "logging_steps": 5, "max_steps": 41460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.668857210492682e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }