diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.859141341051616, + "eval_steps": 500, + "global_step": 25000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00120598166907863, + "grad_norm": 17.625, + "learning_rate": 3e-06, + "loss": 10.4116, + "step": 5 + }, + { + "epoch": 0.00241196333815726, + "grad_norm": 15.625, + "learning_rate": 6e-06, + "loss": 10.3625, + "step": 10 + }, + { + "epoch": 0.00361794500723589, + "grad_norm": 16.875, + "learning_rate": 9e-06, + "loss": 10.2274, + "step": 15 + }, + { + "epoch": 0.00482392667631452, + "grad_norm": 14.625, + "learning_rate": 1.2e-05, + "loss": 9.9835, + "step": 20 + }, + { + "epoch": 0.00602990834539315, + "grad_norm": 14.0, + "learning_rate": 1.5e-05, + "loss": 9.5794, + "step": 25 + }, + { + "epoch": 0.00723589001447178, + "grad_norm": 10.6875, + "learning_rate": 1.8e-05, + "loss": 9.0401, + "step": 30 + }, + { + "epoch": 0.00844187168355041, + "grad_norm": 4.0625, + "learning_rate": 2.1e-05, + "loss": 8.2806, + "step": 35 + }, + { + "epoch": 0.00964785335262904, + "grad_norm": 7.46875, + "learning_rate": 2.4e-05, + "loss": 8.2083, + "step": 40 + }, + { + "epoch": 0.01085383502170767, + "grad_norm": 7.09375, + "learning_rate": 2.7000000000000002e-05, + "loss": 8.0693, + "step": 45 + }, + { + "epoch": 0.0120598166907863, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 7.8499, + "step": 50 + }, + { + "epoch": 0.01326579835986493, + "grad_norm": 4.3125, + "learning_rate": 3e-05, + "loss": 7.771, + "step": 55 + }, + { + "epoch": 0.01447178002894356, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 7.7109, + "step": 60 + }, + { + "epoch": 0.01567776169802219, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 7.6555, + "step": 65 + }, + { + "epoch": 0.01688374336710082, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 7.6062, + "step": 70 + }, + { + "epoch": 0.01808972503617945, + "grad_norm": 1.828125, + "learning_rate": 3e-05, + "loss": 7.5032, + "step": 75 + }, + { + "epoch": 0.01929570670525808, + "grad_norm": 1.78125, + "learning_rate": 3e-05, + "loss": 7.3975, + "step": 80 + }, + { + "epoch": 0.02050168837433671, + "grad_norm": 1.90625, + "learning_rate": 3e-05, + "loss": 7.3448, + "step": 85 + }, + { + "epoch": 0.02170767004341534, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 7.3475, + "step": 90 + }, + { + "epoch": 0.02291365171249397, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 7.1952, + "step": 95 + }, + { + "epoch": 0.0241196333815726, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 7.2894, + "step": 100 + }, + { + "epoch": 0.02532561505065123, + "grad_norm": 1.7265625, + "learning_rate": 3e-05, + "loss": 7.3062, + "step": 105 + }, + { + "epoch": 0.02653159671972986, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 7.1772, + "step": 110 + }, + { + "epoch": 0.02773757838880849, + "grad_norm": 1.7734375, + "learning_rate": 3e-05, + "loss": 7.1534, + "step": 115 + }, + { + "epoch": 0.02894356005788712, + "grad_norm": 1.8984375, + "learning_rate": 3e-05, + "loss": 7.2145, + "step": 120 + }, + { + "epoch": 0.03014954172696575, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 7.0982, + "step": 125 + }, + { + "epoch": 0.03135552339604438, + "grad_norm": 1.78125, + "learning_rate": 3e-05, + "loss": 7.1474, + "step": 130 + }, + { + "epoch": 0.03256150506512301, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 7.061, + "step": 135 + }, + { + "epoch": 0.03376748673420164, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 7.0864, + "step": 140 + }, + { + "epoch": 0.03497346840328027, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 7.0517, + "step": 145 + }, + { + "epoch": 0.0361794500723589, + "grad_norm": 1.890625, + "learning_rate": 3e-05, + "loss": 6.9559, + "step": 150 + }, + { + "epoch": 0.03738543174143753, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 6.9734, + "step": 155 + }, + { + "epoch": 0.03859141341051616, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 6.9503, + "step": 160 + }, + { + "epoch": 0.03979739507959479, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 6.9415, + "step": 165 + }, + { + "epoch": 0.04100337674867342, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 7.001, + "step": 170 + }, + { + "epoch": 0.04220935841775205, + "grad_norm": 1.5078125, + "learning_rate": 3e-05, + "loss": 6.8792, + "step": 175 + }, + { + "epoch": 0.04341534008683068, + "grad_norm": 1.7265625, + "learning_rate": 3e-05, + "loss": 6.854, + "step": 180 + }, + { + "epoch": 0.04462132175590931, + "grad_norm": 1.625, + "learning_rate": 3e-05, + "loss": 6.8928, + "step": 185 + }, + { + "epoch": 0.04582730342498794, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 6.7267, + "step": 190 + }, + { + "epoch": 0.04703328509406657, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 6.8235, + "step": 195 + }, + { + "epoch": 0.0482392667631452, + "grad_norm": 1.4921875, + "learning_rate": 3e-05, + "loss": 6.7739, + "step": 200 + }, + { + "epoch": 0.04944524843222383, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 6.7484, + "step": 205 + }, + { + "epoch": 0.05065123010130246, + "grad_norm": 1.984375, + "learning_rate": 3e-05, + "loss": 6.7177, + "step": 210 + }, + { + "epoch": 0.05185721177038109, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 6.7426, + "step": 215 + }, + { + "epoch": 0.05306319343945972, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 6.6993, + "step": 220 + }, + { + "epoch": 0.05426917510853835, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 6.7761, + "step": 225 + }, + { + "epoch": 0.05547515677761698, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 6.5162, + "step": 230 + }, + { + "epoch": 0.05668113844669561, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 6.6471, + "step": 235 + }, + { + "epoch": 0.05788712011577424, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 6.5741, + "step": 240 + }, + { + "epoch": 0.05909310178485287, + "grad_norm": 4.1875, + "learning_rate": 3e-05, + "loss": 6.4797, + "step": 245 + }, + { + "epoch": 0.0602990834539315, + "grad_norm": 1.921875, + "learning_rate": 3e-05, + "loss": 6.5313, + "step": 250 + }, + { + "epoch": 0.06150506512301013, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 6.4377, + "step": 255 + }, + { + "epoch": 0.06271104679208876, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 6.4948, + "step": 260 + }, + { + "epoch": 0.0639170284611674, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 6.4971, + "step": 265 + }, + { + "epoch": 0.06512301013024602, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 6.3971, + "step": 270 + }, + { + "epoch": 0.06632899179932465, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 6.4961, + "step": 275 + }, + { + "epoch": 0.06753497346840329, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 6.4457, + "step": 280 + }, + { + "epoch": 0.06874095513748191, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 6.4788, + "step": 285 + }, + { + "epoch": 0.06994693680656054, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 6.4388, + "step": 290 + }, + { + "epoch": 0.07115291847563918, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 6.2615, + "step": 295 + }, + { + "epoch": 0.0723589001447178, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 6.369, + "step": 300 + }, + { + "epoch": 0.07356488181379643, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 6.4014, + "step": 305 + }, + { + "epoch": 0.07477086348287507, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 6.3217, + "step": 310 + }, + { + "epoch": 0.07597684515195369, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 6.424, + "step": 315 + }, + { + "epoch": 0.07718282682103232, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 6.2718, + "step": 320 + }, + { + "epoch": 0.07838880849011096, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 6.2224, + "step": 325 + }, + { + "epoch": 0.07959479015918958, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 6.2538, + "step": 330 + }, + { + "epoch": 0.08080077182826821, + "grad_norm": 1.78125, + "learning_rate": 3e-05, + "loss": 6.2736, + "step": 335 + }, + { + "epoch": 0.08200675349734685, + "grad_norm": 1.7421875, + "learning_rate": 3e-05, + "loss": 6.248, + "step": 340 + }, + { + "epoch": 0.08321273516642547, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 6.3282, + "step": 345 + }, + { + "epoch": 0.0844187168355041, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 6.3235, + "step": 350 + }, + { + "epoch": 0.08562469850458274, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 6.37, + "step": 355 + }, + { + "epoch": 0.08683068017366136, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 6.1852, + "step": 360 + }, + { + "epoch": 0.08803666184273999, + "grad_norm": 3.5, + "learning_rate": 3e-05, + "loss": 6.2639, + "step": 365 + }, + { + "epoch": 0.08924264351181863, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 6.2367, + "step": 370 + }, + { + "epoch": 0.09044862518089725, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 6.0995, + "step": 375 + }, + { + "epoch": 0.09165460684997588, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 6.1735, + "step": 380 + }, + { + "epoch": 0.09286058851905452, + "grad_norm": 1.890625, + "learning_rate": 3e-05, + "loss": 6.1528, + "step": 385 + }, + { + "epoch": 0.09406657018813314, + "grad_norm": 1.8359375, + "learning_rate": 3e-05, + "loss": 6.1329, + "step": 390 + }, + { + "epoch": 0.09527255185721177, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 6.1236, + "step": 395 + }, + { + "epoch": 0.0964785335262904, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 6.198, + "step": 400 + }, + { + "epoch": 0.09768451519536903, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 6.1956, + "step": 405 + }, + { + "epoch": 0.09889049686444766, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 6.1735, + "step": 410 + }, + { + "epoch": 0.1000964785335263, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 6.1953, + "step": 415 + }, + { + "epoch": 0.10130246020260492, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 6.1069, + "step": 420 + }, + { + "epoch": 0.10250844187168355, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 6.0713, + "step": 425 + }, + { + "epoch": 0.10371442354076219, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 6.3303, + "step": 430 + }, + { + "epoch": 0.1049204052098408, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 6.012, + "step": 435 + }, + { + "epoch": 0.10612638687891944, + "grad_norm": 1.78125, + "learning_rate": 3e-05, + "loss": 6.05, + "step": 440 + }, + { + "epoch": 0.10733236854799807, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 6.0952, + "step": 445 + }, + { + "epoch": 0.1085383502170767, + "grad_norm": 1.7734375, + "learning_rate": 3e-05, + "loss": 6.1188, + "step": 450 + }, + { + "epoch": 0.10974433188615533, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 6.1039, + "step": 455 + }, + { + "epoch": 0.11095031355523396, + "grad_norm": 1.984375, + "learning_rate": 3e-05, + "loss": 6.0926, + "step": 460 + }, + { + "epoch": 0.11215629522431259, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 6.0924, + "step": 465 + }, + { + "epoch": 0.11336227689339122, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 6.012, + "step": 470 + }, + { + "epoch": 0.11456825856246985, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 5.9395, + "step": 475 + }, + { + "epoch": 0.11577424023154848, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 6.071, + "step": 480 + }, + { + "epoch": 0.11698022190062711, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 6.0479, + "step": 485 + }, + { + "epoch": 0.11818620356970574, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 5.9637, + "step": 490 + }, + { + "epoch": 0.11939218523878437, + "grad_norm": 1.8828125, + "learning_rate": 3e-05, + "loss": 5.9824, + "step": 495 + }, + { + "epoch": 0.120598166907863, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 5.993, + "step": 500 + }, + { + "epoch": 0.12180414857694163, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 5.8498, + "step": 505 + }, + { + "epoch": 0.12301013024602026, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 5.8376, + "step": 510 + }, + { + "epoch": 0.12421611191509889, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.9112, + "step": 515 + }, + { + "epoch": 0.12542209358417752, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 5.9365, + "step": 520 + }, + { + "epoch": 0.12662807525325614, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 6.0778, + "step": 525 + }, + { + "epoch": 0.1278340569223348, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 5.9356, + "step": 530 + }, + { + "epoch": 0.12904003859141341, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 5.8361, + "step": 535 + }, + { + "epoch": 0.13024602026049203, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 5.89, + "step": 540 + }, + { + "epoch": 0.13145200192957068, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 5.9576, + "step": 545 + }, + { + "epoch": 0.1326579835986493, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 5.9431, + "step": 550 + }, + { + "epoch": 0.13386396526772792, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 5.8577, + "step": 555 + }, + { + "epoch": 0.13506994693680657, + "grad_norm": 3.671875, + "learning_rate": 3e-05, + "loss": 5.9617, + "step": 560 + }, + { + "epoch": 0.1362759286058852, + "grad_norm": 3.3125, + "learning_rate": 3e-05, + "loss": 5.8371, + "step": 565 + }, + { + "epoch": 0.13748191027496381, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 5.8252, + "step": 570 + }, + { + "epoch": 0.13868789194404246, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 5.808, + "step": 575 + }, + { + "epoch": 0.13989387361312108, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 5.9049, + "step": 580 + }, + { + "epoch": 0.1410998552821997, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 5.836, + "step": 585 + }, + { + "epoch": 0.14230583695127835, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 5.7272, + "step": 590 + }, + { + "epoch": 0.14351181862035697, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 5.8204, + "step": 595 + }, + { + "epoch": 0.1447178002894356, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 5.9753, + "step": 600 + }, + { + "epoch": 0.14592378195851424, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 5.7331, + "step": 605 + }, + { + "epoch": 0.14712976362759286, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 5.8421, + "step": 610 + }, + { + "epoch": 0.14833574529667148, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.7543, + "step": 615 + }, + { + "epoch": 0.14954172696575013, + "grad_norm": 1.890625, + "learning_rate": 3e-05, + "loss": 5.7815, + "step": 620 + }, + { + "epoch": 0.15074770863482875, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.6632, + "step": 625 + }, + { + "epoch": 0.15195369030390737, + "grad_norm": 1.7265625, + "learning_rate": 3e-05, + "loss": 5.7123, + "step": 630 + }, + { + "epoch": 0.15315967197298602, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 5.8582, + "step": 635 + }, + { + "epoch": 0.15436565364206464, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 5.7628, + "step": 640 + }, + { + "epoch": 0.15557163531114326, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.9128, + "step": 645 + }, + { + "epoch": 0.1567776169802219, + "grad_norm": 1.984375, + "learning_rate": 3e-05, + "loss": 5.8285, + "step": 650 + }, + { + "epoch": 0.15798359864930053, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 5.7964, + "step": 655 + }, + { + "epoch": 0.15918958031837915, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 5.8595, + "step": 660 + }, + { + "epoch": 0.1603955619874578, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 5.7467, + "step": 665 + }, + { + "epoch": 0.16160154365653642, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 5.5928, + "step": 670 + }, + { + "epoch": 0.16280752532561504, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 5.738, + "step": 675 + }, + { + "epoch": 0.1640135069946937, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 5.7686, + "step": 680 + }, + { + "epoch": 0.1652194886637723, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 5.7783, + "step": 685 + }, + { + "epoch": 0.16642547033285093, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 5.7515, + "step": 690 + }, + { + "epoch": 0.16763145200192958, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 5.6172, + "step": 695 + }, + { + "epoch": 0.1688374336710082, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 5.6604, + "step": 700 + }, + { + "epoch": 0.17004341534008682, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 5.8269, + "step": 705 + }, + { + "epoch": 0.17124939700916547, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 5.7508, + "step": 710 + }, + { + "epoch": 0.1724553786782441, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 5.6119, + "step": 715 + }, + { + "epoch": 0.1736613603473227, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 5.8154, + "step": 720 + }, + { + "epoch": 0.17486734201640136, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.6788, + "step": 725 + }, + { + "epoch": 0.17607332368547998, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 5.6748, + "step": 730 + }, + { + "epoch": 0.1772793053545586, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 5.6841, + "step": 735 + }, + { + "epoch": 0.17848528702363725, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.5893, + "step": 740 + }, + { + "epoch": 0.17969126869271587, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 5.6872, + "step": 745 + }, + { + "epoch": 0.1808972503617945, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 5.721, + "step": 750 + }, + { + "epoch": 0.18210323203087314, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 5.7009, + "step": 755 + }, + { + "epoch": 0.18330921369995176, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 5.6044, + "step": 760 + }, + { + "epoch": 0.18451519536903038, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.5983, + "step": 765 + }, + { + "epoch": 0.18572117703810903, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 5.6824, + "step": 770 + }, + { + "epoch": 0.18692715870718765, + "grad_norm": 6.625, + "learning_rate": 3e-05, + "loss": 5.5232, + "step": 775 + }, + { + "epoch": 0.18813314037626627, + "grad_norm": 3.96875, + "learning_rate": 3e-05, + "loss": 5.6313, + "step": 780 + }, + { + "epoch": 0.18933912204534492, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 5.7965, + "step": 785 + }, + { + "epoch": 0.19054510371442354, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 5.5971, + "step": 790 + }, + { + "epoch": 0.19175108538350216, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 5.6727, + "step": 795 + }, + { + "epoch": 0.1929570670525808, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 5.7194, + "step": 800 + }, + { + "epoch": 0.19416304872165943, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 5.6573, + "step": 805 + }, + { + "epoch": 0.19536903039073805, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 5.5429, + "step": 810 + }, + { + "epoch": 0.1965750120598167, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 5.541, + "step": 815 + }, + { + "epoch": 0.19778099372889532, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 5.5469, + "step": 820 + }, + { + "epoch": 0.19898697539797394, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.4902, + "step": 825 + }, + { + "epoch": 0.2001929570670526, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 5.5931, + "step": 830 + }, + { + "epoch": 0.2013989387361312, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 5.4769, + "step": 835 + }, + { + "epoch": 0.20260492040520983, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.6509, + "step": 840 + }, + { + "epoch": 0.20381090207428848, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 5.6149, + "step": 845 + }, + { + "epoch": 0.2050168837433671, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 5.661, + "step": 850 + }, + { + "epoch": 0.20622286541244572, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 5.4612, + "step": 855 + }, + { + "epoch": 0.20742884708152437, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 5.5261, + "step": 860 + }, + { + "epoch": 0.208634828750603, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 5.7081, + "step": 865 + }, + { + "epoch": 0.2098408104196816, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 5.6594, + "step": 870 + }, + { + "epoch": 0.21104679208876026, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 5.3843, + "step": 875 + }, + { + "epoch": 0.21225277375783888, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.6101, + "step": 880 + }, + { + "epoch": 0.2134587554269175, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 5.4925, + "step": 885 + }, + { + "epoch": 0.21466473709599615, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 5.4863, + "step": 890 + }, + { + "epoch": 0.21587071876507477, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 5.5784, + "step": 895 + }, + { + "epoch": 0.2170767004341534, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 5.5706, + "step": 900 + }, + { + "epoch": 0.21828268210323204, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 5.4767, + "step": 905 + }, + { + "epoch": 0.21948866377231066, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.5478, + "step": 910 + }, + { + "epoch": 0.22069464544138928, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 5.4582, + "step": 915 + }, + { + "epoch": 0.22190062711046793, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 5.4833, + "step": 920 + }, + { + "epoch": 0.22310660877954655, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 5.3878, + "step": 925 + }, + { + "epoch": 0.22431259044862517, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 5.6182, + "step": 930 + }, + { + "epoch": 0.22551857211770382, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 5.3843, + "step": 935 + }, + { + "epoch": 0.22672455378678244, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 5.398, + "step": 940 + }, + { + "epoch": 0.22793053545586106, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 5.4073, + "step": 945 + }, + { + "epoch": 0.2291365171249397, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 5.4073, + "step": 950 + }, + { + "epoch": 0.23034249879401833, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 5.5549, + "step": 955 + }, + { + "epoch": 0.23154848046309695, + "grad_norm": 1.8046875, + "learning_rate": 3e-05, + "loss": 5.3167, + "step": 960 + }, + { + "epoch": 0.2327544621321756, + "grad_norm": 1.8984375, + "learning_rate": 3e-05, + "loss": 5.4991, + "step": 965 + }, + { + "epoch": 0.23396044380125422, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 5.617, + "step": 970 + }, + { + "epoch": 0.23516642547033284, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 5.5679, + "step": 975 + }, + { + "epoch": 0.2363724071394115, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.394, + "step": 980 + }, + { + "epoch": 0.2375783888084901, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 5.4975, + "step": 985 + }, + { + "epoch": 0.23878437047756873, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 5.3327, + "step": 990 + }, + { + "epoch": 0.23999035214664738, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 5.5471, + "step": 995 + }, + { + "epoch": 0.241196333815726, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 5.4136, + "step": 1000 + }, + { + "epoch": 0.24240231548480462, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.494, + "step": 1005 + }, + { + "epoch": 0.24360829715388327, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 5.5097, + "step": 1010 + }, + { + "epoch": 0.2448142788229619, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 5.4636, + "step": 1015 + }, + { + "epoch": 0.2460202604920405, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 5.3754, + "step": 1020 + }, + { + "epoch": 0.24722624216111916, + "grad_norm": 1.8046875, + "learning_rate": 3e-05, + "loss": 5.5269, + "step": 1025 + }, + { + "epoch": 0.24843222383019778, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 5.3828, + "step": 1030 + }, + { + "epoch": 0.2496382054992764, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 5.3867, + "step": 1035 + }, + { + "epoch": 0.25084418716835505, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 5.4392, + "step": 1040 + }, + { + "epoch": 0.2520501688374337, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 5.4154, + "step": 1045 + }, + { + "epoch": 0.2532561505065123, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.382, + "step": 1050 + }, + { + "epoch": 0.25446213217559094, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 5.3914, + "step": 1055 + }, + { + "epoch": 0.2556681138446696, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 5.2091, + "step": 1060 + }, + { + "epoch": 0.2568740955137482, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 5.402, + "step": 1065 + }, + { + "epoch": 0.25808007718282683, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 5.3478, + "step": 1070 + }, + { + "epoch": 0.2592860588519055, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 5.278, + "step": 1075 + }, + { + "epoch": 0.26049204052098407, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 5.402, + "step": 1080 + }, + { + "epoch": 0.2616980221900627, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 5.3795, + "step": 1085 + }, + { + "epoch": 0.26290400385914137, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 5.2764, + "step": 1090 + }, + { + "epoch": 0.26410998552821996, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 5.3733, + "step": 1095 + }, + { + "epoch": 0.2653159671972986, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 5.281, + "step": 1100 + }, + { + "epoch": 0.26652194886637726, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 5.2757, + "step": 1105 + }, + { + "epoch": 0.26772793053545585, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 5.2248, + "step": 1110 + }, + { + "epoch": 0.2689339122045345, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 5.4059, + "step": 1115 + }, + { + "epoch": 0.27013989387361315, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 5.2239, + "step": 1120 + }, + { + "epoch": 0.27134587554269174, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.1511, + "step": 1125 + }, + { + "epoch": 0.2725518572117704, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 5.2924, + "step": 1130 + }, + { + "epoch": 0.27375783888084904, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 5.2489, + "step": 1135 + }, + { + "epoch": 0.27496382054992763, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 5.329, + "step": 1140 + }, + { + "epoch": 0.2761698022190063, + "grad_norm": 1.8515625, + "learning_rate": 3e-05, + "loss": 5.1569, + "step": 1145 + }, + { + "epoch": 0.2773757838880849, + "grad_norm": 1.6796875, + "learning_rate": 3e-05, + "loss": 5.3047, + "step": 1150 + }, + { + "epoch": 0.2785817655571635, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.4468, + "step": 1155 + }, + { + "epoch": 0.27978774722624217, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 5.409, + "step": 1160 + }, + { + "epoch": 0.2809937288953208, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.2111, + "step": 1165 + }, + { + "epoch": 0.2821997105643994, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 5.3326, + "step": 1170 + }, + { + "epoch": 0.28340569223347806, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 5.1767, + "step": 1175 + }, + { + "epoch": 0.2846116739025567, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 5.3533, + "step": 1180 + }, + { + "epoch": 0.2858176555716353, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 5.2169, + "step": 1185 + }, + { + "epoch": 0.28702363724071395, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.351, + "step": 1190 + }, + { + "epoch": 0.2882296189097926, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 5.3343, + "step": 1195 + }, + { + "epoch": 0.2894356005788712, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 5.31, + "step": 1200 + }, + { + "epoch": 0.29064158224794984, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 5.2081, + "step": 1205 + }, + { + "epoch": 0.2918475639170285, + "grad_norm": 1.796875, + "learning_rate": 3e-05, + "loss": 5.3053, + "step": 1210 + }, + { + "epoch": 0.2930535455861071, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 5.2586, + "step": 1215 + }, + { + "epoch": 0.2942595272551857, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 5.1507, + "step": 1220 + }, + { + "epoch": 0.2954655089242644, + "grad_norm": 1.7578125, + "learning_rate": 3e-05, + "loss": 5.3245, + "step": 1225 + }, + { + "epoch": 0.29667149059334297, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 5.2317, + "step": 1230 + }, + { + "epoch": 0.2978774722624216, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.2238, + "step": 1235 + }, + { + "epoch": 0.29908345393150026, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 5.2438, + "step": 1240 + }, + { + "epoch": 0.30028943560057886, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 5.2629, + "step": 1245 + }, + { + "epoch": 0.3014954172696575, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 5.1487, + "step": 1250 + }, + { + "epoch": 0.30270139893873615, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 5.2241, + "step": 1255 + }, + { + "epoch": 0.30390738060781475, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 5.2636, + "step": 1260 + }, + { + "epoch": 0.3051133622768934, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 5.1245, + "step": 1265 + }, + { + "epoch": 0.30631934394597204, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 5.1451, + "step": 1270 + }, + { + "epoch": 0.30752532561505064, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 5.2969, + "step": 1275 + }, + { + "epoch": 0.3087313072841293, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 5.1689, + "step": 1280 + }, + { + "epoch": 0.30993728895320793, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 5.2066, + "step": 1285 + }, + { + "epoch": 0.3111432706222865, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.9629, + "step": 1290 + }, + { + "epoch": 0.3123492522913652, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 5.1767, + "step": 1295 + }, + { + "epoch": 0.3135552339604438, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 5.2642, + "step": 1300 + }, + { + "epoch": 0.3147612156295224, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 5.2788, + "step": 1305 + }, + { + "epoch": 0.31596719729860107, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 5.1955, + "step": 1310 + }, + { + "epoch": 0.3171731789676797, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 5.2811, + "step": 1315 + }, + { + "epoch": 0.3183791606367583, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 5.2512, + "step": 1320 + }, + { + "epoch": 0.31958514230583696, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 5.126, + "step": 1325 + }, + { + "epoch": 0.3207911239749156, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 5.0701, + "step": 1330 + }, + { + "epoch": 0.3219971056439942, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 5.1081, + "step": 1335 + }, + { + "epoch": 0.32320308731307285, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 5.2009, + "step": 1340 + }, + { + "epoch": 0.3244090689821515, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.25, + "step": 1345 + }, + { + "epoch": 0.3256150506512301, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 5.1474, + "step": 1350 + }, + { + "epoch": 0.32682103232030874, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 5.4087, + "step": 1355 + }, + { + "epoch": 0.3280270139893874, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 5.2222, + "step": 1360 + }, + { + "epoch": 0.329232995658466, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 5.1554, + "step": 1365 + }, + { + "epoch": 0.3304389773275446, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 5.19, + "step": 1370 + }, + { + "epoch": 0.3316449589966233, + "grad_norm": 3.78125, + "learning_rate": 3e-05, + "loss": 5.1978, + "step": 1375 + }, + { + "epoch": 0.33285094066570187, + "grad_norm": 1.875, + "learning_rate": 3e-05, + "loss": 5.1126, + "step": 1380 + }, + { + "epoch": 0.3340569223347805, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 5.1463, + "step": 1385 + }, + { + "epoch": 0.33526290400385916, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 5.0063, + "step": 1390 + }, + { + "epoch": 0.33646888567293776, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 5.199, + "step": 1395 + }, + { + "epoch": 0.3376748673420164, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 5.1133, + "step": 1400 + }, + { + "epoch": 0.33888084901109505, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 5.3344, + "step": 1405 + }, + { + "epoch": 0.34008683068017365, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 5.2786, + "step": 1410 + }, + { + "epoch": 0.3412928123492523, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 5.1066, + "step": 1415 + }, + { + "epoch": 0.34249879401833094, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 5.2149, + "step": 1420 + }, + { + "epoch": 0.34370477568740954, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 5.0942, + "step": 1425 + }, + { + "epoch": 0.3449107573564882, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 5.1109, + "step": 1430 + }, + { + "epoch": 0.34611673902556683, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 5.2741, + "step": 1435 + }, + { + "epoch": 0.3473227206946454, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 5.1884, + "step": 1440 + }, + { + "epoch": 0.3485287023637241, + "grad_norm": 1.8984375, + "learning_rate": 3e-05, + "loss": 5.184, + "step": 1445 + }, + { + "epoch": 0.3497346840328027, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 5.113, + "step": 1450 + }, + { + "epoch": 0.3509406657018813, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 5.0311, + "step": 1455 + }, + { + "epoch": 0.35214664737095996, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 5.0447, + "step": 1460 + }, + { + "epoch": 0.3533526290400386, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 5.263, + "step": 1465 + }, + { + "epoch": 0.3545586107091172, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 5.0733, + "step": 1470 + }, + { + "epoch": 0.35576459237819585, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 5.176, + "step": 1475 + }, + { + "epoch": 0.3569705740472745, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.9958, + "step": 1480 + }, + { + "epoch": 0.3581765557163531, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 5.0778, + "step": 1485 + }, + { + "epoch": 0.35938253738543174, + "grad_norm": 1.890625, + "learning_rate": 3e-05, + "loss": 5.1228, + "step": 1490 + }, + { + "epoch": 0.3605885190545104, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 5.053, + "step": 1495 + }, + { + "epoch": 0.361794500723589, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 5.0317, + "step": 1500 + }, + { + "epoch": 0.36300048239266763, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 5.0683, + "step": 1505 + }, + { + "epoch": 0.3642064640617463, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 5.0274, + "step": 1510 + }, + { + "epoch": 0.3654124457308249, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 5.0054, + "step": 1515 + }, + { + "epoch": 0.3666184273999035, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 5.0405, + "step": 1520 + }, + { + "epoch": 0.36782440906898217, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 5.1625, + "step": 1525 + }, + { + "epoch": 0.36903039073806077, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.281, + "step": 1530 + }, + { + "epoch": 0.3702363724071394, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 5.1182, + "step": 1535 + }, + { + "epoch": 0.37144235407621806, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 5.088, + "step": 1540 + }, + { + "epoch": 0.37264833574529665, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.9542, + "step": 1545 + }, + { + "epoch": 0.3738543174143753, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 5.0302, + "step": 1550 + }, + { + "epoch": 0.37506029908345395, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 5.0856, + "step": 1555 + }, + { + "epoch": 0.37626628075253254, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 5.018, + "step": 1560 + }, + { + "epoch": 0.3774722624216112, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.9291, + "step": 1565 + }, + { + "epoch": 0.37867824409068984, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.909, + "step": 1570 + }, + { + "epoch": 0.37988422575976843, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 5.111, + "step": 1575 + }, + { + "epoch": 0.3810902074288471, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 5.1551, + "step": 1580 + }, + { + "epoch": 0.38229618909792573, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.9441, + "step": 1585 + }, + { + "epoch": 0.3835021707670043, + "grad_norm": 1.8984375, + "learning_rate": 3e-05, + "loss": 4.9529, + "step": 1590 + }, + { + "epoch": 0.384708152436083, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.9265, + "step": 1595 + }, + { + "epoch": 0.3859141341051616, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.9057, + "step": 1600 + }, + { + "epoch": 0.3871201157742402, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 5.2044, + "step": 1605 + }, + { + "epoch": 0.38832609744331886, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 5.0659, + "step": 1610 + }, + { + "epoch": 0.3895320791123975, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 5.0833, + "step": 1615 + }, + { + "epoch": 0.3907380607814761, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.9692, + "step": 1620 + }, + { + "epoch": 0.39194404245055475, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 5.0722, + "step": 1625 + }, + { + "epoch": 0.3931500241196334, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.9036, + "step": 1630 + }, + { + "epoch": 0.394356005788712, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.9615, + "step": 1635 + }, + { + "epoch": 0.39556198745779064, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 5.0087, + "step": 1640 + }, + { + "epoch": 0.3967679691268693, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 5.0289, + "step": 1645 + }, + { + "epoch": 0.3979739507959479, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 5.0801, + "step": 1650 + }, + { + "epoch": 0.39917993246502653, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.8894, + "step": 1655 + }, + { + "epoch": 0.4003859141341052, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 5.0647, + "step": 1660 + }, + { + "epoch": 0.4015918958031838, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 5.0765, + "step": 1665 + }, + { + "epoch": 0.4027978774722624, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.9994, + "step": 1670 + }, + { + "epoch": 0.40400385914134107, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 5.1047, + "step": 1675 + }, + { + "epoch": 0.40520984081041966, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 5.0652, + "step": 1680 + }, + { + "epoch": 0.4064158224794983, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 5.1016, + "step": 1685 + }, + { + "epoch": 0.40762180414857696, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.0802, + "step": 1690 + }, + { + "epoch": 0.40882778581765555, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.938, + "step": 1695 + }, + { + "epoch": 0.4100337674867342, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 5.0301, + "step": 1700 + }, + { + "epoch": 0.41123974915581285, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 5.0248, + "step": 1705 + }, + { + "epoch": 0.41244573082489144, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.7854, + "step": 1710 + }, + { + "epoch": 0.4136517124939701, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 5.0096, + "step": 1715 + }, + { + "epoch": 0.41485769416304874, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.9715, + "step": 1720 + }, + { + "epoch": 0.41606367583212733, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.9592, + "step": 1725 + }, + { + "epoch": 0.417269657501206, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.8384, + "step": 1730 + }, + { + "epoch": 0.41847563917028463, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.8799, + "step": 1735 + }, + { + "epoch": 0.4196816208393632, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 5.0099, + "step": 1740 + }, + { + "epoch": 0.42088760250844187, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.7933, + "step": 1745 + }, + { + "epoch": 0.4220935841775205, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 5.1248, + "step": 1750 + }, + { + "epoch": 0.4232995658465991, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.9463, + "step": 1755 + }, + { + "epoch": 0.42450554751567776, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 4.9497, + "step": 1760 + }, + { + "epoch": 0.4257115291847564, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.8903, + "step": 1765 + }, + { + "epoch": 0.426917510853835, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.8761, + "step": 1770 + }, + { + "epoch": 0.42812349252291365, + "grad_norm": 1.90625, + "learning_rate": 3e-05, + "loss": 4.9089, + "step": 1775 + }, + { + "epoch": 0.4293294741919923, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.9017, + "step": 1780 + }, + { + "epoch": 0.4305354558610709, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.8968, + "step": 1785 + }, + { + "epoch": 0.43174143753014954, + "grad_norm": 1.875, + "learning_rate": 3e-05, + "loss": 4.9408, + "step": 1790 + }, + { + "epoch": 0.4329474191992282, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.9692, + "step": 1795 + }, + { + "epoch": 0.4341534008683068, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 5.0574, + "step": 1800 + }, + { + "epoch": 0.43535938253738543, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.9252, + "step": 1805 + }, + { + "epoch": 0.4365653642064641, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.91, + "step": 1810 + }, + { + "epoch": 0.4377713458755427, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 4.8551, + "step": 1815 + }, + { + "epoch": 0.4389773275446213, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.8312, + "step": 1820 + }, + { + "epoch": 0.44018330921369997, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.9764, + "step": 1825 + }, + { + "epoch": 0.44138929088277856, + "grad_norm": 1.8984375, + "learning_rate": 3e-05, + "loss": 4.8777, + "step": 1830 + }, + { + "epoch": 0.4425952725518572, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.9967, + "step": 1835 + }, + { + "epoch": 0.44380125422093586, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.9884, + "step": 1840 + }, + { + "epoch": 0.44500723589001445, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.9541, + "step": 1845 + }, + { + "epoch": 0.4462132175590931, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.7961, + "step": 1850 + }, + { + "epoch": 0.44741919922817175, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 5.0094, + "step": 1855 + }, + { + "epoch": 0.44862518089725034, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.9893, + "step": 1860 + }, + { + "epoch": 0.449831162566329, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.8225, + "step": 1865 + }, + { + "epoch": 0.45103714423540764, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 5.0, + "step": 1870 + }, + { + "epoch": 0.45224312590448623, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.8553, + "step": 1875 + }, + { + "epoch": 0.4534491075735649, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.9396, + "step": 1880 + }, + { + "epoch": 0.45465508924264353, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.8937, + "step": 1885 + }, + { + "epoch": 0.4558610709117221, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.6979, + "step": 1890 + }, + { + "epoch": 0.45706705258080077, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.914, + "step": 1895 + }, + { + "epoch": 0.4582730342498794, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.9191, + "step": 1900 + }, + { + "epoch": 0.459479015918958, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.8465, + "step": 1905 + }, + { + "epoch": 0.46068499758803666, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.853, + "step": 1910 + }, + { + "epoch": 0.4618909792571153, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.8957, + "step": 1915 + }, + { + "epoch": 0.4630969609261939, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.9466, + "step": 1920 + }, + { + "epoch": 0.46430294259527255, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.9664, + "step": 1925 + }, + { + "epoch": 0.4655089242643512, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.8807, + "step": 1930 + }, + { + "epoch": 0.4667149059334298, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.9079, + "step": 1935 + }, + { + "epoch": 0.46792088760250844, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.99, + "step": 1940 + }, + { + "epoch": 0.4691268692715871, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.9467, + "step": 1945 + }, + { + "epoch": 0.4703328509406657, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.8917, + "step": 1950 + }, + { + "epoch": 0.47153883260974433, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.9579, + "step": 1955 + }, + { + "epoch": 0.472744814278823, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.9121, + "step": 1960 + }, + { + "epoch": 0.47395079594790157, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 4.9916, + "step": 1965 + }, + { + "epoch": 0.4751567776169802, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.9472, + "step": 1970 + }, + { + "epoch": 0.47636275928605887, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 5.0118, + "step": 1975 + }, + { + "epoch": 0.47756874095513746, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.8575, + "step": 1980 + }, + { + "epoch": 0.4787747226242161, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.8229, + "step": 1985 + }, + { + "epoch": 0.47998070429329476, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.9579, + "step": 1990 + }, + { + "epoch": 0.48118668596237335, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.7751, + "step": 1995 + }, + { + "epoch": 0.482392667631452, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.9796, + "step": 2000 + }, + { + "epoch": 0.48359864930053065, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.7714, + "step": 2005 + }, + { + "epoch": 0.48480463096960924, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.7068, + "step": 2010 + }, + { + "epoch": 0.4860106126386879, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.8003, + "step": 2015 + }, + { + "epoch": 0.48721659430776654, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.8965, + "step": 2020 + }, + { + "epoch": 0.48842257597684513, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.8556, + "step": 2025 + }, + { + "epoch": 0.4896285576459238, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 4.7593, + "step": 2030 + }, + { + "epoch": 0.4908345393150024, + "grad_norm": 1.875, + "learning_rate": 3e-05, + "loss": 4.8081, + "step": 2035 + }, + { + "epoch": 0.492040520984081, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 4.9076, + "step": 2040 + }, + { + "epoch": 0.49324650265315967, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.7247, + "step": 2045 + }, + { + "epoch": 0.4944524843222383, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.8586, + "step": 2050 + }, + { + "epoch": 0.4956584659913169, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.7292, + "step": 2055 + }, + { + "epoch": 0.49686444766039556, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.7707, + "step": 2060 + }, + { + "epoch": 0.4980704293294742, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.8343, + "step": 2065 + }, + { + "epoch": 0.4992764109985528, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.8711, + "step": 2070 + }, + { + "epoch": 0.5004823926676315, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.7347, + "step": 2075 + }, + { + "epoch": 0.5016883743367101, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.7419, + "step": 2080 + }, + { + "epoch": 0.5028943560057887, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.7517, + "step": 2085 + }, + { + "epoch": 0.5041003376748674, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.755, + "step": 2090 + }, + { + "epoch": 0.505306319343946, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.8667, + "step": 2095 + }, + { + "epoch": 0.5065123010130246, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.7972, + "step": 2100 + }, + { + "epoch": 0.5077182826821033, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.881, + "step": 2105 + }, + { + "epoch": 0.5089242643511819, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.83, + "step": 2110 + }, + { + "epoch": 0.5101302460202605, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.8669, + "step": 2115 + }, + { + "epoch": 0.5113362276893392, + "grad_norm": 3.671875, + "learning_rate": 3e-05, + "loss": 4.8719, + "step": 2120 + }, + { + "epoch": 0.5125422093584178, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 4.8299, + "step": 2125 + }, + { + "epoch": 0.5137481910274964, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.8699, + "step": 2130 + }, + { + "epoch": 0.5149541726965751, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.8022, + "step": 2135 + }, + { + "epoch": 0.5161601543656537, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.815, + "step": 2140 + }, + { + "epoch": 0.5173661360347322, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.7154, + "step": 2145 + }, + { + "epoch": 0.518572117703811, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.7906, + "step": 2150 + }, + { + "epoch": 0.5197780993728895, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.7535, + "step": 2155 + }, + { + "epoch": 0.5209840810419681, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.6967, + "step": 2160 + }, + { + "epoch": 0.5221900627110468, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.6081, + "step": 2165 + }, + { + "epoch": 0.5233960443801254, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.7869, + "step": 2170 + }, + { + "epoch": 0.524602026049204, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.7334, + "step": 2175 + }, + { + "epoch": 0.5258080077182827, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.7554, + "step": 2180 + }, + { + "epoch": 0.5270139893873613, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.6895, + "step": 2185 + }, + { + "epoch": 0.5282199710564399, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.7401, + "step": 2190 + }, + { + "epoch": 0.5294259527255186, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.7774, + "step": 2195 + }, + { + "epoch": 0.5306319343945972, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.665, + "step": 2200 + }, + { + "epoch": 0.5318379160636758, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.7083, + "step": 2205 + }, + { + "epoch": 0.5330438977327545, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.7318, + "step": 2210 + }, + { + "epoch": 0.5342498794018331, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.7261, + "step": 2215 + }, + { + "epoch": 0.5354558610709117, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.6802, + "step": 2220 + }, + { + "epoch": 0.5366618427399904, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.7567, + "step": 2225 + }, + { + "epoch": 0.537867824409069, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.6111, + "step": 2230 + }, + { + "epoch": 0.5390738060781476, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.7502, + "step": 2235 + }, + { + "epoch": 0.5402797877472263, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.5811, + "step": 2240 + }, + { + "epoch": 0.5414857694163049, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.7408, + "step": 2245 + }, + { + "epoch": 0.5426917510853835, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.7826, + "step": 2250 + }, + { + "epoch": 0.5438977327544622, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.6391, + "step": 2255 + }, + { + "epoch": 0.5451037144235408, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.7174, + "step": 2260 + }, + { + "epoch": 0.5463096960926194, + "grad_norm": 1.7734375, + "learning_rate": 3e-05, + "loss": 4.6833, + "step": 2265 + }, + { + "epoch": 0.5475156777616981, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.7697, + "step": 2270 + }, + { + "epoch": 0.5487216594307767, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.7611, + "step": 2275 + }, + { + "epoch": 0.5499276410998553, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.5988, + "step": 2280 + }, + { + "epoch": 0.551133622768934, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.7913, + "step": 2285 + }, + { + "epoch": 0.5523396044380126, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.7792, + "step": 2290 + }, + { + "epoch": 0.5535455861070911, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.6986, + "step": 2295 + }, + { + "epoch": 0.5547515677761699, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.8458, + "step": 2300 + }, + { + "epoch": 0.5559575494452484, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.7553, + "step": 2305 + }, + { + "epoch": 0.557163531114327, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.5985, + "step": 2310 + }, + { + "epoch": 0.5583695127834057, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.6457, + "step": 2315 + }, + { + "epoch": 0.5595754944524843, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.8647, + "step": 2320 + }, + { + "epoch": 0.5607814761215629, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.7258, + "step": 2325 + }, + { + "epoch": 0.5619874577906416, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.7187, + "step": 2330 + }, + { + "epoch": 0.5631934394597202, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.5816, + "step": 2335 + }, + { + "epoch": 0.5643994211287988, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.6699, + "step": 2340 + }, + { + "epoch": 0.5656054027978775, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.5736, + "step": 2345 + }, + { + "epoch": 0.5668113844669561, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.7388, + "step": 2350 + }, + { + "epoch": 0.5680173661360347, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.9559, + "step": 2355 + }, + { + "epoch": 0.5692233478051134, + "grad_norm": 1.8515625, + "learning_rate": 3e-05, + "loss": 4.6631, + "step": 2360 + }, + { + "epoch": 0.570429329474192, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.4889, + "step": 2365 + }, + { + "epoch": 0.5716353111432706, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.7725, + "step": 2370 + }, + { + "epoch": 0.5728412928123493, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.8148, + "step": 2375 + }, + { + "epoch": 0.5740472744814279, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.5893, + "step": 2380 + }, + { + "epoch": 0.5752532561505065, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.7027, + "step": 2385 + }, + { + "epoch": 0.5764592378195852, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.6606, + "step": 2390 + }, + { + "epoch": 0.5776652194886638, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.7504, + "step": 2395 + }, + { + "epoch": 0.5788712011577424, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.6643, + "step": 2400 + }, + { + "epoch": 0.5800771828268211, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.7539, + "step": 2405 + }, + { + "epoch": 0.5812831644958997, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.6503, + "step": 2410 + }, + { + "epoch": 0.5824891461649783, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 4.7897, + "step": 2415 + }, + { + "epoch": 0.583695127834057, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.6341, + "step": 2420 + }, + { + "epoch": 0.5849011095031356, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.5938, + "step": 2425 + }, + { + "epoch": 0.5861070911722142, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.8009, + "step": 2430 + }, + { + "epoch": 0.5873130728412929, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.6057, + "step": 2435 + }, + { + "epoch": 0.5885190545103715, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.7151, + "step": 2440 + }, + { + "epoch": 0.58972503617945, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.72, + "step": 2445 + }, + { + "epoch": 0.5909310178485288, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.6675, + "step": 2450 + }, + { + "epoch": 0.5921369995176073, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.6394, + "step": 2455 + }, + { + "epoch": 0.5933429811866859, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.7655, + "step": 2460 + }, + { + "epoch": 0.5945489628557646, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.5585, + "step": 2465 + }, + { + "epoch": 0.5957549445248432, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.6956, + "step": 2470 + }, + { + "epoch": 0.5969609261939218, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 4.5783, + "step": 2475 + }, + { + "epoch": 0.5981669078630005, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.5324, + "step": 2480 + }, + { + "epoch": 0.5993728895320791, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.6965, + "step": 2485 + }, + { + "epoch": 0.6005788712011577, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.5978, + "step": 2490 + }, + { + "epoch": 0.6017848528702364, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.7335, + "step": 2495 + }, + { + "epoch": 0.602990834539315, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.6361, + "step": 2500 + }, + { + "epoch": 0.6041968162083936, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.7529, + "step": 2505 + }, + { + "epoch": 0.6054027978774723, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.7204, + "step": 2510 + }, + { + "epoch": 0.6066087795465509, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.6322, + "step": 2515 + }, + { + "epoch": 0.6078147612156295, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.568, + "step": 2520 + }, + { + "epoch": 0.6090207428847082, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.704, + "step": 2525 + }, + { + "epoch": 0.6102267245537868, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.6954, + "step": 2530 + }, + { + "epoch": 0.6114327062228654, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.675, + "step": 2535 + }, + { + "epoch": 0.6126386878919441, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.7965, + "step": 2540 + }, + { + "epoch": 0.6138446695610227, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.7336, + "step": 2545 + }, + { + "epoch": 0.6150506512301013, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.7419, + "step": 2550 + }, + { + "epoch": 0.61625663289918, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.5528, + "step": 2555 + }, + { + "epoch": 0.6174626145682586, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.5106, + "step": 2560 + }, + { + "epoch": 0.6186685962373372, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.5695, + "step": 2565 + }, + { + "epoch": 0.6198745779064159, + "grad_norm": 1.90625, + "learning_rate": 3e-05, + "loss": 4.6957, + "step": 2570 + }, + { + "epoch": 0.6210805595754945, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.5537, + "step": 2575 + }, + { + "epoch": 0.622286541244573, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.6757, + "step": 2580 + }, + { + "epoch": 0.6234925229136518, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.5825, + "step": 2585 + }, + { + "epoch": 0.6246985045827304, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.5221, + "step": 2590 + }, + { + "epoch": 0.625904486251809, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.6702, + "step": 2595 + }, + { + "epoch": 0.6271104679208876, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.4973, + "step": 2600 + }, + { + "epoch": 0.6283164495899662, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.7989, + "step": 2605 + }, + { + "epoch": 0.6295224312590448, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.6821, + "step": 2610 + }, + { + "epoch": 0.6307284129281235, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.6938, + "step": 2615 + }, + { + "epoch": 0.6319343945972021, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.5866, + "step": 2620 + }, + { + "epoch": 0.6331403762662807, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.6921, + "step": 2625 + }, + { + "epoch": 0.6343463579353594, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.5261, + "step": 2630 + }, + { + "epoch": 0.635552339604438, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.616, + "step": 2635 + }, + { + "epoch": 0.6367583212735166, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.5105, + "step": 2640 + }, + { + "epoch": 0.6379643029425953, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.5342, + "step": 2645 + }, + { + "epoch": 0.6391702846116739, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.5575, + "step": 2650 + }, + { + "epoch": 0.6403762662807525, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.6319, + "step": 2655 + }, + { + "epoch": 0.6415822479498312, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.7018, + "step": 2660 + }, + { + "epoch": 0.6427882296189098, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.774, + "step": 2665 + }, + { + "epoch": 0.6439942112879884, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.6651, + "step": 2670 + }, + { + "epoch": 0.6452001929570671, + "grad_norm": 3.921875, + "learning_rate": 3e-05, + "loss": 4.6475, + "step": 2675 + }, + { + "epoch": 0.6464061746261457, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.4535, + "step": 2680 + }, + { + "epoch": 0.6476121562952243, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.5088, + "step": 2685 + }, + { + "epoch": 0.648818137964303, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.4722, + "step": 2690 + }, + { + "epoch": 0.6500241196333816, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.5021, + "step": 2695 + }, + { + "epoch": 0.6512301013024602, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.5359, + "step": 2700 + }, + { + "epoch": 0.6524360829715389, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.6285, + "step": 2705 + }, + { + "epoch": 0.6536420646406175, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.6462, + "step": 2710 + }, + { + "epoch": 0.6548480463096961, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.578, + "step": 2715 + }, + { + "epoch": 0.6560540279787748, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.5873, + "step": 2720 + }, + { + "epoch": 0.6572600096478534, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.5252, + "step": 2725 + }, + { + "epoch": 0.658465991316932, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.6603, + "step": 2730 + }, + { + "epoch": 0.6596719729860107, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.707, + "step": 2735 + }, + { + "epoch": 0.6608779546550893, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.7296, + "step": 2740 + }, + { + "epoch": 0.6620839363241678, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.554, + "step": 2745 + }, + { + "epoch": 0.6632899179932465, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.6896, + "step": 2750 + }, + { + "epoch": 0.6644958996623251, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.5461, + "step": 2755 + }, + { + "epoch": 0.6657018813314037, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.5021, + "step": 2760 + }, + { + "epoch": 0.6669078630004824, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.7011, + "step": 2765 + }, + { + "epoch": 0.668113844669561, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.5652, + "step": 2770 + }, + { + "epoch": 0.6693198263386396, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.6102, + "step": 2775 + }, + { + "epoch": 0.6705258080077183, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.6903, + "step": 2780 + }, + { + "epoch": 0.6717317896767969, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.7055, + "step": 2785 + }, + { + "epoch": 0.6729377713458755, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.5797, + "step": 2790 + }, + { + "epoch": 0.6741437530149542, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.4644, + "step": 2795 + }, + { + "epoch": 0.6753497346840328, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.5142, + "step": 2800 + }, + { + "epoch": 0.6765557163531114, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.6188, + "step": 2805 + }, + { + "epoch": 0.6777616980221901, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.2891, + "step": 2810 + }, + { + "epoch": 0.6789676796912687, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.6081, + "step": 2815 + }, + { + "epoch": 0.6801736613603473, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.6215, + "step": 2820 + }, + { + "epoch": 0.681379643029426, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.4295, + "step": 2825 + }, + { + "epoch": 0.6825856246985046, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.5824, + "step": 2830 + }, + { + "epoch": 0.6837916063675832, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.4736, + "step": 2835 + }, + { + "epoch": 0.6849975880366619, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.5933, + "step": 2840 + }, + { + "epoch": 0.6862035697057405, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.5604, + "step": 2845 + }, + { + "epoch": 0.6874095513748191, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.4067, + "step": 2850 + }, + { + "epoch": 0.6886155330438978, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.4835, + "step": 2855 + }, + { + "epoch": 0.6898215147129764, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.4981, + "step": 2860 + }, + { + "epoch": 0.691027496382055, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.5609, + "step": 2865 + }, + { + "epoch": 0.6922334780511337, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.4212, + "step": 2870 + }, + { + "epoch": 0.6934394597202123, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.533, + "step": 2875 + }, + { + "epoch": 0.6946454413892909, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.5442, + "step": 2880 + }, + { + "epoch": 0.6958514230583696, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.5254, + "step": 2885 + }, + { + "epoch": 0.6970574047274481, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.352, + "step": 2890 + }, + { + "epoch": 0.6982633863965267, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.5573, + "step": 2895 + }, + { + "epoch": 0.6994693680656054, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.5445, + "step": 2900 + }, + { + "epoch": 0.700675349734684, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.5631, + "step": 2905 + }, + { + "epoch": 0.7018813314037626, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.4409, + "step": 2910 + }, + { + "epoch": 0.7030873130728413, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.5282, + "step": 2915 + }, + { + "epoch": 0.7042932947419199, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.3651, + "step": 2920 + }, + { + "epoch": 0.7054992764109985, + "grad_norm": 1.765625, + "learning_rate": 3e-05, + "loss": 4.5946, + "step": 2925 + }, + { + "epoch": 0.7067052580800772, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.5357, + "step": 2930 + }, + { + "epoch": 0.7079112397491558, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.5109, + "step": 2935 + }, + { + "epoch": 0.7091172214182344, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 4.4849, + "step": 2940 + }, + { + "epoch": 0.7103232030873131, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.4213, + "step": 2945 + }, + { + "epoch": 0.7115291847563917, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.4284, + "step": 2950 + }, + { + "epoch": 0.7127351664254703, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.4445, + "step": 2955 + }, + { + "epoch": 0.713941148094549, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.5524, + "step": 2960 + }, + { + "epoch": 0.7151471297636276, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.5115, + "step": 2965 + }, + { + "epoch": 0.7163531114327062, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.5595, + "step": 2970 + }, + { + "epoch": 0.7175590931017849, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.4096, + "step": 2975 + }, + { + "epoch": 0.7187650747708635, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.6125, + "step": 2980 + }, + { + "epoch": 0.7199710564399421, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.5777, + "step": 2985 + }, + { + "epoch": 0.7211770381090208, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.411, + "step": 2990 + }, + { + "epoch": 0.7223830197780994, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.6638, + "step": 2995 + }, + { + "epoch": 0.723589001447178, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.4474, + "step": 3000 + }, + { + "epoch": 0.7247949831162567, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.4597, + "step": 3005 + }, + { + "epoch": 0.7260009647853353, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.4419, + "step": 3010 + }, + { + "epoch": 0.7272069464544139, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.3696, + "step": 3015 + }, + { + "epoch": 0.7284129281234926, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.442, + "step": 3020 + }, + { + "epoch": 0.7296189097925712, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.3994, + "step": 3025 + }, + { + "epoch": 0.7308248914616498, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.5059, + "step": 3030 + }, + { + "epoch": 0.7320308731307285, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.5434, + "step": 3035 + }, + { + "epoch": 0.733236854799807, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.4819, + "step": 3040 + }, + { + "epoch": 0.7344428364688856, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.4458, + "step": 3045 + }, + { + "epoch": 0.7356488181379643, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.4622, + "step": 3050 + }, + { + "epoch": 0.7368547998070429, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.521, + "step": 3055 + }, + { + "epoch": 0.7380607814761215, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.4673, + "step": 3060 + }, + { + "epoch": 0.7392667631452002, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.4084, + "step": 3065 + }, + { + "epoch": 0.7404727448142788, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.416, + "step": 3070 + }, + { + "epoch": 0.7416787264833574, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.4623, + "step": 3075 + }, + { + "epoch": 0.7428847081524361, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.6505, + "step": 3080 + }, + { + "epoch": 0.7440906898215147, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.4949, + "step": 3085 + }, + { + "epoch": 0.7452966714905933, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.5213, + "step": 3090 + }, + { + "epoch": 0.746502653159672, + "grad_norm": 1.8828125, + "learning_rate": 3e-05, + "loss": 4.3938, + "step": 3095 + }, + { + "epoch": 0.7477086348287506, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.3599, + "step": 3100 + }, + { + "epoch": 0.7489146164978292, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.2375, + "step": 3105 + }, + { + "epoch": 0.7501205981669079, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.566, + "step": 3110 + }, + { + "epoch": 0.7513265798359865, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.6481, + "step": 3115 + }, + { + "epoch": 0.7525325615050651, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.6129, + "step": 3120 + }, + { + "epoch": 0.7537385431741438, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.6412, + "step": 3125 + }, + { + "epoch": 0.7549445248432224, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.3374, + "step": 3130 + }, + { + "epoch": 0.756150506512301, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.5581, + "step": 3135 + }, + { + "epoch": 0.7573564881813797, + "grad_norm": 1.8046875, + "learning_rate": 3e-05, + "loss": 4.4636, + "step": 3140 + }, + { + "epoch": 0.7585624698504583, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.7031, + "step": 3145 + }, + { + "epoch": 0.7597684515195369, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.4857, + "step": 3150 + }, + { + "epoch": 0.7609744331886156, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.3713, + "step": 3155 + }, + { + "epoch": 0.7621804148576942, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.5158, + "step": 3160 + }, + { + "epoch": 0.7633863965267728, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.3696, + "step": 3165 + }, + { + "epoch": 0.7645923781958515, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.3256, + "step": 3170 + }, + { + "epoch": 0.7657983598649301, + "grad_norm": 1.8515625, + "learning_rate": 3e-05, + "loss": 4.3505, + "step": 3175 + }, + { + "epoch": 0.7670043415340086, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.4221, + "step": 3180 + }, + { + "epoch": 0.7682103232030874, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.609, + "step": 3185 + }, + { + "epoch": 0.769416304872166, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.4097, + "step": 3190 + }, + { + "epoch": 0.7706222865412445, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.3881, + "step": 3195 + }, + { + "epoch": 0.7718282682103232, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.3572, + "step": 3200 + }, + { + "epoch": 0.7730342498794018, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.3264, + "step": 3205 + }, + { + "epoch": 0.7742402315484804, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1551, + "step": 3210 + }, + { + "epoch": 0.7754462132175591, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.6972, + "step": 3215 + }, + { + "epoch": 0.7766521948866377, + "grad_norm": 3.421875, + "learning_rate": 3e-05, + "loss": 4.6838, + "step": 3220 + }, + { + "epoch": 0.7778581765557163, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.4757, + "step": 3225 + }, + { + "epoch": 0.779064158224795, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.4435, + "step": 3230 + }, + { + "epoch": 0.7802701398938736, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.3932, + "step": 3235 + }, + { + "epoch": 0.7814761215629522, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.4354, + "step": 3240 + }, + { + "epoch": 0.7826821032320309, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.4226, + "step": 3245 + }, + { + "epoch": 0.7838880849011095, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.5854, + "step": 3250 + }, + { + "epoch": 0.7850940665701881, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.417, + "step": 3255 + }, + { + "epoch": 0.7863000482392668, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.4898, + "step": 3260 + }, + { + "epoch": 0.7875060299083454, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.5403, + "step": 3265 + }, + { + "epoch": 0.788712011577424, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.6457, + "step": 3270 + }, + { + "epoch": 0.7899179932465027, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.3593, + "step": 3275 + }, + { + "epoch": 0.7911239749155813, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.3323, + "step": 3280 + }, + { + "epoch": 0.7923299565846599, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.5542, + "step": 3285 + }, + { + "epoch": 0.7935359382537386, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.521, + "step": 3290 + }, + { + "epoch": 0.7947419199228172, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.4324, + "step": 3295 + }, + { + "epoch": 0.7959479015918958, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.4935, + "step": 3300 + }, + { + "epoch": 0.7971538832609745, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.4216, + "step": 3305 + }, + { + "epoch": 0.7983598649300531, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.4761, + "step": 3310 + }, + { + "epoch": 0.7995658465991317, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.4752, + "step": 3315 + }, + { + "epoch": 0.8007718282682104, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.409, + "step": 3320 + }, + { + "epoch": 0.801977809937289, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.3812, + "step": 3325 + }, + { + "epoch": 0.8031837916063675, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.4513, + "step": 3330 + }, + { + "epoch": 0.8043897732754463, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.502, + "step": 3335 + }, + { + "epoch": 0.8055957549445248, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.3314, + "step": 3340 + }, + { + "epoch": 0.8068017366136034, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.513, + "step": 3345 + }, + { + "epoch": 0.8080077182826821, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.4607, + "step": 3350 + }, + { + "epoch": 0.8092136999517607, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.382, + "step": 3355 + }, + { + "epoch": 0.8104196816208393, + "grad_norm": 1.8671875, + "learning_rate": 3e-05, + "loss": 4.3694, + "step": 3360 + }, + { + "epoch": 0.811625663289918, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.36, + "step": 3365 + }, + { + "epoch": 0.8128316449589966, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.4718, + "step": 3370 + }, + { + "epoch": 0.8140376266280752, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.302, + "step": 3375 + }, + { + "epoch": 0.8152436082971539, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.4969, + "step": 3380 + }, + { + "epoch": 0.8164495899662325, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.3885, + "step": 3385 + }, + { + "epoch": 0.8176555716353111, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.4318, + "step": 3390 + }, + { + "epoch": 0.8188615533043898, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.4583, + "step": 3395 + }, + { + "epoch": 0.8200675349734684, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.2654, + "step": 3400 + }, + { + "epoch": 0.821273516642547, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.3499, + "step": 3405 + }, + { + "epoch": 0.8224794983116257, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.3371, + "step": 3410 + }, + { + "epoch": 0.8236854799807043, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.3177, + "step": 3415 + }, + { + "epoch": 0.8248914616497829, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.3892, + "step": 3420 + }, + { + "epoch": 0.8260974433188616, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.3503, + "step": 3425 + }, + { + "epoch": 0.8273034249879402, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.4576, + "step": 3430 + }, + { + "epoch": 0.8285094066570188, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.4192, + "step": 3435 + }, + { + "epoch": 0.8297153883260975, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.4293, + "step": 3440 + }, + { + "epoch": 0.8309213699951761, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.3069, + "step": 3445 + }, + { + "epoch": 0.8321273516642547, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.432, + "step": 3450 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.3897, + "step": 3455 + }, + { + "epoch": 0.834539315002412, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.4484, + "step": 3460 + }, + { + "epoch": 0.8357452966714906, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.2252, + "step": 3465 + }, + { + "epoch": 0.8369512783405693, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.3953, + "step": 3470 + }, + { + "epoch": 0.8381572600096479, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.4112, + "step": 3475 + }, + { + "epoch": 0.8393632416787264, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.4335, + "step": 3480 + }, + { + "epoch": 0.8405692233478052, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.4711, + "step": 3485 + }, + { + "epoch": 0.8417752050168837, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.2862, + "step": 3490 + }, + { + "epoch": 0.8429811866859623, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.2646, + "step": 3495 + }, + { + "epoch": 0.844187168355041, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.3446, + "step": 3500 + }, + { + "epoch": 0.8453931500241196, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.2989, + "step": 3505 + }, + { + "epoch": 0.8465991316931982, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.3109, + "step": 3510 + }, + { + "epoch": 0.8478051133622769, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.5348, + "step": 3515 + }, + { + "epoch": 0.8490110950313555, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.3558, + "step": 3520 + }, + { + "epoch": 0.8502170767004341, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.5995, + "step": 3525 + }, + { + "epoch": 0.8514230583695128, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.4, + "step": 3530 + }, + { + "epoch": 0.8526290400385914, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.3557, + "step": 3535 + }, + { + "epoch": 0.85383502170767, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.3707, + "step": 3540 + }, + { + "epoch": 0.8550410033767487, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.2456, + "step": 3545 + }, + { + "epoch": 0.8562469850458273, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.2662, + "step": 3550 + }, + { + "epoch": 0.8574529667149059, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.3337, + "step": 3555 + }, + { + "epoch": 0.8586589483839846, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.4318, + "step": 3560 + }, + { + "epoch": 0.8598649300530632, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2292, + "step": 3565 + }, + { + "epoch": 0.8610709117221418, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.201, + "step": 3570 + }, + { + "epoch": 0.8622768933912205, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.4233, + "step": 3575 + }, + { + "epoch": 0.8634828750602991, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.5371, + "step": 3580 + }, + { + "epoch": 0.8646888567293777, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.2995, + "step": 3585 + }, + { + "epoch": 0.8658948383984564, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.4251, + "step": 3590 + }, + { + "epoch": 0.867100820067535, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.3139, + "step": 3595 + }, + { + "epoch": 0.8683068017366136, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.2808, + "step": 3600 + }, + { + "epoch": 0.8695127834056923, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.3172, + "step": 3605 + }, + { + "epoch": 0.8707187650747709, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.3034, + "step": 3610 + }, + { + "epoch": 0.8719247467438495, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2873, + "step": 3615 + }, + { + "epoch": 0.8731307284129282, + "grad_norm": 3.53125, + "learning_rate": 3e-05, + "loss": 4.3963, + "step": 3620 + }, + { + "epoch": 0.8743367100820068, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.3578, + "step": 3625 + }, + { + "epoch": 0.8755426917510853, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.2587, + "step": 3630 + }, + { + "epoch": 0.876748673420164, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.4614, + "step": 3635 + }, + { + "epoch": 0.8779546550892426, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1304, + "step": 3640 + }, + { + "epoch": 0.8791606367583212, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.2083, + "step": 3645 + }, + { + "epoch": 0.8803666184273999, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.2815, + "step": 3650 + }, + { + "epoch": 0.8815726000964785, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.4023, + "step": 3655 + }, + { + "epoch": 0.8827785817655571, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0872, + "step": 3660 + }, + { + "epoch": 0.8839845634346358, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 4.4202, + "step": 3665 + }, + { + "epoch": 0.8851905451037144, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.4167, + "step": 3670 + }, + { + "epoch": 0.886396526772793, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 4.2922, + "step": 3675 + }, + { + "epoch": 0.8876025084418717, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.5491, + "step": 3680 + }, + { + "epoch": 0.8888084901109503, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.0671, + "step": 3685 + }, + { + "epoch": 0.8900144717800289, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.4984, + "step": 3690 + }, + { + "epoch": 0.8912204534491076, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.3098, + "step": 3695 + }, + { + "epoch": 0.8924264351181862, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.2841, + "step": 3700 + }, + { + "epoch": 0.8936324167872648, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.4347, + "step": 3705 + }, + { + "epoch": 0.8948383984563435, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.3184, + "step": 3710 + }, + { + "epoch": 0.8960443801254221, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2526, + "step": 3715 + }, + { + "epoch": 0.8972503617945007, + "grad_norm": 3.59375, + "learning_rate": 3e-05, + "loss": 4.3653, + "step": 3720 + }, + { + "epoch": 0.8984563434635794, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.1791, + "step": 3725 + }, + { + "epoch": 0.899662325132658, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.3713, + "step": 3730 + }, + { + "epoch": 0.9008683068017366, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.278, + "step": 3735 + }, + { + "epoch": 0.9020742884708153, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.3953, + "step": 3740 + }, + { + "epoch": 0.9032802701398939, + "grad_norm": 1.921875, + "learning_rate": 3e-05, + "loss": 4.248, + "step": 3745 + }, + { + "epoch": 0.9044862518089725, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.4365, + "step": 3750 + }, + { + "epoch": 0.9056922334780512, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.3308, + "step": 3755 + }, + { + "epoch": 0.9068982151471298, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.404, + "step": 3760 + }, + { + "epoch": 0.9081041968162084, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.2406, + "step": 3765 + }, + { + "epoch": 0.9093101784852871, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.4286, + "step": 3770 + }, + { + "epoch": 0.9105161601543657, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.2325, + "step": 3775 + }, + { + "epoch": 0.9117221418234442, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.1555, + "step": 3780 + }, + { + "epoch": 0.912928123492523, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.261, + "step": 3785 + }, + { + "epoch": 0.9141341051616015, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.234, + "step": 3790 + }, + { + "epoch": 0.9153400868306801, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.2657, + "step": 3795 + }, + { + "epoch": 0.9165460684997588, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.183, + "step": 3800 + }, + { + "epoch": 0.9177520501688374, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.4021, + "step": 3805 + }, + { + "epoch": 0.918958031837916, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.3369, + "step": 3810 + }, + { + "epoch": 0.9201640135069947, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.2254, + "step": 3815 + }, + { + "epoch": 0.9213699951760733, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.364, + "step": 3820 + }, + { + "epoch": 0.9225759768451519, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.2053, + "step": 3825 + }, + { + "epoch": 0.9237819585142306, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.2441, + "step": 3830 + }, + { + "epoch": 0.9249879401833092, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.2054, + "step": 3835 + }, + { + "epoch": 0.9261939218523878, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.3761, + "step": 3840 + }, + { + "epoch": 0.9273999035214665, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.2734, + "step": 3845 + }, + { + "epoch": 0.9286058851905451, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1227, + "step": 3850 + }, + { + "epoch": 0.9298118668596237, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.4329, + "step": 3855 + }, + { + "epoch": 0.9310178485287024, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.3229, + "step": 3860 + }, + { + "epoch": 0.932223830197781, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.2651, + "step": 3865 + }, + { + "epoch": 0.9334298118668596, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.3458, + "step": 3870 + }, + { + "epoch": 0.9346357935359383, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.2634, + "step": 3875 + }, + { + "epoch": 0.9358417752050169, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.3467, + "step": 3880 + }, + { + "epoch": 0.9370477568740955, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1485, + "step": 3885 + }, + { + "epoch": 0.9382537385431742, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.368, + "step": 3890 + }, + { + "epoch": 0.9394597202122528, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.2311, + "step": 3895 + }, + { + "epoch": 0.9406657018813314, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.3066, + "step": 3900 + }, + { + "epoch": 0.9418716835504101, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.2629, + "step": 3905 + }, + { + "epoch": 0.9430776652194887, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.2645, + "step": 3910 + }, + { + "epoch": 0.9442836468885673, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.3612, + "step": 3915 + }, + { + "epoch": 0.945489628557646, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.245, + "step": 3920 + }, + { + "epoch": 0.9466956102267245, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.3571, + "step": 3925 + }, + { + "epoch": 0.9479015918958031, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1609, + "step": 3930 + }, + { + "epoch": 0.9491075735648818, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.2638, + "step": 3935 + }, + { + "epoch": 0.9503135552339604, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.3612, + "step": 3940 + }, + { + "epoch": 0.951519536903039, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.2925, + "step": 3945 + }, + { + "epoch": 0.9527255185721177, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1713, + "step": 3950 + }, + { + "epoch": 0.9539315002411963, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1754, + "step": 3955 + }, + { + "epoch": 0.9551374819102749, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.3065, + "step": 3960 + }, + { + "epoch": 0.9563434635793536, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.1348, + "step": 3965 + }, + { + "epoch": 0.9575494452484322, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.2436, + "step": 3970 + }, + { + "epoch": 0.9587554269175108, + "grad_norm": 3.34375, + "learning_rate": 3e-05, + "loss": 4.3451, + "step": 3975 + }, + { + "epoch": 0.9599614085865895, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.3137, + "step": 3980 + }, + { + "epoch": 0.9611673902556681, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1208, + "step": 3985 + }, + { + "epoch": 0.9623733719247467, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0812, + "step": 3990 + }, + { + "epoch": 0.9635793535938254, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.3247, + "step": 3995 + }, + { + "epoch": 0.964785335262904, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.304, + "step": 4000 + }, + { + "epoch": 0.9659913169319826, + "grad_norm": 1.90625, + "learning_rate": 3e-05, + "loss": 4.3427, + "step": 4005 + }, + { + "epoch": 0.9671972986010613, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.1484, + "step": 4010 + }, + { + "epoch": 0.9684032802701399, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.1438, + "step": 4015 + }, + { + "epoch": 0.9696092619392185, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.1749, + "step": 4020 + }, + { + "epoch": 0.9708152436082972, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.2491, + "step": 4025 + }, + { + "epoch": 0.9720212252773758, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.2584, + "step": 4030 + }, + { + "epoch": 0.9732272069464544, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.296, + "step": 4035 + }, + { + "epoch": 0.9744331886155331, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.3104, + "step": 4040 + }, + { + "epoch": 0.9756391702846117, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.1227, + "step": 4045 + }, + { + "epoch": 0.9768451519536903, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0633, + "step": 4050 + }, + { + "epoch": 0.978051133622769, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.2781, + "step": 4055 + }, + { + "epoch": 0.9792571152918476, + "grad_norm": 3.921875, + "learning_rate": 3e-05, + "loss": 4.3384, + "step": 4060 + }, + { + "epoch": 0.9804630969609262, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.3135, + "step": 4065 + }, + { + "epoch": 0.9816690786300049, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.364, + "step": 4070 + }, + { + "epoch": 0.9828750602990834, + "grad_norm": 3.484375, + "learning_rate": 3e-05, + "loss": 4.1734, + "step": 4075 + }, + { + "epoch": 0.984081041968162, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 4.1808, + "step": 4080 + }, + { + "epoch": 0.9852870236372407, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1457, + "step": 4085 + }, + { + "epoch": 0.9864930053063193, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1074, + "step": 4090 + }, + { + "epoch": 0.9876989869753979, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1239, + "step": 4095 + }, + { + "epoch": 0.9889049686444766, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.4142, + "step": 4100 + }, + { + "epoch": 0.9901109503135552, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.2597, + "step": 4105 + }, + { + "epoch": 0.9913169319826338, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.2161, + "step": 4110 + }, + { + "epoch": 0.9925229136517125, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1805, + "step": 4115 + }, + { + "epoch": 0.9937288953207911, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.342, + "step": 4120 + }, + { + "epoch": 0.9949348769898697, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0627, + "step": 4125 + }, + { + "epoch": 0.9961408586589484, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.1565, + "step": 4130 + }, + { + "epoch": 0.997346840328027, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.1849, + "step": 4135 + }, + { + "epoch": 0.9985528219971056, + "grad_norm": 1.8671875, + "learning_rate": 3e-05, + "loss": 4.2711, + "step": 4140 + }, + { + "epoch": 0.9997588036661843, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.3792, + "step": 4145 + }, + { + "epoch": 1.000964785335263, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.1339, + "step": 4150 + }, + { + "epoch": 1.0021707670043416, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0774, + "step": 4155 + }, + { + "epoch": 1.0033767486734202, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.3682, + "step": 4160 + }, + { + "epoch": 1.0045827303424988, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.2177, + "step": 4165 + }, + { + "epoch": 1.0057887120115774, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.2855, + "step": 4170 + }, + { + "epoch": 1.006994693680656, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1398, + "step": 4175 + }, + { + "epoch": 1.0082006753497348, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.1209, + "step": 4180 + }, + { + "epoch": 1.0094066570188134, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.3527, + "step": 4185 + }, + { + "epoch": 1.010612638687892, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.3545, + "step": 4190 + }, + { + "epoch": 1.0118186203569706, + "grad_norm": 1.8828125, + "learning_rate": 3e-05, + "loss": 4.0483, + "step": 4195 + }, + { + "epoch": 1.0130246020260492, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2368, + "step": 4200 + }, + { + "epoch": 1.0142305836951278, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0668, + "step": 4205 + }, + { + "epoch": 1.0154365653642066, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.2598, + "step": 4210 + }, + { + "epoch": 1.0166425470332852, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.2961, + "step": 4215 + }, + { + "epoch": 1.0178485287023638, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.2173, + "step": 4220 + }, + { + "epoch": 1.0190545103714423, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1629, + "step": 4225 + }, + { + "epoch": 1.020260492040521, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.102, + "step": 4230 + }, + { + "epoch": 1.0214664737095995, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.2525, + "step": 4235 + }, + { + "epoch": 1.0226724553786783, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0938, + "step": 4240 + }, + { + "epoch": 1.023878437047757, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.171, + "step": 4245 + }, + { + "epoch": 1.0250844187168355, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.2585, + "step": 4250 + }, + { + "epoch": 1.0262904003859141, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.1278, + "step": 4255 + }, + { + "epoch": 1.0274963820549927, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.2064, + "step": 4260 + }, + { + "epoch": 1.0287023637240713, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.2195, + "step": 4265 + }, + { + "epoch": 1.0299083453931501, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.2334, + "step": 4270 + }, + { + "epoch": 1.0311143270622287, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.128, + "step": 4275 + }, + { + "epoch": 1.0323203087313073, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.232, + "step": 4280 + }, + { + "epoch": 1.033526290400386, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1578, + "step": 4285 + }, + { + "epoch": 1.0347322720694645, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.2872, + "step": 4290 + }, + { + "epoch": 1.035938253738543, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9836, + "step": 4295 + }, + { + "epoch": 1.037144235407622, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.306, + "step": 4300 + }, + { + "epoch": 1.0383502170767005, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2063, + "step": 4305 + }, + { + "epoch": 1.039556198745779, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.1627, + "step": 4310 + }, + { + "epoch": 1.0407621804148577, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.3906, + "step": 4315 + }, + { + "epoch": 1.0419681620839363, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.16, + "step": 4320 + }, + { + "epoch": 1.0431741437530149, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.2564, + "step": 4325 + }, + { + "epoch": 1.0443801254220937, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1919, + "step": 4330 + }, + { + "epoch": 1.0455861070911723, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1206, + "step": 4335 + }, + { + "epoch": 1.0467920887602509, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.2585, + "step": 4340 + }, + { + "epoch": 1.0479980704293295, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.2129, + "step": 4345 + }, + { + "epoch": 1.049204052098408, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.1281, + "step": 4350 + }, + { + "epoch": 1.0504100337674867, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 4.1529, + "step": 4355 + }, + { + "epoch": 1.0516160154365655, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.289, + "step": 4360 + }, + { + "epoch": 1.052821997105644, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.191, + "step": 4365 + }, + { + "epoch": 1.0540279787747227, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.3676, + "step": 4370 + }, + { + "epoch": 1.0552339604438012, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0414, + "step": 4375 + }, + { + "epoch": 1.0564399421128798, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1719, + "step": 4380 + }, + { + "epoch": 1.0576459237819584, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.2354, + "step": 4385 + }, + { + "epoch": 1.0588519054510372, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0022, + "step": 4390 + }, + { + "epoch": 1.0600578871201158, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 4.0969, + "step": 4395 + }, + { + "epoch": 1.0612638687891944, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9865, + "step": 4400 + }, + { + "epoch": 1.062469850458273, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.1781, + "step": 4405 + }, + { + "epoch": 1.0636758321273516, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.2007, + "step": 4410 + }, + { + "epoch": 1.0648818137964302, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1212, + "step": 4415 + }, + { + "epoch": 1.066087795465509, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.114, + "step": 4420 + }, + { + "epoch": 1.0672937771345876, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.1021, + "step": 4425 + }, + { + "epoch": 1.0684997588036662, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.2303, + "step": 4430 + }, + { + "epoch": 1.0697057404727448, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1789, + "step": 4435 + }, + { + "epoch": 1.0709117221418234, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 4.0521, + "step": 4440 + }, + { + "epoch": 1.072117703810902, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.0793, + "step": 4445 + }, + { + "epoch": 1.0733236854799808, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.2611, + "step": 4450 + }, + { + "epoch": 1.0745296671490594, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.1296, + "step": 4455 + }, + { + "epoch": 1.075735648818138, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.1777, + "step": 4460 + }, + { + "epoch": 1.0769416304872166, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0622, + "step": 4465 + }, + { + "epoch": 1.0781476121562952, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1491, + "step": 4470 + }, + { + "epoch": 1.0793535938253738, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0251, + "step": 4475 + }, + { + "epoch": 1.0805595754944526, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1231, + "step": 4480 + }, + { + "epoch": 1.0817655571635312, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1179, + "step": 4485 + }, + { + "epoch": 1.0829715388326098, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.2324, + "step": 4490 + }, + { + "epoch": 1.0841775205016884, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.2711, + "step": 4495 + }, + { + "epoch": 1.085383502170767, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.2723, + "step": 4500 + }, + { + "epoch": 1.0865894838398455, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1012, + "step": 4505 + }, + { + "epoch": 1.0877954655089244, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.2708, + "step": 4510 + }, + { + "epoch": 1.089001447178003, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1696, + "step": 4515 + }, + { + "epoch": 1.0902074288470815, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9826, + "step": 4520 + }, + { + "epoch": 1.0914134105161601, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.168, + "step": 4525 + }, + { + "epoch": 1.0926193921852387, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.087, + "step": 4530 + }, + { + "epoch": 1.0938253738543173, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1323, + "step": 4535 + }, + { + "epoch": 1.0950313555233961, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1108, + "step": 4540 + }, + { + "epoch": 1.0962373371924747, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.2787, + "step": 4545 + }, + { + "epoch": 1.0974433188615533, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.1382, + "step": 4550 + }, + { + "epoch": 1.098649300530632, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.2285, + "step": 4555 + }, + { + "epoch": 1.0998552821997105, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.2904, + "step": 4560 + }, + { + "epoch": 1.101061263868789, + "grad_norm": 1.984375, + "learning_rate": 3e-05, + "loss": 4.0092, + "step": 4565 + }, + { + "epoch": 1.102267245537868, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0829, + "step": 4570 + }, + { + "epoch": 1.1034732272069465, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.3022, + "step": 4575 + }, + { + "epoch": 1.104679208876025, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.2022, + "step": 4580 + }, + { + "epoch": 1.1058851905451037, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.0575, + "step": 4585 + }, + { + "epoch": 1.1070911722141823, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1738, + "step": 4590 + }, + { + "epoch": 1.1082971538832609, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.1463, + "step": 4595 + }, + { + "epoch": 1.1095031355523397, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.1492, + "step": 4600 + }, + { + "epoch": 1.1107091172214183, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.173, + "step": 4605 + }, + { + "epoch": 1.111915098890497, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.115, + "step": 4610 + }, + { + "epoch": 1.1131210805595755, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.2306, + "step": 4615 + }, + { + "epoch": 1.114327062228654, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0668, + "step": 4620 + }, + { + "epoch": 1.1155330438977327, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1718, + "step": 4625 + }, + { + "epoch": 1.1167390255668115, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.3329, + "step": 4630 + }, + { + "epoch": 1.11794500723589, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1353, + "step": 4635 + }, + { + "epoch": 1.1191509889049687, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1285, + "step": 4640 + }, + { + "epoch": 1.1203569705740473, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.143, + "step": 4645 + }, + { + "epoch": 1.1215629522431259, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.086, + "step": 4650 + }, + { + "epoch": 1.1227689339122044, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0245, + "step": 4655 + }, + { + "epoch": 1.1239749155812833, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0672, + "step": 4660 + }, + { + "epoch": 1.1251808972503619, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0632, + "step": 4665 + }, + { + "epoch": 1.1263868789194404, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.2852, + "step": 4670 + }, + { + "epoch": 1.127592860588519, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.0793, + "step": 4675 + }, + { + "epoch": 1.1287988422575976, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1748, + "step": 4680 + }, + { + "epoch": 1.1300048239266762, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.111, + "step": 4685 + }, + { + "epoch": 1.1312108055957548, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1002, + "step": 4690 + }, + { + "epoch": 1.1324167872648336, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.1802, + "step": 4695 + }, + { + "epoch": 1.1336227689339122, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.2181, + "step": 4700 + }, + { + "epoch": 1.1348287506029908, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1413, + "step": 4705 + }, + { + "epoch": 1.1360347322720694, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.1944, + "step": 4710 + }, + { + "epoch": 1.137240713941148, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.0352, + "step": 4715 + }, + { + "epoch": 1.1384466956102268, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0962, + "step": 4720 + }, + { + "epoch": 1.1396526772793054, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1969, + "step": 4725 + }, + { + "epoch": 1.140858658948384, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1846, + "step": 4730 + }, + { + "epoch": 1.1420646406174626, + "grad_norm": 1.84375, + "learning_rate": 3e-05, + "loss": 3.9844, + "step": 4735 + }, + { + "epoch": 1.1432706222865412, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1057, + "step": 4740 + }, + { + "epoch": 1.1444766039556198, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.1298, + "step": 4745 + }, + { + "epoch": 1.1456825856246984, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.1007, + "step": 4750 + }, + { + "epoch": 1.1468885672937772, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.1214, + "step": 4755 + }, + { + "epoch": 1.1480945489628558, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.13, + "step": 4760 + }, + { + "epoch": 1.1493005306319344, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0098, + "step": 4765 + }, + { + "epoch": 1.150506512301013, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0388, + "step": 4770 + }, + { + "epoch": 1.1517124939700916, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9719, + "step": 4775 + }, + { + "epoch": 1.1529184756391704, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.0633, + "step": 4780 + }, + { + "epoch": 1.154124457308249, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.152, + "step": 4785 + }, + { + "epoch": 1.1553304389773276, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.2158, + "step": 4790 + }, + { + "epoch": 1.1565364206464062, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.126, + "step": 4795 + }, + { + "epoch": 1.1577424023154848, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.2521, + "step": 4800 + }, + { + "epoch": 1.1589483839845633, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.2653, + "step": 4805 + }, + { + "epoch": 1.1601543656536422, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.0527, + "step": 4810 + }, + { + "epoch": 1.1613603473227208, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9317, + "step": 4815 + }, + { + "epoch": 1.1625663289917993, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.9856, + "step": 4820 + }, + { + "epoch": 1.163772310660878, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.1945, + "step": 4825 + }, + { + "epoch": 1.1649782923299565, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.1806, + "step": 4830 + }, + { + "epoch": 1.1661842739990351, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.1163, + "step": 4835 + }, + { + "epoch": 1.167390255668114, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.0936, + "step": 4840 + }, + { + "epoch": 1.1685962373371925, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9, + "step": 4845 + }, + { + "epoch": 1.1698022190062711, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.161, + "step": 4850 + }, + { + "epoch": 1.1710082006753497, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.1315, + "step": 4855 + }, + { + "epoch": 1.1722141823444283, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.991, + "step": 4860 + }, + { + "epoch": 1.173420164013507, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2845, + "step": 4865 + }, + { + "epoch": 1.1746261456825857, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0925, + "step": 4870 + }, + { + "epoch": 1.1758321273516643, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0806, + "step": 4875 + }, + { + "epoch": 1.177038109020743, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0887, + "step": 4880 + }, + { + "epoch": 1.1782440906898215, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0005, + "step": 4885 + }, + { + "epoch": 1.1794500723589, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0836, + "step": 4890 + }, + { + "epoch": 1.1806560540279787, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1957, + "step": 4895 + }, + { + "epoch": 1.1818620356970575, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0634, + "step": 4900 + }, + { + "epoch": 1.183068017366136, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1855, + "step": 4905 + }, + { + "epoch": 1.1842739990352147, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.0014, + "step": 4910 + }, + { + "epoch": 1.1854799807042933, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0946, + "step": 4915 + }, + { + "epoch": 1.1866859623733719, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9955, + "step": 4920 + }, + { + "epoch": 1.1878919440424505, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1028, + "step": 4925 + }, + { + "epoch": 1.1890979257115293, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1742, + "step": 4930 + }, + { + "epoch": 1.1903039073806079, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.1621, + "step": 4935 + }, + { + "epoch": 1.1915098890496865, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0406, + "step": 4940 + }, + { + "epoch": 1.192715870718765, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.2553, + "step": 4945 + }, + { + "epoch": 1.1939218523878437, + "grad_norm": 1.859375, + "learning_rate": 3e-05, + "loss": 4.1071, + "step": 4950 + }, + { + "epoch": 1.1951278340569222, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0448, + "step": 4955 + }, + { + "epoch": 1.196333815726001, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0193, + "step": 4960 + }, + { + "epoch": 1.1975397973950797, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0086, + "step": 4965 + }, + { + "epoch": 1.1987457790641582, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9806, + "step": 4970 + }, + { + "epoch": 1.1999517607332368, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1022, + "step": 4975 + }, + { + "epoch": 1.2011577424023154, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.075, + "step": 4980 + }, + { + "epoch": 1.202363724071394, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0788, + "step": 4985 + }, + { + "epoch": 1.2035697057404728, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.1779, + "step": 4990 + }, + { + "epoch": 1.2047756874095514, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2796, + "step": 4995 + }, + { + "epoch": 1.20598166907863, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0545, + "step": 5000 + }, + { + "epoch": 1.2071876507477086, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 3.9616, + "step": 5005 + }, + { + "epoch": 1.2083936324167872, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.2295, + "step": 5010 + }, + { + "epoch": 1.2095996140858658, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.0503, + "step": 5015 + }, + { + "epoch": 1.2108055957549446, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.1597, + "step": 5020 + }, + { + "epoch": 1.2120115774240232, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.0367, + "step": 5025 + }, + { + "epoch": 1.2132175590931018, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0732, + "step": 5030 + }, + { + "epoch": 1.2144235407621804, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0282, + "step": 5035 + }, + { + "epoch": 1.215629522431259, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.132, + "step": 5040 + }, + { + "epoch": 1.2168355041003376, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0851, + "step": 5045 + }, + { + "epoch": 1.2180414857694164, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0377, + "step": 5050 + }, + { + "epoch": 1.219247467438495, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0493, + "step": 5055 + }, + { + "epoch": 1.2204534491075736, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1075, + "step": 5060 + }, + { + "epoch": 1.2216594307766522, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0531, + "step": 5065 + }, + { + "epoch": 1.2228654124457308, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8709, + "step": 5070 + }, + { + "epoch": 1.2240713941148094, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1531, + "step": 5075 + }, + { + "epoch": 1.2252773757838882, + "grad_norm": 1.90625, + "learning_rate": 3e-05, + "loss": 3.9813, + "step": 5080 + }, + { + "epoch": 1.2264833574529668, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.9545, + "step": 5085 + }, + { + "epoch": 1.2276893391220454, + "grad_norm": 1.8359375, + "learning_rate": 3e-05, + "loss": 4.0415, + "step": 5090 + }, + { + "epoch": 1.228895320791124, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9904, + "step": 5095 + }, + { + "epoch": 1.2301013024602026, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9211, + "step": 5100 + }, + { + "epoch": 1.2313072841292811, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.0547, + "step": 5105 + }, + { + "epoch": 1.23251326579836, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.0466, + "step": 5110 + }, + { + "epoch": 1.2337192474674386, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1817, + "step": 5115 + }, + { + "epoch": 1.2349252291365171, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.3072, + "step": 5120 + }, + { + "epoch": 1.2361312108055957, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.0433, + "step": 5125 + }, + { + "epoch": 1.2373371924746743, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0285, + "step": 5130 + }, + { + "epoch": 1.238543174143753, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0347, + "step": 5135 + }, + { + "epoch": 1.2397491558128317, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1013, + "step": 5140 + }, + { + "epoch": 1.2409551374819103, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9632, + "step": 5145 + }, + { + "epoch": 1.242161119150989, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1824, + "step": 5150 + }, + { + "epoch": 1.2433671008200675, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1593, + "step": 5155 + }, + { + "epoch": 1.244573082489146, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 4.01, + "step": 5160 + }, + { + "epoch": 1.2457790641582247, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.075, + "step": 5165 + }, + { + "epoch": 1.2469850458273035, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.1642, + "step": 5170 + }, + { + "epoch": 1.248191027496382, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.087, + "step": 5175 + }, + { + "epoch": 1.2493970091654607, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9084, + "step": 5180 + }, + { + "epoch": 1.2506029908345393, + "grad_norm": 1.8515625, + "learning_rate": 3e-05, + "loss": 4.1324, + "step": 5185 + }, + { + "epoch": 1.251808972503618, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9775, + "step": 5190 + }, + { + "epoch": 1.2530149541726967, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0716, + "step": 5195 + }, + { + "epoch": 1.2542209358417753, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8476, + "step": 5200 + }, + { + "epoch": 1.255426917510854, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1585, + "step": 5205 + }, + { + "epoch": 1.2566328991799325, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9763, + "step": 5210 + }, + { + "epoch": 1.257838880849011, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.0321, + "step": 5215 + }, + { + "epoch": 1.2590448625180897, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 3.9563, + "step": 5220 + }, + { + "epoch": 1.2602508441871683, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0625, + "step": 5225 + }, + { + "epoch": 1.2614568258562469, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0721, + "step": 5230 + }, + { + "epoch": 1.2626628075253257, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9653, + "step": 5235 + }, + { + "epoch": 1.2638687891944043, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9962, + "step": 5240 + }, + { + "epoch": 1.2650747708634829, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0589, + "step": 5245 + }, + { + "epoch": 1.2662807525325614, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0012, + "step": 5250 + }, + { + "epoch": 1.2674867342016403, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.805, + "step": 5255 + }, + { + "epoch": 1.2686927158707189, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1071, + "step": 5260 + }, + { + "epoch": 1.2698986975397974, + "grad_norm": 1.7734375, + "learning_rate": 3e-05, + "loss": 3.9521, + "step": 5265 + }, + { + "epoch": 1.271104679208876, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 3.8906, + "step": 5270 + }, + { + "epoch": 1.2723106608779546, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.012, + "step": 5275 + }, + { + "epoch": 1.2735166425470332, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9555, + "step": 5280 + }, + { + "epoch": 1.2747226242161118, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8404, + "step": 5285 + }, + { + "epoch": 1.2759286058851904, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0432, + "step": 5290 + }, + { + "epoch": 1.2771345875542692, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 3.9693, + "step": 5295 + }, + { + "epoch": 1.2783405692233478, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1444, + "step": 5300 + }, + { + "epoch": 1.2795465508924264, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.2652, + "step": 5305 + }, + { + "epoch": 1.280752532561505, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 3.9923, + "step": 5310 + }, + { + "epoch": 1.2819585142305838, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9645, + "step": 5315 + }, + { + "epoch": 1.2831644958996624, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.9688, + "step": 5320 + }, + { + "epoch": 1.284370477568741, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0604, + "step": 5325 + }, + { + "epoch": 1.2855764592378196, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9091, + "step": 5330 + }, + { + "epoch": 1.2867824409068982, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1342, + "step": 5335 + }, + { + "epoch": 1.2879884225759768, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0989, + "step": 5340 + }, + { + "epoch": 1.2891944042450554, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.002, + "step": 5345 + }, + { + "epoch": 1.290400385914134, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0439, + "step": 5350 + }, + { + "epoch": 1.2916063675832128, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0985, + "step": 5355 + }, + { + "epoch": 1.2928123492522914, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0396, + "step": 5360 + }, + { + "epoch": 1.29401833092137, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8272, + "step": 5365 + }, + { + "epoch": 1.2952243125904486, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.0941, + "step": 5370 + }, + { + "epoch": 1.2964302942595274, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9093, + "step": 5375 + }, + { + "epoch": 1.297636275928606, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0413, + "step": 5380 + }, + { + "epoch": 1.2988422575976846, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9375, + "step": 5385 + }, + { + "epoch": 1.3000482392667632, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0401, + "step": 5390 + }, + { + "epoch": 1.3012542209358418, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9941, + "step": 5395 + }, + { + "epoch": 1.3024602026049203, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1502, + "step": 5400 + }, + { + "epoch": 1.303666184273999, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9638, + "step": 5405 + }, + { + "epoch": 1.3048721659430775, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.8541, + "step": 5410 + }, + { + "epoch": 1.3060781476121563, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9946, + "step": 5415 + }, + { + "epoch": 1.307284129281235, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.0078, + "step": 5420 + }, + { + "epoch": 1.3084901109503135, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9895, + "step": 5425 + }, + { + "epoch": 1.3096960926193921, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9607, + "step": 5430 + }, + { + "epoch": 1.310902074288471, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 3.8822, + "step": 5435 + }, + { + "epoch": 1.3121080559575495, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.8596, + "step": 5440 + }, + { + "epoch": 1.3133140376266281, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.1279, + "step": 5445 + }, + { + "epoch": 1.3145200192957067, + "grad_norm": 1.984375, + "learning_rate": 3e-05, + "loss": 4.0723, + "step": 5450 + }, + { + "epoch": 1.3157260009647853, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.0458, + "step": 5455 + }, + { + "epoch": 1.316931982633864, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 4.0843, + "step": 5460 + }, + { + "epoch": 1.3181379643029425, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 4.0233, + "step": 5465 + }, + { + "epoch": 1.319343945972021, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9558, + "step": 5470 + }, + { + "epoch": 1.3205499276411, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9242, + "step": 5475 + }, + { + "epoch": 1.3217559093101785, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8995, + "step": 5480 + }, + { + "epoch": 1.322961890979257, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9838, + "step": 5485 + }, + { + "epoch": 1.3241678726483357, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1444, + "step": 5490 + }, + { + "epoch": 1.3253738543174145, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.9579, + "step": 5495 + }, + { + "epoch": 1.326579835986493, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1027, + "step": 5500 + }, + { + "epoch": 1.3277858176555717, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.1139, + "step": 5505 + }, + { + "epoch": 1.3289917993246503, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0427, + "step": 5510 + }, + { + "epoch": 1.3301977809937289, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.9941, + "step": 5515 + }, + { + "epoch": 1.3314037626628075, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0884, + "step": 5520 + }, + { + "epoch": 1.332609744331886, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 3.9796, + "step": 5525 + }, + { + "epoch": 1.3338157260009647, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9593, + "step": 5530 + }, + { + "epoch": 1.3350217076700435, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9173, + "step": 5535 + }, + { + "epoch": 1.336227689339122, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.0316, + "step": 5540 + }, + { + "epoch": 1.3374336710082007, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0766, + "step": 5545 + }, + { + "epoch": 1.3386396526772792, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1248, + "step": 5550 + }, + { + "epoch": 1.339845634346358, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0593, + "step": 5555 + }, + { + "epoch": 1.3410516160154367, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0802, + "step": 5560 + }, + { + "epoch": 1.3422575976845152, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 4.0298, + "step": 5565 + }, + { + "epoch": 1.3434635793535938, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9824, + "step": 5570 + }, + { + "epoch": 1.3446695610226724, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.993, + "step": 5575 + }, + { + "epoch": 1.345875542691751, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1292, + "step": 5580 + }, + { + "epoch": 1.3470815243608296, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.1268, + "step": 5585 + }, + { + "epoch": 1.3482875060299082, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0361, + "step": 5590 + }, + { + "epoch": 1.349493487698987, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 4.057, + "step": 5595 + }, + { + "epoch": 1.3506994693680656, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.89, + "step": 5600 + }, + { + "epoch": 1.3519054510371442, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.929, + "step": 5605 + }, + { + "epoch": 1.3531114327062228, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1667, + "step": 5610 + }, + { + "epoch": 1.3543174143753016, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.9858, + "step": 5615 + }, + { + "epoch": 1.3555233960443802, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.1026, + "step": 5620 + }, + { + "epoch": 1.3567293777134588, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9049, + "step": 5625 + }, + { + "epoch": 1.3579353593825374, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9312, + "step": 5630 + }, + { + "epoch": 1.359141341051616, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.0745, + "step": 5635 + }, + { + "epoch": 1.3603473227206946, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0108, + "step": 5640 + }, + { + "epoch": 1.3615533043897732, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9133, + "step": 5645 + }, + { + "epoch": 1.3627592860588518, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9778, + "step": 5650 + }, + { + "epoch": 1.3639652677279306, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0117, + "step": 5655 + }, + { + "epoch": 1.3651712493970092, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8992, + "step": 5660 + }, + { + "epoch": 1.3663772310660878, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.8815, + "step": 5665 + }, + { + "epoch": 1.3675832127351664, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0667, + "step": 5670 + }, + { + "epoch": 1.3687891944042452, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1417, + "step": 5675 + }, + { + "epoch": 1.3699951760733238, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1762, + "step": 5680 + }, + { + "epoch": 1.3712011577424024, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 3.9421, + "step": 5685 + }, + { + "epoch": 1.372407139411481, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9013, + "step": 5690 + }, + { + "epoch": 1.3736131210805596, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 3.9713, + "step": 5695 + }, + { + "epoch": 1.3748191027496381, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9916, + "step": 5700 + }, + { + "epoch": 1.3760250844187167, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9628, + "step": 5705 + }, + { + "epoch": 1.3772310660877953, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8958, + "step": 5710 + }, + { + "epoch": 1.3784370477568741, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8121, + "step": 5715 + }, + { + "epoch": 1.3796430294259527, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9822, + "step": 5720 + }, + { + "epoch": 1.3808490110950313, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1571, + "step": 5725 + }, + { + "epoch": 1.38205499276411, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8781, + "step": 5730 + }, + { + "epoch": 1.3832609744331887, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8821, + "step": 5735 + }, + { + "epoch": 1.3844669561022673, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9592, + "step": 5740 + }, + { + "epoch": 1.385672937771346, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8345, + "step": 5745 + }, + { + "epoch": 1.3868789194404245, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0398, + "step": 5750 + }, + { + "epoch": 1.388084901109503, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1608, + "step": 5755 + }, + { + "epoch": 1.3892908827785817, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9649, + "step": 5760 + }, + { + "epoch": 1.3904968644476603, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9321, + "step": 5765 + }, + { + "epoch": 1.391702846116739, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.9942, + "step": 5770 + }, + { + "epoch": 1.3929088277858177, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9593, + "step": 5775 + }, + { + "epoch": 1.3941148094548963, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9289, + "step": 5780 + }, + { + "epoch": 1.395320791123975, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9472, + "step": 5785 + }, + { + "epoch": 1.3965267727930535, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0542, + "step": 5790 + }, + { + "epoch": 1.3977327544621323, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9841, + "step": 5795 + }, + { + "epoch": 1.398938736131211, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.8916, + "step": 5800 + }, + { + "epoch": 1.4001447178002895, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0977, + "step": 5805 + }, + { + "epoch": 1.401350699469368, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 4.0226, + "step": 5810 + }, + { + "epoch": 1.4025566811384467, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 4.0059, + "step": 5815 + }, + { + "epoch": 1.4037626628075253, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1549, + "step": 5820 + }, + { + "epoch": 1.4049686444766039, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0359, + "step": 5825 + }, + { + "epoch": 1.4061746261456824, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1193, + "step": 5830 + }, + { + "epoch": 1.4073806078147613, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1605, + "step": 5835 + }, + { + "epoch": 1.4085865894838399, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8778, + "step": 5840 + }, + { + "epoch": 1.4097925711529185, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9953, + "step": 5845 + }, + { + "epoch": 1.410998552821997, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 4.0284, + "step": 5850 + }, + { + "epoch": 1.4122045344910759, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.8557, + "step": 5855 + }, + { + "epoch": 1.4134105161601545, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9324, + "step": 5860 + }, + { + "epoch": 1.414616497829233, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.9125, + "step": 5865 + }, + { + "epoch": 1.4158224794983116, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.1176, + "step": 5870 + }, + { + "epoch": 1.4170284611673902, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.7876, + "step": 5875 + }, + { + "epoch": 1.4182344428364688, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.8916, + "step": 5880 + }, + { + "epoch": 1.4194404245055474, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0576, + "step": 5885 + }, + { + "epoch": 1.420646406174626, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.847, + "step": 5890 + }, + { + "epoch": 1.4218523878437048, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.9892, + "step": 5895 + }, + { + "epoch": 1.4230583695127834, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0812, + "step": 5900 + }, + { + "epoch": 1.424264351181862, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8944, + "step": 5905 + }, + { + "epoch": 1.4254703328509406, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9525, + "step": 5910 + }, + { + "epoch": 1.4266763145200194, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9512, + "step": 5915 + }, + { + "epoch": 1.427882296189098, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8585, + "step": 5920 + }, + { + "epoch": 1.4290882778581766, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0108, + "step": 5925 + }, + { + "epoch": 1.4302942595272552, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8406, + "step": 5930 + }, + { + "epoch": 1.4315002411963338, + "grad_norm": 1.90625, + "learning_rate": 3e-05, + "loss": 3.9009, + "step": 5935 + }, + { + "epoch": 1.4327062228654124, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.0334, + "step": 5940 + }, + { + "epoch": 1.433912204534491, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0876, + "step": 5945 + }, + { + "epoch": 1.4351181862035696, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9512, + "step": 5950 + }, + { + "epoch": 1.4363241678726484, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 3.8528, + "step": 5955 + }, + { + "epoch": 1.437530149541727, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9492, + "step": 5960 + }, + { + "epoch": 1.4387361312108056, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.9196, + "step": 5965 + }, + { + "epoch": 1.4399421128798842, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0004, + "step": 5970 + }, + { + "epoch": 1.441148094548963, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9415, + "step": 5975 + }, + { + "epoch": 1.4423540762180416, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9929, + "step": 5980 + }, + { + "epoch": 1.4435600578871202, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.039, + "step": 5985 + }, + { + "epoch": 1.4447660395561988, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9964, + "step": 5990 + }, + { + "epoch": 1.4459720212252773, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 3.8487, + "step": 5995 + }, + { + "epoch": 1.447178002894356, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0825, + "step": 6000 + }, + { + "epoch": 1.4483839845634345, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0041, + "step": 6005 + }, + { + "epoch": 1.4495899662325131, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8463, + "step": 6010 + }, + { + "epoch": 1.450795947901592, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.0803, + "step": 6015 + }, + { + "epoch": 1.4520019295706705, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9207, + "step": 6020 + }, + { + "epoch": 1.4532079112397491, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9713, + "step": 6025 + }, + { + "epoch": 1.4544138929088277, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.945, + "step": 6030 + }, + { + "epoch": 1.4556198745779065, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8737, + "step": 6035 + }, + { + "epoch": 1.4568258562469851, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.964, + "step": 6040 + }, + { + "epoch": 1.4580318379160637, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8763, + "step": 6045 + }, + { + "epoch": 1.4592378195851423, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 3.9784, + "step": 6050 + }, + { + "epoch": 1.460443801254221, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8817, + "step": 6055 + }, + { + "epoch": 1.4616497829232995, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1275, + "step": 6060 + }, + { + "epoch": 1.462855764592378, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0263, + "step": 6065 + }, + { + "epoch": 1.464061746261457, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.949, + "step": 6070 + }, + { + "epoch": 1.4652677279305355, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9922, + "step": 6075 + }, + { + "epoch": 1.466473709599614, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0292, + "step": 6080 + }, + { + "epoch": 1.4676796912686927, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0366, + "step": 6085 + }, + { + "epoch": 1.4688856729377713, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1351, + "step": 6090 + }, + { + "epoch": 1.47009165460685, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.9434, + "step": 6095 + }, + { + "epoch": 1.4712976362759287, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0487, + "step": 6100 + }, + { + "epoch": 1.4725036179450073, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9642, + "step": 6105 + }, + { + "epoch": 1.4737095996140859, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.1484, + "step": 6110 + }, + { + "epoch": 1.4749155812831645, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.0544, + "step": 6115 + }, + { + "epoch": 1.476121562952243, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.0747, + "step": 6120 + }, + { + "epoch": 1.4773275446213217, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 3.9309, + "step": 6125 + }, + { + "epoch": 1.4785335262904005, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0152, + "step": 6130 + }, + { + "epoch": 1.479739507959479, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0743, + "step": 6135 + }, + { + "epoch": 1.4809454896285577, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9126, + "step": 6140 + }, + { + "epoch": 1.4821514712976362, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9772, + "step": 6145 + }, + { + "epoch": 1.4833574529667148, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8644, + "step": 6150 + }, + { + "epoch": 1.4845634346357937, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8759, + "step": 6155 + }, + { + "epoch": 1.4857694163048722, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8819, + "step": 6160 + }, + { + "epoch": 1.4869753979739508, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 3.9676, + "step": 6165 + }, + { + "epoch": 1.4881813796430294, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9589, + "step": 6170 + }, + { + "epoch": 1.489387361312108, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 3.9092, + "step": 6175 + }, + { + "epoch": 1.4905933429811866, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9514, + "step": 6180 + }, + { + "epoch": 1.4917993246502652, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0319, + "step": 6185 + }, + { + "epoch": 1.493005306319344, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8165, + "step": 6190 + }, + { + "epoch": 1.4942112879884226, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0019, + "step": 6195 + }, + { + "epoch": 1.4954172696575012, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8178, + "step": 6200 + }, + { + "epoch": 1.4966232513265798, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.878, + "step": 6205 + }, + { + "epoch": 1.4978292329956584, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9808, + "step": 6210 + }, + { + "epoch": 1.4990352146647372, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9069, + "step": 6215 + }, + { + "epoch": 1.5002411963338158, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9508, + "step": 6220 + }, + { + "epoch": 1.5014471780028944, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.7668, + "step": 6225 + }, + { + "epoch": 1.502653159671973, + "grad_norm": 1.7890625, + "learning_rate": 3e-05, + "loss": 3.9336, + "step": 6230 + }, + { + "epoch": 1.5038591413410516, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8866, + "step": 6235 + }, + { + "epoch": 1.5050651230101302, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0279, + "step": 6240 + }, + { + "epoch": 1.5062711046792088, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0068, + "step": 6245 + }, + { + "epoch": 1.5074770863482874, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9693, + "step": 6250 + }, + { + "epoch": 1.5086830680173662, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9907, + "step": 6255 + }, + { + "epoch": 1.5098890496864448, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0405, + "step": 6260 + }, + { + "epoch": 1.5110950313555234, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9773, + "step": 6265 + }, + { + "epoch": 1.5123010130246022, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 3.9862, + "step": 6270 + }, + { + "epoch": 1.5135069946936808, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.1404, + "step": 6275 + }, + { + "epoch": 1.5147129763627594, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8186, + "step": 6280 + }, + { + "epoch": 1.515918958031838, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0414, + "step": 6285 + }, + { + "epoch": 1.5171249397009166, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9972, + "step": 6290 + }, + { + "epoch": 1.5183309213699951, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8544, + "step": 6295 + }, + { + "epoch": 1.5195369030390737, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 3.9612, + "step": 6300 + }, + { + "epoch": 1.5207428847081523, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8324, + "step": 6305 + }, + { + "epoch": 1.521948866377231, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 3.7584, + "step": 6310 + }, + { + "epoch": 1.5231548480463097, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9257, + "step": 6315 + }, + { + "epoch": 1.5243608297153883, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9332, + "step": 6320 + }, + { + "epoch": 1.525566811384467, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8832, + "step": 6325 + }, + { + "epoch": 1.5267727930535457, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8925, + "step": 6330 + }, + { + "epoch": 1.5279787747226243, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 3.8892, + "step": 6335 + }, + { + "epoch": 1.529184756391703, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9855, + "step": 6340 + }, + { + "epoch": 1.5303907380607815, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8522, + "step": 6345 + }, + { + "epoch": 1.5315967197298601, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9123, + "step": 6350 + }, + { + "epoch": 1.5328027013989387, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.7973, + "step": 6355 + }, + { + "epoch": 1.5340086830680173, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9352, + "step": 6360 + }, + { + "epoch": 1.535214664737096, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8727, + "step": 6365 + }, + { + "epoch": 1.5364206464061745, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.917, + "step": 6370 + }, + { + "epoch": 1.5376266280752533, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.8183, + "step": 6375 + }, + { + "epoch": 1.538832609744332, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.8579, + "step": 6380 + }, + { + "epoch": 1.5400385914134105, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.8495, + "step": 6385 + }, + { + "epoch": 1.5412445730824893, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9046, + "step": 6390 + }, + { + "epoch": 1.542450554751568, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.969, + "step": 6395 + }, + { + "epoch": 1.5436565364206465, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.8441, + "step": 6400 + }, + { + "epoch": 1.544862518089725, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9219, + "step": 6405 + }, + { + "epoch": 1.5460684997588037, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.904, + "step": 6410 + }, + { + "epoch": 1.5472744814278823, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8468, + "step": 6415 + }, + { + "epoch": 1.5484804630969609, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0883, + "step": 6420 + }, + { + "epoch": 1.5496864447660395, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.759, + "step": 6425 + }, + { + "epoch": 1.550892426435118, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9091, + "step": 6430 + }, + { + "epoch": 1.5520984081041969, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0657, + "step": 6435 + }, + { + "epoch": 1.5533043897732755, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.8907, + "step": 6440 + }, + { + "epoch": 1.554510371442354, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0508, + "step": 6445 + }, + { + "epoch": 1.5557163531114329, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.976, + "step": 6450 + }, + { + "epoch": 1.5569223347805115, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.831, + "step": 6455 + }, + { + "epoch": 1.55812831644959, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.807, + "step": 6460 + }, + { + "epoch": 1.5593342981186686, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.012, + "step": 6465 + }, + { + "epoch": 1.5605402797877472, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9841, + "step": 6470 + }, + { + "epoch": 1.5617462614568258, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.8583, + "step": 6475 + }, + { + "epoch": 1.5629522431259044, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.802, + "step": 6480 + }, + { + "epoch": 1.564158224794983, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8552, + "step": 6485 + }, + { + "epoch": 1.5653642064640616, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7725, + "step": 6490 + }, + { + "epoch": 1.5665701881331404, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.944, + "step": 6495 + }, + { + "epoch": 1.567776169802219, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 3.9519, + "step": 6500 + }, + { + "epoch": 1.5689821514712976, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.1496, + "step": 6505 + }, + { + "epoch": 1.5701881331403764, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.6896, + "step": 6510 + }, + { + "epoch": 1.571394114809455, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.054, + "step": 6515 + }, + { + "epoch": 1.5726000964785336, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0407, + "step": 6520 + }, + { + "epoch": 1.5738060781476122, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.7924, + "step": 6525 + }, + { + "epoch": 1.5750120598166908, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8803, + "step": 6530 + }, + { + "epoch": 1.5762180414857694, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9293, + "step": 6535 + }, + { + "epoch": 1.577424023154848, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9784, + "step": 6540 + }, + { + "epoch": 1.5786300048239266, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.957, + "step": 6545 + }, + { + "epoch": 1.5798359864930052, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9482, + "step": 6550 + }, + { + "epoch": 1.581041968162084, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8502, + "step": 6555 + }, + { + "epoch": 1.5822479498311626, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.9748, + "step": 6560 + }, + { + "epoch": 1.5834539315002412, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9425, + "step": 6565 + }, + { + "epoch": 1.58465991316932, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8511, + "step": 6570 + }, + { + "epoch": 1.5858658948383986, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9477, + "step": 6575 + }, + { + "epoch": 1.5870718765074772, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8881, + "step": 6580 + }, + { + "epoch": 1.5882778581765558, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9599, + "step": 6585 + }, + { + "epoch": 1.5894838398456343, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9477, + "step": 6590 + }, + { + "epoch": 1.590689821514713, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9076, + "step": 6595 + }, + { + "epoch": 1.5918958031837915, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9423, + "step": 6600 + }, + { + "epoch": 1.5931017848528701, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0652, + "step": 6605 + }, + { + "epoch": 1.5943077665219487, + "grad_norm": 3.3125, + "learning_rate": 3e-05, + "loss": 3.7895, + "step": 6610 + }, + { + "epoch": 1.5955137481910275, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.8779, + "step": 6615 + }, + { + "epoch": 1.5967197298601061, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8829, + "step": 6620 + }, + { + "epoch": 1.5979257115291847, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7635, + "step": 6625 + }, + { + "epoch": 1.5991316931982635, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9663, + "step": 6630 + }, + { + "epoch": 1.6003376748673421, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8111, + "step": 6635 + }, + { + "epoch": 1.6015436565364207, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9601, + "step": 6640 + }, + { + "epoch": 1.6027496382054993, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9679, + "step": 6645 + }, + { + "epoch": 1.603955619874578, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9529, + "step": 6650 + }, + { + "epoch": 1.6051616015436565, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0332, + "step": 6655 + }, + { + "epoch": 1.606367583212735, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.8873, + "step": 6660 + }, + { + "epoch": 1.6075735648818137, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8165, + "step": 6665 + }, + { + "epoch": 1.6087795465508923, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9351, + "step": 6670 + }, + { + "epoch": 1.609985528219971, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0391, + "step": 6675 + }, + { + "epoch": 1.6111915098890497, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9085, + "step": 6680 + }, + { + "epoch": 1.6123974915581283, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0265, + "step": 6685 + }, + { + "epoch": 1.613603473227207, + "grad_norm": 3.21875, + "learning_rate": 3e-05, + "loss": 4.0292, + "step": 6690 + }, + { + "epoch": 1.6148094548962857, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0291, + "step": 6695 + }, + { + "epoch": 1.6160154365653643, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9243, + "step": 6700 + }, + { + "epoch": 1.6172214182344429, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.7057, + "step": 6705 + }, + { + "epoch": 1.6184273999035215, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.8733, + "step": 6710 + }, + { + "epoch": 1.6196333815726, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 4.0025, + "step": 6715 + }, + { + "epoch": 1.6208393632416787, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 3.8148, + "step": 6720 + }, + { + "epoch": 1.6220453449107572, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9253, + "step": 6725 + }, + { + "epoch": 1.6232513265798358, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 3.9053, + "step": 6730 + }, + { + "epoch": 1.6244573082489147, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.7442, + "step": 6735 + }, + { + "epoch": 1.6256632899179932, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 3.9988, + "step": 6740 + }, + { + "epoch": 1.6268692715870718, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8321, + "step": 6745 + }, + { + "epoch": 1.6280752532561507, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8806, + "step": 6750 + }, + { + "epoch": 1.6292812349252292, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.0618, + "step": 6755 + }, + { + "epoch": 1.6304872165943078, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0236, + "step": 6760 + }, + { + "epoch": 1.6316931982633864, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.894, + "step": 6765 + }, + { + "epoch": 1.632899179932465, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9581, + "step": 6770 + }, + { + "epoch": 1.6341051616015436, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.7907, + "step": 6775 + }, + { + "epoch": 1.6353111432706222, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.861, + "step": 6780 + }, + { + "epoch": 1.6365171249397008, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.8668, + "step": 6785 + }, + { + "epoch": 1.6377231066087794, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8156, + "step": 6790 + }, + { + "epoch": 1.6389290882778582, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8422, + "step": 6795 + }, + { + "epoch": 1.6401350699469368, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.0078, + "step": 6800 + }, + { + "epoch": 1.6413410516160154, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9526, + "step": 6805 + }, + { + "epoch": 1.6425470332850942, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.0902, + "step": 6810 + }, + { + "epoch": 1.6437530149541728, + "grad_norm": 1.9765625, + "learning_rate": 3e-05, + "loss": 3.9587, + "step": 6815 + }, + { + "epoch": 1.6449589966232514, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.916, + "step": 6820 + }, + { + "epoch": 1.64616497829233, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.7942, + "step": 6825 + }, + { + "epoch": 1.6473709599614086, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.7968, + "step": 6830 + }, + { + "epoch": 1.6485769416304872, + "grad_norm": 1.9140625, + "learning_rate": 3e-05, + "loss": 3.8271, + "step": 6835 + }, + { + "epoch": 1.6497829232995658, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.7452, + "step": 6840 + }, + { + "epoch": 1.6509889049686444, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.788, + "step": 6845 + }, + { + "epoch": 1.652194886637723, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 3.9527, + "step": 6850 + }, + { + "epoch": 1.6534008683068018, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9491, + "step": 6855 + }, + { + "epoch": 1.6546068499758804, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.7877, + "step": 6860 + }, + { + "epoch": 1.655812831644959, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0136, + "step": 6865 + }, + { + "epoch": 1.6570188133140378, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.8773, + "step": 6870 + }, + { + "epoch": 1.6582247949831164, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9202, + "step": 6875 + }, + { + "epoch": 1.659430776652195, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.061, + "step": 6880 + }, + { + "epoch": 1.6606367583212736, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0245, + "step": 6885 + }, + { + "epoch": 1.6618427399903521, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8493, + "step": 6890 + }, + { + "epoch": 1.6630487216594307, + "grad_norm": 1.8828125, + "learning_rate": 3e-05, + "loss": 4.0109, + "step": 6895 + }, + { + "epoch": 1.6642547033285093, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9281, + "step": 6900 + }, + { + "epoch": 1.665460684997588, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8341, + "step": 6905 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9277, + "step": 6910 + }, + { + "epoch": 1.6678726483357453, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7938, + "step": 6915 + }, + { + "epoch": 1.669078630004824, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0384, + "step": 6920 + }, + { + "epoch": 1.6702846116739025, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0376, + "step": 6925 + }, + { + "epoch": 1.6714905933429813, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9776, + "step": 6930 + }, + { + "epoch": 1.67269657501206, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.834, + "step": 6935 + }, + { + "epoch": 1.6739025566811385, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8761, + "step": 6940 + }, + { + "epoch": 1.6751085383502171, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8635, + "step": 6945 + }, + { + "epoch": 1.6763145200192957, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.8935, + "step": 6950 + }, + { + "epoch": 1.6775205016883743, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7317, + "step": 6955 + }, + { + "epoch": 1.678726483357453, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8441, + "step": 6960 + }, + { + "epoch": 1.6799324650265315, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 3.9865, + "step": 6965 + }, + { + "epoch": 1.68113844669561, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.818, + "step": 6970 + }, + { + "epoch": 1.682344428364689, + "grad_norm": 3.8125, + "learning_rate": 3e-05, + "loss": 3.9333, + "step": 6975 + }, + { + "epoch": 1.6835504100337675, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.7855, + "step": 6980 + }, + { + "epoch": 1.684756391702846, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.8617, + "step": 6985 + }, + { + "epoch": 1.685962373371925, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9757, + "step": 6990 + }, + { + "epoch": 1.6871683550410035, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.7579, + "step": 6995 + }, + { + "epoch": 1.688374336710082, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8122, + "step": 7000 + }, + { + "epoch": 1.6895803183791607, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8582, + "step": 7005 + }, + { + "epoch": 1.6907863000482393, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.8424, + "step": 7010 + }, + { + "epoch": 1.6919922817173179, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.7794, + "step": 7015 + }, + { + "epoch": 1.6931982633863965, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.883, + "step": 7020 + }, + { + "epoch": 1.694404245055475, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.7467, + "step": 7025 + }, + { + "epoch": 1.6956102267245536, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9066, + "step": 7030 + }, + { + "epoch": 1.6968162083936325, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.7149, + "step": 7035 + }, + { + "epoch": 1.698022190062711, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9981, + "step": 7040 + }, + { + "epoch": 1.6992281717317896, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.885, + "step": 7045 + }, + { + "epoch": 1.7004341534008685, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8583, + "step": 7050 + }, + { + "epoch": 1.701640135069947, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.798, + "step": 7055 + }, + { + "epoch": 1.7028461167390256, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8205, + "step": 7060 + }, + { + "epoch": 1.7040520984081042, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7779, + "step": 7065 + }, + { + "epoch": 1.7052580800771828, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9621, + "step": 7070 + }, + { + "epoch": 1.7064640617462614, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.7453, + "step": 7075 + }, + { + "epoch": 1.70767004341534, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.9981, + "step": 7080 + }, + { + "epoch": 1.7088760250844186, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8412, + "step": 7085 + }, + { + "epoch": 1.7100820067534972, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.7884, + "step": 7090 + }, + { + "epoch": 1.711287988422576, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.787, + "step": 7095 + }, + { + "epoch": 1.7124939700916546, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.7944, + "step": 7100 + }, + { + "epoch": 1.7136999517607332, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.8066, + "step": 7105 + }, + { + "epoch": 1.714905933429812, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9211, + "step": 7110 + }, + { + "epoch": 1.7161119150988906, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8854, + "step": 7115 + }, + { + "epoch": 1.7173178967679692, + "grad_norm": 1.921875, + "learning_rate": 3e-05, + "loss": 3.81, + "step": 7120 + }, + { + "epoch": 1.7185238784370478, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9024, + "step": 7125 + }, + { + "epoch": 1.7197298601061264, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.0509, + "step": 7130 + }, + { + "epoch": 1.720935841775205, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7665, + "step": 7135 + }, + { + "epoch": 1.7221418234442836, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.0687, + "step": 7140 + }, + { + "epoch": 1.7233478051133622, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.7784, + "step": 7145 + }, + { + "epoch": 1.7245537867824408, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.8031, + "step": 7150 + }, + { + "epoch": 1.7257597684515196, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8097, + "step": 7155 + }, + { + "epoch": 1.7269657501205982, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8283, + "step": 7160 + }, + { + "epoch": 1.7281717317896768, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7616, + "step": 7165 + }, + { + "epoch": 1.7293777134587556, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8086, + "step": 7170 + }, + { + "epoch": 1.7305836951278342, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8023, + "step": 7175 + }, + { + "epoch": 1.7317896767969128, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.939, + "step": 7180 + }, + { + "epoch": 1.7329956584659914, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.86, + "step": 7185 + }, + { + "epoch": 1.73420164013507, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7831, + "step": 7190 + }, + { + "epoch": 1.7354076218041485, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8882, + "step": 7195 + }, + { + "epoch": 1.7366136034732271, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8688, + "step": 7200 + }, + { + "epoch": 1.7378195851423057, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8119, + "step": 7205 + }, + { + "epoch": 1.7390255668113843, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.8169, + "step": 7210 + }, + { + "epoch": 1.7402315484804631, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8267, + "step": 7215 + }, + { + "epoch": 1.7414375301495417, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8566, + "step": 7220 + }, + { + "epoch": 1.7426435118186203, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 3.947, + "step": 7225 + }, + { + "epoch": 1.7438494934876991, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9169, + "step": 7230 + }, + { + "epoch": 1.7450554751567777, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.944, + "step": 7235 + }, + { + "epoch": 1.7462614568258563, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7584, + "step": 7240 + }, + { + "epoch": 1.747467438494935, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.7844, + "step": 7245 + }, + { + "epoch": 1.7486734201640135, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.5996, + "step": 7250 + }, + { + "epoch": 1.749879401833092, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.7866, + "step": 7255 + }, + { + "epoch": 1.7510853835021707, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1632, + "step": 7260 + }, + { + "epoch": 1.7522913651712493, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 3.9928, + "step": 7265 + }, + { + "epoch": 1.7534973468403279, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.9223, + "step": 7270 + }, + { + "epoch": 1.7547033285094067, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8657, + "step": 7275 + }, + { + "epoch": 1.7559093101784853, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.9718, + "step": 7280 + }, + { + "epoch": 1.7571152918475639, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.7933, + "step": 7285 + }, + { + "epoch": 1.7583212735166427, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0573, + "step": 7290 + }, + { + "epoch": 1.7595272551857213, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.8796, + "step": 7295 + }, + { + "epoch": 1.7607332368547999, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.7767, + "step": 7300 + }, + { + "epoch": 1.7619392185238785, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8606, + "step": 7305 + }, + { + "epoch": 1.763145200192957, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 3.8195, + "step": 7310 + }, + { + "epoch": 1.7643511818620357, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7947, + "step": 7315 + }, + { + "epoch": 1.7655571635311142, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.7401, + "step": 7320 + }, + { + "epoch": 1.7667631452001928, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.7819, + "step": 7325 + }, + { + "epoch": 1.7679691268692714, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0019, + "step": 7330 + }, + { + "epoch": 1.7691751085383502, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8072, + "step": 7335 + }, + { + "epoch": 1.7703810902074288, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8622, + "step": 7340 + }, + { + "epoch": 1.7715870718765074, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 3.7629, + "step": 7345 + }, + { + "epoch": 1.7727930535455863, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 3.7102, + "step": 7350 + }, + { + "epoch": 1.7739990352146648, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.541, + "step": 7355 + }, + { + "epoch": 1.7752050168837434, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0336, + "step": 7360 + }, + { + "epoch": 1.776410998552822, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0071, + "step": 7365 + }, + { + "epoch": 1.7776169802219006, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9898, + "step": 7370 + }, + { + "epoch": 1.7788229618909792, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.9848, + "step": 7375 + }, + { + "epoch": 1.7800289435600578, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.6977, + "step": 7380 + }, + { + "epoch": 1.7812349252291364, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9025, + "step": 7385 + }, + { + "epoch": 1.782440906898215, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 3.7842, + "step": 7390 + }, + { + "epoch": 1.7836468885672938, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9929, + "step": 7395 + }, + { + "epoch": 1.7848528702363724, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8596, + "step": 7400 + }, + { + "epoch": 1.786058851905451, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8727, + "step": 7405 + }, + { + "epoch": 1.7872648335745298, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.9179, + "step": 7410 + }, + { + "epoch": 1.7884708152436084, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1607, + "step": 7415 + }, + { + "epoch": 1.789676796912687, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.7633, + "step": 7420 + }, + { + "epoch": 1.7908827785817656, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.7805, + "step": 7425 + }, + { + "epoch": 1.7920887602508442, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9107, + "step": 7430 + }, + { + "epoch": 1.7932947419199228, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9827, + "step": 7435 + }, + { + "epoch": 1.7945007235890014, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8112, + "step": 7440 + }, + { + "epoch": 1.79570670525808, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9435, + "step": 7445 + }, + { + "epoch": 1.7969126869271586, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.821, + "step": 7450 + }, + { + "epoch": 1.7981186685962374, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.7391, + "step": 7455 + }, + { + "epoch": 1.799324650265316, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0559, + "step": 7460 + }, + { + "epoch": 1.8005306319343946, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8256, + "step": 7465 + }, + { + "epoch": 1.8017366136034734, + "grad_norm": 1.84375, + "learning_rate": 3e-05, + "loss": 3.7895, + "step": 7470 + }, + { + "epoch": 1.802942595272552, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.8965, + "step": 7475 + }, + { + "epoch": 1.8041485769416306, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9159, + "step": 7480 + }, + { + "epoch": 1.8053545586107091, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8632, + "step": 7485 + }, + { + "epoch": 1.8065605402797877, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9129, + "step": 7490 + }, + { + "epoch": 1.8077665219488663, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9412, + "step": 7495 + }, + { + "epoch": 1.808972503617945, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8699, + "step": 7500 + }, + { + "epoch": 1.8101784852870235, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.779, + "step": 7505 + }, + { + "epoch": 1.8113844669561021, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8178, + "step": 7510 + }, + { + "epoch": 1.812590448625181, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.8654, + "step": 7515 + }, + { + "epoch": 1.8137964302942595, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.7555, + "step": 7520 + }, + { + "epoch": 1.8150024119633381, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8698, + "step": 7525 + }, + { + "epoch": 1.816208393632417, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9038, + "step": 7530 + }, + { + "epoch": 1.8174143753014955, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8027, + "step": 7535 + }, + { + "epoch": 1.8186203569705741, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9413, + "step": 7540 + }, + { + "epoch": 1.8198263386396527, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.767, + "step": 7545 + }, + { + "epoch": 1.8210323203087313, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.7247, + "step": 7550 + }, + { + "epoch": 1.82223830197781, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.7961, + "step": 7555 + }, + { + "epoch": 1.8234442836468885, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.7912, + "step": 7560 + }, + { + "epoch": 1.824650265315967, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.7878, + "step": 7565 + }, + { + "epoch": 1.8258562469850457, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.7986, + "step": 7570 + }, + { + "epoch": 1.8270622286541245, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9248, + "step": 7575 + }, + { + "epoch": 1.828268210323203, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8577, + "step": 7580 + }, + { + "epoch": 1.8294741919922817, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9105, + "step": 7585 + }, + { + "epoch": 1.8306801736613605, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.7991, + "step": 7590 + }, + { + "epoch": 1.831886155330439, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8738, + "step": 7595 + }, + { + "epoch": 1.8330921369995177, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.7792, + "step": 7600 + }, + { + "epoch": 1.8342981186685963, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.9745, + "step": 7605 + }, + { + "epoch": 1.8355041003376749, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.5487, + "step": 7610 + }, + { + "epoch": 1.8367100820067535, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8766, + "step": 7615 + }, + { + "epoch": 1.837916063675832, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9019, + "step": 7620 + }, + { + "epoch": 1.8391220453449106, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8466, + "step": 7625 + }, + { + "epoch": 1.8403280270139892, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9464, + "step": 7630 + }, + { + "epoch": 1.841534008683068, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.7597, + "step": 7635 + }, + { + "epoch": 1.8427399903521466, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.7028, + "step": 7640 + }, + { + "epoch": 1.8439459720212252, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8406, + "step": 7645 + }, + { + "epoch": 1.845151953690304, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.6806, + "step": 7650 + }, + { + "epoch": 1.8463579353593826, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.739, + "step": 7655 + }, + { + "epoch": 1.8475639170284612, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9497, + "step": 7660 + }, + { + "epoch": 1.8487698986975398, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 3.8381, + "step": 7665 + }, + { + "epoch": 1.8499758803666184, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9928, + "step": 7670 + }, + { + "epoch": 1.851181862035697, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9728, + "step": 7675 + }, + { + "epoch": 1.8523878437047756, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8359, + "step": 7680 + }, + { + "epoch": 1.8535938253738542, + "grad_norm": 4.34375, + "learning_rate": 3e-05, + "loss": 3.7726, + "step": 7685 + }, + { + "epoch": 1.8547998070429328, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.7752, + "step": 7690 + }, + { + "epoch": 1.8560057887120116, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.6874, + "step": 7695 + }, + { + "epoch": 1.8572117703810902, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8096, + "step": 7700 + }, + { + "epoch": 1.8584177520501688, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8184, + "step": 7705 + }, + { + "epoch": 1.8596237337192476, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7646, + "step": 7710 + }, + { + "epoch": 1.8608297153883262, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.7085, + "step": 7715 + }, + { + "epoch": 1.8620356970574048, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.7508, + "step": 7720 + }, + { + "epoch": 1.8632416787264834, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 4.0698, + "step": 7725 + }, + { + "epoch": 1.864447660395562, + "grad_norm": 1.9453125, + "learning_rate": 3e-05, + "loss": 3.7501, + "step": 7730 + }, + { + "epoch": 1.8656536420646406, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9893, + "step": 7735 + }, + { + "epoch": 1.8668596237337192, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.7567, + "step": 7740 + }, + { + "epoch": 1.8680656054027978, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.7308, + "step": 7745 + }, + { + "epoch": 1.8692715870718764, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.7238, + "step": 7750 + }, + { + "epoch": 1.8704775687409552, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8657, + "step": 7755 + }, + { + "epoch": 1.8716835504100338, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.7781, + "step": 7760 + }, + { + "epoch": 1.8728895320791124, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.7808, + "step": 7765 + }, + { + "epoch": 1.8740955137481912, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9164, + "step": 7770 + }, + { + "epoch": 1.8753014954172698, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 3.7068, + "step": 7775 + }, + { + "epoch": 1.8765074770863484, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8565, + "step": 7780 + }, + { + "epoch": 1.877713458755427, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7286, + "step": 7785 + }, + { + "epoch": 1.8789194404245055, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.6623, + "step": 7790 + }, + { + "epoch": 1.8801254220935841, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.7901, + "step": 7795 + }, + { + "epoch": 1.8813314037626627, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 3.883, + "step": 7800 + }, + { + "epoch": 1.8825373854317413, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.6748, + "step": 7805 + }, + { + "epoch": 1.88374336710082, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.7408, + "step": 7810 + }, + { + "epoch": 1.8849493487698987, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9042, + "step": 7815 + }, + { + "epoch": 1.8861553304389773, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8211, + "step": 7820 + }, + { + "epoch": 1.887361312108056, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0569, + "step": 7825 + }, + { + "epoch": 1.8885672937771347, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 3.5554, + "step": 7830 + }, + { + "epoch": 1.8897732754462133, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9346, + "step": 7835 + }, + { + "epoch": 1.890979257115292, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8821, + "step": 7840 + }, + { + "epoch": 1.8921852387843705, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.683, + "step": 7845 + }, + { + "epoch": 1.893391220453449, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9123, + "step": 7850 + }, + { + "epoch": 1.8945972021225277, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.821, + "step": 7855 + }, + { + "epoch": 1.8958031837916063, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.69, + "step": 7860 + }, + { + "epoch": 1.8970091654606849, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9012, + "step": 7865 + }, + { + "epoch": 1.8982151471297635, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.6692, + "step": 7870 + }, + { + "epoch": 1.8994211287988423, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8644, + "step": 7875 + }, + { + "epoch": 1.9006271104679209, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.819, + "step": 7880 + }, + { + "epoch": 1.9018330921369995, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7935, + "step": 7885 + }, + { + "epoch": 1.9030390738060783, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8848, + "step": 7890 + }, + { + "epoch": 1.9042450554751569, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8953, + "step": 7895 + }, + { + "epoch": 1.9054510371442355, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8023, + "step": 7900 + }, + { + "epoch": 1.906657018813314, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9101, + "step": 7905 + }, + { + "epoch": 1.9078630004823927, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7766, + "step": 7910 + }, + { + "epoch": 1.9090689821514712, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7748, + "step": 7915 + }, + { + "epoch": 1.9102749638205498, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8324, + "step": 7920 + }, + { + "epoch": 1.9114809454896284, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7237, + "step": 7925 + }, + { + "epoch": 1.912686927158707, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.7041, + "step": 7930 + }, + { + "epoch": 1.9138929088277858, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.7389, + "step": 7935 + }, + { + "epoch": 1.9150988904968644, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.7476, + "step": 7940 + }, + { + "epoch": 1.916304872165943, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.7843, + "step": 7945 + }, + { + "epoch": 1.9175108538350218, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.8001, + "step": 7950 + }, + { + "epoch": 1.9187168355041004, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8482, + "step": 7955 + }, + { + "epoch": 1.919922817173179, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.7813, + "step": 7960 + }, + { + "epoch": 1.9211287988422576, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8595, + "step": 7965 + }, + { + "epoch": 1.9223347805113362, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7564, + "step": 7970 + }, + { + "epoch": 1.9235407621804148, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.672, + "step": 7975 + }, + { + "epoch": 1.9247467438494934, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.761, + "step": 7980 + }, + { + "epoch": 1.925952725518572, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8834, + "step": 7985 + }, + { + "epoch": 1.9271587071876506, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.7795, + "step": 7990 + }, + { + "epoch": 1.9283646888567294, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.6395, + "step": 7995 + }, + { + "epoch": 1.929570670525808, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8351, + "step": 8000 + }, + { + "epoch": 1.9307766521948866, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9706, + "step": 8005 + }, + { + "epoch": 1.9319826338639654, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 3.7406, + "step": 8010 + }, + { + "epoch": 1.933188615533044, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.7666, + "step": 8015 + }, + { + "epoch": 1.9343945972021226, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8246, + "step": 8020 + }, + { + "epoch": 1.9356005788712012, + "grad_norm": 6.125, + "learning_rate": 3e-05, + "loss": 3.761, + "step": 8025 + }, + { + "epoch": 1.9368065605402798, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8339, + "step": 8030 + }, + { + "epoch": 1.9380125422093584, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7689, + "step": 8035 + }, + { + "epoch": 1.939218523878437, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.7775, + "step": 8040 + }, + { + "epoch": 1.9404245055475156, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8596, + "step": 8045 + }, + { + "epoch": 1.9416304872165941, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7681, + "step": 8050 + }, + { + "epoch": 1.942836468885673, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7345, + "step": 8055 + }, + { + "epoch": 1.9440424505547516, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8306, + "step": 8060 + }, + { + "epoch": 1.9452484322238301, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.8441, + "step": 8065 + }, + { + "epoch": 1.946454413892909, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8137, + "step": 8070 + }, + { + "epoch": 1.9476603955619876, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.7487, + "step": 8075 + }, + { + "epoch": 1.9488663772310661, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.7596, + "step": 8080 + }, + { + "epoch": 1.9500723589001447, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7839, + "step": 8085 + }, + { + "epoch": 1.9512783405692233, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9499, + "step": 8090 + }, + { + "epoch": 1.952484322238302, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.653, + "step": 8095 + }, + { + "epoch": 1.9536903039073805, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.6592, + "step": 8100 + }, + { + "epoch": 1.9548962855764591, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8865, + "step": 8105 + }, + { + "epoch": 1.9561022672455377, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.6638, + "step": 8110 + }, + { + "epoch": 1.9573082489146165, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.6911, + "step": 8115 + }, + { + "epoch": 1.9585142305836951, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.7635, + "step": 8120 + }, + { + "epoch": 1.9597202122527737, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.952, + "step": 8125 + }, + { + "epoch": 1.9609261939218525, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.6066, + "step": 8130 + }, + { + "epoch": 1.9621321755909311, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.633, + "step": 8135 + }, + { + "epoch": 1.9633381572600097, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7623, + "step": 8140 + }, + { + "epoch": 1.9645441389290883, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 3.8725, + "step": 8145 + }, + { + "epoch": 1.965750120598167, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9518, + "step": 8150 + }, + { + "epoch": 1.9669561022672455, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.6851, + "step": 8155 + }, + { + "epoch": 1.968162083936324, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.6479, + "step": 8160 + }, + { + "epoch": 1.9693680656054027, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7243, + "step": 8165 + }, + { + "epoch": 1.9705740472744813, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.6661, + "step": 8170 + }, + { + "epoch": 1.97178002894356, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8267, + "step": 8175 + }, + { + "epoch": 1.9729860106126387, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.8583, + "step": 8180 + }, + { + "epoch": 1.9741919922817173, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 3.8219, + "step": 8185 + }, + { + "epoch": 1.975397973950796, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.6852, + "step": 8190 + }, + { + "epoch": 1.9766039556198747, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.666, + "step": 8195 + }, + { + "epoch": 1.9778099372889533, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.7167, + "step": 8200 + }, + { + "epoch": 1.9790159189580319, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.906, + "step": 8205 + }, + { + "epoch": 1.9802219006271105, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8129, + "step": 8210 + }, + { + "epoch": 1.981427882296189, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9011, + "step": 8215 + }, + { + "epoch": 1.9826338639652676, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.7331, + "step": 8220 + }, + { + "epoch": 1.9838398456343462, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.7144, + "step": 8225 + }, + { + "epoch": 1.9850458273034248, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.6859, + "step": 8230 + }, + { + "epoch": 1.9862518089725036, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.62, + "step": 8235 + }, + { + "epoch": 1.9874577906415822, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.6762, + "step": 8240 + }, + { + "epoch": 1.9886637723106608, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0014, + "step": 8245 + }, + { + "epoch": 1.9898697539797396, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.7749, + "step": 8250 + }, + { + "epoch": 1.9910757356488182, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.6529, + "step": 8255 + }, + { + "epoch": 1.9922817173178968, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.7956, + "step": 8260 + }, + { + "epoch": 1.9934876989869754, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8536, + "step": 8265 + }, + { + "epoch": 1.994693680656054, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.6658, + "step": 8270 + }, + { + "epoch": 1.9958996623251326, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.7088, + "step": 8275 + }, + { + "epoch": 1.9971056439942112, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.7479, + "step": 8280 + }, + { + "epoch": 1.9983116256632898, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.7478, + "step": 8285 + }, + { + "epoch": 1.9995176073323684, + "grad_norm": 3.65625, + "learning_rate": 3e-05, + "loss": 3.9395, + "step": 8290 + }, + { + "epoch": 2.000723589001447, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.648, + "step": 8295 + }, + { + "epoch": 2.001929570670526, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.6676, + "step": 8300 + }, + { + "epoch": 2.0031355523396046, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8753, + "step": 8305 + }, + { + "epoch": 2.004341534008683, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8172, + "step": 8310 + }, + { + "epoch": 2.005547515677762, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.836, + "step": 8315 + }, + { + "epoch": 2.0067534973468404, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.7457, + "step": 8320 + }, + { + "epoch": 2.007959479015919, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.6816, + "step": 8325 + }, + { + "epoch": 2.0091654606849976, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.7902, + "step": 8330 + }, + { + "epoch": 2.010371442354076, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9442, + "step": 8335 + }, + { + "epoch": 2.0115774240231548, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.674, + "step": 8340 + }, + { + "epoch": 2.0127834056922334, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7012, + "step": 8345 + }, + { + "epoch": 2.013989387361312, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.6978, + "step": 8350 + }, + { + "epoch": 2.0151953690303905, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.7883, + "step": 8355 + }, + { + "epoch": 2.0164013506994696, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8538, + "step": 8360 + }, + { + "epoch": 2.017607332368548, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.7824, + "step": 8365 + }, + { + "epoch": 2.0188133140376268, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.713, + "step": 8370 + }, + { + "epoch": 2.0200192957067054, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.6853, + "step": 8375 + }, + { + "epoch": 2.021225277375784, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.752, + "step": 8380 + }, + { + "epoch": 2.0224312590448625, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.7355, + "step": 8385 + }, + { + "epoch": 2.023637240713941, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.6408, + "step": 8390 + }, + { + "epoch": 2.0248432223830197, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8283, + "step": 8395 + }, + { + "epoch": 2.0260492040520983, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.7683, + "step": 8400 + }, + { + "epoch": 2.027255185721177, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7757, + "step": 8405 + }, + { + "epoch": 2.0284611673902555, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.7638, + "step": 8410 + }, + { + "epoch": 2.029667149059334, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.7866, + "step": 8415 + }, + { + "epoch": 2.030873130728413, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 3.6612, + "step": 8420 + }, + { + "epoch": 2.0320791123974917, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.7902, + "step": 8425 + }, + { + "epoch": 2.0332850940665703, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.7404, + "step": 8430 + }, + { + "epoch": 2.034491075735649, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.7562, + "step": 8435 + }, + { + "epoch": 2.0356970574047275, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.6321, + "step": 8440 + }, + { + "epoch": 2.036903039073806, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8358, + "step": 8445 + }, + { + "epoch": 2.0381090207428847, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.7849, + "step": 8450 + }, + { + "epoch": 2.0393150024119633, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.7036, + "step": 8455 + }, + { + "epoch": 2.040520984081042, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8943, + "step": 8460 + }, + { + "epoch": 2.0417269657501205, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.755, + "step": 8465 + }, + { + "epoch": 2.042932947419199, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8691, + "step": 8470 + }, + { + "epoch": 2.0441389290882777, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.8007, + "step": 8475 + }, + { + "epoch": 2.0453449107573567, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.6498, + "step": 8480 + }, + { + "epoch": 2.0465508924264353, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.8584, + "step": 8485 + }, + { + "epoch": 2.047756874095514, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8621, + "step": 8490 + }, + { + "epoch": 2.0489628557645925, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.6308, + "step": 8495 + }, + { + "epoch": 2.050168837433671, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.6876, + "step": 8500 + }, + { + "epoch": 2.0513748191027497, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.8568, + "step": 8505 + }, + { + "epoch": 2.0525808007718283, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.7444, + "step": 8510 + }, + { + "epoch": 2.053786782440907, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9624, + "step": 8515 + }, + { + "epoch": 2.0549927641099854, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.6974, + "step": 8520 + }, + { + "epoch": 2.056198745779064, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.7123, + "step": 8525 + }, + { + "epoch": 2.0574047274481426, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8225, + "step": 8530 + }, + { + "epoch": 2.058610709117221, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.65, + "step": 8535 + }, + { + "epoch": 2.0598166907863003, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.5742, + "step": 8540 + }, + { + "epoch": 2.061022672455379, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.5805, + "step": 8545 + }, + { + "epoch": 2.0622286541244574, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.7155, + "step": 8550 + }, + { + "epoch": 2.063434635793536, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8111, + "step": 8555 + }, + { + "epoch": 2.0646406174626146, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.7434, + "step": 8560 + }, + { + "epoch": 2.065846599131693, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.6068, + "step": 8565 + }, + { + "epoch": 2.067052580800772, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.7453, + "step": 8570 + }, + { + "epoch": 2.0682585624698504, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.7809, + "step": 8575 + }, + { + "epoch": 2.069464544138929, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.7651, + "step": 8580 + }, + { + "epoch": 2.0706705258080076, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.6758, + "step": 8585 + }, + { + "epoch": 2.071876507477086, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.5966, + "step": 8590 + }, + { + "epoch": 2.0730824891461648, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8621, + "step": 8595 + }, + { + "epoch": 2.074288470815244, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.7196, + "step": 8600 + }, + { + "epoch": 2.0754944524843224, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7583, + "step": 8605 + }, + { + "epoch": 2.076700434153401, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7316, + "step": 8610 + }, + { + "epoch": 2.0779064158224796, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.7126, + "step": 8615 + }, + { + "epoch": 2.079112397491558, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.5228, + "step": 8620 + }, + { + "epoch": 2.0803183791606368, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.6893, + "step": 8625 + }, + { + "epoch": 2.0815243608297154, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.7325, + "step": 8630 + }, + { + "epoch": 2.082730342498794, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.7844, + "step": 8635 + }, + { + "epoch": 2.0839363241678726, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 3.8857, + "step": 8640 + }, + { + "epoch": 2.085142305836951, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.8558, + "step": 8645 + }, + { + "epoch": 2.0863482875060297, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.6917, + "step": 8650 + }, + { + "epoch": 2.0875542691751083, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.836, + "step": 8655 + }, + { + "epoch": 2.0887602508441874, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.7832, + "step": 8660 + }, + { + "epoch": 2.089966232513266, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.6065, + "step": 8665 + }, + { + "epoch": 2.0911722141823446, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.6953, + "step": 8670 + }, + { + "epoch": 2.092378195851423, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.6726, + "step": 8675 + }, + { + "epoch": 2.0935841775205017, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.7379, + "step": 8680 + }, + { + "epoch": 2.0947901591895803, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.7176, + "step": 8685 + }, + { + "epoch": 2.095996140858659, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8576, + "step": 8690 + }, + { + "epoch": 2.0972021225277375, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.6537, + "step": 8695 + }, + { + "epoch": 2.098408104196816, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8747, + "step": 8700 + }, + { + "epoch": 2.0996140858658947, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9617, + "step": 8705 + }, + { + "epoch": 2.1008200675349733, + "grad_norm": 3.40625, + "learning_rate": 3e-05, + "loss": 3.6493, + "step": 8710 + }, + { + "epoch": 2.102026049204052, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.5946, + "step": 8715 + }, + { + "epoch": 2.103232030873131, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8339, + "step": 8720 + }, + { + "epoch": 2.1044380125422095, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8405, + "step": 8725 + }, + { + "epoch": 2.105643994211288, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.6722, + "step": 8730 + }, + { + "epoch": 2.1068499758803667, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 3.7286, + "step": 8735 + }, + { + "epoch": 2.1080559575494453, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.6787, + "step": 8740 + }, + { + "epoch": 2.109261939218524, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.7956, + "step": 8745 + }, + { + "epoch": 2.1104679208876025, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.7473, + "step": 8750 + }, + { + "epoch": 2.111673902556681, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.7366, + "step": 8755 + }, + { + "epoch": 2.1128798842257597, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.8498, + "step": 8760 + }, + { + "epoch": 2.1140858658948383, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.6918, + "step": 8765 + }, + { + "epoch": 2.115291847563917, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.777, + "step": 8770 + }, + { + "epoch": 2.1164978292329955, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.8905, + "step": 8775 + }, + { + "epoch": 2.1177038109020745, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.8154, + "step": 8780 + }, + { + "epoch": 2.118909792571153, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.6955, + "step": 8785 + }, + { + "epoch": 2.1201157742402317, + "grad_norm": 3.578125, + "learning_rate": 3e-05, + "loss": 3.6849, + "step": 8790 + }, + { + "epoch": 2.1213217559093103, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.7385, + "step": 8795 + }, + { + "epoch": 2.122527737578389, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.6102, + "step": 8800 + }, + { + "epoch": 2.1237337192474675, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.6665, + "step": 8805 + }, + { + "epoch": 2.124939700916546, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.6648, + "step": 8810 + }, + { + "epoch": 2.1261456825856246, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8518, + "step": 8815 + }, + { + "epoch": 2.1273516642547032, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.739, + "step": 8820 + }, + { + "epoch": 2.128557645923782, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7587, + "step": 8825 + }, + { + "epoch": 2.1297636275928604, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.7438, + "step": 8830 + }, + { + "epoch": 2.130969609261939, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.6728, + "step": 8835 + }, + { + "epoch": 2.132175590931018, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.7619, + "step": 8840 + }, + { + "epoch": 2.1333815726000966, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 3.864, + "step": 8845 + }, + { + "epoch": 2.1345875542691752, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 3.6923, + "step": 8850 + }, + { + "epoch": 2.135793535938254, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8623, + "step": 8855 + }, + { + "epoch": 2.1369995176073324, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 3.6683, + "step": 8860 + }, + { + "epoch": 2.138205499276411, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.649, + "step": 8865 + }, + { + "epoch": 2.1394114809454896, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7971, + "step": 8870 + }, + { + "epoch": 2.140617462614568, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.78, + "step": 8875 + }, + { + "epoch": 2.141823444283647, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7552, + "step": 8880 + }, + { + "epoch": 2.1430294259527254, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.6223, + "step": 8885 + }, + { + "epoch": 2.144235407621804, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7211, + "step": 8890 + }, + { + "epoch": 2.1454413892908826, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.6437, + "step": 8895 + }, + { + "epoch": 2.1466473709599616, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8143, + "step": 8900 + }, + { + "epoch": 2.14785335262904, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.7189, + "step": 8905 + }, + { + "epoch": 2.149059334298119, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.6055, + "step": 8910 + }, + { + "epoch": 2.1502653159671974, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.7114, + "step": 8915 + }, + { + "epoch": 2.151471297636276, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.5794, + "step": 8920 + }, + { + "epoch": 2.1526772793053546, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.6117, + "step": 8925 + }, + { + "epoch": 2.153883260974433, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.7645, + "step": 8930 + }, + { + "epoch": 2.1550892426435118, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8283, + "step": 8935 + }, + { + "epoch": 2.1562952243125904, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.7704, + "step": 8940 + }, + { + "epoch": 2.157501205981669, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.8142, + "step": 8945 + }, + { + "epoch": 2.1587071876507475, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9524, + "step": 8950 + }, + { + "epoch": 2.159913169319826, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.6887, + "step": 8955 + }, + { + "epoch": 2.161119150988905, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.573, + "step": 8960 + }, + { + "epoch": 2.1623251326579838, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.5456, + "step": 8965 + }, + { + "epoch": 2.1635311143270624, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7748, + "step": 8970 + }, + { + "epoch": 2.164737095996141, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.7593, + "step": 8975 + }, + { + "epoch": 2.1659430776652195, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7741, + "step": 8980 + }, + { + "epoch": 2.167149059334298, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.7436, + "step": 8985 + }, + { + "epoch": 2.1683550410033767, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.5113, + "step": 8990 + }, + { + "epoch": 2.1695610226724553, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.7764, + "step": 8995 + }, + { + "epoch": 2.170767004341534, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.719, + "step": 9000 + }, + { + "epoch": 2.0012059816690786, + "grad_norm": 3.4375, + "learning_rate": 3e-05, + "loss": 4.2476, + "step": 9005 + }, + { + "epoch": 2.002411963338157, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.3297, + "step": 9010 + }, + { + "epoch": 2.003617945007236, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.3612, + "step": 9015 + }, + { + "epoch": 2.0048239266763144, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.3168, + "step": 9020 + }, + { + "epoch": 2.006029908345393, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.2866, + "step": 9025 + }, + { + "epoch": 2.0072358900144716, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.2497, + "step": 9030 + }, + { + "epoch": 2.0084418716835506, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.1764, + "step": 9035 + }, + { + "epoch": 2.009647853352629, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.3846, + "step": 9040 + }, + { + "epoch": 2.010853835021708, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.3504, + "step": 9045 + }, + { + "epoch": 2.0120598166907864, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1678, + "step": 9050 + }, + { + "epoch": 2.013265798359865, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.2396, + "step": 9055 + }, + { + "epoch": 2.0144717800289436, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1989, + "step": 9060 + }, + { + "epoch": 2.015677761698022, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.2574, + "step": 9065 + }, + { + "epoch": 2.0168837433671007, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.3173, + "step": 9070 + }, + { + "epoch": 2.0180897250361793, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.2554, + "step": 9075 + }, + { + "epoch": 2.019295706705258, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.1888, + "step": 9080 + }, + { + "epoch": 2.0205016883743365, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2018, + "step": 9085 + }, + { + "epoch": 2.0217076700434156, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.2681, + "step": 9090 + }, + { + "epoch": 2.022913651712494, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1149, + "step": 9095 + }, + { + "epoch": 2.0241196333815727, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.2563, + "step": 9100 + }, + { + "epoch": 2.0253256150506513, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.3224, + "step": 9105 + }, + { + "epoch": 2.02653159671973, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1351, + "step": 9110 + }, + { + "epoch": 2.0277375783888085, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.2382, + "step": 9115 + }, + { + "epoch": 2.028943560057887, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.2655, + "step": 9120 + }, + { + "epoch": 2.0301495417269657, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.215, + "step": 9125 + }, + { + "epoch": 2.0313555233960443, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.217, + "step": 9130 + }, + { + "epoch": 2.032561505065123, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.2117, + "step": 9135 + }, + { + "epoch": 2.0337674867342015, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.2194, + "step": 9140 + }, + { + "epoch": 2.03497346840328, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.3021, + "step": 9145 + }, + { + "epoch": 2.0361794500723587, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0593, + "step": 9150 + }, + { + "epoch": 2.0373854317414377, + "grad_norm": 3.734375, + "learning_rate": 3e-05, + "loss": 4.3386, + "step": 9155 + }, + { + "epoch": 2.0385914134105163, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.2206, + "step": 9160 + }, + { + "epoch": 2.039797395079595, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2414, + "step": 9165 + }, + { + "epoch": 2.0410033767486735, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.3838, + "step": 9170 + }, + { + "epoch": 2.042209358417752, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.2496, + "step": 9175 + }, + { + "epoch": 2.0434153400868307, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.2251, + "step": 9180 + }, + { + "epoch": 2.0446213217559093, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.3328, + "step": 9185 + }, + { + "epoch": 2.045827303424988, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0846, + "step": 9190 + }, + { + "epoch": 2.0470332850940665, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.2989, + "step": 9195 + }, + { + "epoch": 2.048239266763145, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 4.2227, + "step": 9200 + }, + { + "epoch": 2.0494452484322236, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.2321, + "step": 9205 + }, + { + "epoch": 2.0506512301013027, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1743, + "step": 9210 + }, + { + "epoch": 2.0518572117703813, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.31, + "step": 9215 + }, + { + "epoch": 2.05306319343946, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.2312, + "step": 9220 + }, + { + "epoch": 2.0542691751085385, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.361, + "step": 9225 + }, + { + "epoch": 2.055475156777617, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0888, + "step": 9230 + }, + { + "epoch": 2.0566811384466956, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.3034, + "step": 9235 + }, + { + "epoch": 2.0578871201157742, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.1833, + "step": 9240 + }, + { + "epoch": 2.059093101784853, + "grad_norm": 3.71875, + "learning_rate": 3e-05, + "loss": 4.0506, + "step": 9245 + }, + { + "epoch": 2.0602990834539314, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1551, + "step": 9250 + }, + { + "epoch": 2.06150506512301, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.165, + "step": 9255 + }, + { + "epoch": 2.0627110467920886, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1569, + "step": 9260 + }, + { + "epoch": 2.063917028461167, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.2441, + "step": 9265 + }, + { + "epoch": 2.065123010130246, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1088, + "step": 9270 + }, + { + "epoch": 2.066328991799325, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.176, + "step": 9275 + }, + { + "epoch": 2.0675349734684034, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.1904, + "step": 9280 + }, + { + "epoch": 2.068740955137482, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.257, + "step": 9285 + }, + { + "epoch": 2.0699469368065606, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.236, + "step": 9290 + }, + { + "epoch": 2.071152918475639, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0485, + "step": 9295 + }, + { + "epoch": 2.072358900144718, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.1858, + "step": 9300 + }, + { + "epoch": 2.0735648818137964, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.2637, + "step": 9305 + }, + { + "epoch": 2.074770863482875, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.1634, + "step": 9310 + }, + { + "epoch": 2.0759768451519536, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.2879, + "step": 9315 + }, + { + "epoch": 2.077182826821032, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0801, + "step": 9320 + }, + { + "epoch": 2.0783888084901108, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.1329, + "step": 9325 + }, + { + "epoch": 2.07959479015919, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.1094, + "step": 9330 + }, + { + "epoch": 2.0808007718282684, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.1697, + "step": 9335 + }, + { + "epoch": 2.082006753497347, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.156, + "step": 9340 + }, + { + "epoch": 2.0832127351664256, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.3122, + "step": 9345 + }, + { + "epoch": 2.084418716835504, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.2945, + "step": 9350 + }, + { + "epoch": 2.0856246985045828, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.3271, + "step": 9355 + }, + { + "epoch": 2.0868306801736614, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.1325, + "step": 9360 + }, + { + "epoch": 2.08803666184274, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.2733, + "step": 9365 + }, + { + "epoch": 2.0892426435118185, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.2, + "step": 9370 + }, + { + "epoch": 2.090448625180897, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0549, + "step": 9375 + }, + { + "epoch": 2.0916546068499757, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.2768, + "step": 9380 + }, + { + "epoch": 2.0928605885190543, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1369, + "step": 9385 + }, + { + "epoch": 2.094066570188133, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.1692, + "step": 9390 + }, + { + "epoch": 2.095272551857212, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 4.1711, + "step": 9395 + }, + { + "epoch": 2.0964785335262905, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.2876, + "step": 9400 + }, + { + "epoch": 2.097684515195369, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.2371, + "step": 9405 + }, + { + "epoch": 2.0988904968644477, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.2242, + "step": 9410 + }, + { + "epoch": 2.1000964785335263, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.2682, + "step": 9415 + }, + { + "epoch": 2.101302460202605, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1245, + "step": 9420 + }, + { + "epoch": 2.1025084418716835, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1288, + "step": 9425 + }, + { + "epoch": 2.103714423540762, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.4769, + "step": 9430 + }, + { + "epoch": 2.1049204052098407, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.1007, + "step": 9435 + }, + { + "epoch": 2.1061263868789193, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.09, + "step": 9440 + }, + { + "epoch": 2.107332368547998, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1662, + "step": 9445 + }, + { + "epoch": 2.108538350217077, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.2663, + "step": 9450 + }, + { + "epoch": 2.1097443318861555, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.2089, + "step": 9455 + }, + { + "epoch": 2.110950313555234, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.2068, + "step": 9460 + }, + { + "epoch": 2.1121562952243127, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.2668, + "step": 9465 + }, + { + "epoch": 2.1133622768933913, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.2088, + "step": 9470 + }, + { + "epoch": 2.11456825856247, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.1264, + "step": 9475 + }, + { + "epoch": 2.1157742402315485, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.3307, + "step": 9480 + }, + { + "epoch": 2.116980221900627, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.3448, + "step": 9485 + }, + { + "epoch": 2.1181862035697057, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1575, + "step": 9490 + }, + { + "epoch": 2.1193921852387843, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1604, + "step": 9495 + }, + { + "epoch": 2.120598166907863, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 4.2618, + "step": 9500 + }, + { + "epoch": 2.1218041485769414, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0784, + "step": 9505 + }, + { + "epoch": 2.12301013024602, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0385, + "step": 9510 + }, + { + "epoch": 2.124216111915099, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1256, + "step": 9515 + }, + { + "epoch": 2.1254220935841777, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1635, + "step": 9520 + }, + { + "epoch": 2.1266280752532563, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.3009, + "step": 9525 + }, + { + "epoch": 2.127834056922335, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.2126, + "step": 9530 + }, + { + "epoch": 2.1290400385914134, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.1022, + "step": 9535 + }, + { + "epoch": 2.130246020260492, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1408, + "step": 9540 + }, + { + "epoch": 2.1314520019295706, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.2241, + "step": 9545 + }, + { + "epoch": 2.132657983598649, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.2544, + "step": 9550 + }, + { + "epoch": 2.133863965267728, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.203, + "step": 9555 + }, + { + "epoch": 2.1350699469368064, + "grad_norm": 4.25, + "learning_rate": 3e-05, + "loss": 4.2632, + "step": 9560 + }, + { + "epoch": 2.136275928605885, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.1642, + "step": 9565 + }, + { + "epoch": 2.137481910274964, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1256, + "step": 9570 + }, + { + "epoch": 2.1386878919440426, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.1642, + "step": 9575 + }, + { + "epoch": 2.1398938736131212, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.2296, + "step": 9580 + }, + { + "epoch": 2.1410998552822, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.2119, + "step": 9585 + }, + { + "epoch": 2.1423058369512784, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0698, + "step": 9590 + }, + { + "epoch": 2.143511818620357, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1828, + "step": 9595 + }, + { + "epoch": 2.1447178002894356, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.2621, + "step": 9600 + }, + { + "epoch": 2.145923781958514, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0704, + "step": 9605 + }, + { + "epoch": 2.147129763627593, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.2167, + "step": 9610 + }, + { + "epoch": 2.1483357452966714, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1329, + "step": 9615 + }, + { + "epoch": 2.14954172696575, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1253, + "step": 9620 + }, + { + "epoch": 2.1507477086348286, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.0756, + "step": 9625 + }, + { + "epoch": 2.151953690303907, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.0534, + "step": 9630 + }, + { + "epoch": 2.153159671972986, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.193, + "step": 9635 + }, + { + "epoch": 2.154365653642065, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.2204, + "step": 9640 + }, + { + "epoch": 2.1555716353111434, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.2818, + "step": 9645 + }, + { + "epoch": 2.156777616980222, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1848, + "step": 9650 + }, + { + "epoch": 2.1579835986493006, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.2621, + "step": 9655 + }, + { + "epoch": 2.159189580318379, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.2945, + "step": 9660 + }, + { + "epoch": 2.1603955619874577, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1566, + "step": 9665 + }, + { + "epoch": 2.1616015436565363, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0023, + "step": 9670 + }, + { + "epoch": 2.162807525325615, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1066, + "step": 9675 + }, + { + "epoch": 2.1640135069946935, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 4.2053, + "step": 9680 + }, + { + "epoch": 2.165219488663772, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.2369, + "step": 9685 + }, + { + "epoch": 2.166425470332851, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.2122, + "step": 9690 + }, + { + "epoch": 2.1676314520019297, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0694, + "step": 9695 + }, + { + "epoch": 2.1688374336710083, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0493, + "step": 9700 + }, + { + "epoch": 2.170043415340087, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.2667, + "step": 9705 + }, + { + "epoch": 2.1712493970091655, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.1695, + "step": 9710 + }, + { + "epoch": 2.172455378678244, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1245, + "step": 9715 + }, + { + "epoch": 2.1736613603473227, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.3299, + "step": 9720 + }, + { + "epoch": 2.1748673420164013, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.1907, + "step": 9725 + }, + { + "epoch": 2.17607332368548, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.1464, + "step": 9730 + }, + { + "epoch": 2.1772793053545585, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1861, + "step": 9735 + }, + { + "epoch": 2.178485287023637, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0978, + "step": 9740 + }, + { + "epoch": 2.1796912686927157, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1823, + "step": 9745 + }, + { + "epoch": 2.1808972503617943, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.2823, + "step": 9750 + }, + { + "epoch": 2.1821032320308733, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.2414, + "step": 9755 + }, + { + "epoch": 2.183309213699952, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.1525, + "step": 9760 + }, + { + "epoch": 2.1845151953690305, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.1164, + "step": 9765 + }, + { + "epoch": 2.185721177038109, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.1465, + "step": 9770 + }, + { + "epoch": 2.1869271587071877, + "grad_norm": 4.28125, + "learning_rate": 3e-05, + "loss": 4.1222, + "step": 9775 + }, + { + "epoch": 2.1881331403762663, + "grad_norm": 3.890625, + "learning_rate": 3e-05, + "loss": 4.1895, + "step": 9780 + }, + { + "epoch": 2.189339122045345, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.3288, + "step": 9785 + }, + { + "epoch": 2.1905451037144235, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.1525, + "step": 9790 + }, + { + "epoch": 2.191751085383502, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.2228, + "step": 9795 + }, + { + "epoch": 2.1929570670525806, + "grad_norm": 3.5, + "learning_rate": 3e-05, + "loss": 4.3341, + "step": 9800 + }, + { + "epoch": 2.1941630487216592, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.2245, + "step": 9805 + }, + { + "epoch": 2.1953690303907383, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0826, + "step": 9810 + }, + { + "epoch": 2.196575012059817, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.1116, + "step": 9815 + }, + { + "epoch": 2.1977809937288955, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.1183, + "step": 9820 + }, + { + "epoch": 2.198986975397974, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0587, + "step": 9825 + }, + { + "epoch": 2.2001929570670526, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1993, + "step": 9830 + }, + { + "epoch": 2.2013989387361312, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.1627, + "step": 9835 + }, + { + "epoch": 2.20260492040521, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.2873, + "step": 9840 + }, + { + "epoch": 2.2038109020742884, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.2517, + "step": 9845 + }, + { + "epoch": 2.205016883743367, + "grad_norm": 2.0, + "learning_rate": 3e-05, + "loss": 4.2902, + "step": 9850 + }, + { + "epoch": 2.2062228654124456, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.1335, + "step": 9855 + }, + { + "epoch": 2.207428847081524, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1005, + "step": 9860 + }, + { + "epoch": 2.208634828750603, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.3688, + "step": 9865 + }, + { + "epoch": 2.2098408104196814, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 4.2761, + "step": 9870 + }, + { + "epoch": 2.2110467920887604, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9508, + "step": 9875 + }, + { + "epoch": 2.212252773757839, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.2251, + "step": 9880 + }, + { + "epoch": 2.2134587554269176, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1957, + "step": 9885 + }, + { + "epoch": 2.214664737095996, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.1688, + "step": 9890 + }, + { + "epoch": 2.215870718765075, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.2139, + "step": 9895 + }, + { + "epoch": 2.2170767004341534, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.2213, + "step": 9900 + }, + { + "epoch": 2.218282682103232, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.13, + "step": 9905 + }, + { + "epoch": 2.2194886637723106, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1776, + "step": 9910 + }, + { + "epoch": 2.220694645441389, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 4.1426, + "step": 9915 + }, + { + "epoch": 2.2219006271104678, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1578, + "step": 9920 + }, + { + "epoch": 2.2231066087795464, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0207, + "step": 9925 + }, + { + "epoch": 2.2243125904486254, + "grad_norm": 3.359375, + "learning_rate": 3e-05, + "loss": 4.2973, + "step": 9930 + }, + { + "epoch": 2.225518572117704, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0343, + "step": 9935 + }, + { + "epoch": 2.2267245537867826, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 4.0924, + "step": 9940 + }, + { + "epoch": 2.227930535455861, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1201, + "step": 9945 + }, + { + "epoch": 2.2291365171249398, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0651, + "step": 9950 + }, + { + "epoch": 2.2303424987940184, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.2184, + "step": 9955 + }, + { + "epoch": 2.231548480463097, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.0086, + "step": 9960 + }, + { + "epoch": 2.2327544621321755, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.2001, + "step": 9965 + }, + { + "epoch": 2.233960443801254, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.3328, + "step": 9970 + }, + { + "epoch": 2.2351664254703327, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.2686, + "step": 9975 + }, + { + "epoch": 2.2363724071394113, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0926, + "step": 9980 + }, + { + "epoch": 2.23757838880849, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.2253, + "step": 9985 + }, + { + "epoch": 2.2387843704775685, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0817, + "step": 9990 + }, + { + "epoch": 2.2399903521466475, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.2543, + "step": 9995 + }, + { + "epoch": 2.241196333815726, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0528, + "step": 10000 + }, + { + "epoch": 2.2424023154848047, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.2123, + "step": 10005 + }, + { + "epoch": 2.2436082971538833, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.1992, + "step": 10010 + }, + { + "epoch": 2.244814278822962, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.212, + "step": 10015 + }, + { + "epoch": 2.2460202604920405, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.1248, + "step": 10020 + }, + { + "epoch": 2.247226242161119, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.2627, + "step": 10025 + }, + { + "epoch": 2.2484322238301977, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.103, + "step": 10030 + }, + { + "epoch": 2.2496382054992763, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1098, + "step": 10035 + }, + { + "epoch": 2.250844187168355, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.1861, + "step": 10040 + }, + { + "epoch": 2.2520501688374335, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1603, + "step": 10045 + }, + { + "epoch": 2.2532561505065125, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1436, + "step": 10050 + }, + { + "epoch": 2.254462132175591, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.1656, + "step": 10055 + }, + { + "epoch": 2.2556681138446697, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9487, + "step": 10060 + }, + { + "epoch": 2.2568740955137483, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1735, + "step": 10065 + }, + { + "epoch": 2.258080077182827, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1959, + "step": 10070 + }, + { + "epoch": 2.2592860588519055, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.1209, + "step": 10075 + }, + { + "epoch": 2.260492040520984, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1955, + "step": 10080 + }, + { + "epoch": 2.2616980221900627, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1891, + "step": 10085 + }, + { + "epoch": 2.2629040038591413, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0758, + "step": 10090 + }, + { + "epoch": 2.26410998552822, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1674, + "step": 10095 + }, + { + "epoch": 2.2653159671972984, + "grad_norm": 3.3125, + "learning_rate": 3e-05, + "loss": 4.1084, + "step": 10100 + }, + { + "epoch": 2.2665219488663775, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1182, + "step": 10105 + }, + { + "epoch": 2.2677279305354556, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0088, + "step": 10110 + }, + { + "epoch": 2.2689339122045347, + "grad_norm": 3.53125, + "learning_rate": 3e-05, + "loss": 4.2343, + "step": 10115 + }, + { + "epoch": 2.2701398938736133, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0165, + "step": 10120 + }, + { + "epoch": 2.271345875542692, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0021, + "step": 10125 + }, + { + "epoch": 2.2725518572117704, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1563, + "step": 10130 + }, + { + "epoch": 2.273757838880849, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0757, + "step": 10135 + }, + { + "epoch": 2.2749638205499276, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.1532, + "step": 10140 + }, + { + "epoch": 2.276169802219006, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.9921, + "step": 10145 + }, + { + "epoch": 2.277375783888085, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1515, + "step": 10150 + }, + { + "epoch": 2.2785817655571634, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.2955, + "step": 10155 + }, + { + "epoch": 2.279787747226242, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.266, + "step": 10160 + }, + { + "epoch": 2.2809937288953206, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.061, + "step": 10165 + }, + { + "epoch": 2.2821997105643996, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.19, + "step": 10170 + }, + { + "epoch": 2.2834056922334782, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0745, + "step": 10175 + }, + { + "epoch": 2.284611673902557, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1898, + "step": 10180 + }, + { + "epoch": 2.2858176555716354, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0209, + "step": 10185 + }, + { + "epoch": 2.287023637240714, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.2058, + "step": 10190 + }, + { + "epoch": 2.2882296189097926, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.183, + "step": 10195 + }, + { + "epoch": 2.289435600578871, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1823, + "step": 10200 + }, + { + "epoch": 2.29064158224795, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.1024, + "step": 10205 + }, + { + "epoch": 2.2918475639170284, + "grad_norm": 1.9375, + "learning_rate": 3e-05, + "loss": 4.1754, + "step": 10210 + }, + { + "epoch": 2.293053545586107, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.117, + "step": 10215 + }, + { + "epoch": 2.2942595272551856, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.022, + "step": 10220 + }, + { + "epoch": 2.2954655089242646, + "grad_norm": 1.9921875, + "learning_rate": 3e-05, + "loss": 4.2116, + "step": 10225 + }, + { + "epoch": 2.2966714905933427, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1104, + "step": 10230 + }, + { + "epoch": 2.297877472262422, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1166, + "step": 10235 + }, + { + "epoch": 2.2990834539315004, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.1814, + "step": 10240 + }, + { + "epoch": 2.300289435600579, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.1516, + "step": 10245 + }, + { + "epoch": 2.3014954172696576, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0829, + "step": 10250 + }, + { + "epoch": 2.302701398938736, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1533, + "step": 10255 + }, + { + "epoch": 2.3039073806078147, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1477, + "step": 10260 + }, + { + "epoch": 2.3051133622768933, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0464, + "step": 10265 + }, + { + "epoch": 2.306319343945972, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.086, + "step": 10270 + }, + { + "epoch": 2.3075253256150505, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.2252, + "step": 10275 + }, + { + "epoch": 2.308731307284129, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0657, + "step": 10280 + }, + { + "epoch": 2.3099372889532077, + "grad_norm": 3.21875, + "learning_rate": 3e-05, + "loss": 4.1504, + "step": 10285 + }, + { + "epoch": 2.3111432706222867, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.8939, + "step": 10290 + }, + { + "epoch": 2.3123492522913653, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.1186, + "step": 10295 + }, + { + "epoch": 2.313555233960444, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.1986, + "step": 10300 + }, + { + "epoch": 2.3147612156295225, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.2401, + "step": 10305 + }, + { + "epoch": 2.315967197298601, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.1075, + "step": 10310 + }, + { + "epoch": 2.3171731789676797, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.2698, + "step": 10315 + }, + { + "epoch": 2.3183791606367583, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.2006, + "step": 10320 + }, + { + "epoch": 2.319585142305837, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0476, + "step": 10325 + }, + { + "epoch": 2.3207911239749155, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0846, + "step": 10330 + }, + { + "epoch": 2.321997105643994, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0502, + "step": 10335 + }, + { + "epoch": 2.3232030873130727, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.1355, + "step": 10340 + }, + { + "epoch": 2.3244090689821517, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.2342, + "step": 10345 + }, + { + "epoch": 2.32561505065123, + "grad_norm": 3.453125, + "learning_rate": 3e-05, + "loss": 4.1098, + "step": 10350 + }, + { + "epoch": 2.326821032320309, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.3324, + "step": 10355 + }, + { + "epoch": 2.3280270139893875, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1441, + "step": 10360 + }, + { + "epoch": 2.329232995658466, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1193, + "step": 10365 + }, + { + "epoch": 2.3304389773275447, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 4.1809, + "step": 10370 + }, + { + "epoch": 2.3316449589966233, + "grad_norm": 3.640625, + "learning_rate": 3e-05, + "loss": 4.2074, + "step": 10375 + }, + { + "epoch": 2.332850940665702, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.1207, + "step": 10380 + }, + { + "epoch": 2.3340569223347805, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1611, + "step": 10385 + }, + { + "epoch": 2.335262904003859, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9704, + "step": 10390 + }, + { + "epoch": 2.3364688856729376, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.2003, + "step": 10395 + }, + { + "epoch": 2.3376748673420162, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1499, + "step": 10400 + }, + { + "epoch": 2.338880849011095, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.333, + "step": 10405 + }, + { + "epoch": 2.340086830680174, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.2561, + "step": 10410 + }, + { + "epoch": 2.3412928123492525, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0917, + "step": 10415 + }, + { + "epoch": 2.342498794018331, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1949, + "step": 10420 + }, + { + "epoch": 2.3437047756874096, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0743, + "step": 10425 + }, + { + "epoch": 2.3449107573564882, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1132, + "step": 10430 + }, + { + "epoch": 2.346116739025567, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.3364, + "step": 10435 + }, + { + "epoch": 2.3473227206946454, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.2251, + "step": 10440 + }, + { + "epoch": 2.348528702363724, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.1887, + "step": 10445 + }, + { + "epoch": 2.3497346840328026, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.2043, + "step": 10450 + }, + { + "epoch": 2.350940665701881, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.077, + "step": 10455 + }, + { + "epoch": 2.35214664737096, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0716, + "step": 10460 + }, + { + "epoch": 2.353352629040039, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.3089, + "step": 10465 + }, + { + "epoch": 2.354558610709117, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1143, + "step": 10470 + }, + { + "epoch": 2.355764592378196, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.2583, + "step": 10475 + }, + { + "epoch": 2.3569705740472746, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.029, + "step": 10480 + }, + { + "epoch": 2.358176555716353, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.1102, + "step": 10485 + }, + { + "epoch": 2.359382537385432, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1991, + "step": 10490 + }, + { + "epoch": 2.3605885190545104, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.1438, + "step": 10495 + }, + { + "epoch": 2.361794500723589, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0709, + "step": 10500 + }, + { + "epoch": 2.3630004823926676, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1308, + "step": 10505 + }, + { + "epoch": 2.364206464061746, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0969, + "step": 10510 + }, + { + "epoch": 2.3654124457308248, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0492, + "step": 10515 + }, + { + "epoch": 2.3666184273999034, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.079, + "step": 10520 + }, + { + "epoch": 2.367824409068982, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.2073, + "step": 10525 + }, + { + "epoch": 2.369030390738061, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.3452, + "step": 10530 + }, + { + "epoch": 2.3702363724071396, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.1712, + "step": 10535 + }, + { + "epoch": 2.371442354076218, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1475, + "step": 10540 + }, + { + "epoch": 2.3726483357452968, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0714, + "step": 10545 + }, + { + "epoch": 2.3738543174143754, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0849, + "step": 10550 + }, + { + "epoch": 2.375060299083454, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1826, + "step": 10555 + }, + { + "epoch": 2.3762662807525325, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1175, + "step": 10560 + }, + { + "epoch": 2.377472262421611, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9737, + "step": 10565 + }, + { + "epoch": 2.3786782440906897, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0004, + "step": 10570 + }, + { + "epoch": 2.3798842257597683, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.2172, + "step": 10575 + }, + { + "epoch": 2.381090207428847, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.2544, + "step": 10580 + }, + { + "epoch": 2.382296189097926, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0196, + "step": 10585 + }, + { + "epoch": 2.383502170767004, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0762, + "step": 10590 + }, + { + "epoch": 2.384708152436083, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0776, + "step": 10595 + }, + { + "epoch": 2.3859141341051617, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9896, + "step": 10600 + }, + { + "epoch": 2.3871201157742403, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.3171, + "step": 10605 + }, + { + "epoch": 2.388326097443319, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1992, + "step": 10610 + }, + { + "epoch": 2.3895320791123975, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.2065, + "step": 10615 + }, + { + "epoch": 2.390738060781476, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0532, + "step": 10620 + }, + { + "epoch": 2.3919440424505547, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1709, + "step": 10625 + }, + { + "epoch": 2.3931500241196333, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0705, + "step": 10630 + }, + { + "epoch": 2.394356005788712, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.1072, + "step": 10635 + }, + { + "epoch": 2.3955619874577905, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1484, + "step": 10640 + }, + { + "epoch": 2.396767969126869, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.1344, + "step": 10645 + }, + { + "epoch": 2.397973950795948, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.2026, + "step": 10650 + }, + { + "epoch": 2.3991799324650267, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0286, + "step": 10655 + }, + { + "epoch": 2.4003859141341053, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 4.1975, + "step": 10660 + }, + { + "epoch": 2.401591895803184, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.2446, + "step": 10665 + }, + { + "epoch": 2.4027978774722625, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1889, + "step": 10670 + }, + { + "epoch": 2.404003859141341, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.2531, + "step": 10675 + }, + { + "epoch": 2.4052098408104197, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.2263, + "step": 10680 + }, + { + "epoch": 2.4064158224794983, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.2489, + "step": 10685 + }, + { + "epoch": 2.407621804148577, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.2175, + "step": 10690 + }, + { + "epoch": 2.4088277858176554, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0807, + "step": 10695 + }, + { + "epoch": 2.410033767486734, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1817, + "step": 10700 + }, + { + "epoch": 2.411239749155813, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.1689, + "step": 10705 + }, + { + "epoch": 2.412445730824891, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9629, + "step": 10710 + }, + { + "epoch": 2.4136517124939703, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1763, + "step": 10715 + }, + { + "epoch": 2.414857694163049, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1753, + "step": 10720 + }, + { + "epoch": 2.4160636758321274, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.1507, + "step": 10725 + }, + { + "epoch": 2.417269657501206, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0001, + "step": 10730 + }, + { + "epoch": 2.4184756391702846, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.108, + "step": 10735 + }, + { + "epoch": 2.4196816208393632, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1973, + "step": 10740 + }, + { + "epoch": 2.420887602508442, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.973, + "step": 10745 + }, + { + "epoch": 2.4220935841775204, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.2821, + "step": 10750 + }, + { + "epoch": 2.423299565846599, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.143, + "step": 10755 + }, + { + "epoch": 2.4245055475156776, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0657, + "step": 10760 + }, + { + "epoch": 2.425711529184756, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1022, + "step": 10765 + }, + { + "epoch": 2.4269175108538352, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0939, + "step": 10770 + }, + { + "epoch": 2.428123492522914, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.077, + "step": 10775 + }, + { + "epoch": 2.4293294741919924, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0794, + "step": 10780 + }, + { + "epoch": 2.430535455861071, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.104, + "step": 10785 + }, + { + "epoch": 2.4317414375301496, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1079, + "step": 10790 + }, + { + "epoch": 2.432947419199228, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.1413, + "step": 10795 + }, + { + "epoch": 2.434153400868307, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.2506, + "step": 10800 + }, + { + "epoch": 2.4353593825373854, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1035, + "step": 10805 + }, + { + "epoch": 2.436565364206464, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1023, + "step": 10810 + }, + { + "epoch": 2.4377713458755426, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0791, + "step": 10815 + }, + { + "epoch": 2.438977327544621, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0464, + "step": 10820 + }, + { + "epoch": 2.4401833092137, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1936, + "step": 10825 + }, + { + "epoch": 2.4413892908827783, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.1159, + "step": 10830 + }, + { + "epoch": 2.4425952725518574, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.2167, + "step": 10835 + }, + { + "epoch": 2.443801254220936, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1819, + "step": 10840 + }, + { + "epoch": 2.4450072358900146, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1397, + "step": 10845 + }, + { + "epoch": 2.446213217559093, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.0418, + "step": 10850 + }, + { + "epoch": 2.4474191992281717, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.2023, + "step": 10855 + }, + { + "epoch": 2.4486251808972503, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1893, + "step": 10860 + }, + { + "epoch": 2.449831162566329, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0535, + "step": 10865 + }, + { + "epoch": 2.4510371442354075, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.231, + "step": 10870 + }, + { + "epoch": 2.452243125904486, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.0816, + "step": 10875 + }, + { + "epoch": 2.4534491075735647, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1265, + "step": 10880 + }, + { + "epoch": 2.4546550892426433, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.1443, + "step": 10885 + }, + { + "epoch": 2.4558610709117223, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9236, + "step": 10890 + }, + { + "epoch": 2.457067052580801, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1654, + "step": 10895 + }, + { + "epoch": 2.4582730342498795, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1724, + "step": 10900 + }, + { + "epoch": 2.459479015918958, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0959, + "step": 10905 + }, + { + "epoch": 2.4606849975880367, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.1286, + "step": 10910 + }, + { + "epoch": 2.4618909792571153, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1728, + "step": 10915 + }, + { + "epoch": 2.463096960926194, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.2028, + "step": 10920 + }, + { + "epoch": 2.4643029425952725, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1971, + "step": 10925 + }, + { + "epoch": 2.465508924264351, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1073, + "step": 10930 + }, + { + "epoch": 2.4667149059334297, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.1725, + "step": 10935 + }, + { + "epoch": 2.4679208876025083, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.2382, + "step": 10940 + }, + { + "epoch": 2.4691268692715873, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.2438, + "step": 10945 + }, + { + "epoch": 2.4703328509406655, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.144, + "step": 10950 + }, + { + "epoch": 2.4715388326097445, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.2194, + "step": 10955 + }, + { + "epoch": 2.472744814278823, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.1659, + "step": 10960 + }, + { + "epoch": 2.4739507959479017, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2302, + "step": 10965 + }, + { + "epoch": 2.4751567776169803, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1926, + "step": 10970 + }, + { + "epoch": 2.476362759286059, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 4.2911, + "step": 10975 + }, + { + "epoch": 2.4775687409551375, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1058, + "step": 10980 + }, + { + "epoch": 2.478774722624216, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0897, + "step": 10985 + }, + { + "epoch": 2.4799807042932946, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.2864, + "step": 10990 + }, + { + "epoch": 2.4811866859623732, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.0392, + "step": 10995 + }, + { + "epoch": 2.482392667631452, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.2263, + "step": 11000 + }, + { + "epoch": 2.4835986493005304, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0793, + "step": 11005 + }, + { + "epoch": 2.4848046309696095, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9963, + "step": 11010 + }, + { + "epoch": 2.486010612638688, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.1093, + "step": 11015 + }, + { + "epoch": 2.4872165943077666, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.1803, + "step": 11020 + }, + { + "epoch": 2.4884225759768452, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1233, + "step": 11025 + }, + { + "epoch": 2.489628557645924, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0788, + "step": 11030 + }, + { + "epoch": 2.4908345393150024, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1316, + "step": 11035 + }, + { + "epoch": 2.492040520984081, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.1877, + "step": 11040 + }, + { + "epoch": 2.4932465026531596, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0385, + "step": 11045 + }, + { + "epoch": 2.494452484322238, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.1771, + "step": 11050 + }, + { + "epoch": 2.495658465991317, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.9814, + "step": 11055 + }, + { + "epoch": 2.4968644476603954, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.0832, + "step": 11060 + }, + { + "epoch": 2.4980704293294744, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.1175, + "step": 11065 + }, + { + "epoch": 2.4992764109985526, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.1414, + "step": 11070 + }, + { + "epoch": 2.5004823926676316, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0533, + "step": 11075 + }, + { + "epoch": 2.50168837433671, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0339, + "step": 11080 + }, + { + "epoch": 2.502894356005789, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0856, + "step": 11085 + }, + { + "epoch": 2.5041003376748674, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0876, + "step": 11090 + }, + { + "epoch": 2.505306319343946, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.2181, + "step": 11095 + }, + { + "epoch": 2.5065123010130246, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.153, + "step": 11100 + }, + { + "epoch": 2.507718282682103, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.2022, + "step": 11105 + }, + { + "epoch": 2.5089242643511818, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1784, + "step": 11110 + }, + { + "epoch": 2.5101302460202604, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1909, + "step": 11115 + }, + { + "epoch": 2.5113362276893394, + "grad_norm": 3.578125, + "learning_rate": 3e-05, + "loss": 4.19, + "step": 11120 + }, + { + "epoch": 2.5125422093584175, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.1537, + "step": 11125 + }, + { + "epoch": 2.5137481910274966, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.2259, + "step": 11130 + }, + { + "epoch": 2.514954172696575, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1133, + "step": 11135 + }, + { + "epoch": 2.5161601543656538, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1808, + "step": 11140 + }, + { + "epoch": 2.5173661360347324, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1005, + "step": 11145 + }, + { + "epoch": 2.518572117703811, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0976, + "step": 11150 + }, + { + "epoch": 2.5197780993728895, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1336, + "step": 11155 + }, + { + "epoch": 2.520984081041968, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0542, + "step": 11160 + }, + { + "epoch": 2.5221900627110467, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9754, + "step": 11165 + }, + { + "epoch": 2.5233960443801253, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1391, + "step": 11170 + }, + { + "epoch": 2.524602026049204, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0785, + "step": 11175 + }, + { + "epoch": 2.5258080077182825, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1229, + "step": 11180 + }, + { + "epoch": 2.5270139893873615, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0562, + "step": 11185 + }, + { + "epoch": 2.5282199710564397, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1304, + "step": 11190 + }, + { + "epoch": 2.5294259527255187, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1364, + "step": 11195 + }, + { + "epoch": 2.5306319343945973, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.061, + "step": 11200 + }, + { + "epoch": 2.531837916063676, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0671, + "step": 11205 + }, + { + "epoch": 2.5330438977327545, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0776, + "step": 11210 + }, + { + "epoch": 2.534249879401833, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1167, + "step": 11215 + }, + { + "epoch": 2.5354558610709117, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0123, + "step": 11220 + }, + { + "epoch": 2.5366618427399903, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1062, + "step": 11225 + }, + { + "epoch": 2.537867824409069, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9938, + "step": 11230 + }, + { + "epoch": 2.5390738060781475, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1367, + "step": 11235 + }, + { + "epoch": 2.5402797877472265, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9744, + "step": 11240 + }, + { + "epoch": 2.5414857694163047, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.1372, + "step": 11245 + }, + { + "epoch": 2.5426917510853837, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 4.1641, + "step": 11250 + }, + { + "epoch": 2.5438977327544623, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0557, + "step": 11255 + }, + { + "epoch": 2.545103714423541, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 4.1024, + "step": 11260 + }, + { + "epoch": 2.5463096960926195, + "grad_norm": 1.953125, + "learning_rate": 3e-05, + "loss": 4.0954, + "step": 11265 + }, + { + "epoch": 2.547515677761698, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1508, + "step": 11270 + }, + { + "epoch": 2.5487216594307767, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1624, + "step": 11275 + }, + { + "epoch": 2.5499276410998553, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.989, + "step": 11280 + }, + { + "epoch": 2.551133622768934, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1942, + "step": 11285 + }, + { + "epoch": 2.5523396044380124, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1694, + "step": 11290 + }, + { + "epoch": 2.553545586107091, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0985, + "step": 11295 + }, + { + "epoch": 2.5547515677761696, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.2778, + "step": 11300 + }, + { + "epoch": 2.5559575494452487, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1524, + "step": 11305 + }, + { + "epoch": 2.557163531114327, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0022, + "step": 11310 + }, + { + "epoch": 2.558369512783406, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.044, + "step": 11315 + }, + { + "epoch": 2.5595754944524844, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 4.2498, + "step": 11320 + }, + { + "epoch": 2.560781476121563, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1297, + "step": 11325 + }, + { + "epoch": 2.5619874577906416, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1091, + "step": 11330 + }, + { + "epoch": 2.5631934394597202, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9753, + "step": 11335 + }, + { + "epoch": 2.564399421128799, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0582, + "step": 11340 + }, + { + "epoch": 2.5656054027978774, + "grad_norm": 3.453125, + "learning_rate": 3e-05, + "loss": 3.9721, + "step": 11345 + }, + { + "epoch": 2.566811384466956, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.142, + "step": 11350 + }, + { + "epoch": 2.5680173661360346, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.3733, + "step": 11355 + }, + { + "epoch": 2.5692233478051136, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1027, + "step": 11360 + }, + { + "epoch": 2.570429329474192, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9416, + "step": 11365 + }, + { + "epoch": 2.571635311143271, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1964, + "step": 11370 + }, + { + "epoch": 2.5728412928123494, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.2444, + "step": 11375 + }, + { + "epoch": 2.574047274481428, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9891, + "step": 11380 + }, + { + "epoch": 2.5752532561505066, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1265, + "step": 11385 + }, + { + "epoch": 2.576459237819585, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1045, + "step": 11390 + }, + { + "epoch": 2.577665219488664, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.181, + "step": 11395 + }, + { + "epoch": 2.5788712011577424, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1055, + "step": 11400 + }, + { + "epoch": 2.580077182826821, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1873, + "step": 11405 + }, + { + "epoch": 2.5812831644958996, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0753, + "step": 11410 + }, + { + "epoch": 2.582489146164978, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2336, + "step": 11415 + }, + { + "epoch": 2.5836951278340567, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0895, + "step": 11420 + }, + { + "epoch": 2.584901109503136, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.0433, + "step": 11425 + }, + { + "epoch": 2.586107091172214, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 4.2776, + "step": 11430 + }, + { + "epoch": 2.587313072841293, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0463, + "step": 11435 + }, + { + "epoch": 2.5885190545103716, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1704, + "step": 11440 + }, + { + "epoch": 2.58972503617945, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.1732, + "step": 11445 + }, + { + "epoch": 2.5909310178485288, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1178, + "step": 11450 + }, + { + "epoch": 2.5921369995176073, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1073, + "step": 11455 + }, + { + "epoch": 2.593342981186686, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.2243, + "step": 11460 + }, + { + "epoch": 2.5945489628557645, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.023, + "step": 11465 + }, + { + "epoch": 2.595754944524843, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1639, + "step": 11470 + }, + { + "epoch": 2.5969609261939217, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0331, + "step": 11475 + }, + { + "epoch": 2.5981669078630008, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9988, + "step": 11480 + }, + { + "epoch": 2.599372889532079, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.176, + "step": 11485 + }, + { + "epoch": 2.600578871201158, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.067, + "step": 11490 + }, + { + "epoch": 2.6017848528702365, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 4.1904, + "step": 11495 + }, + { + "epoch": 2.602990834539315, + "grad_norm": 3.4375, + "learning_rate": 3e-05, + "loss": 4.098, + "step": 11500 + }, + { + "epoch": 2.6041968162083937, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1901, + "step": 11505 + }, + { + "epoch": 2.6054027978774723, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1816, + "step": 11510 + }, + { + "epoch": 2.606608779546551, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.137, + "step": 11515 + }, + { + "epoch": 2.6078147612156295, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0361, + "step": 11520 + }, + { + "epoch": 2.609020742884708, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.2011, + "step": 11525 + }, + { + "epoch": 2.6102267245537867, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.137, + "step": 11530 + }, + { + "epoch": 2.6114327062228653, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1566, + "step": 11535 + }, + { + "epoch": 2.612638687891944, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.2693, + "step": 11540 + }, + { + "epoch": 2.613844669561023, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.2253, + "step": 11545 + }, + { + "epoch": 2.615050651230101, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.2158, + "step": 11550 + }, + { + "epoch": 2.61625663289918, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0551, + "step": 11555 + }, + { + "epoch": 2.6174626145682587, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0063, + "step": 11560 + }, + { + "epoch": 2.6186685962373373, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.081, + "step": 11565 + }, + { + "epoch": 2.619874577906416, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 4.1738, + "step": 11570 + }, + { + "epoch": 2.6210805595754945, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0546, + "step": 11575 + }, + { + "epoch": 2.622286541244573, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1841, + "step": 11580 + }, + { + "epoch": 2.6234925229136516, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0773, + "step": 11585 + }, + { + "epoch": 2.6246985045827302, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0238, + "step": 11590 + }, + { + "epoch": 2.625904486251809, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1272, + "step": 11595 + }, + { + "epoch": 2.627110467920888, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0367, + "step": 11600 + }, + { + "epoch": 2.628316449589966, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2925, + "step": 11605 + }, + { + "epoch": 2.629522431259045, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.1793, + "step": 11610 + }, + { + "epoch": 2.6307284129281236, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.203, + "step": 11615 + }, + { + "epoch": 2.6319343945972022, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0658, + "step": 11620 + }, + { + "epoch": 2.633140376266281, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.1987, + "step": 11625 + }, + { + "epoch": 2.6343463579353594, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.048, + "step": 11630 + }, + { + "epoch": 2.635552339604438, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.124, + "step": 11635 + }, + { + "epoch": 2.6367583212735166, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0372, + "step": 11640 + }, + { + "epoch": 2.637964302942595, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0557, + "step": 11645 + }, + { + "epoch": 2.639170284611674, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0853, + "step": 11650 + }, + { + "epoch": 2.6403762662807524, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1164, + "step": 11655 + }, + { + "epoch": 2.641582247949831, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.2068, + "step": 11660 + }, + { + "epoch": 2.64278822961891, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.2862, + "step": 11665 + }, + { + "epoch": 2.643994211287988, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.1415, + "step": 11670 + }, + { + "epoch": 2.645200192957067, + "grad_norm": 4.0625, + "learning_rate": 3e-05, + "loss": 4.1748, + "step": 11675 + }, + { + "epoch": 2.646406174626146, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9935, + "step": 11680 + }, + { + "epoch": 2.6476121562952244, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0582, + "step": 11685 + }, + { + "epoch": 2.648818137964303, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0076, + "step": 11690 + }, + { + "epoch": 2.6500241196333816, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9872, + "step": 11695 + }, + { + "epoch": 2.65123010130246, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0618, + "step": 11700 + }, + { + "epoch": 2.6524360829715388, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.1482, + "step": 11705 + }, + { + "epoch": 2.6536420646406174, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.1659, + "step": 11710 + }, + { + "epoch": 2.654848046309696, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0781, + "step": 11715 + }, + { + "epoch": 2.656054027978775, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1076, + "step": 11720 + }, + { + "epoch": 2.657260009647853, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0904, + "step": 11725 + }, + { + "epoch": 2.658465991316932, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.2026, + "step": 11730 + }, + { + "epoch": 2.6596719729860108, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.2472, + "step": 11735 + }, + { + "epoch": 2.6608779546550894, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.2508, + "step": 11740 + }, + { + "epoch": 2.662083936324168, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0978, + "step": 11745 + }, + { + "epoch": 2.6632899179932465, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.2511, + "step": 11750 + }, + { + "epoch": 2.664495899662325, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0748, + "step": 11755 + }, + { + "epoch": 2.6657018813314037, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0397, + "step": 11760 + }, + { + "epoch": 2.6669078630004823, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.2276, + "step": 11765 + }, + { + "epoch": 2.668113844669561, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1341, + "step": 11770 + }, + { + "epoch": 2.6693198263386395, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.1468, + "step": 11775 + }, + { + "epoch": 2.670525808007718, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.2332, + "step": 11780 + }, + { + "epoch": 2.671731789676797, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1936, + "step": 11785 + }, + { + "epoch": 2.6729377713458753, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.128, + "step": 11790 + }, + { + "epoch": 2.6741437530149543, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.989, + "step": 11795 + }, + { + "epoch": 2.675349734684033, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1033, + "step": 11800 + }, + { + "epoch": 2.6765557163531115, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1828, + "step": 11805 + }, + { + "epoch": 2.67776169802219, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8892, + "step": 11810 + }, + { + "epoch": 2.6789676796912687, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1484, + "step": 11815 + }, + { + "epoch": 2.6801736613603473, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1791, + "step": 11820 + }, + { + "epoch": 2.681379643029426, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0164, + "step": 11825 + }, + { + "epoch": 2.6825856246985045, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 4.1231, + "step": 11830 + }, + { + "epoch": 2.683791606367583, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0432, + "step": 11835 + }, + { + "epoch": 2.684997588036662, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.1452, + "step": 11840 + }, + { + "epoch": 2.6862035697057403, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.136, + "step": 11845 + }, + { + "epoch": 2.6874095513748193, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0017, + "step": 11850 + }, + { + "epoch": 2.688615533043898, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0565, + "step": 11855 + }, + { + "epoch": 2.6898215147129765, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0813, + "step": 11860 + }, + { + "epoch": 2.691027496382055, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.1229, + "step": 11865 + }, + { + "epoch": 2.6922334780511337, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0003, + "step": 11870 + }, + { + "epoch": 2.6934394597202123, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0572, + "step": 11875 + }, + { + "epoch": 2.694645441389291, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0974, + "step": 11880 + }, + { + "epoch": 2.6958514230583694, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.1187, + "step": 11885 + }, + { + "epoch": 2.697057404727448, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9405, + "step": 11890 + }, + { + "epoch": 2.6982633863965266, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1633, + "step": 11895 + }, + { + "epoch": 2.6994693680656052, + "grad_norm": 3.328125, + "learning_rate": 3e-05, + "loss": 4.0612, + "step": 11900 + }, + { + "epoch": 2.7006753497346843, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.1518, + "step": 11905 + }, + { + "epoch": 2.7018813314037624, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0476, + "step": 11910 + }, + { + "epoch": 2.7030873130728414, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1044, + "step": 11915 + }, + { + "epoch": 2.70429329474192, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9564, + "step": 11920 + }, + { + "epoch": 2.7054992764109986, + "grad_norm": 1.921875, + "learning_rate": 3e-05, + "loss": 4.1638, + "step": 11925 + }, + { + "epoch": 2.7067052580800772, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1089, + "step": 11930 + }, + { + "epoch": 2.707911239749156, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.1196, + "step": 11935 + }, + { + "epoch": 2.7091172214182344, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0696, + "step": 11940 + }, + { + "epoch": 2.710323203087313, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0352, + "step": 11945 + }, + { + "epoch": 2.7115291847563916, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0309, + "step": 11950 + }, + { + "epoch": 2.71273516642547, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0522, + "step": 11955 + }, + { + "epoch": 2.7139411480945492, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.1092, + "step": 11960 + }, + { + "epoch": 2.7151471297636274, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0894, + "step": 11965 + }, + { + "epoch": 2.7163531114327064, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.1613, + "step": 11970 + }, + { + "epoch": 2.717559093101785, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9797, + "step": 11975 + }, + { + "epoch": 2.7187650747708636, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.2203, + "step": 11980 + }, + { + "epoch": 2.719971056439942, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.1743, + "step": 11985 + }, + { + "epoch": 2.721177038109021, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0163, + "step": 11990 + }, + { + "epoch": 2.7223830197780994, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.2629, + "step": 11995 + }, + { + "epoch": 2.723589001447178, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.0506, + "step": 12000 + }, + { + "epoch": 2.7247949831162566, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0748, + "step": 12005 + }, + { + "epoch": 2.726000964785335, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0374, + "step": 12010 + }, + { + "epoch": 2.7272069464544137, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9765, + "step": 12015 + }, + { + "epoch": 2.7284129281234923, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0611, + "step": 12020 + }, + { + "epoch": 2.7296189097925714, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0109, + "step": 12025 + }, + { + "epoch": 2.7308248914616495, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1325, + "step": 12030 + }, + { + "epoch": 2.7320308731307286, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1419, + "step": 12035 + }, + { + "epoch": 2.733236854799807, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0651, + "step": 12040 + }, + { + "epoch": 2.7344428364688858, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0466, + "step": 12045 + }, + { + "epoch": 2.7356488181379643, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0669, + "step": 12050 + }, + { + "epoch": 2.736854799807043, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1362, + "step": 12055 + }, + { + "epoch": 2.7380607814761215, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0632, + "step": 12060 + }, + { + "epoch": 2.7392667631452, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0498, + "step": 12065 + }, + { + "epoch": 2.7404727448142787, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0151, + "step": 12070 + }, + { + "epoch": 2.7416787264833573, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0584, + "step": 12075 + }, + { + "epoch": 2.7428847081524363, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.2482, + "step": 12080 + }, + { + "epoch": 2.7440906898215145, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1087, + "step": 12085 + }, + { + "epoch": 2.7452966714905935, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.1298, + "step": 12090 + }, + { + "epoch": 2.746502653159672, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0261, + "step": 12095 + }, + { + "epoch": 2.7477086348287507, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9859, + "step": 12100 + }, + { + "epoch": 2.7489146164978293, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8792, + "step": 12105 + }, + { + "epoch": 2.750120598166908, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.1922, + "step": 12110 + }, + { + "epoch": 2.7513265798359865, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.2514, + "step": 12115 + }, + { + "epoch": 2.752532561505065, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.2351, + "step": 12120 + }, + { + "epoch": 2.7537385431741437, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.2309, + "step": 12125 + }, + { + "epoch": 2.7549445248432223, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9893, + "step": 12130 + }, + { + "epoch": 2.756150506512301, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1744, + "step": 12135 + }, + { + "epoch": 2.7573564881813795, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.0635, + "step": 12140 + }, + { + "epoch": 2.7585624698504585, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.2654, + "step": 12145 + }, + { + "epoch": 2.7597684515195366, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1203, + "step": 12150 + }, + { + "epoch": 2.7609744331886157, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 4.0367, + "step": 12155 + }, + { + "epoch": 2.7621804148576943, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.1486, + "step": 12160 + }, + { + "epoch": 2.763386396526773, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0081, + "step": 12165 + }, + { + "epoch": 2.7645923781958515, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9865, + "step": 12170 + }, + { + "epoch": 2.76579835986493, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.0075, + "step": 12175 + }, + { + "epoch": 2.7670043415340086, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0424, + "step": 12180 + }, + { + "epoch": 2.7682103232030872, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.2542, + "step": 12185 + }, + { + "epoch": 2.769416304872166, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.055, + "step": 12190 + }, + { + "epoch": 2.7706222865412444, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.045, + "step": 12195 + }, + { + "epoch": 2.7718282682103235, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0115, + "step": 12200 + }, + { + "epoch": 2.7730342498794016, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.989, + "step": 12205 + }, + { + "epoch": 2.7742402315484807, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8249, + "step": 12210 + }, + { + "epoch": 2.7754462132175592, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.3465, + "step": 12215 + }, + { + "epoch": 2.776652194886638, + "grad_norm": 3.90625, + "learning_rate": 3e-05, + "loss": 4.3276, + "step": 12220 + }, + { + "epoch": 2.7778581765557164, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.1293, + "step": 12225 + }, + { + "epoch": 2.779064158224795, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0856, + "step": 12230 + }, + { + "epoch": 2.7802701398938736, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 4.0849, + "step": 12235 + }, + { + "epoch": 2.781476121562952, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0482, + "step": 12240 + }, + { + "epoch": 2.782682103232031, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0615, + "step": 12245 + }, + { + "epoch": 2.7838880849011094, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.2041, + "step": 12250 + }, + { + "epoch": 2.785094066570188, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0746, + "step": 12255 + }, + { + "epoch": 2.7863000482392666, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.1682, + "step": 12260 + }, + { + "epoch": 2.7875060299083456, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.1719, + "step": 12265 + }, + { + "epoch": 2.7887120115774238, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.333, + "step": 12270 + }, + { + "epoch": 2.789917993246503, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0211, + "step": 12275 + }, + { + "epoch": 2.7911239749155814, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0016, + "step": 12280 + }, + { + "epoch": 2.79232995658466, + "grad_norm": 3.3125, + "learning_rate": 3e-05, + "loss": 4.236, + "step": 12285 + }, + { + "epoch": 2.7935359382537386, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1864, + "step": 12290 + }, + { + "epoch": 2.794741919922817, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0912, + "step": 12295 + }, + { + "epoch": 2.7959479015918958, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.1669, + "step": 12300 + }, + { + "epoch": 2.7971538832609744, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0738, + "step": 12305 + }, + { + "epoch": 2.798359864930053, + "grad_norm": 3.46875, + "learning_rate": 3e-05, + "loss": 4.1521, + "step": 12310 + }, + { + "epoch": 2.7995658465991315, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.1464, + "step": 12315 + }, + { + "epoch": 2.8007718282682106, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.1016, + "step": 12320 + }, + { + "epoch": 2.8019778099372887, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.0554, + "step": 12325 + }, + { + "epoch": 2.8031837916063678, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1224, + "step": 12330 + }, + { + "epoch": 2.8043897732754464, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.1699, + "step": 12335 + }, + { + "epoch": 2.805595754944525, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0422, + "step": 12340 + }, + { + "epoch": 2.8068017366136035, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1896, + "step": 12345 + }, + { + "epoch": 2.808007718282682, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 4.1451, + "step": 12350 + }, + { + "epoch": 2.8092136999517607, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0473, + "step": 12355 + }, + { + "epoch": 2.8104196816208393, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 4.0793, + "step": 12360 + }, + { + "epoch": 2.811625663289918, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0617, + "step": 12365 + }, + { + "epoch": 2.8128316449589965, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.169, + "step": 12370 + }, + { + "epoch": 2.814037626628075, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9993, + "step": 12375 + }, + { + "epoch": 2.8152436082971537, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.1945, + "step": 12380 + }, + { + "epoch": 2.8164495899662327, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0516, + "step": 12385 + }, + { + "epoch": 2.817655571635311, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.1206, + "step": 12390 + }, + { + "epoch": 2.81886155330439, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1641, + "step": 12395 + }, + { + "epoch": 2.8200675349734685, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9757, + "step": 12400 + }, + { + "epoch": 2.821273516642547, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0516, + "step": 12405 + }, + { + "epoch": 2.8224794983116257, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.0335, + "step": 12410 + }, + { + "epoch": 2.8236854799807043, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0036, + "step": 12415 + }, + { + "epoch": 2.824891461649783, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1023, + "step": 12420 + }, + { + "epoch": 2.8260974433188615, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0767, + "step": 12425 + }, + { + "epoch": 2.82730342498794, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.1463, + "step": 12430 + }, + { + "epoch": 2.8285094066570187, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1257, + "step": 12435 + }, + { + "epoch": 2.8297153883260977, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.1518, + "step": 12440 + }, + { + "epoch": 2.830921369995176, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0254, + "step": 12445 + }, + { + "epoch": 2.832127351664255, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.1223, + "step": 12450 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0784, + "step": 12455 + }, + { + "epoch": 2.834539315002412, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1354, + "step": 12460 + }, + { + "epoch": 2.8357452966714907, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.9273, + "step": 12465 + }, + { + "epoch": 2.8369512783405693, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.127, + "step": 12470 + }, + { + "epoch": 2.838157260009648, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.1316, + "step": 12475 + }, + { + "epoch": 2.8393632416787264, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.1644, + "step": 12480 + }, + { + "epoch": 2.840569223347805, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1627, + "step": 12485 + }, + { + "epoch": 2.8417752050168836, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9926, + "step": 12490 + }, + { + "epoch": 2.8429811866859622, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9624, + "step": 12495 + }, + { + "epoch": 2.844187168355041, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0651, + "step": 12500 + }, + { + "epoch": 2.84539315002412, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0333, + "step": 12505 + }, + { + "epoch": 2.846599131693198, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0222, + "step": 12510 + }, + { + "epoch": 2.847805113362277, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.2556, + "step": 12515 + }, + { + "epoch": 2.8490110950313556, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.046, + "step": 12520 + }, + { + "epoch": 2.8502170767004342, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.2979, + "step": 12525 + }, + { + "epoch": 2.851423058369513, + "grad_norm": 3.21875, + "learning_rate": 3e-05, + "loss": 4.1178, + "step": 12530 + }, + { + "epoch": 2.8526290400385914, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0731, + "step": 12535 + }, + { + "epoch": 2.85383502170767, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.0941, + "step": 12540 + }, + { + "epoch": 2.8550410033767486, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9681, + "step": 12545 + }, + { + "epoch": 2.856246985045827, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9995, + "step": 12550 + }, + { + "epoch": 2.857452966714906, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0625, + "step": 12555 + }, + { + "epoch": 2.858658948383985, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.1462, + "step": 12560 + }, + { + "epoch": 2.859864930053063, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.967, + "step": 12565 + }, + { + "epoch": 2.861070911722142, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9447, + "step": 12570 + }, + { + "epoch": 2.8622768933912206, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.1469, + "step": 12575 + }, + { + "epoch": 2.863482875060299, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.2723, + "step": 12580 + }, + { + "epoch": 2.864688856729378, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0287, + "step": 12585 + }, + { + "epoch": 2.8658948383984564, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1408, + "step": 12590 + }, + { + "epoch": 2.867100820067535, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0394, + "step": 12595 + }, + { + "epoch": 2.8683068017366136, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 4.0287, + "step": 12600 + }, + { + "epoch": 2.869512783405692, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0445, + "step": 12605 + }, + { + "epoch": 2.8707187650747708, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0422, + "step": 12610 + }, + { + "epoch": 2.8719247467438493, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0228, + "step": 12615 + }, + { + "epoch": 2.873130728412928, + "grad_norm": 3.890625, + "learning_rate": 3e-05, + "loss": 4.1007, + "step": 12620 + }, + { + "epoch": 2.874336710082007, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0885, + "step": 12625 + }, + { + "epoch": 2.875542691751085, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9827, + "step": 12630 + }, + { + "epoch": 2.876748673420164, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.1984, + "step": 12635 + }, + { + "epoch": 2.8779546550892428, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9131, + "step": 12640 + }, + { + "epoch": 2.8791606367583213, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.9683, + "step": 12645 + }, + { + "epoch": 2.8803666184274, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0378, + "step": 12650 + }, + { + "epoch": 2.8815726000964785, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.148, + "step": 12655 + }, + { + "epoch": 2.882778581765557, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8546, + "step": 12660 + }, + { + "epoch": 2.8839845634346357, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.193, + "step": 12665 + }, + { + "epoch": 2.8851905451037143, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1806, + "step": 12670 + }, + { + "epoch": 2.886396526772793, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0654, + "step": 12675 + }, + { + "epoch": 2.887602508441872, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.2687, + "step": 12680 + }, + { + "epoch": 2.88880849011095, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8384, + "step": 12685 + }, + { + "epoch": 2.890014471780029, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2077, + "step": 12690 + }, + { + "epoch": 2.8912204534491077, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.0338, + "step": 12695 + }, + { + "epoch": 2.8924264351181863, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0343, + "step": 12700 + }, + { + "epoch": 2.893632416787265, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1802, + "step": 12705 + }, + { + "epoch": 2.8948383984563435, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0843, + "step": 12710 + }, + { + "epoch": 2.896044380125422, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.014, + "step": 12715 + }, + { + "epoch": 2.8972503617945007, + "grad_norm": 4.09375, + "learning_rate": 3e-05, + "loss": 4.1161, + "step": 12720 + }, + { + "epoch": 2.8984563434635793, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9769, + "step": 12725 + }, + { + "epoch": 2.899662325132658, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.1166, + "step": 12730 + }, + { + "epoch": 2.9008683068017365, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0279, + "step": 12735 + }, + { + "epoch": 2.902074288470815, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1594, + "step": 12740 + }, + { + "epoch": 2.903280270139894, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.0105, + "step": 12745 + }, + { + "epoch": 2.9044862518089722, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.2134, + "step": 12750 + }, + { + "epoch": 2.9056922334780513, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0715, + "step": 12755 + }, + { + "epoch": 2.90689821514713, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1808, + "step": 12760 + }, + { + "epoch": 2.9081041968162085, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0269, + "step": 12765 + }, + { + "epoch": 2.909310178485287, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.167, + "step": 12770 + }, + { + "epoch": 2.9105161601543657, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0173, + "step": 12775 + }, + { + "epoch": 2.9117221418234442, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9264, + "step": 12780 + }, + { + "epoch": 2.912928123492523, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.042, + "step": 12785 + }, + { + "epoch": 2.9141341051616014, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0058, + "step": 12790 + }, + { + "epoch": 2.91534008683068, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0471, + "step": 12795 + }, + { + "epoch": 2.916546068499759, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9633, + "step": 12800 + }, + { + "epoch": 2.917752050168837, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1622, + "step": 12805 + }, + { + "epoch": 2.9189580318379162, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.131, + "step": 12810 + }, + { + "epoch": 2.920164013506995, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0153, + "step": 12815 + }, + { + "epoch": 2.9213699951760734, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1133, + "step": 12820 + }, + { + "epoch": 2.922575976845152, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.0067, + "step": 12825 + }, + { + "epoch": 2.9237819585142306, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.013, + "step": 12830 + }, + { + "epoch": 2.924987940183309, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9936, + "step": 12835 + }, + { + "epoch": 2.926193921852388, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.1483, + "step": 12840 + }, + { + "epoch": 2.9273999035214664, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0721, + "step": 12845 + }, + { + "epoch": 2.928605885190545, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.9175, + "step": 12850 + }, + { + "epoch": 2.9298118668596236, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.2047, + "step": 12855 + }, + { + "epoch": 2.931017848528702, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.1067, + "step": 12860 + }, + { + "epoch": 2.932223830197781, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0596, + "step": 12865 + }, + { + "epoch": 2.9334298118668594, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 4.1225, + "step": 12870 + }, + { + "epoch": 2.9346357935359384, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0444, + "step": 12875 + }, + { + "epoch": 2.935841775205017, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1243, + "step": 12880 + }, + { + "epoch": 2.9370477568740956, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9408, + "step": 12885 + }, + { + "epoch": 2.938253738543174, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1349, + "step": 12890 + }, + { + "epoch": 2.9394597202122528, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0184, + "step": 12895 + }, + { + "epoch": 2.9406657018813314, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0827, + "step": 12900 + }, + { + "epoch": 2.94187168355041, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0646, + "step": 12905 + }, + { + "epoch": 2.9430776652194885, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.043, + "step": 12910 + }, + { + "epoch": 2.944283646888567, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1694, + "step": 12915 + }, + { + "epoch": 2.945489628557646, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0645, + "step": 12920 + }, + { + "epoch": 2.9466956102267243, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1264, + "step": 12925 + }, + { + "epoch": 2.9479015918958034, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9371, + "step": 12930 + }, + { + "epoch": 2.949107573564882, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0511, + "step": 12935 + }, + { + "epoch": 2.9503135552339605, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 4.171, + "step": 12940 + }, + { + "epoch": 2.951519536903039, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0845, + "step": 12945 + }, + { + "epoch": 2.9527255185721177, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9512, + "step": 12950 + }, + { + "epoch": 2.9539315002411963, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9474, + "step": 12955 + }, + { + "epoch": 2.955137481910275, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1066, + "step": 12960 + }, + { + "epoch": 2.9563434635793535, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9189, + "step": 12965 + }, + { + "epoch": 2.957549445248432, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0173, + "step": 12970 + }, + { + "epoch": 2.9587554269175107, + "grad_norm": 3.59375, + "learning_rate": 3e-05, + "loss": 4.139, + "step": 12975 + }, + { + "epoch": 2.9599614085865893, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0827, + "step": 12980 + }, + { + "epoch": 2.9611673902556683, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9359, + "step": 12985 + }, + { + "epoch": 2.9623733719247465, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8856, + "step": 12990 + }, + { + "epoch": 2.9635793535938255, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1431, + "step": 12995 + }, + { + "epoch": 2.964785335262904, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1186, + "step": 13000 + }, + { + "epoch": 2.9659913169319827, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.1581, + "step": 13005 + }, + { + "epoch": 2.9671972986010613, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9509, + "step": 13010 + }, + { + "epoch": 2.96840328027014, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9609, + "step": 13015 + }, + { + "epoch": 2.9696092619392185, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9756, + "step": 13020 + }, + { + "epoch": 2.970815243608297, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 4.0841, + "step": 13025 + }, + { + "epoch": 2.9720212252773757, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0665, + "step": 13030 + }, + { + "epoch": 2.9732272069464543, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1547, + "step": 13035 + }, + { + "epoch": 2.9744331886155333, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 4.1022, + "step": 13040 + }, + { + "epoch": 2.9756391702846114, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.9579, + "step": 13045 + }, + { + "epoch": 2.9768451519536905, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8809, + "step": 13050 + }, + { + "epoch": 2.978051133622769, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0751, + "step": 13055 + }, + { + "epoch": 2.9792571152918477, + "grad_norm": 3.84375, + "learning_rate": 3e-05, + "loss": 4.0962, + "step": 13060 + }, + { + "epoch": 2.9804630969609263, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.1125, + "step": 13065 + }, + { + "epoch": 2.981669078630005, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1935, + "step": 13070 + }, + { + "epoch": 2.9828750602990834, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 4.0026, + "step": 13075 + }, + { + "epoch": 2.984081041968162, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.015, + "step": 13080 + }, + { + "epoch": 2.9852870236372406, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9765, + "step": 13085 + }, + { + "epoch": 2.9864930053063192, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.921, + "step": 13090 + }, + { + "epoch": 2.987698986975398, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9594, + "step": 13095 + }, + { + "epoch": 2.9889049686444764, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.213, + "step": 13100 + }, + { + "epoch": 2.9901109503135554, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.053, + "step": 13105 + }, + { + "epoch": 2.9913169319826336, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0336, + "step": 13110 + }, + { + "epoch": 2.9925229136517126, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0058, + "step": 13115 + }, + { + "epoch": 2.9937288953207912, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1504, + "step": 13120 + }, + { + "epoch": 2.99493487698987, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9082, + "step": 13125 + }, + { + "epoch": 2.9961408586589484, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0186, + "step": 13130 + }, + { + "epoch": 2.997346840328027, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0363, + "step": 13135 + }, + { + "epoch": 2.9985528219971056, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.099, + "step": 13140 + }, + { + "epoch": 2.999758803666184, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.1806, + "step": 13145 + }, + { + "epoch": 3.000964785335263, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9605, + "step": 13150 + }, + { + "epoch": 3.0021707670043414, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9325, + "step": 13155 + }, + { + "epoch": 3.00337674867342, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1737, + "step": 13160 + }, + { + "epoch": 3.004582730342499, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.0322, + "step": 13165 + }, + { + "epoch": 3.0057887120115776, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1062, + "step": 13170 + }, + { + "epoch": 3.006994693680656, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9885, + "step": 13175 + }, + { + "epoch": 3.008200675349735, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9667, + "step": 13180 + }, + { + "epoch": 3.0094066570188134, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 4.1605, + "step": 13185 + }, + { + "epoch": 3.010612638687892, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1717, + "step": 13190 + }, + { + "epoch": 3.0118186203569706, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.8992, + "step": 13195 + }, + { + "epoch": 3.013024602026049, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0766, + "step": 13200 + }, + { + "epoch": 3.0142305836951278, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9022, + "step": 13205 + }, + { + "epoch": 3.0154365653642063, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0784, + "step": 13210 + }, + { + "epoch": 3.016642547033285, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.1089, + "step": 13215 + }, + { + "epoch": 3.0178485287023635, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0501, + "step": 13220 + }, + { + "epoch": 3.0190545103714426, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9829, + "step": 13225 + }, + { + "epoch": 3.020260492040521, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9425, + "step": 13230 + }, + { + "epoch": 3.0214664737095998, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.101, + "step": 13235 + }, + { + "epoch": 3.0226724553786783, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9544, + "step": 13240 + }, + { + "epoch": 3.023878437047757, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0155, + "step": 13245 + }, + { + "epoch": 3.0250844187168355, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0865, + "step": 13250 + }, + { + "epoch": 3.026290400385914, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9694, + "step": 13255 + }, + { + "epoch": 3.0274963820549927, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.027, + "step": 13260 + }, + { + "epoch": 3.0287023637240713, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0497, + "step": 13265 + }, + { + "epoch": 3.02990834539315, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0664, + "step": 13270 + }, + { + "epoch": 3.0311143270622285, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9588, + "step": 13275 + }, + { + "epoch": 3.032320308731307, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.068, + "step": 13280 + }, + { + "epoch": 3.033526290400386, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9816, + "step": 13285 + }, + { + "epoch": 3.0347322720694647, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.1299, + "step": 13290 + }, + { + "epoch": 3.0359382537385433, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8364, + "step": 13295 + }, + { + "epoch": 3.037144235407622, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.1571, + "step": 13300 + }, + { + "epoch": 3.0383502170767005, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0643, + "step": 13305 + }, + { + "epoch": 3.039556198745779, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 4.0234, + "step": 13310 + }, + { + "epoch": 3.0407621804148577, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.1982, + "step": 13315 + }, + { + "epoch": 3.0419681620839363, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 4.0241, + "step": 13320 + }, + { + "epoch": 3.043174143753015, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1117, + "step": 13325 + }, + { + "epoch": 3.0443801254220935, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0489, + "step": 13330 + }, + { + "epoch": 3.045586107091172, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.974, + "step": 13335 + }, + { + "epoch": 3.0467920887602506, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1092, + "step": 13340 + }, + { + "epoch": 3.0479980704293297, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0894, + "step": 13345 + }, + { + "epoch": 3.0492040520984083, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9961, + "step": 13350 + }, + { + "epoch": 3.050410033767487, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9915, + "step": 13355 + }, + { + "epoch": 3.0516160154365655, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.1449, + "step": 13360 + }, + { + "epoch": 3.052821997105644, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0539, + "step": 13365 + }, + { + "epoch": 3.0540279787747227, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.2002, + "step": 13370 + }, + { + "epoch": 3.0552339604438012, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.934, + "step": 13375 + }, + { + "epoch": 3.05643994211288, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0279, + "step": 13380 + }, + { + "epoch": 3.0576459237819584, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0714, + "step": 13385 + }, + { + "epoch": 3.058851905451037, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8936, + "step": 13390 + }, + { + "epoch": 3.0600578871201156, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.9457, + "step": 13395 + }, + { + "epoch": 3.061263868789194, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8864, + "step": 13400 + }, + { + "epoch": 3.0624698504582732, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0401, + "step": 13405 + }, + { + "epoch": 3.063675832127352, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0836, + "step": 13410 + }, + { + "epoch": 3.0648818137964304, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9798, + "step": 13415 + }, + { + "epoch": 3.066087795465509, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.966, + "step": 13420 + }, + { + "epoch": 3.0672937771345876, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9688, + "step": 13425 + }, + { + "epoch": 3.068499758803666, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0855, + "step": 13430 + }, + { + "epoch": 3.069705740472745, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0465, + "step": 13435 + }, + { + "epoch": 3.0709117221418234, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9355, + "step": 13440 + }, + { + "epoch": 3.072117703810902, + "grad_norm": 3.359375, + "learning_rate": 3e-05, + "loss": 3.9578, + "step": 13445 + }, + { + "epoch": 3.0733236854799806, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1251, + "step": 13450 + }, + { + "epoch": 3.074529667149059, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.0133, + "step": 13455 + }, + { + "epoch": 3.0757356488181378, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0401, + "step": 13460 + }, + { + "epoch": 3.076941630487217, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9156, + "step": 13465 + }, + { + "epoch": 3.0781476121562954, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0138, + "step": 13470 + }, + { + "epoch": 3.079353593825374, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8903, + "step": 13475 + }, + { + "epoch": 3.0805595754944526, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9927, + "step": 13480 + }, + { + "epoch": 3.081765557163531, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0072, + "step": 13485 + }, + { + "epoch": 3.0829715388326098, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0956, + "step": 13490 + }, + { + "epoch": 3.0841775205016884, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1261, + "step": 13495 + }, + { + "epoch": 3.085383502170767, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.157, + "step": 13500 + }, + { + "epoch": 3.0865894838398455, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9861, + "step": 13505 + }, + { + "epoch": 3.087795465508924, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.124, + "step": 13510 + }, + { + "epoch": 3.0890014471780027, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0408, + "step": 13515 + }, + { + "epoch": 3.0902074288470813, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8824, + "step": 13520 + }, + { + "epoch": 3.0914134105161604, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.057, + "step": 13525 + }, + { + "epoch": 3.092619392185239, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9654, + "step": 13530 + }, + { + "epoch": 3.0938253738543176, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0035, + "step": 13535 + }, + { + "epoch": 3.095031355523396, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9868, + "step": 13540 + }, + { + "epoch": 3.0962373371924747, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.1533, + "step": 13545 + }, + { + "epoch": 3.0974433188615533, + "grad_norm": 3.21875, + "learning_rate": 3e-05, + "loss": 4.0024, + "step": 13550 + }, + { + "epoch": 3.098649300530632, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.1052, + "step": 13555 + }, + { + "epoch": 3.0998552821997105, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1562, + "step": 13560 + }, + { + "epoch": 3.101061263868789, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9155, + "step": 13565 + }, + { + "epoch": 3.1022672455378677, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9797, + "step": 13570 + }, + { + "epoch": 3.1034732272069463, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1746, + "step": 13575 + }, + { + "epoch": 3.104679208876025, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0701, + "step": 13580 + }, + { + "epoch": 3.105885190545104, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9489, + "step": 13585 + }, + { + "epoch": 3.1070911722141825, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0601, + "step": 13590 + }, + { + "epoch": 3.108297153883261, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0415, + "step": 13595 + }, + { + "epoch": 3.1095031355523397, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0321, + "step": 13600 + }, + { + "epoch": 3.1107091172214183, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.048, + "step": 13605 + }, + { + "epoch": 3.111915098890497, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0143, + "step": 13610 + }, + { + "epoch": 3.1131210805595755, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.118, + "step": 13615 + }, + { + "epoch": 3.114327062228654, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9826, + "step": 13620 + }, + { + "epoch": 3.1155330438977327, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0644, + "step": 13625 + }, + { + "epoch": 3.1167390255668113, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.2254, + "step": 13630 + }, + { + "epoch": 3.11794500723589, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.028, + "step": 13635 + }, + { + "epoch": 3.1191509889049684, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0133, + "step": 13640 + }, + { + "epoch": 3.1203569705740475, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0251, + "step": 13645 + }, + { + "epoch": 3.121562952243126, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9699, + "step": 13650 + }, + { + "epoch": 3.1227689339122047, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9452, + "step": 13655 + }, + { + "epoch": 3.1239749155812833, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9581, + "step": 13660 + }, + { + "epoch": 3.125180897250362, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9676, + "step": 13665 + }, + { + "epoch": 3.1263868789194404, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1417, + "step": 13670 + }, + { + "epoch": 3.127592860588519, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9902, + "step": 13675 + }, + { + "epoch": 3.1287988422575976, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0673, + "step": 13680 + }, + { + "epoch": 3.1300048239266762, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.998, + "step": 13685 + }, + { + "epoch": 3.131210805595755, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9742, + "step": 13690 + }, + { + "epoch": 3.1324167872648334, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.0822, + "step": 13695 + }, + { + "epoch": 3.133622768933912, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.1118, + "step": 13700 + }, + { + "epoch": 3.134828750602991, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.0555, + "step": 13705 + }, + { + "epoch": 3.1360347322720696, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0973, + "step": 13710 + }, + { + "epoch": 3.1372407139411482, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9268, + "step": 13715 + }, + { + "epoch": 3.138446695610227, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.015, + "step": 13720 + }, + { + "epoch": 3.1396526772793054, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0813, + "step": 13725 + }, + { + "epoch": 3.140858658948384, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0813, + "step": 13730 + }, + { + "epoch": 3.1420646406174626, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 3.8976, + "step": 13735 + }, + { + "epoch": 3.143270622286541, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0243, + "step": 13740 + }, + { + "epoch": 3.14447660395562, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0255, + "step": 13745 + }, + { + "epoch": 3.1456825856246984, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0229, + "step": 13750 + }, + { + "epoch": 3.146888567293777, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0125, + "step": 13755 + }, + { + "epoch": 3.1480945489628556, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0281, + "step": 13760 + }, + { + "epoch": 3.1493005306319346, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9351, + "step": 13765 + }, + { + "epoch": 3.150506512301013, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9611, + "step": 13770 + }, + { + "epoch": 3.151712493970092, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8776, + "step": 13775 + }, + { + "epoch": 3.1529184756391704, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9645, + "step": 13780 + }, + { + "epoch": 3.154124457308249, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0643, + "step": 13785 + }, + { + "epoch": 3.1553304389773276, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1082, + "step": 13790 + }, + { + "epoch": 3.156536420646406, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0164, + "step": 13795 + }, + { + "epoch": 3.1577424023154848, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1538, + "step": 13800 + }, + { + "epoch": 3.1589483839845633, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1769, + "step": 13805 + }, + { + "epoch": 3.160154365653642, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9665, + "step": 13810 + }, + { + "epoch": 3.1613603473227205, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8724, + "step": 13815 + }, + { + "epoch": 3.162566328991799, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8985, + "step": 13820 + }, + { + "epoch": 3.163772310660878, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1009, + "step": 13825 + }, + { + "epoch": 3.1649782923299568, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1008, + "step": 13830 + }, + { + "epoch": 3.1661842739990353, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0372, + "step": 13835 + }, + { + "epoch": 3.167390255668114, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.014, + "step": 13840 + }, + { + "epoch": 3.1685962373371925, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.8106, + "step": 13845 + }, + { + "epoch": 3.169802219006271, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0814, + "step": 13850 + }, + { + "epoch": 3.1710082006753497, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 4.0285, + "step": 13855 + }, + { + "epoch": 3.1722141823444283, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9147, + "step": 13860 + }, + { + "epoch": 3.173420164013507, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.2018, + "step": 13865 + }, + { + "epoch": 3.1746261456825855, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0175, + "step": 13870 + }, + { + "epoch": 3.175832127351664, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0163, + "step": 13875 + }, + { + "epoch": 3.1770381090207427, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0386, + "step": 13880 + }, + { + "epoch": 3.1782440906898217, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9444, + "step": 13885 + }, + { + "epoch": 3.1794500723589003, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.9977, + "step": 13890 + }, + { + "epoch": 3.180656054027979, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.1229, + "step": 13895 + }, + { + "epoch": 3.1818620356970575, + "grad_norm": 3.515625, + "learning_rate": 3e-05, + "loss": 4.0196, + "step": 13900 + }, + { + "epoch": 3.183068017366136, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.1247, + "step": 13905 + }, + { + "epoch": 3.1842739990352147, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9441, + "step": 13910 + }, + { + "epoch": 3.1854799807042933, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0142, + "step": 13915 + }, + { + "epoch": 3.186685962373372, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.9364, + "step": 13920 + }, + { + "epoch": 3.1878919440424505, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0615, + "step": 13925 + }, + { + "epoch": 3.189097925711529, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0958, + "step": 13930 + }, + { + "epoch": 3.1903039073806077, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0952, + "step": 13935 + }, + { + "epoch": 3.1915098890496862, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.993, + "step": 13940 + }, + { + "epoch": 3.1927158707187653, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1895, + "step": 13945 + }, + { + "epoch": 3.193921852387844, + "grad_norm": 1.921875, + "learning_rate": 3e-05, + "loss": 4.0379, + "step": 13950 + }, + { + "epoch": 3.1951278340569225, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9688, + "step": 13955 + }, + { + "epoch": 3.196333815726001, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.9644, + "step": 13960 + }, + { + "epoch": 3.1975397973950797, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9363, + "step": 13965 + }, + { + "epoch": 3.1987457790641582, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9434, + "step": 13970 + }, + { + "epoch": 3.199951760733237, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0571, + "step": 13975 + }, + { + "epoch": 3.2011577424023154, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0266, + "step": 13980 + }, + { + "epoch": 3.202363724071394, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.034, + "step": 13985 + }, + { + "epoch": 3.2035697057404726, + "grad_norm": 3.296875, + "learning_rate": 3e-05, + "loss": 4.1168, + "step": 13990 + }, + { + "epoch": 3.204775687409551, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.2014, + "step": 13995 + }, + { + "epoch": 3.20598166907863, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.016, + "step": 14000 + }, + { + "epoch": 3.207187650747709, + "grad_norm": 2.015625, + "learning_rate": 3e-05, + "loss": 3.899, + "step": 14005 + }, + { + "epoch": 3.2083936324167874, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.1745, + "step": 14010 + }, + { + "epoch": 3.209599614085866, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9857, + "step": 14015 + }, + { + "epoch": 3.2108055957549446, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.0834, + "step": 14020 + }, + { + "epoch": 3.212011577424023, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9924, + "step": 14025 + }, + { + "epoch": 3.213217559093102, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0381, + "step": 14030 + }, + { + "epoch": 3.2144235407621804, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9991, + "step": 14035 + }, + { + "epoch": 3.215629522431259, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0751, + "step": 14040 + }, + { + "epoch": 3.2168355041003376, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0358, + "step": 14045 + }, + { + "epoch": 3.218041485769416, + "grad_norm": 3.40625, + "learning_rate": 3e-05, + "loss": 4.0009, + "step": 14050 + }, + { + "epoch": 3.2192474674384948, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.994, + "step": 14055 + }, + { + "epoch": 3.2204534491075734, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0452, + "step": 14060 + }, + { + "epoch": 3.2216594307766524, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0047, + "step": 14065 + }, + { + "epoch": 3.222865412445731, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8072, + "step": 14070 + }, + { + "epoch": 3.2240713941148096, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.1151, + "step": 14075 + }, + { + "epoch": 3.225277375783888, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.9328, + "step": 14080 + }, + { + "epoch": 3.2264833574529668, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9196, + "step": 14085 + }, + { + "epoch": 3.2276893391220454, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.0239, + "step": 14090 + }, + { + "epoch": 3.228895320791124, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9504, + "step": 14095 + }, + { + "epoch": 3.2301013024602026, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8764, + "step": 14100 + }, + { + "epoch": 3.231307284129281, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9942, + "step": 14105 + }, + { + "epoch": 3.2325132657983597, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9936, + "step": 14110 + }, + { + "epoch": 3.2337192474674383, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.1299, + "step": 14115 + }, + { + "epoch": 3.234925229136517, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.2354, + "step": 14120 + }, + { + "epoch": 3.236131210805596, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0096, + "step": 14125 + }, + { + "epoch": 3.2373371924746746, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9859, + "step": 14130 + }, + { + "epoch": 3.238543174143753, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9901, + "step": 14135 + }, + { + "epoch": 3.2397491558128317, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.048, + "step": 14140 + }, + { + "epoch": 3.2409551374819103, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8809, + "step": 14145 + }, + { + "epoch": 3.242161119150989, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.1033, + "step": 14150 + }, + { + "epoch": 3.2433671008200675, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0967, + "step": 14155 + }, + { + "epoch": 3.244573082489146, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.9584, + "step": 14160 + }, + { + "epoch": 3.2457790641582247, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.0257, + "step": 14165 + }, + { + "epoch": 3.2469850458273033, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0908, + "step": 14170 + }, + { + "epoch": 3.248191027496382, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0343, + "step": 14175 + }, + { + "epoch": 3.2493970091654605, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.882, + "step": 14180 + }, + { + "epoch": 3.2506029908345395, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.0731, + "step": 14185 + }, + { + "epoch": 3.251808972503618, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9578, + "step": 14190 + }, + { + "epoch": 3.2530149541726967, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0419, + "step": 14195 + }, + { + "epoch": 3.2542209358417753, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8306, + "step": 14200 + }, + { + "epoch": 3.255426917510854, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1088, + "step": 14205 + }, + { + "epoch": 3.2566328991799325, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9569, + "step": 14210 + }, + { + "epoch": 3.257838880849011, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0281, + "step": 14215 + }, + { + "epoch": 3.2590448625180897, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9602, + "step": 14220 + }, + { + "epoch": 3.2602508441871683, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0232, + "step": 14225 + }, + { + "epoch": 3.261456825856247, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.052, + "step": 14230 + }, + { + "epoch": 3.2626628075253254, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 3.9574, + "step": 14235 + }, + { + "epoch": 3.2638687891944045, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.9721, + "step": 14240 + }, + { + "epoch": 3.2650747708634826, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0276, + "step": 14245 + }, + { + "epoch": 3.2662807525325617, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9899, + "step": 14250 + }, + { + "epoch": 3.2674867342016403, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.7672, + "step": 14255 + }, + { + "epoch": 3.268692715870719, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.1001, + "step": 14260 + }, + { + "epoch": 3.2698986975397974, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 3.9353, + "step": 14265 + }, + { + "epoch": 3.271104679208876, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.8915, + "step": 14270 + }, + { + "epoch": 3.2723106608779546, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0175, + "step": 14275 + }, + { + "epoch": 3.2735166425470332, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9433, + "step": 14280 + }, + { + "epoch": 3.274722624216112, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8234, + "step": 14285 + }, + { + "epoch": 3.2759286058851904, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.011, + "step": 14290 + }, + { + "epoch": 3.277134587554269, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.9454, + "step": 14295 + }, + { + "epoch": 3.2783405692233476, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1123, + "step": 14300 + }, + { + "epoch": 3.2795465508924266, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.2219, + "step": 14305 + }, + { + "epoch": 3.2807525325615052, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9955, + "step": 14310 + }, + { + "epoch": 3.281958514230584, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9424, + "step": 14315 + }, + { + "epoch": 3.2831644958996624, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9537, + "step": 14320 + }, + { + "epoch": 3.284370477568741, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0333, + "step": 14325 + }, + { + "epoch": 3.2855764592378196, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8545, + "step": 14330 + }, + { + "epoch": 3.286782440906898, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.083, + "step": 14335 + }, + { + "epoch": 3.287988422575977, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0703, + "step": 14340 + }, + { + "epoch": 3.2891944042450554, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9802, + "step": 14345 + }, + { + "epoch": 3.290400385914134, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0247, + "step": 14350 + }, + { + "epoch": 3.2916063675832126, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0461, + "step": 14355 + }, + { + "epoch": 3.2928123492522916, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 4.0102, + "step": 14360 + }, + { + "epoch": 3.2940183309213698, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8292, + "step": 14365 + }, + { + "epoch": 3.295224312590449, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.0706, + "step": 14370 + }, + { + "epoch": 3.2964302942595274, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.914, + "step": 14375 + }, + { + "epoch": 3.297636275928606, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0359, + "step": 14380 + }, + { + "epoch": 3.2988422575976846, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9523, + "step": 14385 + }, + { + "epoch": 3.300048239266763, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0287, + "step": 14390 + }, + { + "epoch": 3.3012542209358418, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9867, + "step": 14395 + }, + { + "epoch": 3.3024602026049203, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1293, + "step": 14400 + }, + { + "epoch": 3.303666184273999, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9522, + "step": 14405 + }, + { + "epoch": 3.3048721659430775, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8667, + "step": 14410 + }, + { + "epoch": 3.306078147612156, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9863, + "step": 14415 + }, + { + "epoch": 3.3072841292812347, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.013, + "step": 14420 + }, + { + "epoch": 3.3084901109503138, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9983, + "step": 14425 + }, + { + "epoch": 3.3096960926193923, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.972, + "step": 14430 + }, + { + "epoch": 3.310902074288471, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.8923, + "step": 14435 + }, + { + "epoch": 3.3121080559575495, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8711, + "step": 14440 + }, + { + "epoch": 3.313314037626628, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1087, + "step": 14445 + }, + { + "epoch": 3.3145200192957067, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0565, + "step": 14450 + }, + { + "epoch": 3.3157260009647853, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0258, + "step": 14455 + }, + { + "epoch": 3.316931982633864, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 4.0745, + "step": 14460 + }, + { + "epoch": 3.3181379643029425, + "grad_norm": 3.46875, + "learning_rate": 3e-05, + "loss": 4.0424, + "step": 14465 + }, + { + "epoch": 3.319343945972021, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9648, + "step": 14470 + }, + { + "epoch": 3.3205499276410997, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9329, + "step": 14475 + }, + { + "epoch": 3.3217559093101787, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9133, + "step": 14480 + }, + { + "epoch": 3.322961890979257, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9903, + "step": 14485 + }, + { + "epoch": 3.324167872648336, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1389, + "step": 14490 + }, + { + "epoch": 3.3253738543174145, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.96, + "step": 14495 + }, + { + "epoch": 3.326579835986493, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0746, + "step": 14500 + }, + { + "epoch": 3.3277858176555717, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0971, + "step": 14505 + }, + { + "epoch": 3.3289917993246503, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 4.0437, + "step": 14510 + }, + { + "epoch": 3.330197780993729, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9961, + "step": 14515 + }, + { + "epoch": 3.3314037626628075, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0781, + "step": 14520 + }, + { + "epoch": 3.332609744331886, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.9975, + "step": 14525 + }, + { + "epoch": 3.3338157260009647, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9685, + "step": 14530 + }, + { + "epoch": 3.3350217076700432, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9129, + "step": 14535 + }, + { + "epoch": 3.336227689339122, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0438, + "step": 14540 + }, + { + "epoch": 3.337433671008201, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.067, + "step": 14545 + }, + { + "epoch": 3.3386396526772795, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0993, + "step": 14550 + }, + { + "epoch": 3.339845634346358, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0744, + "step": 14555 + }, + { + "epoch": 3.3410516160154367, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0717, + "step": 14560 + }, + { + "epoch": 3.3422575976845152, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 4.0287, + "step": 14565 + }, + { + "epoch": 3.343463579353594, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9798, + "step": 14570 + }, + { + "epoch": 3.3446695610226724, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0063, + "step": 14575 + }, + { + "epoch": 3.345875542691751, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.1417, + "step": 14580 + }, + { + "epoch": 3.3470815243608296, + "grad_norm": 3.359375, + "learning_rate": 3e-05, + "loss": 4.1565, + "step": 14585 + }, + { + "epoch": 3.348287506029908, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0453, + "step": 14590 + }, + { + "epoch": 3.349493487698987, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0766, + "step": 14595 + }, + { + "epoch": 3.350699469368066, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9373, + "step": 14600 + }, + { + "epoch": 3.351905451037144, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9683, + "step": 14605 + }, + { + "epoch": 3.353111432706223, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1546, + "step": 14610 + }, + { + "epoch": 3.3543174143753016, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9953, + "step": 14615 + }, + { + "epoch": 3.35552339604438, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1147, + "step": 14620 + }, + { + "epoch": 3.356729377713459, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9063, + "step": 14625 + }, + { + "epoch": 3.3579353593825374, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9578, + "step": 14630 + }, + { + "epoch": 3.359141341051616, + "grad_norm": 3.59375, + "learning_rate": 3e-05, + "loss": 4.0816, + "step": 14635 + }, + { + "epoch": 3.3603473227206946, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0369, + "step": 14640 + }, + { + "epoch": 3.361553304389773, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 3.9274, + "step": 14645 + }, + { + "epoch": 3.3627592860588518, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9866, + "step": 14650 + }, + { + "epoch": 3.3639652677279304, + "grad_norm": 3.453125, + "learning_rate": 3e-05, + "loss": 4.0215, + "step": 14655 + }, + { + "epoch": 3.365171249397009, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9415, + "step": 14660 + }, + { + "epoch": 3.366377231066088, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 3.8939, + "step": 14665 + }, + { + "epoch": 3.3675832127351666, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0955, + "step": 14670 + }, + { + "epoch": 3.368789194404245, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1219, + "step": 14675 + }, + { + "epoch": 3.3699951760733238, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1545, + "step": 14680 + }, + { + "epoch": 3.3712011577424024, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.9588, + "step": 14685 + }, + { + "epoch": 3.372407139411481, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9333, + "step": 14690 + }, + { + "epoch": 3.3736131210805596, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.9928, + "step": 14695 + }, + { + "epoch": 3.374819102749638, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0348, + "step": 14700 + }, + { + "epoch": 3.3760250844187167, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0069, + "step": 14705 + }, + { + "epoch": 3.3772310660877953, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9182, + "step": 14710 + }, + { + "epoch": 3.378437047756874, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8373, + "step": 14715 + }, + { + "epoch": 3.379643029425953, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0303, + "step": 14720 + }, + { + "epoch": 3.380849011095031, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.1722, + "step": 14725 + }, + { + "epoch": 3.38205499276411, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9155, + "step": 14730 + }, + { + "epoch": 3.3832609744331887, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9361, + "step": 14735 + }, + { + "epoch": 3.3844669561022673, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0036, + "step": 14740 + }, + { + "epoch": 3.385672937771346, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8562, + "step": 14745 + }, + { + "epoch": 3.3868789194404245, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0685, + "step": 14750 + }, + { + "epoch": 3.388084901109503, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.1758, + "step": 14755 + }, + { + "epoch": 3.3892908827785817, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0137, + "step": 14760 + }, + { + "epoch": 3.3904968644476603, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.978, + "step": 14765 + }, + { + "epoch": 3.391702846116739, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0286, + "step": 14770 + }, + { + "epoch": 3.3929088277858175, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0036, + "step": 14775 + }, + { + "epoch": 3.394114809454896, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9598, + "step": 14780 + }, + { + "epoch": 3.395320791123975, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9941, + "step": 14785 + }, + { + "epoch": 3.3965267727930537, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0702, + "step": 14790 + }, + { + "epoch": 3.3977327544621323, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.008, + "step": 14795 + }, + { + "epoch": 3.398938736131211, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9393, + "step": 14800 + }, + { + "epoch": 3.4001447178002895, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.1036, + "step": 14805 + }, + { + "epoch": 3.401350699469368, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.054, + "step": 14810 + }, + { + "epoch": 3.4025566811384467, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.0621, + "step": 14815 + }, + { + "epoch": 3.4037626628075253, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.1669, + "step": 14820 + }, + { + "epoch": 3.404968644476604, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0668, + "step": 14825 + }, + { + "epoch": 3.4061746261456824, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.1343, + "step": 14830 + }, + { + "epoch": 3.407380607814761, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.1739, + "step": 14835 + }, + { + "epoch": 3.40858658948384, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9357, + "step": 14840 + }, + { + "epoch": 3.4097925711529182, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0191, + "step": 14845 + }, + { + "epoch": 3.4109985528219973, + "grad_norm": 3.796875, + "learning_rate": 3e-05, + "loss": 4.0613, + "step": 14850 + }, + { + "epoch": 3.412204534491076, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.9014, + "step": 14855 + }, + { + "epoch": 3.4134105161601545, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9721, + "step": 14860 + }, + { + "epoch": 3.414616497829233, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9494, + "step": 14865 + }, + { + "epoch": 3.4158224794983116, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1432, + "step": 14870 + }, + { + "epoch": 3.4170284611673902, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8406, + "step": 14875 + }, + { + "epoch": 3.418234442836469, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9471, + "step": 14880 + }, + { + "epoch": 3.4194404245055474, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1186, + "step": 14885 + }, + { + "epoch": 3.420646406174626, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.9049, + "step": 14890 + }, + { + "epoch": 3.4218523878437046, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0118, + "step": 14895 + }, + { + "epoch": 3.423058369512783, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.1187, + "step": 14900 + }, + { + "epoch": 3.4242643511818622, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9578, + "step": 14905 + }, + { + "epoch": 3.425470332850941, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.998, + "step": 14910 + }, + { + "epoch": 3.4266763145200194, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9749, + "step": 14915 + }, + { + "epoch": 3.427882296189098, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9134, + "step": 14920 + }, + { + "epoch": 3.4290882778581766, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0398, + "step": 14925 + }, + { + "epoch": 3.430294259527255, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9159, + "step": 14930 + }, + { + "epoch": 3.431500241196334, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.9447, + "step": 14935 + }, + { + "epoch": 3.4327062228654124, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0588, + "step": 14940 + }, + { + "epoch": 3.433912204534491, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.1365, + "step": 14945 + }, + { + "epoch": 3.4351181862035696, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.988, + "step": 14950 + }, + { + "epoch": 3.436324167872648, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9131, + "step": 14955 + }, + { + "epoch": 3.437530149541727, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0076, + "step": 14960 + }, + { + "epoch": 3.4387361312108053, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9779, + "step": 14965 + }, + { + "epoch": 3.4399421128798844, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0355, + "step": 14970 + }, + { + "epoch": 3.441148094548963, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0125, + "step": 14975 + }, + { + "epoch": 3.4423540762180416, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.029, + "step": 14980 + }, + { + "epoch": 3.44356005788712, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.0693, + "step": 14985 + }, + { + "epoch": 3.4447660395561988, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.0339, + "step": 14990 + }, + { + "epoch": 3.4459720212252773, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 3.9148, + "step": 14995 + }, + { + "epoch": 3.447178002894356, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.1223, + "step": 15000 + }, + { + "epoch": 3.4483839845634345, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0484, + "step": 15005 + }, + { + "epoch": 3.449589966232513, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.905, + "step": 15010 + }, + { + "epoch": 3.4507959479015917, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.1181, + "step": 15015 + }, + { + "epoch": 3.4520019295706703, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9647, + "step": 15020 + }, + { + "epoch": 3.4532079112397494, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0063, + "step": 15025 + }, + { + "epoch": 3.454413892908828, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9917, + "step": 15030 + }, + { + "epoch": 3.4556198745779065, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9416, + "step": 15035 + }, + { + "epoch": 3.456825856246985, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.003, + "step": 15040 + }, + { + "epoch": 3.4580318379160637, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9431, + "step": 15045 + }, + { + "epoch": 3.4592378195851423, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.0297, + "step": 15050 + }, + { + "epoch": 3.460443801254221, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.9578, + "step": 15055 + }, + { + "epoch": 3.4616497829232995, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.1545, + "step": 15060 + }, + { + "epoch": 3.462855764592378, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0698, + "step": 15065 + }, + { + "epoch": 3.4640617462614567, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.0099, + "step": 15070 + }, + { + "epoch": 3.4652677279305353, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0312, + "step": 15075 + }, + { + "epoch": 3.4664737095996143, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.068, + "step": 15080 + }, + { + "epoch": 3.467679691268693, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.0546, + "step": 15085 + }, + { + "epoch": 3.4688856729377715, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1771, + "step": 15090 + }, + { + "epoch": 3.47009165460685, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0114, + "step": 15095 + }, + { + "epoch": 3.4712976362759287, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0987, + "step": 15100 + }, + { + "epoch": 3.4725036179450073, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0167, + "step": 15105 + }, + { + "epoch": 3.473709599614086, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1671, + "step": 15110 + }, + { + "epoch": 3.4749155812831645, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0783, + "step": 15115 + }, + { + "epoch": 3.476121562952243, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 4.112, + "step": 15120 + }, + { + "epoch": 3.4773275446213217, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.9935, + "step": 15125 + }, + { + "epoch": 3.4785335262904002, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0511, + "step": 15130 + }, + { + "epoch": 3.479739507959479, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1281, + "step": 15135 + }, + { + "epoch": 3.4809454896285574, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9762, + "step": 15140 + }, + { + "epoch": 3.4821514712976365, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0346, + "step": 15145 + }, + { + "epoch": 3.483357452966715, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9322, + "step": 15150 + }, + { + "epoch": 3.4845634346357937, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9412, + "step": 15155 + }, + { + "epoch": 3.4857694163048722, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.953, + "step": 15160 + }, + { + "epoch": 3.486975397973951, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 4.0236, + "step": 15165 + }, + { + "epoch": 3.4881813796430294, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0193, + "step": 15170 + }, + { + "epoch": 3.489387361312108, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9847, + "step": 15175 + }, + { + "epoch": 3.4905933429811866, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0215, + "step": 15180 + }, + { + "epoch": 3.491799324650265, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0822, + "step": 15185 + }, + { + "epoch": 3.493005306319344, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8903, + "step": 15190 + }, + { + "epoch": 3.4942112879884224, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0781, + "step": 15195 + }, + { + "epoch": 3.4954172696575014, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8788, + "step": 15200 + }, + { + "epoch": 3.49662325132658, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9371, + "step": 15205 + }, + { + "epoch": 3.4978292329956586, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0242, + "step": 15210 + }, + { + "epoch": 3.499035214664737, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9758, + "step": 15215 + }, + { + "epoch": 3.500241196333816, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9946, + "step": 15220 + }, + { + "epoch": 3.5014471780028944, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8467, + "step": 15225 + }, + { + "epoch": 3.502653159671973, + "grad_norm": 1.9609375, + "learning_rate": 3e-05, + "loss": 3.9953, + "step": 15230 + }, + { + "epoch": 3.5038591413410516, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.9511, + "step": 15235 + }, + { + "epoch": 3.50506512301013, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.082, + "step": 15240 + }, + { + "epoch": 3.5062711046792088, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0789, + "step": 15245 + }, + { + "epoch": 3.5074770863482874, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0445, + "step": 15250 + }, + { + "epoch": 3.5086830680173664, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.0717, + "step": 15255 + }, + { + "epoch": 3.5098890496864446, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.1007, + "step": 15260 + }, + { + "epoch": 3.5110950313555236, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0185, + "step": 15265 + }, + { + "epoch": 3.512301013024602, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0267, + "step": 15270 + }, + { + "epoch": 3.5135069946936808, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.2083, + "step": 15275 + }, + { + "epoch": 3.5147129763627594, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8944, + "step": 15280 + }, + { + "epoch": 3.515918958031838, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0898, + "step": 15285 + }, + { + "epoch": 3.5171249397009166, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0848, + "step": 15290 + }, + { + "epoch": 3.518330921369995, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9434, + "step": 15295 + }, + { + "epoch": 3.5195369030390737, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 4.0369, + "step": 15300 + }, + { + "epoch": 3.5207428847081523, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.918, + "step": 15305 + }, + { + "epoch": 3.521948866377231, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.854, + "step": 15310 + }, + { + "epoch": 3.5231548480463095, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0014, + "step": 15315 + }, + { + "epoch": 3.5243608297153886, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0185, + "step": 15320 + }, + { + "epoch": 3.5255668113844667, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9638, + "step": 15325 + }, + { + "epoch": 3.5267727930535457, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.993, + "step": 15330 + }, + { + "epoch": 3.5279787747226243, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.9725, + "step": 15335 + }, + { + "epoch": 3.529184756391703, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.0577, + "step": 15340 + }, + { + "epoch": 3.5303907380607815, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.9415, + "step": 15345 + }, + { + "epoch": 3.53159671972986, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0072, + "step": 15350 + }, + { + "epoch": 3.5328027013989387, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8658, + "step": 15355 + }, + { + "epoch": 3.5340086830680173, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0158, + "step": 15360 + }, + { + "epoch": 3.535214664737096, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.933, + "step": 15365 + }, + { + "epoch": 3.5364206464061745, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9914, + "step": 15370 + }, + { + "epoch": 3.5376266280752535, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9102, + "step": 15375 + }, + { + "epoch": 3.5388326097443317, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9443, + "step": 15380 + }, + { + "epoch": 3.5400385914134107, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9349, + "step": 15385 + }, + { + "epoch": 3.5412445730824893, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9924, + "step": 15390 + }, + { + "epoch": 3.542450554751568, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.0462, + "step": 15395 + }, + { + "epoch": 3.5436565364206465, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9478, + "step": 15400 + }, + { + "epoch": 3.544862518089725, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9993, + "step": 15405 + }, + { + "epoch": 3.5460684997588037, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0022, + "step": 15410 + }, + { + "epoch": 3.5472744814278823, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9451, + "step": 15415 + }, + { + "epoch": 3.548480463096961, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1631, + "step": 15420 + }, + { + "epoch": 3.5496864447660395, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8508, + "step": 15425 + }, + { + "epoch": 3.550892426435118, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.007, + "step": 15430 + }, + { + "epoch": 3.5520984081041966, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1412, + "step": 15435 + }, + { + "epoch": 3.5533043897732757, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9812, + "step": 15440 + }, + { + "epoch": 3.554510371442354, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.1371, + "step": 15445 + }, + { + "epoch": 3.555716353111433, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.0788, + "step": 15450 + }, + { + "epoch": 3.5569223347805115, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9216, + "step": 15455 + }, + { + "epoch": 3.55812831644959, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8934, + "step": 15460 + }, + { + "epoch": 3.5593342981186686, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0998, + "step": 15465 + }, + { + "epoch": 3.5605402797877472, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0685, + "step": 15470 + }, + { + "epoch": 3.561746261456826, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9627, + "step": 15475 + }, + { + "epoch": 3.5629522431259044, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9114, + "step": 15480 + }, + { + "epoch": 3.564158224794983, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9346, + "step": 15485 + }, + { + "epoch": 3.5653642064640616, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8659, + "step": 15490 + }, + { + "epoch": 3.5665701881331406, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0312, + "step": 15495 + }, + { + "epoch": 3.567776169802219, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0441, + "step": 15500 + }, + { + "epoch": 3.568982151471298, + "grad_norm": 3.296875, + "learning_rate": 3e-05, + "loss": 4.2124, + "step": 15505 + }, + { + "epoch": 3.5701881331403764, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8043, + "step": 15510 + }, + { + "epoch": 3.571394114809455, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1218, + "step": 15515 + }, + { + "epoch": 3.5726000964785336, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.1294, + "step": 15520 + }, + { + "epoch": 3.573806078147612, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8942, + "step": 15525 + }, + { + "epoch": 3.575012059816691, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9531, + "step": 15530 + }, + { + "epoch": 3.5762180414857694, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0333, + "step": 15535 + }, + { + "epoch": 3.577424023154848, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0552, + "step": 15540 + }, + { + "epoch": 3.5786300048239266, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0429, + "step": 15545 + }, + { + "epoch": 3.579835986493005, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0418, + "step": 15550 + }, + { + "epoch": 3.5810419681620838, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9592, + "step": 15555 + }, + { + "epoch": 3.582247949831163, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0796, + "step": 15560 + }, + { + "epoch": 3.583453931500241, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0507, + "step": 15565 + }, + { + "epoch": 3.58465991316932, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9588, + "step": 15570 + }, + { + "epoch": 3.5858658948383986, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.052, + "step": 15575 + }, + { + "epoch": 3.587071876507477, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9889, + "step": 15580 + }, + { + "epoch": 3.5882778581765558, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0662, + "step": 15585 + }, + { + "epoch": 3.5894838398456343, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0463, + "step": 15590 + }, + { + "epoch": 3.590689821514713, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.998, + "step": 15595 + }, + { + "epoch": 3.5918958031837915, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0381, + "step": 15600 + }, + { + "epoch": 3.59310178485287, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.155, + "step": 15605 + }, + { + "epoch": 3.5943077665219487, + "grad_norm": 3.625, + "learning_rate": 3e-05, + "loss": 3.8984, + "step": 15610 + }, + { + "epoch": 3.5955137481910278, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9862, + "step": 15615 + }, + { + "epoch": 3.596719729860106, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.988, + "step": 15620 + }, + { + "epoch": 3.597925711529185, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8587, + "step": 15625 + }, + { + "epoch": 3.5991316931982635, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.094, + "step": 15630 + }, + { + "epoch": 3.600337674867342, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9377, + "step": 15635 + }, + { + "epoch": 3.6015436565364207, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0605, + "step": 15640 + }, + { + "epoch": 3.6027496382054993, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0728, + "step": 15645 + }, + { + "epoch": 3.603955619874578, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0383, + "step": 15650 + }, + { + "epoch": 3.6051616015436565, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 4.1243, + "step": 15655 + }, + { + "epoch": 3.606367583212735, + "grad_norm": 3.359375, + "learning_rate": 3e-05, + "loss": 3.9975, + "step": 15660 + }, + { + "epoch": 3.6075735648818137, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.918, + "step": 15665 + }, + { + "epoch": 3.6087795465508923, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0236, + "step": 15670 + }, + { + "epoch": 3.609985528219971, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1219, + "step": 15675 + }, + { + "epoch": 3.61119150988905, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0131, + "step": 15680 + }, + { + "epoch": 3.612397491558128, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1084, + "step": 15685 + }, + { + "epoch": 3.613603473227207, + "grad_norm": 3.796875, + "learning_rate": 3e-05, + "loss": 4.1355, + "step": 15690 + }, + { + "epoch": 3.6148094548962857, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.1247, + "step": 15695 + }, + { + "epoch": 3.6160154365653643, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.0449, + "step": 15700 + }, + { + "epoch": 3.617221418234443, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.8251, + "step": 15705 + }, + { + "epoch": 3.6184273999035215, + "grad_norm": 3.671875, + "learning_rate": 3e-05, + "loss": 4.0023, + "step": 15710 + }, + { + "epoch": 3.6196333815726, + "grad_norm": 3.359375, + "learning_rate": 3e-05, + "loss": 4.1111, + "step": 15715 + }, + { + "epoch": 3.6208393632416787, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.929, + "step": 15720 + }, + { + "epoch": 3.6220453449107572, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0471, + "step": 15725 + }, + { + "epoch": 3.623251326579836, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0213, + "step": 15730 + }, + { + "epoch": 3.624457308248915, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8792, + "step": 15735 + }, + { + "epoch": 3.625663289917993, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0717, + "step": 15740 + }, + { + "epoch": 3.626869271587072, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.9672, + "step": 15745 + }, + { + "epoch": 3.6280752532561507, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9882, + "step": 15750 + }, + { + "epoch": 3.6292812349252292, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.1598, + "step": 15755 + }, + { + "epoch": 3.630487216594308, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.1315, + "step": 15760 + }, + { + "epoch": 3.6316931982633864, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0041, + "step": 15765 + }, + { + "epoch": 3.632899179932465, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0705, + "step": 15770 + }, + { + "epoch": 3.6341051616015436, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9164, + "step": 15775 + }, + { + "epoch": 3.635311143270622, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9844, + "step": 15780 + }, + { + "epoch": 3.636517124939701, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9853, + "step": 15785 + }, + { + "epoch": 3.6377231066087794, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9188, + "step": 15790 + }, + { + "epoch": 3.638929088277858, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9599, + "step": 15795 + }, + { + "epoch": 3.640135069946937, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1056, + "step": 15800 + }, + { + "epoch": 3.641341051616015, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.0439, + "step": 15805 + }, + { + "epoch": 3.642547033285094, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1838, + "step": 15810 + }, + { + "epoch": 3.643753014954173, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0717, + "step": 15815 + }, + { + "epoch": 3.6449589966232514, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0409, + "step": 15820 + }, + { + "epoch": 3.64616497829233, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.921, + "step": 15825 + }, + { + "epoch": 3.6473709599614086, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9315, + "step": 15830 + }, + { + "epoch": 3.648576941630487, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.9507, + "step": 15835 + }, + { + "epoch": 3.6497829232995658, + "grad_norm": 3.46875, + "learning_rate": 3e-05, + "loss": 3.8639, + "step": 15840 + }, + { + "epoch": 3.6509889049686444, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9103, + "step": 15845 + }, + { + "epoch": 3.652194886637723, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0831, + "step": 15850 + }, + { + "epoch": 3.653400868306802, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0593, + "step": 15855 + }, + { + "epoch": 3.65460684997588, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.896, + "step": 15860 + }, + { + "epoch": 3.655812831644959, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1176, + "step": 15865 + }, + { + "epoch": 3.6570188133140378, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.9909, + "step": 15870 + }, + { + "epoch": 3.6582247949831164, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0337, + "step": 15875 + }, + { + "epoch": 3.659430776652195, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.1702, + "step": 15880 + }, + { + "epoch": 3.6606367583212736, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1194, + "step": 15885 + }, + { + "epoch": 3.661842739990352, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9726, + "step": 15890 + }, + { + "epoch": 3.6630487216594307, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 4.1218, + "step": 15895 + }, + { + "epoch": 3.6642547033285093, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.037, + "step": 15900 + }, + { + "epoch": 3.665460684997588, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9541, + "step": 15905 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.038, + "step": 15910 + }, + { + "epoch": 3.667872648335745, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9506, + "step": 15915 + }, + { + "epoch": 3.669078630004824, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1574, + "step": 15920 + }, + { + "epoch": 3.6702846116739023, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1581, + "step": 15925 + }, + { + "epoch": 3.6714905933429813, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0647, + "step": 15930 + }, + { + "epoch": 3.67269657501206, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9469, + "step": 15935 + }, + { + "epoch": 3.6739025566811385, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9963, + "step": 15940 + }, + { + "epoch": 3.675108538350217, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.996, + "step": 15945 + }, + { + "epoch": 3.6763145200192957, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0304, + "step": 15950 + }, + { + "epoch": 3.6775205016883743, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8884, + "step": 15955 + }, + { + "epoch": 3.678726483357453, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9658, + "step": 15960 + }, + { + "epoch": 3.6799324650265315, + "grad_norm": 3.65625, + "learning_rate": 3e-05, + "loss": 4.0913, + "step": 15965 + }, + { + "epoch": 3.68113844669561, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9585, + "step": 15970 + }, + { + "epoch": 3.682344428364689, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 4.0505, + "step": 15975 + }, + { + "epoch": 3.6835504100337673, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9165, + "step": 15980 + }, + { + "epoch": 3.6847563917028463, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.976, + "step": 15985 + }, + { + "epoch": 3.685962373371925, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.1032, + "step": 15990 + }, + { + "epoch": 3.6871683550410035, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9082, + "step": 15995 + }, + { + "epoch": 3.688374336710082, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9664, + "step": 16000 + }, + { + "epoch": 3.6895803183791607, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9856, + "step": 16005 + }, + { + "epoch": 3.6907863000482393, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9754, + "step": 16010 + }, + { + "epoch": 3.691992281717318, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9251, + "step": 16015 + }, + { + "epoch": 3.6931982633863965, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.9983, + "step": 16020 + }, + { + "epoch": 3.694404245055475, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.878, + "step": 16025 + }, + { + "epoch": 3.6956102267245536, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0377, + "step": 16030 + }, + { + "epoch": 3.6968162083936322, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8615, + "step": 16035 + }, + { + "epoch": 3.6980221900627113, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 4.1154, + "step": 16040 + }, + { + "epoch": 3.6992281717317894, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9776, + "step": 16045 + }, + { + "epoch": 3.7004341534008685, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9946, + "step": 16050 + }, + { + "epoch": 3.701640135069947, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9427, + "step": 16055 + }, + { + "epoch": 3.7028461167390256, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9502, + "step": 16060 + }, + { + "epoch": 3.7040520984081042, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9252, + "step": 16065 + }, + { + "epoch": 3.705258080077183, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0952, + "step": 16070 + }, + { + "epoch": 3.7064640617462614, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.887, + "step": 16075 + }, + { + "epoch": 3.70767004341534, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.122, + "step": 16080 + }, + { + "epoch": 3.7088760250844186, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9845, + "step": 16085 + }, + { + "epoch": 3.710082006753497, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9335, + "step": 16090 + }, + { + "epoch": 3.7112879884225762, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.9416, + "step": 16095 + }, + { + "epoch": 3.7124939700916544, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9335, + "step": 16100 + }, + { + "epoch": 3.7136999517607334, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9316, + "step": 16105 + }, + { + "epoch": 3.714905933429812, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0515, + "step": 16110 + }, + { + "epoch": 3.7161119150988906, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.0138, + "step": 16115 + }, + { + "epoch": 3.717317896767969, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.9327, + "step": 16120 + }, + { + "epoch": 3.718523878437048, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.0349, + "step": 16125 + }, + { + "epoch": 3.7197298601061264, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 4.1848, + "step": 16130 + }, + { + "epoch": 3.720935841775205, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.898, + "step": 16135 + }, + { + "epoch": 3.7221418234442836, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1881, + "step": 16140 + }, + { + "epoch": 3.723347805113362, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.9333, + "step": 16145 + }, + { + "epoch": 3.7245537867824408, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9446, + "step": 16150 + }, + { + "epoch": 3.7257597684515193, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9377, + "step": 16155 + }, + { + "epoch": 3.7269657501205984, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9663, + "step": 16160 + }, + { + "epoch": 3.7281717317896765, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8904, + "step": 16165 + }, + { + "epoch": 3.7293777134587556, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9516, + "step": 16170 + }, + { + "epoch": 3.730583695127834, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9578, + "step": 16175 + }, + { + "epoch": 3.7317896767969128, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0739, + "step": 16180 + }, + { + "epoch": 3.7329956584659914, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9925, + "step": 16185 + }, + { + "epoch": 3.73420164013507, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9177, + "step": 16190 + }, + { + "epoch": 3.7354076218041485, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0116, + "step": 16195 + }, + { + "epoch": 3.736613603473227, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0218, + "step": 16200 + }, + { + "epoch": 3.7378195851423057, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.9417, + "step": 16205 + }, + { + "epoch": 3.7390255668113843, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9539, + "step": 16210 + }, + { + "epoch": 3.7402315484804634, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9554, + "step": 16215 + }, + { + "epoch": 3.7414375301495415, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.9798, + "step": 16220 + }, + { + "epoch": 3.7426435118186205, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.0595, + "step": 16225 + }, + { + "epoch": 3.743849493487699, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0568, + "step": 16230 + }, + { + "epoch": 3.7450554751567777, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.0669, + "step": 16235 + }, + { + "epoch": 3.7462614568258563, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9098, + "step": 16240 + }, + { + "epoch": 3.747467438494935, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9163, + "step": 16245 + }, + { + "epoch": 3.7486734201640135, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.7712, + "step": 16250 + }, + { + "epoch": 3.749879401833092, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9362, + "step": 16255 + }, + { + "epoch": 3.7510853835021707, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2692, + "step": 16260 + }, + { + "epoch": 3.7522913651712493, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 4.1035, + "step": 16265 + }, + { + "epoch": 3.753497346840328, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0493, + "step": 16270 + }, + { + "epoch": 3.7547033285094065, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0008, + "step": 16275 + }, + { + "epoch": 3.7559093101784855, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1084, + "step": 16280 + }, + { + "epoch": 3.7571152918475637, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9278, + "step": 16285 + }, + { + "epoch": 3.7583212735166427, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1495, + "step": 16290 + }, + { + "epoch": 3.7595272551857213, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0115, + "step": 16295 + }, + { + "epoch": 3.7607332368548, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9386, + "step": 16300 + }, + { + "epoch": 3.7619392185238785, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0107, + "step": 16305 + }, + { + "epoch": 3.763145200192957, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9651, + "step": 16310 + }, + { + "epoch": 3.7643511818620357, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9544, + "step": 16315 + }, + { + "epoch": 3.7655571635311142, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8844, + "step": 16320 + }, + { + "epoch": 3.766763145200193, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9236, + "step": 16325 + }, + { + "epoch": 3.7679691268692714, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.1234, + "step": 16330 + }, + { + "epoch": 3.7691751085383505, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9655, + "step": 16335 + }, + { + "epoch": 3.7703810902074286, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0073, + "step": 16340 + }, + { + "epoch": 3.7715870718765077, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9181, + "step": 16345 + }, + { + "epoch": 3.7727930535455863, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.8662, + "step": 16350 + }, + { + "epoch": 3.773999035214665, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7226, + "step": 16355 + }, + { + "epoch": 3.7752050168837434, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.1805, + "step": 16360 + }, + { + "epoch": 3.776410998552822, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.1371, + "step": 16365 + }, + { + "epoch": 3.7776169802219006, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.129, + "step": 16370 + }, + { + "epoch": 3.778822961890979, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.1049, + "step": 16375 + }, + { + "epoch": 3.780028943560058, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8706, + "step": 16380 + }, + { + "epoch": 3.7812349252291364, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0336, + "step": 16385 + }, + { + "epoch": 3.782440906898215, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.9422, + "step": 16390 + }, + { + "epoch": 3.7836468885672936, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0953, + "step": 16395 + }, + { + "epoch": 3.7848528702363726, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0049, + "step": 16400 + }, + { + "epoch": 3.7860588519054508, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0195, + "step": 16405 + }, + { + "epoch": 3.78726483357453, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0637, + "step": 16410 + }, + { + "epoch": 3.7884708152436084, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.3091, + "step": 16415 + }, + { + "epoch": 3.789676796912687, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.933, + "step": 16420 + }, + { + "epoch": 3.7908827785817656, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9491, + "step": 16425 + }, + { + "epoch": 3.792088760250844, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0612, + "step": 16430 + }, + { + "epoch": 3.7932947419199228, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.128, + "step": 16435 + }, + { + "epoch": 3.7945007235890014, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9772, + "step": 16440 + }, + { + "epoch": 3.79570670525808, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0806, + "step": 16445 + }, + { + "epoch": 3.7969126869271586, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.9721, + "step": 16450 + }, + { + "epoch": 3.7981186685962376, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9053, + "step": 16455 + }, + { + "epoch": 3.7993246502653157, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.2064, + "step": 16460 + }, + { + "epoch": 3.8005306319343948, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9998, + "step": 16465 + }, + { + "epoch": 3.8017366136034734, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.93, + "step": 16470 + }, + { + "epoch": 3.802942595272552, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0474, + "step": 16475 + }, + { + "epoch": 3.8041485769416306, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0622, + "step": 16480 + }, + { + "epoch": 3.805354558610709, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0207, + "step": 16485 + }, + { + "epoch": 3.8065605402797877, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0581, + "step": 16490 + }, + { + "epoch": 3.8077665219488663, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0917, + "step": 16495 + }, + { + "epoch": 3.808972503617945, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0036, + "step": 16500 + }, + { + "epoch": 3.8101784852870235, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9413, + "step": 16505 + }, + { + "epoch": 3.811384466956102, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9802, + "step": 16510 + }, + { + "epoch": 3.8125904486251807, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 4.0398, + "step": 16515 + }, + { + "epoch": 3.8137964302942597, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9177, + "step": 16520 + }, + { + "epoch": 3.815002411963338, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0421, + "step": 16525 + }, + { + "epoch": 3.816208393632417, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0367, + "step": 16530 + }, + { + "epoch": 3.8174143753014955, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9691, + "step": 16535 + }, + { + "epoch": 3.818620356970574, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0991, + "step": 16540 + }, + { + "epoch": 3.8198263386396527, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9303, + "step": 16545 + }, + { + "epoch": 3.8210323203087313, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8994, + "step": 16550 + }, + { + "epoch": 3.82223830197781, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9727, + "step": 16555 + }, + { + "epoch": 3.8234442836468885, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9319, + "step": 16560 + }, + { + "epoch": 3.824650265315967, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9565, + "step": 16565 + }, + { + "epoch": 3.8258562469850457, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9581, + "step": 16570 + }, + { + "epoch": 3.8270622286541247, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0812, + "step": 16575 + }, + { + "epoch": 3.828268210323203, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0111, + "step": 16580 + }, + { + "epoch": 3.829474191992282, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.0788, + "step": 16585 + }, + { + "epoch": 3.8306801736613605, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9713, + "step": 16590 + }, + { + "epoch": 3.831886155330439, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0475, + "step": 16595 + }, + { + "epoch": 3.8330921369995177, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.9336, + "step": 16600 + }, + { + "epoch": 3.8342981186685963, + "grad_norm": 3.484375, + "learning_rate": 3e-05, + "loss": 4.1285, + "step": 16605 + }, + { + "epoch": 3.835504100337675, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.7279, + "step": 16610 + }, + { + "epoch": 3.8367100820067535, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0651, + "step": 16615 + }, + { + "epoch": 3.837916063675832, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.066, + "step": 16620 + }, + { + "epoch": 3.8391220453449106, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0235, + "step": 16625 + }, + { + "epoch": 3.8403280270139892, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.1096, + "step": 16630 + }, + { + "epoch": 3.841534008683068, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9193, + "step": 16635 + }, + { + "epoch": 3.842739990352147, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8714, + "step": 16640 + }, + { + "epoch": 3.843945972021225, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0109, + "step": 16645 + }, + { + "epoch": 3.845151953690304, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8672, + "step": 16650 + }, + { + "epoch": 3.8463579353593826, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.923, + "step": 16655 + }, + { + "epoch": 3.8475639170284612, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 4.1278, + "step": 16660 + }, + { + "epoch": 3.84876989869754, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9864, + "step": 16665 + }, + { + "epoch": 3.8499758803666184, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.1395, + "step": 16670 + }, + { + "epoch": 3.851181862035697, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.1347, + "step": 16675 + }, + { + "epoch": 3.8523878437047756, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0089, + "step": 16680 + }, + { + "epoch": 3.853593825373854, + "grad_norm": 4.6875, + "learning_rate": 3e-05, + "loss": 3.9326, + "step": 16685 + }, + { + "epoch": 3.854799807042933, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9616, + "step": 16690 + }, + { + "epoch": 3.856005788712012, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8703, + "step": 16695 + }, + { + "epoch": 3.85721177038109, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9703, + "step": 16700 + }, + { + "epoch": 3.858417752050169, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9828, + "step": 16705 + }, + { + "epoch": 3.8596237337192476, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9348, + "step": 16710 + }, + { + "epoch": 3.860829715388326, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8907, + "step": 16715 + }, + { + "epoch": 3.862035697057405, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9368, + "step": 16720 + }, + { + "epoch": 3.8632416787264834, + "grad_norm": 3.421875, + "learning_rate": 3e-05, + "loss": 4.2211, + "step": 16725 + }, + { + "epoch": 3.864447660395562, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9402, + "step": 16730 + }, + { + "epoch": 3.8656536420646406, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.1418, + "step": 16735 + }, + { + "epoch": 3.866859623733719, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9347, + "step": 16740 + }, + { + "epoch": 3.8680656054027978, + "grad_norm": 3.546875, + "learning_rate": 3e-05, + "loss": 3.917, + "step": 16745 + }, + { + "epoch": 3.8692715870718764, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.8988, + "step": 16750 + }, + { + "epoch": 3.870477568740955, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0261, + "step": 16755 + }, + { + "epoch": 3.871683550410034, + "grad_norm": 3.375, + "learning_rate": 3e-05, + "loss": 3.9484, + "step": 16760 + }, + { + "epoch": 3.872889532079112, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9292, + "step": 16765 + }, + { + "epoch": 3.874095513748191, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0743, + "step": 16770 + }, + { + "epoch": 3.8753014954172698, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.8979, + "step": 16775 + }, + { + "epoch": 3.8765074770863484, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0227, + "step": 16780 + }, + { + "epoch": 3.877713458755427, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9069, + "step": 16785 + }, + { + "epoch": 3.8789194404245055, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8504, + "step": 16790 + }, + { + "epoch": 3.880125422093584, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9777, + "step": 16795 + }, + { + "epoch": 3.8813314037626627, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 4.0615, + "step": 16800 + }, + { + "epoch": 3.8825373854317413, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8637, + "step": 16805 + }, + { + "epoch": 3.88374336710082, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9371, + "step": 16810 + }, + { + "epoch": 3.884949348769899, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0929, + "step": 16815 + }, + { + "epoch": 3.886155330438977, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0039, + "step": 16820 + }, + { + "epoch": 3.887361312108056, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.2203, + "step": 16825 + }, + { + "epoch": 3.8885672937771347, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.7446, + "step": 16830 + }, + { + "epoch": 3.8897732754462133, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0843, + "step": 16835 + }, + { + "epoch": 3.890979257115292, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0489, + "step": 16840 + }, + { + "epoch": 3.8921852387843705, + "grad_norm": 3.296875, + "learning_rate": 3e-05, + "loss": 3.8601, + "step": 16845 + }, + { + "epoch": 3.893391220453449, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0869, + "step": 16850 + }, + { + "epoch": 3.8945972021225277, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.015, + "step": 16855 + }, + { + "epoch": 3.8958031837916063, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8737, + "step": 16860 + }, + { + "epoch": 3.897009165460685, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0627, + "step": 16865 + }, + { + "epoch": 3.8982151471297635, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8719, + "step": 16870 + }, + { + "epoch": 3.899421128798842, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0355, + "step": 16875 + }, + { + "epoch": 3.900627110467921, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9983, + "step": 16880 + }, + { + "epoch": 3.9018330921369992, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.968, + "step": 16885 + }, + { + "epoch": 3.9030390738060783, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.055, + "step": 16890 + }, + { + "epoch": 3.904245055475157, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0793, + "step": 16895 + }, + { + "epoch": 3.9054510371442355, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 3.9698, + "step": 16900 + }, + { + "epoch": 3.906657018813314, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0943, + "step": 16905 + }, + { + "epoch": 3.9078630004823927, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9656, + "step": 16910 + }, + { + "epoch": 3.9090689821514712, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.9591, + "step": 16915 + }, + { + "epoch": 3.91027496382055, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0096, + "step": 16920 + }, + { + "epoch": 3.9114809454896284, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9084, + "step": 16925 + }, + { + "epoch": 3.912686927158707, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8909, + "step": 16930 + }, + { + "epoch": 3.913892908827786, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9246, + "step": 16935 + }, + { + "epoch": 3.915098890496864, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9368, + "step": 16940 + }, + { + "epoch": 3.9163048721659433, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9595, + "step": 16945 + }, + { + "epoch": 3.917510853835022, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9792, + "step": 16950 + }, + { + "epoch": 3.9187168355041004, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0542, + "step": 16955 + }, + { + "epoch": 3.919922817173179, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9786, + "step": 16960 + }, + { + "epoch": 3.9211287988422576, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0183, + "step": 16965 + }, + { + "epoch": 3.922334780511336, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9496, + "step": 16970 + }, + { + "epoch": 3.923540762180415, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8776, + "step": 16975 + }, + { + "epoch": 3.9247467438494934, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.9536, + "step": 16980 + }, + { + "epoch": 3.925952725518572, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0673, + "step": 16985 + }, + { + "epoch": 3.9271587071876506, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9777, + "step": 16990 + }, + { + "epoch": 3.928364688856729, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8375, + "step": 16995 + }, + { + "epoch": 3.929570670525808, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0203, + "step": 17000 + }, + { + "epoch": 3.9307766521948864, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.1498, + "step": 17005 + }, + { + "epoch": 3.9319826338639654, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.9497, + "step": 17010 + }, + { + "epoch": 3.933188615533044, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9592, + "step": 17015 + }, + { + "epoch": 3.9343945972021226, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0012, + "step": 17020 + }, + { + "epoch": 3.935600578871201, + "grad_norm": 5.46875, + "learning_rate": 3e-05, + "loss": 3.9361, + "step": 17025 + }, + { + "epoch": 3.9368065605402798, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.0225, + "step": 17030 + }, + { + "epoch": 3.9380125422093584, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9514, + "step": 17035 + }, + { + "epoch": 3.939218523878437, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 3.9751, + "step": 17040 + }, + { + "epoch": 3.9404245055475156, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0302, + "step": 17045 + }, + { + "epoch": 3.941630487216594, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9687, + "step": 17050 + }, + { + "epoch": 3.942836468885673, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9369, + "step": 17055 + }, + { + "epoch": 3.9440424505547513, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0217, + "step": 17060 + }, + { + "epoch": 3.9452484322238304, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0357, + "step": 17065 + }, + { + "epoch": 3.946454413892909, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9942, + "step": 17070 + }, + { + "epoch": 3.9476603955619876, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9284, + "step": 17075 + }, + { + "epoch": 3.948866377231066, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.9487, + "step": 17080 + }, + { + "epoch": 3.9500723589001447, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9801, + "step": 17085 + }, + { + "epoch": 3.9512783405692233, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1226, + "step": 17090 + }, + { + "epoch": 3.952484322238302, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.844, + "step": 17095 + }, + { + "epoch": 3.9536903039073805, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8442, + "step": 17100 + }, + { + "epoch": 3.954896285576459, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.073, + "step": 17105 + }, + { + "epoch": 3.9561022672455377, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8494, + "step": 17110 + }, + { + "epoch": 3.9573082489146163, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8586, + "step": 17115 + }, + { + "epoch": 3.9585142305836953, + "grad_norm": 3.296875, + "learning_rate": 3e-05, + "loss": 3.9582, + "step": 17120 + }, + { + "epoch": 3.9597202122527735, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.1137, + "step": 17125 + }, + { + "epoch": 3.9609261939218525, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.7992, + "step": 17130 + }, + { + "epoch": 3.962132175590931, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8212, + "step": 17135 + }, + { + "epoch": 3.9633381572600097, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.961, + "step": 17140 + }, + { + "epoch": 3.9645441389290883, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.0676, + "step": 17145 + }, + { + "epoch": 3.965750120598167, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 4.1459, + "step": 17150 + }, + { + "epoch": 3.9669561022672455, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8767, + "step": 17155 + }, + { + "epoch": 3.968162083936324, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8575, + "step": 17160 + }, + { + "epoch": 3.9693680656054027, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9358, + "step": 17165 + }, + { + "epoch": 3.9705740472744813, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8776, + "step": 17170 + }, + { + "epoch": 3.9717800289435603, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0366, + "step": 17175 + }, + { + "epoch": 3.9729860106126385, + "grad_norm": 3.359375, + "learning_rate": 3e-05, + "loss": 4.0652, + "step": 17180 + }, + { + "epoch": 3.9741919922817175, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.0154, + "step": 17185 + }, + { + "epoch": 3.975397973950796, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8953, + "step": 17190 + }, + { + "epoch": 3.9766039556198747, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8697, + "step": 17195 + }, + { + "epoch": 3.9778099372889533, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9085, + "step": 17200 + }, + { + "epoch": 3.979015918958032, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0601, + "step": 17205 + }, + { + "epoch": 3.9802219006271105, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0051, + "step": 17210 + }, + { + "epoch": 3.981427882296189, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.096, + "step": 17215 + }, + { + "epoch": 3.9826338639652676, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.952, + "step": 17220 + }, + { + "epoch": 3.9838398456343462, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9228, + "step": 17225 + }, + { + "epoch": 3.985045827303425, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.893, + "step": 17230 + }, + { + "epoch": 3.9862518089725034, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.814, + "step": 17235 + }, + { + "epoch": 3.9874577906415825, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8744, + "step": 17240 + }, + { + "epoch": 3.9886637723106606, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.1822, + "step": 17245 + }, + { + "epoch": 3.9898697539797396, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.9633, + "step": 17250 + }, + { + "epoch": 3.9910757356488182, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8521, + "step": 17255 + }, + { + "epoch": 3.992281717317897, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9787, + "step": 17260 + }, + { + "epoch": 3.9934876989869754, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.048, + "step": 17265 + }, + { + "epoch": 3.994693680656054, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8688, + "step": 17270 + }, + { + "epoch": 3.9958996623251326, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9372, + "step": 17275 + }, + { + "epoch": 3.997105643994211, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 3.9689, + "step": 17280 + }, + { + "epoch": 3.99831162566329, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9447, + "step": 17285 + }, + { + "epoch": 3.9995176073323684, + "grad_norm": 4.0625, + "learning_rate": 3e-05, + "loss": 4.1225, + "step": 17290 + }, + { + "epoch": 4.000723589001447, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.8388, + "step": 17295 + }, + { + "epoch": 4.001929570670526, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8877, + "step": 17300 + }, + { + "epoch": 4.003135552339605, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0602, + "step": 17305 + }, + { + "epoch": 4.004341534008683, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0037, + "step": 17310 + }, + { + "epoch": 4.005547515677762, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0028, + "step": 17315 + }, + { + "epoch": 4.00675349734684, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9476, + "step": 17320 + }, + { + "epoch": 4.007959479015919, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.8961, + "step": 17325 + }, + { + "epoch": 4.009165460684998, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9938, + "step": 17330 + }, + { + "epoch": 4.010371442354076, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1277, + "step": 17335 + }, + { + "epoch": 4.011577424023155, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.885, + "step": 17340 + }, + { + "epoch": 4.012783405692233, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9108, + "step": 17345 + }, + { + "epoch": 4.013989387361312, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8904, + "step": 17350 + }, + { + "epoch": 4.0151953690303905, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9804, + "step": 17355 + }, + { + "epoch": 4.01640135069947, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.017, + "step": 17360 + }, + { + "epoch": 4.017607332368548, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9892, + "step": 17365 + }, + { + "epoch": 4.018813314037627, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8946, + "step": 17370 + }, + { + "epoch": 4.020019295706705, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8947, + "step": 17375 + }, + { + "epoch": 4.021225277375784, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.957, + "step": 17380 + }, + { + "epoch": 4.022431259044862, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9355, + "step": 17385 + }, + { + "epoch": 4.023637240713941, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8422, + "step": 17390 + }, + { + "epoch": 4.02484322238302, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0214, + "step": 17395 + }, + { + "epoch": 4.026049204052098, + "grad_norm": 3.375, + "learning_rate": 3e-05, + "loss": 3.9652, + "step": 17400 + }, + { + "epoch": 4.027255185721177, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9663, + "step": 17405 + }, + { + "epoch": 4.0284611673902555, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.96, + "step": 17410 + }, + { + "epoch": 4.0296671490593345, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9893, + "step": 17415 + }, + { + "epoch": 4.030873130728413, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 3.8778, + "step": 17420 + }, + { + "epoch": 4.032079112397492, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.9751, + "step": 17425 + }, + { + "epoch": 4.03328509406657, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.934, + "step": 17430 + }, + { + "epoch": 4.034491075735649, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.9501, + "step": 17435 + }, + { + "epoch": 4.035697057404727, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8527, + "step": 17440 + }, + { + "epoch": 4.036903039073806, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0261, + "step": 17445 + }, + { + "epoch": 4.038109020742885, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0066, + "step": 17450 + }, + { + "epoch": 4.039315002411963, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9134, + "step": 17455 + }, + { + "epoch": 4.040520984081042, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0793, + "step": 17460 + }, + { + "epoch": 4.0417269657501205, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9565, + "step": 17465 + }, + { + "epoch": 4.0429329474191995, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0828, + "step": 17470 + }, + { + "epoch": 4.044138929088278, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 3.9935, + "step": 17475 + }, + { + "epoch": 4.045344910757357, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8752, + "step": 17480 + }, + { + "epoch": 4.046550892426435, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0542, + "step": 17485 + }, + { + "epoch": 4.047756874095514, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0809, + "step": 17490 + }, + { + "epoch": 4.048962855764592, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8445, + "step": 17495 + }, + { + "epoch": 4.050168837433671, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8905, + "step": 17500 + }, + { + "epoch": 4.051374819102749, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 4.0455, + "step": 17505 + }, + { + "epoch": 4.052580800771828, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9642, + "step": 17510 + }, + { + "epoch": 4.053786782440907, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1512, + "step": 17515 + }, + { + "epoch": 4.054992764109985, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9247, + "step": 17520 + }, + { + "epoch": 4.0561987457790645, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9222, + "step": 17525 + }, + { + "epoch": 4.057404727448143, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0195, + "step": 17530 + }, + { + "epoch": 4.058610709117222, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8717, + "step": 17535 + }, + { + "epoch": 4.0598166907863, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.7809, + "step": 17540 + }, + { + "epoch": 4.061022672455379, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.807, + "step": 17545 + }, + { + "epoch": 4.062228654124457, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9341, + "step": 17550 + }, + { + "epoch": 4.063434635793536, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.016, + "step": 17555 + }, + { + "epoch": 4.064640617462614, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.952, + "step": 17560 + }, + { + "epoch": 4.065846599131693, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8231, + "step": 17565 + }, + { + "epoch": 4.067052580800772, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9479, + "step": 17570 + }, + { + "epoch": 4.06825856246985, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.9743, + "step": 17575 + }, + { + "epoch": 4.069464544138929, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9869, + "step": 17580 + }, + { + "epoch": 4.070670525808008, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8977, + "step": 17585 + }, + { + "epoch": 4.071876507477087, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8267, + "step": 17590 + }, + { + "epoch": 4.073082489146165, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0622, + "step": 17595 + }, + { + "epoch": 4.074288470815244, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.9417, + "step": 17600 + }, + { + "epoch": 4.075494452484322, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9521, + "step": 17605 + }, + { + "epoch": 4.076700434153401, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.934, + "step": 17610 + }, + { + "epoch": 4.077906415822479, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9232, + "step": 17615 + }, + { + "epoch": 4.079112397491558, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.7429, + "step": 17620 + }, + { + "epoch": 4.080318379160636, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.8876, + "step": 17625 + }, + { + "epoch": 4.081524360829715, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9486, + "step": 17630 + }, + { + "epoch": 4.082730342498794, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0074, + "step": 17635 + }, + { + "epoch": 4.0839363241678726, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 4.0851, + "step": 17640 + }, + { + "epoch": 4.085142305836952, + "grad_norm": 3.296875, + "learning_rate": 3e-05, + "loss": 4.0495, + "step": 17645 + }, + { + "epoch": 4.08634828750603, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9278, + "step": 17650 + }, + { + "epoch": 4.087554269175109, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0431, + "step": 17655 + }, + { + "epoch": 4.088760250844187, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9829, + "step": 17660 + }, + { + "epoch": 4.089966232513266, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8358, + "step": 17665 + }, + { + "epoch": 4.091172214182344, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9264, + "step": 17670 + }, + { + "epoch": 4.092378195851423, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8923, + "step": 17675 + }, + { + "epoch": 4.093584177520501, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9424, + "step": 17680 + }, + { + "epoch": 4.09479015918958, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.927, + "step": 17685 + }, + { + "epoch": 4.095996140858659, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0665, + "step": 17690 + }, + { + "epoch": 4.0972021225277375, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8743, + "step": 17695 + }, + { + "epoch": 4.098408104196817, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0551, + "step": 17700 + }, + { + "epoch": 4.099614085865895, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1643, + "step": 17705 + }, + { + "epoch": 4.100820067534974, + "grad_norm": 3.65625, + "learning_rate": 3e-05, + "loss": 3.8589, + "step": 17710 + }, + { + "epoch": 4.102026049204052, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8367, + "step": 17715 + }, + { + "epoch": 4.103232030873131, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0369, + "step": 17720 + }, + { + "epoch": 4.104438012542209, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0402, + "step": 17725 + }, + { + "epoch": 4.105643994211288, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9079, + "step": 17730 + }, + { + "epoch": 4.106849975880366, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9417, + "step": 17735 + }, + { + "epoch": 4.108055957549445, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9048, + "step": 17740 + }, + { + "epoch": 4.1092619392185235, + "grad_norm": 3.59375, + "learning_rate": 3e-05, + "loss": 4.0145, + "step": 17745 + }, + { + "epoch": 4.1104679208876025, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9552, + "step": 17750 + }, + { + "epoch": 4.1116739025566815, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9672, + "step": 17755 + }, + { + "epoch": 4.11287988422576, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0467, + "step": 17760 + }, + { + "epoch": 4.114085865894839, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9248, + "step": 17765 + }, + { + "epoch": 4.115291847563917, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0002, + "step": 17770 + }, + { + "epoch": 4.116497829232996, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 4.0882, + "step": 17775 + }, + { + "epoch": 4.117703810902074, + "grad_norm": 3.3125, + "learning_rate": 3e-05, + "loss": 4.0125, + "step": 17780 + }, + { + "epoch": 4.118909792571153, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9268, + "step": 17785 + }, + { + "epoch": 4.120115774240231, + "grad_norm": 3.765625, + "learning_rate": 3e-05, + "loss": 3.8774, + "step": 17790 + }, + { + "epoch": 4.12132175590931, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9526, + "step": 17795 + }, + { + "epoch": 4.122527737578388, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8421, + "step": 17800 + }, + { + "epoch": 4.1237337192474675, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9007, + "step": 17805 + }, + { + "epoch": 4.1249397009165465, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8851, + "step": 17810 + }, + { + "epoch": 4.126145682585625, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0274, + "step": 17815 + }, + { + "epoch": 4.127351664254704, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9692, + "step": 17820 + }, + { + "epoch": 4.128557645923782, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9722, + "step": 17825 + }, + { + "epoch": 4.129763627592861, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9472, + "step": 17830 + }, + { + "epoch": 4.130969609261939, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.8693, + "step": 17835 + }, + { + "epoch": 4.132175590931018, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.992, + "step": 17840 + }, + { + "epoch": 4.133381572600096, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0836, + "step": 17845 + }, + { + "epoch": 4.134587554269175, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9054, + "step": 17850 + }, + { + "epoch": 4.135793535938253, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0747, + "step": 17855 + }, + { + "epoch": 4.136999517607332, + "grad_norm": 2.046875, + "learning_rate": 3e-05, + "loss": 3.877, + "step": 17860 + }, + { + "epoch": 4.138205499276411, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8684, + "step": 17865 + }, + { + "epoch": 4.13941148094549, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0057, + "step": 17870 + }, + { + "epoch": 4.140617462614569, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9937, + "step": 17875 + }, + { + "epoch": 4.141823444283647, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9655, + "step": 17880 + }, + { + "epoch": 4.143029425952726, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8509, + "step": 17885 + }, + { + "epoch": 4.144235407621804, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.945, + "step": 17890 + }, + { + "epoch": 4.145441389290883, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.882, + "step": 17895 + }, + { + "epoch": 4.146647370959961, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0138, + "step": 17900 + }, + { + "epoch": 4.14785335262904, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.9427, + "step": 17905 + }, + { + "epoch": 4.149059334298118, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8479, + "step": 17910 + }, + { + "epoch": 4.150265315967197, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9487, + "step": 17915 + }, + { + "epoch": 4.1514712976362755, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.7906, + "step": 17920 + }, + { + "epoch": 4.152677279305355, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8365, + "step": 17925 + }, + { + "epoch": 4.153883260974434, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9746, + "step": 17930 + }, + { + "epoch": 4.155089242643512, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0496, + "step": 17935 + }, + { + "epoch": 4.156295224312591, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9872, + "step": 17940 + }, + { + "epoch": 4.157501205981669, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0278, + "step": 17945 + }, + { + "epoch": 4.158707187650748, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.1542, + "step": 17950 + }, + { + "epoch": 4.159913169319826, + "grad_norm": 3.21875, + "learning_rate": 3e-05, + "loss": 3.9273, + "step": 17955 + }, + { + "epoch": 4.161119150988905, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.8292, + "step": 17960 + }, + { + "epoch": 4.162325132657983, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7716, + "step": 17965 + }, + { + "epoch": 4.163531114327062, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0024, + "step": 17970 + }, + { + "epoch": 4.1647370959961405, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.9923, + "step": 17975 + }, + { + "epoch": 4.1659430776652195, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 4.0109, + "step": 17980 + }, + { + "epoch": 4.167149059334298, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9655, + "step": 17985 + }, + { + "epoch": 4.168355041003377, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.7403, + "step": 17990 + }, + { + "epoch": 4.169561022672456, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9884, + "step": 17995 + }, + { + "epoch": 4.170767004341534, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9317, + "step": 18000 + }, + { + "epoch": 4.171972986010613, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.8992, + "step": 18005 + }, + { + "epoch": 4.173178967679691, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9805, + "step": 18010 + }, + { + "epoch": 4.17438494934877, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0573, + "step": 18015 + }, + { + "epoch": 4.175590931017848, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8974, + "step": 18020 + }, + { + "epoch": 4.176796912686927, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0002, + "step": 18025 + }, + { + "epoch": 4.1780028943560055, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8932, + "step": 18030 + }, + { + "epoch": 4.1792088760250845, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.8835, + "step": 18035 + }, + { + "epoch": 4.180414857694163, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0431, + "step": 18040 + }, + { + "epoch": 4.181620839363242, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9927, + "step": 18045 + }, + { + "epoch": 4.182826821032321, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.9982, + "step": 18050 + }, + { + "epoch": 4.184032802701399, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9321, + "step": 18055 + }, + { + "epoch": 4.185238784370478, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.8708, + "step": 18060 + }, + { + "epoch": 4.186444766039556, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9317, + "step": 18065 + }, + { + "epoch": 4.187650747708635, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9679, + "step": 18070 + }, + { + "epoch": 4.188856729377713, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9209, + "step": 18075 + }, + { + "epoch": 4.190062711046792, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.1022, + "step": 18080 + }, + { + "epoch": 4.19126869271587, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9368, + "step": 18085 + }, + { + "epoch": 4.1924746743849495, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.1196, + "step": 18090 + }, + { + "epoch": 4.193680656054028, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0144, + "step": 18095 + }, + { + "epoch": 4.194886637723107, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8412, + "step": 18100 + }, + { + "epoch": 4.196092619392186, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9223, + "step": 18105 + }, + { + "epoch": 4.197298601061264, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8457, + "step": 18110 + }, + { + "epoch": 4.198504582730343, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8579, + "step": 18115 + }, + { + "epoch": 4.199710564399421, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0089, + "step": 18120 + }, + { + "epoch": 4.2009165460685, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9215, + "step": 18125 + }, + { + "epoch": 4.202122527737578, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.8947, + "step": 18130 + }, + { + "epoch": 4.203328509406657, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.132, + "step": 18135 + }, + { + "epoch": 4.204534491075735, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0845, + "step": 18140 + }, + { + "epoch": 4.205740472744814, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0138, + "step": 18145 + }, + { + "epoch": 4.206946454413893, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.7772, + "step": 18150 + }, + { + "epoch": 4.208152436082972, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0942, + "step": 18155 + }, + { + "epoch": 4.20935841775205, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9259, + "step": 18160 + }, + { + "epoch": 4.210564399421129, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.0173, + "step": 18165 + }, + { + "epoch": 4.211770381090208, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9079, + "step": 18170 + }, + { + "epoch": 4.212976362759286, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9434, + "step": 18175 + }, + { + "epoch": 4.214182344428365, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.8833, + "step": 18180 + }, + { + "epoch": 4.215388326097443, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.997, + "step": 18185 + }, + { + "epoch": 4.216594307766522, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0224, + "step": 18190 + }, + { + "epoch": 4.2178002894356, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9269, + "step": 18195 + }, + { + "epoch": 4.219006271104679, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8506, + "step": 18200 + }, + { + "epoch": 4.2202122527737576, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9768, + "step": 18205 + }, + { + "epoch": 4.221418234442837, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9676, + "step": 18210 + }, + { + "epoch": 4.222624216111915, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.7846, + "step": 18215 + }, + { + "epoch": 4.223830197780994, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.979, + "step": 18220 + }, + { + "epoch": 4.225036179450072, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9122, + "step": 18225 + }, + { + "epoch": 4.226242161119151, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8589, + "step": 18230 + }, + { + "epoch": 4.22744814278823, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9471, + "step": 18235 + }, + { + "epoch": 4.228654124457308, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8577, + "step": 18240 + }, + { + "epoch": 4.229860106126387, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.794, + "step": 18245 + }, + { + "epoch": 4.231066087795465, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9315, + "step": 18250 + }, + { + "epoch": 4.232272069464544, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9148, + "step": 18255 + }, + { + "epoch": 4.2334780511336225, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0313, + "step": 18260 + }, + { + "epoch": 4.234684032802702, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.2064, + "step": 18265 + }, + { + "epoch": 4.23589001447178, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8789, + "step": 18270 + }, + { + "epoch": 4.237095996140859, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.9149, + "step": 18275 + }, + { + "epoch": 4.238301977809937, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9243, + "step": 18280 + }, + { + "epoch": 4.239507959479016, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9675, + "step": 18285 + }, + { + "epoch": 4.240713941148095, + "grad_norm": 5.375, + "learning_rate": 3e-05, + "loss": 3.8315, + "step": 18290 + }, + { + "epoch": 4.241919922817173, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.919, + "step": 18295 + }, + { + "epoch": 4.243125904486252, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1006, + "step": 18300 + }, + { + "epoch": 4.24433188615533, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.896, + "step": 18305 + }, + { + "epoch": 4.245537867824409, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0299, + "step": 18310 + }, + { + "epoch": 4.2467438494934875, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9486, + "step": 18315 + }, + { + "epoch": 4.2479498311625665, + "grad_norm": 3.53125, + "learning_rate": 3e-05, + "loss": 3.9588, + "step": 18320 + }, + { + "epoch": 4.249155812831645, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.7608, + "step": 18325 + }, + { + "epoch": 4.250361794500724, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 4.0113, + "step": 18330 + }, + { + "epoch": 4.251567776169802, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8923, + "step": 18335 + }, + { + "epoch": 4.252773757838881, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9763, + "step": 18340 + }, + { + "epoch": 4.25397973950796, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.809, + "step": 18345 + }, + { + "epoch": 4.255185721177038, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0002, + "step": 18350 + }, + { + "epoch": 4.256391702846117, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8559, + "step": 18355 + }, + { + "epoch": 4.257597684515195, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9349, + "step": 18360 + }, + { + "epoch": 4.258803666184274, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.9087, + "step": 18365 + }, + { + "epoch": 4.2600096478533525, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9506, + "step": 18370 + }, + { + "epoch": 4.2612156295224315, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9924, + "step": 18375 + }, + { + "epoch": 4.26242161119151, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9285, + "step": 18380 + }, + { + "epoch": 4.263627592860589, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8252, + "step": 18385 + }, + { + "epoch": 4.264833574529667, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9538, + "step": 18390 + }, + { + "epoch": 4.266039556198746, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9166, + "step": 18395 + }, + { + "epoch": 4.267245537867824, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.7875, + "step": 18400 + }, + { + "epoch": 4.268451519536903, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.947, + "step": 18405 + }, + { + "epoch": 4.269657501205982, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9321, + "step": 18410 + }, + { + "epoch": 4.27086348287506, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.8441, + "step": 18415 + }, + { + "epoch": 4.272069464544139, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9337, + "step": 18420 + }, + { + "epoch": 4.273275446213217, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.802, + "step": 18425 + }, + { + "epoch": 4.2744814278822965, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8029, + "step": 18430 + }, + { + "epoch": 4.275687409551375, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9194, + "step": 18435 + }, + { + "epoch": 4.276893391220454, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8878, + "step": 18440 + }, + { + "epoch": 4.278099372889532, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0155, + "step": 18445 + }, + { + "epoch": 4.279305354558611, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.1717, + "step": 18450 + }, + { + "epoch": 4.280511336227689, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.8489, + "step": 18455 + }, + { + "epoch": 4.281717317896768, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.9181, + "step": 18460 + }, + { + "epoch": 4.282923299565846, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9272, + "step": 18465 + }, + { + "epoch": 4.284129281234925, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8896, + "step": 18470 + }, + { + "epoch": 4.285335262904004, + "grad_norm": 5.03125, + "learning_rate": 3e-05, + "loss": 3.8618, + "step": 18475 + }, + { + "epoch": 4.286541244573082, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8792, + "step": 18480 + }, + { + "epoch": 4.287747226242161, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0755, + "step": 18485 + }, + { + "epoch": 4.28895320791124, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.8929, + "step": 18490 + }, + { + "epoch": 4.290159189580319, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9933, + "step": 18495 + }, + { + "epoch": 4.291365171249397, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9579, + "step": 18500 + }, + { + "epoch": 4.292571152918476, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.937, + "step": 18505 + }, + { + "epoch": 4.293777134587554, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.7516, + "step": 18510 + }, + { + "epoch": 4.294983116256633, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 3.9543, + "step": 18515 + }, + { + "epoch": 4.296189097925711, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.931, + "step": 18520 + }, + { + "epoch": 4.29739507959479, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.941, + "step": 18525 + }, + { + "epoch": 4.298601061263869, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.848, + "step": 18530 + }, + { + "epoch": 4.299807042932947, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.9776, + "step": 18535 + }, + { + "epoch": 4.301013024602026, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9061, + "step": 18540 + }, + { + "epoch": 4.3022190062711045, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 4.062, + "step": 18545 + }, + { + "epoch": 4.303424987940184, + "grad_norm": 3.4375, + "learning_rate": 3e-05, + "loss": 3.907, + "step": 18550 + }, + { + "epoch": 4.304630969609262, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8085, + "step": 18555 + }, + { + "epoch": 4.305836951278341, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.83, + "step": 18560 + }, + { + "epoch": 4.307042932947419, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.9847, + "step": 18565 + }, + { + "epoch": 4.308248914616498, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9728, + "step": 18570 + }, + { + "epoch": 4.309454896285576, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.886, + "step": 18575 + }, + { + "epoch": 4.310660877954655, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.8431, + "step": 18580 + }, + { + "epoch": 4.311866859623734, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.7705, + "step": 18585 + }, + { + "epoch": 4.313072841292812, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0417, + "step": 18590 + }, + { + "epoch": 4.314278822961891, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9488, + "step": 18595 + }, + { + "epoch": 4.3154848046309695, + "grad_norm": 3.5, + "learning_rate": 3e-05, + "loss": 3.9963, + "step": 18600 + }, + { + "epoch": 4.3166907863000485, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0354, + "step": 18605 + }, + { + "epoch": 4.317896767969127, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9202, + "step": 18610 + }, + { + "epoch": 4.319102749638206, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9808, + "step": 18615 + }, + { + "epoch": 4.320308731307284, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8379, + "step": 18620 + }, + { + "epoch": 4.321514712976363, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8134, + "step": 18625 + }, + { + "epoch": 4.322720694645441, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9399, + "step": 18630 + }, + { + "epoch": 4.32392667631452, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9967, + "step": 18635 + }, + { + "epoch": 4.325132657983598, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9444, + "step": 18640 + }, + { + "epoch": 4.326338639652677, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9807, + "step": 18645 + }, + { + "epoch": 4.327544621321756, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0261, + "step": 18650 + }, + { + "epoch": 4.3287506029908345, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9628, + "step": 18655 + }, + { + "epoch": 4.3299565846599135, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.936, + "step": 18660 + }, + { + "epoch": 4.331162566328992, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0522, + "step": 18665 + }, + { + "epoch": 4.332368547998071, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.9081, + "step": 18670 + }, + { + "epoch": 4.333574529667149, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8924, + "step": 18675 + }, + { + "epoch": 4.334780511336228, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9114, + "step": 18680 + }, + { + "epoch": 4.335986493005306, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9564, + "step": 18685 + }, + { + "epoch": 4.337192474674385, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.9293, + "step": 18690 + }, + { + "epoch": 4.338398456343463, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0191, + "step": 18695 + }, + { + "epoch": 4.339604438012542, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0384, + "step": 18700 + }, + { + "epoch": 4.34081041968162, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.0502, + "step": 18705 + }, + { + "epoch": 4.342016401350699, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.926, + "step": 18710 + }, + { + "epoch": 4.3432223830197785, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9212, + "step": 18715 + }, + { + "epoch": 4.344428364688857, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.9521, + "step": 18720 + }, + { + "epoch": 4.345634346357936, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 4.0382, + "step": 18725 + }, + { + "epoch": 4.346840328027014, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9874, + "step": 18730 + }, + { + "epoch": 4.348046309696093, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 4.0693, + "step": 18735 + }, + { + "epoch": 4.349252291365171, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0806, + "step": 18740 + }, + { + "epoch": 4.35045827303425, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8479, + "step": 18745 + }, + { + "epoch": 4.351664254703328, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8612, + "step": 18750 + }, + { + "epoch": 4.352870236372407, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 4.1342, + "step": 18755 + }, + { + "epoch": 4.354076218041485, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9161, + "step": 18760 + }, + { + "epoch": 4.355282199710564, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0085, + "step": 18765 + }, + { + "epoch": 4.356488181379643, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9504, + "step": 18770 + }, + { + "epoch": 4.357694163048722, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8287, + "step": 18775 + }, + { + "epoch": 4.358900144717801, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9961, + "step": 18780 + }, + { + "epoch": 4.360106126386879, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9507, + "step": 18785 + }, + { + "epoch": 4.361312108055958, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.8648, + "step": 18790 + }, + { + "epoch": 4.362518089725036, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9403, + "step": 18795 + }, + { + "epoch": 4.363724071394115, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8749, + "step": 18800 + }, + { + "epoch": 4.364930053063193, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9702, + "step": 18805 + }, + { + "epoch": 4.366136034732272, + "grad_norm": 3.3125, + "learning_rate": 3e-05, + "loss": 3.8223, + "step": 18810 + }, + { + "epoch": 4.36734201640135, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.966, + "step": 18815 + }, + { + "epoch": 4.368547998070429, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.1193, + "step": 18820 + }, + { + "epoch": 4.369753979739508, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0578, + "step": 18825 + }, + { + "epoch": 4.370959961408587, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9657, + "step": 18830 + }, + { + "epoch": 4.372165943077666, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8123, + "step": 18835 + }, + { + "epoch": 4.373371924746744, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9681, + "step": 18840 + }, + { + "epoch": 4.374577906415823, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 3.8817, + "step": 18845 + }, + { + "epoch": 4.375783888084901, + "grad_norm": 3.609375, + "learning_rate": 3e-05, + "loss": 3.9669, + "step": 18850 + }, + { + "epoch": 4.37698986975398, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.86, + "step": 18855 + }, + { + "epoch": 4.378195851423058, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.7974, + "step": 18860 + }, + { + "epoch": 4.379401833092137, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9139, + "step": 18865 + }, + { + "epoch": 4.380607814761215, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 4.1553, + "step": 18870 + }, + { + "epoch": 4.381813796430294, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8111, + "step": 18875 + }, + { + "epoch": 4.3830197780993725, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8528, + "step": 18880 + }, + { + "epoch": 4.3842257597684515, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9521, + "step": 18885 + }, + { + "epoch": 4.385431741437531, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8524, + "step": 18890 + }, + { + "epoch": 4.386637723106609, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9402, + "step": 18895 + }, + { + "epoch": 4.387843704775688, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0998, + "step": 18900 + }, + { + "epoch": 4.389049686444766, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9386, + "step": 18905 + }, + { + "epoch": 4.390255668113845, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9202, + "step": 18910 + }, + { + "epoch": 4.391461649782923, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.9179, + "step": 18915 + }, + { + "epoch": 4.392667631452002, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9382, + "step": 18920 + }, + { + "epoch": 4.39387361312108, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9612, + "step": 18925 + }, + { + "epoch": 4.395079594790159, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.891, + "step": 18930 + }, + { + "epoch": 4.3962855764592375, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9813, + "step": 18935 + }, + { + "epoch": 4.3974915581283165, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9354, + "step": 18940 + }, + { + "epoch": 4.398697539797395, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.9109, + "step": 18945 + }, + { + "epoch": 4.399903521466474, + "grad_norm": 4.4375, + "learning_rate": 3e-05, + "loss": 4.0227, + "step": 18950 + }, + { + "epoch": 4.401109503135553, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.972, + "step": 18955 + }, + { + "epoch": 4.402315484804631, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0383, + "step": 18960 + }, + { + "epoch": 4.40352146647371, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 4.0109, + "step": 18965 + }, + { + "epoch": 4.404727448142788, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0041, + "step": 18970 + }, + { + "epoch": 4.405933429811867, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0996, + "step": 18975 + }, + { + "epoch": 4.407139411480945, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1244, + "step": 18980 + }, + { + "epoch": 4.408345393150024, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.8899, + "step": 18985 + }, + { + "epoch": 4.409551374819102, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9266, + "step": 18990 + }, + { + "epoch": 4.4107573564881815, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0406, + "step": 18995 + }, + { + "epoch": 4.41196333815726, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.8808, + "step": 19000 + }, + { + "epoch": 4.413169319826339, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.8306, + "step": 19005 + }, + { + "epoch": 4.414375301495418, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.8822, + "step": 19010 + }, + { + "epoch": 4.415581283164496, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.102, + "step": 19015 + }, + { + "epoch": 4.416787264833575, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8508, + "step": 19020 + }, + { + "epoch": 4.417993246502653, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8282, + "step": 19025 + }, + { + "epoch": 4.419199228171732, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 3.98, + "step": 19030 + }, + { + "epoch": 4.42040520984081, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9701, + "step": 19035 + }, + { + "epoch": 4.421611191509889, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.89, + "step": 19040 + }, + { + "epoch": 4.422817173178967, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0001, + "step": 19045 + }, + { + "epoch": 4.424023154848046, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9654, + "step": 19050 + }, + { + "epoch": 4.425229136517125, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8927, + "step": 19055 + }, + { + "epoch": 4.426435118186204, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8706, + "step": 19060 + }, + { + "epoch": 4.427641099855283, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.8929, + "step": 19065 + }, + { + "epoch": 4.428847081524361, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0062, + "step": 19070 + }, + { + "epoch": 4.43005306319344, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7936, + "step": 19075 + }, + { + "epoch": 4.431259044862518, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9664, + "step": 19080 + }, + { + "epoch": 4.432465026531597, + "grad_norm": 4.1875, + "learning_rate": 3e-05, + "loss": 3.9573, + "step": 19085 + }, + { + "epoch": 4.433671008200675, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0506, + "step": 19090 + }, + { + "epoch": 4.434876989869754, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9315, + "step": 19095 + }, + { + "epoch": 4.436082971538832, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9096, + "step": 19100 + }, + { + "epoch": 4.437288953207911, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8916, + "step": 19105 + }, + { + "epoch": 4.4384949348769895, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9185, + "step": 19110 + }, + { + "epoch": 4.439700916546069, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9442, + "step": 19115 + }, + { + "epoch": 4.440906898215147, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.976, + "step": 19120 + }, + { + "epoch": 4.442112879884226, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9757, + "step": 19125 + }, + { + "epoch": 4.443318861553305, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9799, + "step": 19130 + }, + { + "epoch": 4.444524843222383, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.947, + "step": 19135 + }, + { + "epoch": 4.445730824891462, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9425, + "step": 19140 + }, + { + "epoch": 4.44693680656054, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.9554, + "step": 19145 + }, + { + "epoch": 4.448142788229619, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9967, + "step": 19150 + }, + { + "epoch": 4.449348769898697, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8435, + "step": 19155 + }, + { + "epoch": 4.450554751567776, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0367, + "step": 19160 + }, + { + "epoch": 4.4517607332368545, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9623, + "step": 19165 + }, + { + "epoch": 4.4529667149059335, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9176, + "step": 19170 + }, + { + "epoch": 4.454172696575012, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9261, + "step": 19175 + }, + { + "epoch": 4.455378678244091, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9105, + "step": 19180 + }, + { + "epoch": 4.456584659913169, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8613, + "step": 19185 + }, + { + "epoch": 4.457790641582248, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8935, + "step": 19190 + }, + { + "epoch": 4.458996623251327, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0224, + "step": 19195 + }, + { + "epoch": 4.460202604920405, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9101, + "step": 19200 + }, + { + "epoch": 4.461408586589484, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0449, + "step": 19205 + }, + { + "epoch": 4.462614568258562, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9605, + "step": 19210 + }, + { + "epoch": 4.463820549927641, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.9333, + "step": 19215 + }, + { + "epoch": 4.4650265315967195, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0092, + "step": 19220 + }, + { + "epoch": 4.4662325132657985, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0187, + "step": 19225 + }, + { + "epoch": 4.467438494934877, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 3.9404, + "step": 19230 + }, + { + "epoch": 4.468644476603956, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0867, + "step": 19235 + }, + { + "epoch": 4.469850458273034, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.9912, + "step": 19240 + }, + { + "epoch": 4.471056439942113, + "grad_norm": 3.609375, + "learning_rate": 3e-05, + "loss": 4.0355, + "step": 19245 + }, + { + "epoch": 4.472262421611192, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9804, + "step": 19250 + }, + { + "epoch": 4.47346840328027, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0691, + "step": 19255 + }, + { + "epoch": 4.474674384949349, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9969, + "step": 19260 + }, + { + "epoch": 4.475880366618427, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.0161, + "step": 19265 + }, + { + "epoch": 4.477086348287506, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0143, + "step": 19270 + }, + { + "epoch": 4.478292329956584, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9209, + "step": 19275 + }, + { + "epoch": 4.4794983116256635, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9923, + "step": 19280 + }, + { + "epoch": 4.480704293294742, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.037, + "step": 19285 + }, + { + "epoch": 4.481910274963821, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9443, + "step": 19290 + }, + { + "epoch": 4.483116256632899, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9038, + "step": 19295 + }, + { + "epoch": 4.484322238301978, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.912, + "step": 19300 + }, + { + "epoch": 4.485528219971057, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.8101, + "step": 19305 + }, + { + "epoch": 4.486734201640135, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0292, + "step": 19310 + }, + { + "epoch": 4.487940183309214, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8724, + "step": 19315 + }, + { + "epoch": 4.489146164978292, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9854, + "step": 19320 + }, + { + "epoch": 4.490352146647371, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9615, + "step": 19325 + }, + { + "epoch": 4.491558128316449, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.9114, + "step": 19330 + }, + { + "epoch": 4.492764109985528, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9932, + "step": 19335 + }, + { + "epoch": 4.493970091654607, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8948, + "step": 19340 + }, + { + "epoch": 4.495176073323686, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8645, + "step": 19345 + }, + { + "epoch": 4.496382054992764, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9229, + "step": 19350 + }, + { + "epoch": 4.497588036661843, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9432, + "step": 19355 + }, + { + "epoch": 4.498794018330921, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8388, + "step": 19360 + }, + { + "epoch": 4.5, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 3.948, + "step": 19365 + }, + { + "epoch": 4.501205981669079, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.7818, + "step": 19370 + }, + { + "epoch": 4.502411963338157, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9572, + "step": 19375 + }, + { + "epoch": 4.503617945007236, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8603, + "step": 19380 + }, + { + "epoch": 4.504823926676314, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9934, + "step": 19385 + }, + { + "epoch": 4.506029908345393, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0172, + "step": 19390 + }, + { + "epoch": 4.507235890014472, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9433, + "step": 19395 + }, + { + "epoch": 4.508441871683551, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0175, + "step": 19400 + }, + { + "epoch": 4.509647853352629, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9929, + "step": 19405 + }, + { + "epoch": 4.510853835021708, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.0433, + "step": 19410 + }, + { + "epoch": 4.512059816690786, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9826, + "step": 19415 + }, + { + "epoch": 4.513265798359865, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 4.0318, + "step": 19420 + }, + { + "epoch": 4.514471780028943, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9771, + "step": 19425 + }, + { + "epoch": 4.515677761698022, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9192, + "step": 19430 + }, + { + "epoch": 4.516883743367101, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0718, + "step": 19435 + }, + { + "epoch": 4.518089725036179, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.886, + "step": 19440 + }, + { + "epoch": 4.519295706705258, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9867, + "step": 19445 + }, + { + "epoch": 4.5205016883743365, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8466, + "step": 19450 + }, + { + "epoch": 4.521707670043416, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8188, + "step": 19455 + }, + { + "epoch": 4.522913651712494, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8884, + "step": 19460 + }, + { + "epoch": 4.524119633381573, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9647, + "step": 19465 + }, + { + "epoch": 4.525325615050651, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9551, + "step": 19470 + }, + { + "epoch": 4.52653159671973, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9088, + "step": 19475 + }, + { + "epoch": 4.527737578388809, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8979, + "step": 19480 + }, + { + "epoch": 4.528943560057887, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9685, + "step": 19485 + }, + { + "epoch": 4.530149541726965, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8511, + "step": 19490 + }, + { + "epoch": 4.531355523396044, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0053, + "step": 19495 + }, + { + "epoch": 4.532561505065123, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.76, + "step": 19500 + }, + { + "epoch": 4.5337674867342015, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8996, + "step": 19505 + }, + { + "epoch": 4.5349734684032805, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9456, + "step": 19510 + }, + { + "epoch": 4.536179450072359, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8806, + "step": 19515 + }, + { + "epoch": 4.537385431741438, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9195, + "step": 19520 + }, + { + "epoch": 4.538591413410516, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.8215, + "step": 19525 + }, + { + "epoch": 4.539797395079595, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9012, + "step": 19530 + }, + { + "epoch": 4.541003376748673, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8991, + "step": 19535 + }, + { + "epoch": 4.542209358417752, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9642, + "step": 19540 + }, + { + "epoch": 4.543415340086831, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.9493, + "step": 19545 + }, + { + "epoch": 4.544621321755909, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.839, + "step": 19550 + }, + { + "epoch": 4.545827303424988, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9932, + "step": 19555 + }, + { + "epoch": 4.5470332850940665, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.932, + "step": 19560 + }, + { + "epoch": 4.5482392667631455, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.1326, + "step": 19565 + }, + { + "epoch": 4.549445248432224, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 3.7192, + "step": 19570 + }, + { + "epoch": 4.550651230101303, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9293, + "step": 19575 + }, + { + "epoch": 4.551857211770381, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1116, + "step": 19580 + }, + { + "epoch": 4.55306319343946, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.917, + "step": 19585 + }, + { + "epoch": 4.554269175108538, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.1351, + "step": 19590 + }, + { + "epoch": 4.555475156777617, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9761, + "step": 19595 + }, + { + "epoch": 4.556681138446695, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.849, + "step": 19600 + }, + { + "epoch": 4.557887120115774, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8289, + "step": 19605 + }, + { + "epoch": 4.559093101784853, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0511, + "step": 19610 + }, + { + "epoch": 4.560299083453931, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0382, + "step": 19615 + }, + { + "epoch": 4.5615050651230105, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9575, + "step": 19620 + }, + { + "epoch": 4.562711046792089, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8191, + "step": 19625 + }, + { + "epoch": 4.563917028461168, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.901, + "step": 19630 + }, + { + "epoch": 4.565123010130246, + "grad_norm": 3.484375, + "learning_rate": 3e-05, + "loss": 3.766, + "step": 19635 + }, + { + "epoch": 4.566328991799325, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.9071, + "step": 19640 + }, + { + "epoch": 4.567534973468403, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 4.0219, + "step": 19645 + }, + { + "epoch": 4.568740955137482, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.1069, + "step": 19650 + }, + { + "epoch": 4.56994693680656, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.8216, + "step": 19655 + }, + { + "epoch": 4.571152918475639, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9227, + "step": 19660 + }, + { + "epoch": 4.572358900144717, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1128, + "step": 19665 + }, + { + "epoch": 4.573564881813796, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.8642, + "step": 19670 + }, + { + "epoch": 4.574770863482875, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.8425, + "step": 19675 + }, + { + "epoch": 4.575976845151954, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0141, + "step": 19680 + }, + { + "epoch": 4.577182826821033, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9448, + "step": 19685 + }, + { + "epoch": 4.578388808490111, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0479, + "step": 19690 + }, + { + "epoch": 4.57959479015919, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.9816, + "step": 19695 + }, + { + "epoch": 4.580800771828268, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9442, + "step": 19700 + }, + { + "epoch": 4.582006753497347, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.9765, + "step": 19705 + }, + { + "epoch": 4.583212735166425, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0233, + "step": 19710 + }, + { + "epoch": 4.584418716835504, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8763, + "step": 19715 + }, + { + "epoch": 4.585624698504583, + "grad_norm": 4.0, + "learning_rate": 3e-05, + "loss": 3.9905, + "step": 19720 + }, + { + "epoch": 4.586830680173661, + "grad_norm": 3.453125, + "learning_rate": 3e-05, + "loss": 3.9975, + "step": 19725 + }, + { + "epoch": 4.5880366618427395, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 3.9168, + "step": 19730 + }, + { + "epoch": 4.5892426435118185, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9945, + "step": 19735 + }, + { + "epoch": 4.590448625180898, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0105, + "step": 19740 + }, + { + "epoch": 4.591654606849976, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8938, + "step": 19745 + }, + { + "epoch": 4.592860588519055, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.0372, + "step": 19750 + }, + { + "epoch": 4.594066570188133, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9091, + "step": 19755 + }, + { + "epoch": 4.595272551857212, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9619, + "step": 19760 + }, + { + "epoch": 4.59647853352629, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9201, + "step": 19765 + }, + { + "epoch": 4.597684515195369, + "grad_norm": 3.375, + "learning_rate": 3e-05, + "loss": 3.8557, + "step": 19770 + }, + { + "epoch": 4.598890496864447, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9338, + "step": 19775 + }, + { + "epoch": 4.600096478533526, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9232, + "step": 19780 + }, + { + "epoch": 4.601302460202605, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9607, + "step": 19785 + }, + { + "epoch": 4.6025084418716835, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9891, + "step": 19790 + }, + { + "epoch": 4.6037144235407625, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9689, + "step": 19795 + }, + { + "epoch": 4.604920405209841, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0237, + "step": 19800 + }, + { + "epoch": 4.60612638687892, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9795, + "step": 19805 + }, + { + "epoch": 4.607332368547998, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9382, + "step": 19810 + }, + { + "epoch": 4.608538350217077, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.924, + "step": 19815 + }, + { + "epoch": 4.609744331886155, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0634, + "step": 19820 + }, + { + "epoch": 4.610950313555234, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.9434, + "step": 19825 + }, + { + "epoch": 4.612156295224312, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0461, + "step": 19830 + }, + { + "epoch": 4.613362276893391, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0901, + "step": 19835 + }, + { + "epoch": 4.614568258562469, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.075, + "step": 19840 + }, + { + "epoch": 4.6157742402315485, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9903, + "step": 19845 + }, + { + "epoch": 4.6169802219006275, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.7537, + "step": 19850 + }, + { + "epoch": 4.618186203569706, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.8785, + "step": 19855 + }, + { + "epoch": 4.619392185238785, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 4.0932, + "step": 19860 + }, + { + "epoch": 4.620598166907863, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9081, + "step": 19865 + }, + { + "epoch": 4.621804148576942, + "grad_norm": 4.0625, + "learning_rate": 3e-05, + "loss": 3.9645, + "step": 19870 + }, + { + "epoch": 4.62301013024602, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.002, + "step": 19875 + }, + { + "epoch": 4.624216111915099, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.7916, + "step": 19880 + }, + { + "epoch": 4.625422093584177, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0679, + "step": 19885 + }, + { + "epoch": 4.626628075253256, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8554, + "step": 19890 + }, + { + "epoch": 4.627834056922334, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8759, + "step": 19895 + }, + { + "epoch": 4.629040038591413, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1692, + "step": 19900 + }, + { + "epoch": 4.630246020260492, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0769, + "step": 19905 + }, + { + "epoch": 4.631452001929571, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9471, + "step": 19910 + }, + { + "epoch": 4.63265798359865, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9627, + "step": 19915 + }, + { + "epoch": 4.633863965267728, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.9123, + "step": 19920 + }, + { + "epoch": 4.635069946936807, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9039, + "step": 19925 + }, + { + "epoch": 4.636275928605885, + "grad_norm": 3.34375, + "learning_rate": 3e-05, + "loss": 3.9443, + "step": 19930 + }, + { + "epoch": 4.637481910274964, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8838, + "step": 19935 + }, + { + "epoch": 4.638687891944042, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8633, + "step": 19940 + }, + { + "epoch": 4.639893873613121, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0481, + "step": 19945 + }, + { + "epoch": 4.641099855282199, + "grad_norm": 3.40625, + "learning_rate": 3e-05, + "loss": 3.9399, + "step": 19950 + }, + { + "epoch": 4.642305836951278, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.077, + "step": 19955 + }, + { + "epoch": 4.6435118186203574, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.1191, + "step": 19960 + }, + { + "epoch": 4.644717800289436, + "grad_norm": 3.71875, + "learning_rate": 3e-05, + "loss": 3.9741, + "step": 19965 + }, + { + "epoch": 4.645923781958514, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8167, + "step": 19970 + }, + { + "epoch": 4.647129763627593, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8412, + "step": 19975 + }, + { + "epoch": 4.648335745296672, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9654, + "step": 19980 + }, + { + "epoch": 4.64954172696575, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.7964, + "step": 19985 + }, + { + "epoch": 4.650747708634829, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.8671, + "step": 19990 + }, + { + "epoch": 4.651953690303907, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.0256, + "step": 19995 + }, + { + "epoch": 4.653159671972986, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 4.0187, + "step": 20000 + }, + { + "epoch": 4.654365653642064, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8417, + "step": 20005 + }, + { + "epoch": 4.655571635311143, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0317, + "step": 20010 + }, + { + "epoch": 4.6567776169802215, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8415, + "step": 20015 + }, + { + "epoch": 4.657983598649301, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0008, + "step": 20020 + }, + { + "epoch": 4.65918958031838, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 4.171, + "step": 20025 + }, + { + "epoch": 4.660395561987458, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0814, + "step": 20030 + }, + { + "epoch": 4.661601543656537, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9164, + "step": 20035 + }, + { + "epoch": 4.662807525325615, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.043, + "step": 20040 + }, + { + "epoch": 4.664013506994694, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0084, + "step": 20045 + }, + { + "epoch": 4.665219488663772, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8783, + "step": 20050 + }, + { + "epoch": 4.666425470332851, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8936, + "step": 20055 + }, + { + "epoch": 4.667631452001929, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.009, + "step": 20060 + }, + { + "epoch": 4.668837433671008, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0824, + "step": 20065 + }, + { + "epoch": 4.6700434153400865, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0993, + "step": 20070 + }, + { + "epoch": 4.6712493970091655, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.0027, + "step": 20075 + }, + { + "epoch": 4.672455378678244, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8808, + "step": 20080 + }, + { + "epoch": 4.673661360347323, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9794, + "step": 20085 + }, + { + "epoch": 4.674867342016402, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9214, + "step": 20090 + }, + { + "epoch": 4.67607332368548, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9029, + "step": 20095 + }, + { + "epoch": 4.677279305354559, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9026, + "step": 20100 + }, + { + "epoch": 4.678485287023637, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8744, + "step": 20105 + }, + { + "epoch": 4.679691268692716, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0173, + "step": 20110 + }, + { + "epoch": 4.680897250361794, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.9444, + "step": 20115 + }, + { + "epoch": 4.682103232030873, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9932, + "step": 20120 + }, + { + "epoch": 4.6833092136999515, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8665, + "step": 20125 + }, + { + "epoch": 4.6845151953690305, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 3.877, + "step": 20130 + }, + { + "epoch": 4.685721177038109, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 4.0382, + "step": 20135 + }, + { + "epoch": 4.686927158707188, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9295, + "step": 20140 + }, + { + "epoch": 4.688133140376266, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8698, + "step": 20145 + }, + { + "epoch": 4.689339122045345, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9039, + "step": 20150 + }, + { + "epoch": 4.690545103714424, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9007, + "step": 20155 + }, + { + "epoch": 4.691751085383502, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.896, + "step": 20160 + }, + { + "epoch": 4.692957067052581, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8963, + "step": 20165 + }, + { + "epoch": 4.694163048721659, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8924, + "step": 20170 + }, + { + "epoch": 4.695369030390738, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.9467, + "step": 20175 + }, + { + "epoch": 4.696575012059816, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.8768, + "step": 20180 + }, + { + "epoch": 4.6977809937288955, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9075, + "step": 20185 + }, + { + "epoch": 4.698986975397974, + "grad_norm": 3.453125, + "learning_rate": 3e-05, + "loss": 4.0124, + "step": 20190 + }, + { + "epoch": 4.700192957067053, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8964, + "step": 20195 + }, + { + "epoch": 4.701398938736132, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9017, + "step": 20200 + }, + { + "epoch": 4.70260492040521, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9123, + "step": 20205 + }, + { + "epoch": 4.703810902074288, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8724, + "step": 20210 + }, + { + "epoch": 4.705016883743367, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.9993, + "step": 20215 + }, + { + "epoch": 4.706222865412446, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 3.8621, + "step": 20220 + }, + { + "epoch": 4.707428847081524, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0181, + "step": 20225 + }, + { + "epoch": 4.708634828750603, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9327, + "step": 20230 + }, + { + "epoch": 4.709840810419681, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8933, + "step": 20235 + }, + { + "epoch": 4.71104679208876, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9689, + "step": 20240 + }, + { + "epoch": 4.712252773757839, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.7931, + "step": 20245 + }, + { + "epoch": 4.713458755426918, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.8618, + "step": 20250 + }, + { + "epoch": 4.714664737095996, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0381, + "step": 20255 + }, + { + "epoch": 4.715870718765075, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.9483, + "step": 20260 + }, + { + "epoch": 4.717076700434154, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.933, + "step": 20265 + }, + { + "epoch": 4.718282682103232, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8767, + "step": 20270 + }, + { + "epoch": 4.719488663772311, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.1403, + "step": 20275 + }, + { + "epoch": 4.720694645441389, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9109, + "step": 20280 + }, + { + "epoch": 4.721900627110468, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0063, + "step": 20285 + }, + { + "epoch": 4.723106608779546, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9716, + "step": 20290 + }, + { + "epoch": 4.724312590448625, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.868, + "step": 20295 + }, + { + "epoch": 4.7255185721177035, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.9417, + "step": 20300 + }, + { + "epoch": 4.726724553786783, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8789, + "step": 20305 + }, + { + "epoch": 4.727930535455861, + "grad_norm": 2.109375, + "learning_rate": 3e-05, + "loss": 3.7718, + "step": 20310 + }, + { + "epoch": 4.72913651712494, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.924, + "step": 20315 + }, + { + "epoch": 4.730342498794018, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9154, + "step": 20320 + }, + { + "epoch": 4.731548480463097, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0157, + "step": 20325 + }, + { + "epoch": 4.732754462132176, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9031, + "step": 20330 + }, + { + "epoch": 4.733960443801254, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9127, + "step": 20335 + }, + { + "epoch": 4.735166425470333, + "grad_norm": 3.78125, + "learning_rate": 3e-05, + "loss": 3.935, + "step": 20340 + }, + { + "epoch": 4.736372407139411, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.9078, + "step": 20345 + }, + { + "epoch": 4.73757838880849, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9356, + "step": 20350 + }, + { + "epoch": 4.7387843704775685, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9006, + "step": 20355 + }, + { + "epoch": 4.7399903521466475, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8435, + "step": 20360 + }, + { + "epoch": 4.741196333815726, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.9632, + "step": 20365 + }, + { + "epoch": 4.742402315484805, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.931, + "step": 20370 + }, + { + "epoch": 4.743608297153883, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0474, + "step": 20375 + }, + { + "epoch": 4.744814278822962, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.9415, + "step": 20380 + }, + { + "epoch": 4.74602026049204, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.961, + "step": 20385 + }, + { + "epoch": 4.747226242161119, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8224, + "step": 20390 + }, + { + "epoch": 4.748432223830198, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.7832, + "step": 20395 + }, + { + "epoch": 4.749638205499276, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8409, + "step": 20400 + }, + { + "epoch": 4.750844187168355, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1794, + "step": 20405 + }, + { + "epoch": 4.7520501688374335, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 4.0168, + "step": 20410 + }, + { + "epoch": 4.7532561505065125, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0308, + "step": 20415 + }, + { + "epoch": 4.754462132175591, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9314, + "step": 20420 + }, + { + "epoch": 4.75566811384467, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.038, + "step": 20425 + }, + { + "epoch": 4.756874095513748, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9364, + "step": 20430 + }, + { + "epoch": 4.758080077182827, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0435, + "step": 20435 + }, + { + "epoch": 4.759286058851906, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9514, + "step": 20440 + }, + { + "epoch": 4.760492040520984, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9252, + "step": 20445 + }, + { + "epoch": 4.761698022190062, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.9576, + "step": 20450 + }, + { + "epoch": 4.762904003859141, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 3.9271, + "step": 20455 + }, + { + "epoch": 4.76410998552822, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.8534, + "step": 20460 + }, + { + "epoch": 4.765315967197298, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.858, + "step": 20465 + }, + { + "epoch": 4.7665219488663775, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 3.8748, + "step": 20470 + }, + { + "epoch": 4.767727930535456, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9712, + "step": 20475 + }, + { + "epoch": 4.768933912204535, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9745, + "step": 20480 + }, + { + "epoch": 4.770139893873613, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.98, + "step": 20485 + }, + { + "epoch": 4.771345875542692, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8438, + "step": 20490 + }, + { + "epoch": 4.77255185721177, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8551, + "step": 20495 + }, + { + "epoch": 4.773757838880849, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.654, + "step": 20500 + }, + { + "epoch": 4.774963820549928, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 4.0144, + "step": 20505 + }, + { + "epoch": 4.776169802219006, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.1523, + "step": 20510 + }, + { + "epoch": 4.777375783888085, + "grad_norm": 2.140625, + "learning_rate": 3e-05, + "loss": 4.1056, + "step": 20515 + }, + { + "epoch": 4.778581765557163, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9953, + "step": 20520 + }, + { + "epoch": 4.7797877472262424, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.8346, + "step": 20525 + }, + { + "epoch": 4.780993728895321, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0517, + "step": 20530 + }, + { + "epoch": 4.7821997105644, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8489, + "step": 20535 + }, + { + "epoch": 4.783405692233478, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.9816, + "step": 20540 + }, + { + "epoch": 4.784611673902557, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9426, + "step": 20545 + }, + { + "epoch": 4.785817655571635, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.9752, + "step": 20550 + }, + { + "epoch": 4.787023637240714, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0684, + "step": 20555 + }, + { + "epoch": 4.788229618909792, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 4.1085, + "step": 20560 + }, + { + "epoch": 4.789435600578871, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9727, + "step": 20565 + }, + { + "epoch": 4.79064158224795, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9538, + "step": 20570 + }, + { + "epoch": 4.791847563917028, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9423, + "step": 20575 + }, + { + "epoch": 4.793053545586107, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 4.0441, + "step": 20580 + }, + { + "epoch": 4.794259527255186, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9836, + "step": 20585 + }, + { + "epoch": 4.795465508924265, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9774, + "step": 20590 + }, + { + "epoch": 4.796671490593343, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.912, + "step": 20595 + }, + { + "epoch": 4.797877472262422, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9208, + "step": 20600 + }, + { + "epoch": 4.7990834539315, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0449, + "step": 20605 + }, + { + "epoch": 4.800289435600579, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.031, + "step": 20610 + }, + { + "epoch": 4.801495417269657, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8891, + "step": 20615 + }, + { + "epoch": 4.802701398938736, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9214, + "step": 20620 + }, + { + "epoch": 4.803907380607814, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0434, + "step": 20625 + }, + { + "epoch": 4.805113362276893, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9426, + "step": 20630 + }, + { + "epoch": 4.806319343945972, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0018, + "step": 20635 + }, + { + "epoch": 4.8075253256150505, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0681, + "step": 20640 + }, + { + "epoch": 4.80873130728413, + "grad_norm": 4.125, + "learning_rate": 3e-05, + "loss": 3.9522, + "step": 20645 + }, + { + "epoch": 4.809937288953208, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8802, + "step": 20650 + }, + { + "epoch": 4.811143270622287, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8852, + "step": 20655 + }, + { + "epoch": 4.812349252291365, + "grad_norm": 3.375, + "learning_rate": 3e-05, + "loss": 3.9538, + "step": 20660 + }, + { + "epoch": 4.813555233960444, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9321, + "step": 20665 + }, + { + "epoch": 4.814761215629522, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9748, + "step": 20670 + }, + { + "epoch": 4.815967197298601, + "grad_norm": 6.375, + "learning_rate": 3e-05, + "loss": 4.0384, + "step": 20675 + }, + { + "epoch": 4.81717317896768, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.8488, + "step": 20680 + }, + { + "epoch": 4.818379160636758, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 4.0321, + "step": 20685 + }, + { + "epoch": 4.8195851423058365, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.968, + "step": 20690 + }, + { + "epoch": 4.8207911239749155, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.7767, + "step": 20695 + }, + { + "epoch": 4.8219971056439945, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9967, + "step": 20700 + }, + { + "epoch": 4.823203087313073, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.7879, + "step": 20705 + }, + { + "epoch": 4.824409068982152, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.8706, + "step": 20710 + }, + { + "epoch": 4.82561505065123, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0078, + "step": 20715 + }, + { + "epoch": 4.826821032320309, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0377, + "step": 20720 + }, + { + "epoch": 4.828027013989387, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.913, + "step": 20725 + }, + { + "epoch": 4.829232995658466, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9725, + "step": 20730 + }, + { + "epoch": 4.830438977327544, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9714, + "step": 20735 + }, + { + "epoch": 4.831644958996623, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 3.9658, + "step": 20740 + }, + { + "epoch": 4.832850940665702, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8669, + "step": 20745 + }, + { + "epoch": 4.8340569223347805, + "grad_norm": 3.65625, + "learning_rate": 3e-05, + "loss": 4.0164, + "step": 20750 + }, + { + "epoch": 4.8352629040038595, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.7978, + "step": 20755 + }, + { + "epoch": 4.836468885672938, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9663, + "step": 20760 + }, + { + "epoch": 4.837674867342017, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0615, + "step": 20765 + }, + { + "epoch": 4.838880849011095, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8136, + "step": 20770 + }, + { + "epoch": 4.840086830680174, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.1187, + "step": 20775 + }, + { + "epoch": 4.841292812349252, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.9137, + "step": 20780 + }, + { + "epoch": 4.842498794018331, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8554, + "step": 20785 + }, + { + "epoch": 4.843704775687409, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9178, + "step": 20790 + }, + { + "epoch": 4.844910757356488, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.758, + "step": 20795 + }, + { + "epoch": 4.846116739025566, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9255, + "step": 20800 + }, + { + "epoch": 4.847322720694645, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0064, + "step": 20805 + }, + { + "epoch": 4.8485287023637245, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0108, + "step": 20810 + }, + { + "epoch": 4.849734684032803, + "grad_norm": 3.703125, + "learning_rate": 3e-05, + "loss": 3.969, + "step": 20815 + }, + { + "epoch": 4.850940665701882, + "grad_norm": 1.96875, + "learning_rate": 3e-05, + "loss": 4.1743, + "step": 20820 + }, + { + "epoch": 4.85214664737096, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9218, + "step": 20825 + }, + { + "epoch": 4.853352629040039, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9121, + "step": 20830 + }, + { + "epoch": 4.854558610709117, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9341, + "step": 20835 + }, + { + "epoch": 4.855764592378196, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8066, + "step": 20840 + }, + { + "epoch": 4.856970574047274, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9598, + "step": 20845 + }, + { + "epoch": 4.858176555716353, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8488, + "step": 20850 + }, + { + "epoch": 4.859382537385431, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9551, + "step": 20855 + }, + { + "epoch": 4.86058851905451, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.8587, + "step": 20860 + }, + { + "epoch": 4.8617945007235885, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8243, + "step": 20865 + }, + { + "epoch": 4.863000482392668, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0493, + "step": 20870 + }, + { + "epoch": 4.864206464061747, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0363, + "step": 20875 + }, + { + "epoch": 4.865412445730825, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 4.0081, + "step": 20880 + }, + { + "epoch": 4.866618427399904, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.958, + "step": 20885 + }, + { + "epoch": 4.867824409068982, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.7951, + "step": 20890 + }, + { + "epoch": 4.869030390738061, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8614, + "step": 20895 + }, + { + "epoch": 4.870236372407139, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 4.03, + "step": 20900 + }, + { + "epoch": 4.871442354076218, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8715, + "step": 20905 + }, + { + "epoch": 4.872648335745296, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.9043, + "step": 20910 + }, + { + "epoch": 4.873854317414375, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9583, + "step": 20915 + }, + { + "epoch": 4.875060299083454, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 3.9146, + "step": 20920 + }, + { + "epoch": 4.8762662807525325, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.8752, + "step": 20925 + }, + { + "epoch": 4.877472262421611, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9535, + "step": 20930 + }, + { + "epoch": 4.87867824409069, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.7113, + "step": 20935 + }, + { + "epoch": 4.879884225759769, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.932, + "step": 20940 + }, + { + "epoch": 4.881090207428847, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.021, + "step": 20945 + }, + { + "epoch": 4.882296189097926, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9005, + "step": 20950 + }, + { + "epoch": 4.883502170767004, + "grad_norm": 1.9296875, + "learning_rate": 3e-05, + "loss": 3.7932, + "step": 20955 + }, + { + "epoch": 4.884708152436083, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.1057, + "step": 20960 + }, + { + "epoch": 4.885914134105161, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9113, + "step": 20965 + }, + { + "epoch": 4.88712011577424, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0753, + "step": 20970 + }, + { + "epoch": 4.8883260974433185, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8068, + "step": 20975 + }, + { + "epoch": 4.8895320791123975, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 4.0359, + "step": 20980 + }, + { + "epoch": 4.8907380607814765, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9581, + "step": 20985 + }, + { + "epoch": 4.891944042450555, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8416, + "step": 20990 + }, + { + "epoch": 4.893150024119634, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 3.996, + "step": 20995 + }, + { + "epoch": 4.894356005788712, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 4.0332, + "step": 21000 + }, + { + "epoch": 4.895561987457791, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8406, + "step": 21005 + }, + { + "epoch": 4.896767969126869, + "grad_norm": 3.53125, + "learning_rate": 3e-05, + "loss": 4.005, + "step": 21010 + }, + { + "epoch": 4.897973950795948, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.7687, + "step": 21015 + }, + { + "epoch": 4.899179932465026, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0207, + "step": 21020 + }, + { + "epoch": 4.900385914134105, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9626, + "step": 21025 + }, + { + "epoch": 4.901591895803183, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 3.8243, + "step": 21030 + }, + { + "epoch": 4.9027978774722625, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 4.0623, + "step": 21035 + }, + { + "epoch": 4.904003859141341, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 4.0195, + "step": 21040 + }, + { + "epoch": 4.90520984081042, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.8954, + "step": 21045 + }, + { + "epoch": 4.906415822479499, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0211, + "step": 21050 + }, + { + "epoch": 4.907621804148577, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 3.9705, + "step": 21055 + }, + { + "epoch": 4.908827785817656, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9381, + "step": 21060 + }, + { + "epoch": 4.910033767486734, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9312, + "step": 21065 + }, + { + "epoch": 4.911239749155813, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8781, + "step": 21070 + }, + { + "epoch": 4.912445730824891, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.8277, + "step": 21075 + }, + { + "epoch": 4.91365171249397, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8868, + "step": 21080 + }, + { + "epoch": 4.914857694163048, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8644, + "step": 21085 + }, + { + "epoch": 4.9160636758321274, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.92, + "step": 21090 + }, + { + "epoch": 4.9172696575012065, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8915, + "step": 21095 + }, + { + "epoch": 4.918475639170285, + "grad_norm": 4.1875, + "learning_rate": 3e-05, + "loss": 4.0556, + "step": 21100 + }, + { + "epoch": 4.919681620839363, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9364, + "step": 21105 + }, + { + "epoch": 4.920887602508442, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9488, + "step": 21110 + }, + { + "epoch": 4.922093584177521, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8182, + "step": 21115 + }, + { + "epoch": 4.923299565846599, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8677, + "step": 21120 + }, + { + "epoch": 4.924505547515678, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9054, + "step": 21125 + }, + { + "epoch": 4.925711529184756, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9512, + "step": 21130 + }, + { + "epoch": 4.926917510853835, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.0463, + "step": 21135 + }, + { + "epoch": 4.928123492522913, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.782, + "step": 21140 + }, + { + "epoch": 4.929329474191992, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.8505, + "step": 21145 + }, + { + "epoch": 4.930535455861071, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.1929, + "step": 21150 + }, + { + "epoch": 4.93174143753015, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9182, + "step": 21155 + }, + { + "epoch": 4.932947419199229, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.9292, + "step": 21160 + }, + { + "epoch": 4.934153400868307, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.877, + "step": 21165 + }, + { + "epoch": 4.935359382537386, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.9097, + "step": 21170 + }, + { + "epoch": 4.936565364206464, + "grad_norm": 3.328125, + "learning_rate": 3e-05, + "loss": 3.9824, + "step": 21175 + }, + { + "epoch": 4.937771345875543, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.8611, + "step": 21180 + }, + { + "epoch": 4.938977327544621, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.889, + "step": 21185 + }, + { + "epoch": 4.9401833092137, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9685, + "step": 21190 + }, + { + "epoch": 4.941389290882778, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9564, + "step": 21195 + }, + { + "epoch": 4.942595272551857, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.8476, + "step": 21200 + }, + { + "epoch": 4.9438012542209355, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.994, + "step": 21205 + }, + { + "epoch": 4.945007235890015, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9816, + "step": 21210 + }, + { + "epoch": 4.946213217559093, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8729, + "step": 21215 + }, + { + "epoch": 4.947419199228172, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0304, + "step": 21220 + }, + { + "epoch": 4.948625180897251, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8374, + "step": 21225 + }, + { + "epoch": 4.949831162566329, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9099, + "step": 21230 + }, + { + "epoch": 4.951037144235408, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 4.0382, + "step": 21235 + }, + { + "epoch": 4.952243125904486, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8176, + "step": 21240 + }, + { + "epoch": 4.953449107573565, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8897, + "step": 21245 + }, + { + "epoch": 4.954655089242643, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9768, + "step": 21250 + }, + { + "epoch": 4.955861070911722, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8429, + "step": 21255 + }, + { + "epoch": 4.9570670525808005, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.7557, + "step": 21260 + }, + { + "epoch": 4.9582730342498795, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.8978, + "step": 21265 + }, + { + "epoch": 4.959479015918958, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 4.064, + "step": 21270 + }, + { + "epoch": 4.960684997588037, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.7338, + "step": 21275 + }, + { + "epoch": 4.961890979257115, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8576, + "step": 21280 + }, + { + "epoch": 4.963096960926194, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8683, + "step": 21285 + }, + { + "epoch": 4.964302942595273, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 4.0564, + "step": 21290 + }, + { + "epoch": 4.965508924264351, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 4.1415, + "step": 21295 + }, + { + "epoch": 4.96671490593343, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.7619, + "step": 21300 + }, + { + "epoch": 4.967920887602508, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8409, + "step": 21305 + }, + { + "epoch": 4.969126869271587, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8675, + "step": 21310 + }, + { + "epoch": 4.9703328509406655, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.805, + "step": 21315 + }, + { + "epoch": 4.9715388326097445, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9616, + "step": 21320 + }, + { + "epoch": 4.972744814278823, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9983, + "step": 21325 + }, + { + "epoch": 4.973950795947902, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 4.0076, + "step": 21330 + }, + { + "epoch": 4.975156777616981, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8523, + "step": 21335 + }, + { + "epoch": 4.976362759286059, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8214, + "step": 21340 + }, + { + "epoch": 4.977568740955137, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8259, + "step": 21345 + }, + { + "epoch": 4.978774722624216, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9151, + "step": 21350 + }, + { + "epoch": 4.979980704293295, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0295, + "step": 21355 + }, + { + "epoch": 4.981186685962373, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0624, + "step": 21360 + }, + { + "epoch": 4.982392667631452, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8917, + "step": 21365 + }, + { + "epoch": 4.98359864930053, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.89, + "step": 21370 + }, + { + "epoch": 4.9848046309696095, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.857, + "step": 21375 + }, + { + "epoch": 4.986010612638688, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.8069, + "step": 21380 + }, + { + "epoch": 4.987216594307767, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.7648, + "step": 21385 + }, + { + "epoch": 4.988422575976845, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0919, + "step": 21390 + }, + { + "epoch": 4.989628557645924, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.956, + "step": 21395 + }, + { + "epoch": 4.990834539315003, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8665, + "step": 21400 + }, + { + "epoch": 4.992040520984081, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8915, + "step": 21405 + }, + { + "epoch": 4.99324650265316, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.0173, + "step": 21410 + }, + { + "epoch": 4.994452484322238, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.7982, + "step": 21415 + }, + { + "epoch": 4.995658465991317, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8948, + "step": 21420 + }, + { + "epoch": 4.996864447660395, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.8085, + "step": 21425 + }, + { + "epoch": 4.998070429329474, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.9634, + "step": 21430 + }, + { + "epoch": 4.999276410998553, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 4.0828, + "step": 21435 + }, + { + "epoch": 5.000482392667632, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7937, + "step": 21440 + }, + { + "epoch": 5.00168837433671, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9184, + "step": 21445 + }, + { + "epoch": 5.002894356005789, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9379, + "step": 21450 + }, + { + "epoch": 5.004100337674867, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 4.0123, + "step": 21455 + }, + { + "epoch": 5.005306319343946, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.8683, + "step": 21460 + }, + { + "epoch": 5.006512301013025, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8978, + "step": 21465 + }, + { + "epoch": 5.007718282682103, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8722, + "step": 21470 + }, + { + "epoch": 5.008924264351182, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9295, + "step": 21475 + }, + { + "epoch": 5.01013024602026, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 4.1149, + "step": 21480 + }, + { + "epoch": 5.011336227689339, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8613, + "step": 21485 + }, + { + "epoch": 5.0125422093584175, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8367, + "step": 21490 + }, + { + "epoch": 5.013748191027497, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8382, + "step": 21495 + }, + { + "epoch": 5.014954172696575, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9636, + "step": 21500 + }, + { + "epoch": 5.016160154365654, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9675, + "step": 21505 + }, + { + "epoch": 5.017366136034732, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8949, + "step": 21510 + }, + { + "epoch": 5.018572117703811, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 3.8907, + "step": 21515 + }, + { + "epoch": 5.019778099372889, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8416, + "step": 21520 + }, + { + "epoch": 5.020984081041968, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9238, + "step": 21525 + }, + { + "epoch": 5.022190062711047, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.959, + "step": 21530 + }, + { + "epoch": 5.023396044380125, + "grad_norm": 3.484375, + "learning_rate": 3e-05, + "loss": 3.7752, + "step": 21535 + }, + { + "epoch": 5.024602026049204, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8786, + "step": 21540 + }, + { + "epoch": 5.0258080077182825, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9937, + "step": 21545 + }, + { + "epoch": 5.0270139893873615, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8521, + "step": 21550 + }, + { + "epoch": 5.02821997105644, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9118, + "step": 21555 + }, + { + "epoch": 5.029425952725519, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9575, + "step": 21560 + }, + { + "epoch": 5.030631934394597, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8176, + "step": 21565 + }, + { + "epoch": 5.031837916063676, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9273, + "step": 21570 + }, + { + "epoch": 5.033043897732754, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9293, + "step": 21575 + }, + { + "epoch": 5.034249879401833, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.836, + "step": 21580 + }, + { + "epoch": 5.035455861070912, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8771, + "step": 21585 + }, + { + "epoch": 5.03666184273999, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9086, + "step": 21590 + }, + { + "epoch": 5.037867824409069, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0582, + "step": 21595 + }, + { + "epoch": 5.0390738060781475, + "grad_norm": 2.078125, + "learning_rate": 3e-05, + "loss": 3.7782, + "step": 21600 + }, + { + "epoch": 5.0402797877472265, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.0771, + "step": 21605 + }, + { + "epoch": 5.041485769416305, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9776, + "step": 21610 + }, + { + "epoch": 5.042691751085384, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9155, + "step": 21615 + }, + { + "epoch": 5.043897732754462, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8897, + "step": 21620 + }, + { + "epoch": 5.045103714423541, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.9852, + "step": 21625 + }, + { + "epoch": 5.046309696092619, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9083, + "step": 21630 + }, + { + "epoch": 5.047515677761698, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9973, + "step": 21635 + }, + { + "epoch": 5.048721659430776, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.885, + "step": 21640 + }, + { + "epoch": 5.049927641099855, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8372, + "step": 21645 + }, + { + "epoch": 5.051133622768934, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9772, + "step": 21650 + }, + { + "epoch": 5.052339604438012, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.947, + "step": 21655 + }, + { + "epoch": 5.0535455861070915, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0524, + "step": 21660 + }, + { + "epoch": 5.05475156777617, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9114, + "step": 21665 + }, + { + "epoch": 5.055957549445249, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.845, + "step": 21670 + }, + { + "epoch": 5.057163531114327, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9679, + "step": 21675 + }, + { + "epoch": 5.058369512783406, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.894, + "step": 21680 + }, + { + "epoch": 5.059575494452484, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.7347, + "step": 21685 + }, + { + "epoch": 5.060781476121563, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.7781, + "step": 21690 + }, + { + "epoch": 5.061987457790641, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.8644, + "step": 21695 + }, + { + "epoch": 5.06319343945972, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.898, + "step": 21700 + }, + { + "epoch": 5.064399421128799, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9921, + "step": 21705 + }, + { + "epoch": 5.065605402797877, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.7472, + "step": 21710 + }, + { + "epoch": 5.0668113844669564, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9397, + "step": 21715 + }, + { + "epoch": 5.068017366136035, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8533, + "step": 21720 + }, + { + "epoch": 5.069223347805114, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9585, + "step": 21725 + }, + { + "epoch": 5.070429329474192, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.934, + "step": 21730 + }, + { + "epoch": 5.071635311143271, + "grad_norm": 3.21875, + "learning_rate": 3e-05, + "loss": 3.7228, + "step": 21735 + }, + { + "epoch": 5.072841292812349, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9274, + "step": 21740 + }, + { + "epoch": 5.074047274481428, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9427, + "step": 21745 + }, + { + "epoch": 5.075253256150506, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.865, + "step": 21750 + }, + { + "epoch": 5.076459237819585, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9054, + "step": 21755 + }, + { + "epoch": 5.077665219488663, + "grad_norm": 3.3125, + "learning_rate": 3e-05, + "loss": 3.9283, + "step": 21760 + }, + { + "epoch": 5.078871201157742, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.6981, + "step": 21765 + }, + { + "epoch": 5.080077182826821, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8309, + "step": 21770 + }, + { + "epoch": 5.0812831644959, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9549, + "step": 21775 + }, + { + "epoch": 5.082489146164979, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9689, + "step": 21780 + }, + { + "epoch": 5.083695127834057, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.957, + "step": 21785 + }, + { + "epoch": 5.084901109503136, + "grad_norm": 3.953125, + "learning_rate": 3e-05, + "loss": 3.922, + "step": 21790 + }, + { + "epoch": 5.086107091172214, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.9898, + "step": 21795 + }, + { + "epoch": 5.087313072841293, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9402, + "step": 21800 + }, + { + "epoch": 5.088519054510371, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 4.0113, + "step": 21805 + }, + { + "epoch": 5.08972503617945, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7446, + "step": 21810 + }, + { + "epoch": 5.090931017848528, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8729, + "step": 21815 + }, + { + "epoch": 5.092136999517607, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9414, + "step": 21820 + }, + { + "epoch": 5.093342981186686, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8305, + "step": 21825 + }, + { + "epoch": 5.0945489628557645, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9127, + "step": 21830 + }, + { + "epoch": 5.095754944524844, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9617, + "step": 21835 + }, + { + "epoch": 5.096960926193922, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9096, + "step": 21840 + }, + { + "epoch": 5.098166907863001, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9059, + "step": 21845 + }, + { + "epoch": 5.099372889532079, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.1668, + "step": 21850 + }, + { + "epoch": 5.100578871201158, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.8048, + "step": 21855 + }, + { + "epoch": 5.101784852870236, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8474, + "step": 21860 + }, + { + "epoch": 5.102990834539315, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 3.8594, + "step": 21865 + }, + { + "epoch": 5.104196816208393, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.093, + "step": 21870 + }, + { + "epoch": 5.105402797877472, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9099, + "step": 21875 + }, + { + "epoch": 5.1066087795465505, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.8213, + "step": 21880 + }, + { + "epoch": 5.1078147612156295, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8325, + "step": 21885 + }, + { + "epoch": 5.1090207428847085, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9916, + "step": 21890 + }, + { + "epoch": 5.110226724553787, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9432, + "step": 21895 + }, + { + "epoch": 5.111432706222866, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.8655, + "step": 21900 + }, + { + "epoch": 5.112638687891944, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.0019, + "step": 21905 + }, + { + "epoch": 5.113844669561023, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8892, + "step": 21910 + }, + { + "epoch": 5.115050651230101, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.94, + "step": 21915 + }, + { + "epoch": 5.11625663289918, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9665, + "step": 21920 + }, + { + "epoch": 5.117462614568258, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.096, + "step": 21925 + }, + { + "epoch": 5.118668596237337, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8654, + "step": 21930 + }, + { + "epoch": 5.119874577906415, + "grad_norm": 3.671875, + "learning_rate": 3e-05, + "loss": 3.8154, + "step": 21935 + }, + { + "epoch": 5.1210805595754945, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9213, + "step": 21940 + }, + { + "epoch": 5.1222865412445735, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8733, + "step": 21945 + }, + { + "epoch": 5.123492522913652, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 3.7978, + "step": 21950 + }, + { + "epoch": 5.124698504582731, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7867, + "step": 21955 + }, + { + "epoch": 5.125904486251809, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0244, + "step": 21960 + }, + { + "epoch": 5.127110467920888, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9427, + "step": 21965 + }, + { + "epoch": 5.128316449589966, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9321, + "step": 21970 + }, + { + "epoch": 5.129522431259045, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.8548, + "step": 21975 + }, + { + "epoch": 5.130728412928123, + "grad_norm": 3.34375, + "learning_rate": 3e-05, + "loss": 3.8351, + "step": 21980 + }, + { + "epoch": 5.131934394597202, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9124, + "step": 21985 + }, + { + "epoch": 5.13314037626628, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 4.0818, + "step": 21990 + }, + { + "epoch": 5.134346357935359, + "grad_norm": 2.171875, + "learning_rate": 3e-05, + "loss": 3.7985, + "step": 21995 + }, + { + "epoch": 5.135552339604438, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.0759, + "step": 22000 + }, + { + "epoch": 5.136758321273517, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 3.8537, + "step": 22005 + }, + { + "epoch": 5.137964302942596, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 3.7829, + "step": 22010 + }, + { + "epoch": 5.139170284611674, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.923, + "step": 22015 + }, + { + "epoch": 5.140376266280753, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9953, + "step": 22020 + }, + { + "epoch": 5.141582247949831, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9761, + "step": 22025 + }, + { + "epoch": 5.14278822961891, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.7898, + "step": 22030 + }, + { + "epoch": 5.143994211287988, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8231, + "step": 22035 + }, + { + "epoch": 5.145200192957067, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.8964, + "step": 22040 + }, + { + "epoch": 5.146406174626145, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9405, + "step": 22045 + }, + { + "epoch": 5.147612156295224, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.9334, + "step": 22050 + }, + { + "epoch": 5.1488181379643025, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8232, + "step": 22055 + }, + { + "epoch": 5.150024119633382, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8534, + "step": 22060 + }, + { + "epoch": 5.151230101302461, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.7698, + "step": 22065 + }, + { + "epoch": 5.152436082971539, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.797, + "step": 22070 + }, + { + "epoch": 5.153642064640618, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.94, + "step": 22075 + }, + { + "epoch": 5.154848046309696, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9255, + "step": 22080 + }, + { + "epoch": 5.156054027978775, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.0454, + "step": 22085 + }, + { + "epoch": 5.157260009647853, + "grad_norm": 3.8125, + "learning_rate": 3e-05, + "loss": 3.9132, + "step": 22090 + }, + { + "epoch": 5.158465991316932, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0732, + "step": 22095 + }, + { + "epoch": 5.15967197298601, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9151, + "step": 22100 + }, + { + "epoch": 5.160877954655089, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.8621, + "step": 22105 + }, + { + "epoch": 5.1620839363241675, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.701, + "step": 22110 + }, + { + "epoch": 5.1632899179932465, + "grad_norm": 2.234375, + "learning_rate": 3e-05, + "loss": 3.9158, + "step": 22115 + }, + { + "epoch": 5.164495899662326, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.9542, + "step": 22120 + }, + { + "epoch": 5.165701881331404, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9618, + "step": 22125 + }, + { + "epoch": 5.166907863000483, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8922, + "step": 22130 + }, + { + "epoch": 5.168113844669561, + "grad_norm": 3.453125, + "learning_rate": 3e-05, + "loss": 3.8107, + "step": 22135 + }, + { + "epoch": 5.16931982633864, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8175, + "step": 22140 + }, + { + "epoch": 5.170525808007718, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9216, + "step": 22145 + }, + { + "epoch": 5.171731789676797, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8877, + "step": 22150 + }, + { + "epoch": 5.172937771345875, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.8928, + "step": 22155 + }, + { + "epoch": 5.174143753014954, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 4.0834, + "step": 22160 + }, + { + "epoch": 5.1753497346840325, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8334, + "step": 22165 + }, + { + "epoch": 5.1765557163531115, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9299, + "step": 22170 + }, + { + "epoch": 5.17776169802219, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8922, + "step": 22175 + }, + { + "epoch": 5.178967679691269, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8433, + "step": 22180 + }, + { + "epoch": 5.180173661360348, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9261, + "step": 22185 + }, + { + "epoch": 5.181379643029426, + "grad_norm": 3.421875, + "learning_rate": 3e-05, + "loss": 4.0145, + "step": 22190 + }, + { + "epoch": 5.182585624698505, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.978, + "step": 22195 + }, + { + "epoch": 5.183791606367583, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8131, + "step": 22200 + }, + { + "epoch": 5.184997588036662, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.8114, + "step": 22205 + }, + { + "epoch": 5.18620356970574, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9253, + "step": 22210 + }, + { + "epoch": 5.187409551374819, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8948, + "step": 22215 + }, + { + "epoch": 5.188615533043897, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8952, + "step": 22220 + }, + { + "epoch": 5.1898215147129765, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 4.0042, + "step": 22225 + }, + { + "epoch": 5.191027496382055, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8473, + "step": 22230 + }, + { + "epoch": 5.192233478051134, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.1594, + "step": 22235 + }, + { + "epoch": 5.193439459720212, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.927, + "step": 22240 + }, + { + "epoch": 5.194645441389291, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.859, + "step": 22245 + }, + { + "epoch": 5.19585142305837, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8284, + "step": 22250 + }, + { + "epoch": 5.197057404727448, + "grad_norm": 3.203125, + "learning_rate": 3e-05, + "loss": 3.8632, + "step": 22255 + }, + { + "epoch": 5.198263386396527, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.7852, + "step": 22260 + }, + { + "epoch": 5.199469368065605, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9096, + "step": 22265 + }, + { + "epoch": 5.200675349734684, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9676, + "step": 22270 + }, + { + "epoch": 5.201881331403762, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.7712, + "step": 22275 + }, + { + "epoch": 5.2030873130728414, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0899, + "step": 22280 + }, + { + "epoch": 5.20429329474192, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.0446, + "step": 22285 + }, + { + "epoch": 5.205499276410999, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9552, + "step": 22290 + }, + { + "epoch": 5.206705258080077, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8262, + "step": 22295 + }, + { + "epoch": 5.207911239749156, + "grad_norm": 3.53125, + "learning_rate": 3e-05, + "loss": 3.9497, + "step": 22300 + }, + { + "epoch": 5.209117221418235, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.953, + "step": 22305 + }, + { + "epoch": 5.210323203087313, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9856, + "step": 22310 + }, + { + "epoch": 5.211529184756392, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.7899, + "step": 22315 + }, + { + "epoch": 5.21273516642547, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8281, + "step": 22320 + }, + { + "epoch": 5.213941148094549, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.9231, + "step": 22325 + }, + { + "epoch": 5.215147129763627, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9871, + "step": 22330 + }, + { + "epoch": 5.216353111432706, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 3.9694, + "step": 22335 + }, + { + "epoch": 5.217559093101785, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8839, + "step": 22340 + }, + { + "epoch": 5.218765074770864, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8151, + "step": 22345 + }, + { + "epoch": 5.219971056439942, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8759, + "step": 22350 + }, + { + "epoch": 5.221177038109021, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9434, + "step": 22355 + }, + { + "epoch": 5.2223830197781, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.7774, + "step": 22360 + }, + { + "epoch": 5.223589001447178, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.8739, + "step": 22365 + }, + { + "epoch": 5.224794983116257, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9417, + "step": 22370 + }, + { + "epoch": 5.226000964785335, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8387, + "step": 22375 + }, + { + "epoch": 5.227206946454414, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8646, + "step": 22380 + }, + { + "epoch": 5.228412928123492, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.7746, + "step": 22385 + }, + { + "epoch": 5.229618909792571, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.8178, + "step": 22390 + }, + { + "epoch": 5.2308248914616495, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8928, + "step": 22395 + }, + { + "epoch": 5.232030873130729, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.8171, + "step": 22400 + }, + { + "epoch": 5.233236854799807, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 4.0171, + "step": 22405 + }, + { + "epoch": 5.234442836468886, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 4.0651, + "step": 22410 + }, + { + "epoch": 5.235648818137964, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8623, + "step": 22415 + }, + { + "epoch": 5.236854799807043, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8643, + "step": 22420 + }, + { + "epoch": 5.238060781476122, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8966, + "step": 22425 + }, + { + "epoch": 5.2392667631452, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8332, + "step": 22430 + }, + { + "epoch": 5.240472744814279, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8822, + "step": 22435 + }, + { + "epoch": 5.241678726483357, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8462, + "step": 22440 + }, + { + "epoch": 5.242884708152436, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 4.0783, + "step": 22445 + }, + { + "epoch": 5.2440906898215145, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8279, + "step": 22450 + }, + { + "epoch": 5.2452966714905935, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9815, + "step": 22455 + }, + { + "epoch": 5.246502653159672, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8487, + "step": 22460 + }, + { + "epoch": 5.247708634828751, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9799, + "step": 22465 + }, + { + "epoch": 5.248914616497829, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.7613, + "step": 22470 + }, + { + "epoch": 5.250120598166908, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8872, + "step": 22475 + }, + { + "epoch": 5.251326579835986, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8842, + "step": 22480 + }, + { + "epoch": 5.252532561505065, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9607, + "step": 22485 + }, + { + "epoch": 5.253738543174144, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7689, + "step": 22490 + }, + { + "epoch": 5.254944524843222, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9501, + "step": 22495 + }, + { + "epoch": 5.256150506512301, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.7814, + "step": 22500 + }, + { + "epoch": 5.2573564881813795, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.833, + "step": 22505 + }, + { + "epoch": 5.2585624698504585, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.8897, + "step": 22510 + }, + { + "epoch": 5.259768451519537, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.0058, + "step": 22515 + }, + { + "epoch": 5.260974433188616, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8924, + "step": 22520 + }, + { + "epoch": 5.262180414857694, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9175, + "step": 22525 + }, + { + "epoch": 5.263386396526773, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.7812, + "step": 22530 + }, + { + "epoch": 5.264592378195851, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.8766, + "step": 22535 + }, + { + "epoch": 5.26579835986493, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9242, + "step": 22540 + }, + { + "epoch": 5.267004341534009, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.7476, + "step": 22545 + }, + { + "epoch": 5.268210323203087, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.8466, + "step": 22550 + }, + { + "epoch": 5.269416304872166, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9543, + "step": 22555 + }, + { + "epoch": 5.270622286541244, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.7943, + "step": 22560 + }, + { + "epoch": 5.2718282682103235, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.8315, + "step": 22565 + }, + { + "epoch": 5.273034249879402, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.7683, + "step": 22570 + }, + { + "epoch": 5.274240231548481, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8152, + "step": 22575 + }, + { + "epoch": 5.275446213217559, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8659, + "step": 22580 + }, + { + "epoch": 5.276652194886638, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.7819, + "step": 22585 + }, + { + "epoch": 5.277858176555716, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 4.0173, + "step": 22590 + }, + { + "epoch": 5.279064158224795, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0548, + "step": 22595 + }, + { + "epoch": 5.280270139893874, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8177, + "step": 22600 + }, + { + "epoch": 5.281476121562952, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.875, + "step": 22605 + }, + { + "epoch": 5.282682103232031, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8638, + "step": 22610 + }, + { + "epoch": 5.283888084901109, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8335, + "step": 22615 + }, + { + "epoch": 5.285094066570188, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.8747, + "step": 22620 + }, + { + "epoch": 5.286300048239267, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8161, + "step": 22625 + }, + { + "epoch": 5.287506029908346, + "grad_norm": 3.328125, + "learning_rate": 3e-05, + "loss": 4.041, + "step": 22630 + }, + { + "epoch": 5.288712011577424, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8093, + "step": 22635 + }, + { + "epoch": 5.289917993246503, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9673, + "step": 22640 + }, + { + "epoch": 5.291123974915581, + "grad_norm": 2.21875, + "learning_rate": 3e-05, + "loss": 3.8403, + "step": 22645 + }, + { + "epoch": 5.29232995658466, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9492, + "step": 22650 + }, + { + "epoch": 5.293535938253738, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.7767, + "step": 22655 + }, + { + "epoch": 5.294741919922817, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8635, + "step": 22660 + }, + { + "epoch": 5.295947901591896, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8812, + "step": 22665 + }, + { + "epoch": 5.297153883260974, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9282, + "step": 22670 + }, + { + "epoch": 5.298359864930053, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.7618, + "step": 22675 + }, + { + "epoch": 5.2995658465991315, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.9495, + "step": 22680 + }, + { + "epoch": 5.300771828268211, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.915, + "step": 22685 + }, + { + "epoch": 5.301977809937289, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9178, + "step": 22690 + }, + { + "epoch": 5.303183791606368, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9037, + "step": 22695 + }, + { + "epoch": 5.304389773275446, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.7155, + "step": 22700 + }, + { + "epoch": 5.305595754944525, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8979, + "step": 22705 + }, + { + "epoch": 5.306801736613603, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.8654, + "step": 22710 + }, + { + "epoch": 5.308007718282682, + "grad_norm": 3.28125, + "learning_rate": 3e-05, + "loss": 3.9831, + "step": 22715 + }, + { + "epoch": 5.30921369995176, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8501, + "step": 22720 + }, + { + "epoch": 5.310419681620839, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8332, + "step": 22725 + }, + { + "epoch": 5.311625663289918, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.6915, + "step": 22730 + }, + { + "epoch": 5.3128316449589965, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9509, + "step": 22735 + }, + { + "epoch": 5.3140376266280756, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.9541, + "step": 22740 + }, + { + "epoch": 5.315243608297154, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9391, + "step": 22745 + }, + { + "epoch": 5.316449589966233, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8767, + "step": 22750 + }, + { + "epoch": 5.317655571635311, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9737, + "step": 22755 + }, + { + "epoch": 5.31886155330439, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9249, + "step": 22760 + }, + { + "epoch": 5.320067534973468, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.8289, + "step": 22765 + }, + { + "epoch": 5.321273516642547, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7654, + "step": 22770 + }, + { + "epoch": 5.322479498311625, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8648, + "step": 22775 + }, + { + "epoch": 5.323685479980704, + "grad_norm": 3.546875, + "learning_rate": 3e-05, + "loss": 3.951, + "step": 22780 + }, + { + "epoch": 5.324891461649783, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.9552, + "step": 22785 + }, + { + "epoch": 5.3260974433188615, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.9069, + "step": 22790 + }, + { + "epoch": 5.3273034249879405, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0112, + "step": 22795 + }, + { + "epoch": 5.328509406657019, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8401, + "step": 22800 + }, + { + "epoch": 5.329715388326098, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9011, + "step": 22805 + }, + { + "epoch": 5.330921369995176, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0513, + "step": 22810 + }, + { + "epoch": 5.332127351664255, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8436, + "step": 22815 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8568, + "step": 22820 + }, + { + "epoch": 5.334539315002412, + "grad_norm": 3.671875, + "learning_rate": 3e-05, + "loss": 3.8491, + "step": 22825 + }, + { + "epoch": 5.33574529667149, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8814, + "step": 22830 + }, + { + "epoch": 5.336951278340569, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9669, + "step": 22835 + }, + { + "epoch": 5.338157260009648, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9322, + "step": 22840 + }, + { + "epoch": 5.3393632416787264, + "grad_norm": 3.5, + "learning_rate": 3e-05, + "loss": 4.0186, + "step": 22845 + }, + { + "epoch": 5.3405692233478055, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9937, + "step": 22850 + }, + { + "epoch": 5.341775205016884, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9043, + "step": 22855 + }, + { + "epoch": 5.342981186685963, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.8663, + "step": 22860 + }, + { + "epoch": 5.344187168355041, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8442, + "step": 22865 + }, + { + "epoch": 5.34539315002412, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9621, + "step": 22870 + }, + { + "epoch": 5.346599131693198, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 4.0337, + "step": 22875 + }, + { + "epoch": 5.347805113362277, + "grad_norm": 3.328125, + "learning_rate": 3e-05, + "loss": 4.0618, + "step": 22880 + }, + { + "epoch": 5.349011095031355, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.965, + "step": 22885 + }, + { + "epoch": 5.350217076700434, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.819, + "step": 22890 + }, + { + "epoch": 5.351423058369512, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.8295, + "step": 22895 + }, + { + "epoch": 5.352629040038591, + "grad_norm": 3.34375, + "learning_rate": 3e-05, + "loss": 4.0718, + "step": 22900 + }, + { + "epoch": 5.3538350217076704, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.889, + "step": 22905 + }, + { + "epoch": 5.355041003376749, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.963, + "step": 22910 + }, + { + "epoch": 5.356246985045828, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9911, + "step": 22915 + }, + { + "epoch": 5.357452966714906, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.7436, + "step": 22920 + }, + { + "epoch": 5.358658948383985, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8823, + "step": 22925 + }, + { + "epoch": 5.359864930053063, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9227, + "step": 22930 + }, + { + "epoch": 5.361070911722142, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8426, + "step": 22935 + }, + { + "epoch": 5.36227689339122, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9322, + "step": 22940 + }, + { + "epoch": 5.363482875060299, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8069, + "step": 22945 + }, + { + "epoch": 5.364688856729377, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8967, + "step": 22950 + }, + { + "epoch": 5.365894838398456, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.7528, + "step": 22955 + }, + { + "epoch": 5.3671008200675345, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9744, + "step": 22960 + }, + { + "epoch": 5.368306801736614, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 4.1065, + "step": 22965 + }, + { + "epoch": 5.369512783405693, + "grad_norm": 4.15625, + "learning_rate": 3e-05, + "loss": 3.9464, + "step": 22970 + }, + { + "epoch": 5.370718765074771, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9455, + "step": 22975 + }, + { + "epoch": 5.37192474674385, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.7549, + "step": 22980 + }, + { + "epoch": 5.373130728412928, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.931, + "step": 22985 + }, + { + "epoch": 5.374336710082007, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8414, + "step": 22990 + }, + { + "epoch": 5.375542691751085, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 3.9864, + "step": 22995 + }, + { + "epoch": 5.376748673420164, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.7807, + "step": 23000 + }, + { + "epoch": 5.377954655089242, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 3.7735, + "step": 23005 + }, + { + "epoch": 5.379160636758321, + "grad_norm": 3.296875, + "learning_rate": 3e-05, + "loss": 3.8382, + "step": 23010 + }, + { + "epoch": 5.3803666184273995, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 4.0343, + "step": 23015 + }, + { + "epoch": 5.3815726000964785, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8926, + "step": 23020 + }, + { + "epoch": 5.382778581765558, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8172, + "step": 23025 + }, + { + "epoch": 5.383984563434636, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8929, + "step": 23030 + }, + { + "epoch": 5.385190545103715, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8203, + "step": 23035 + }, + { + "epoch": 5.386396526772793, + "grad_norm": 3.625, + "learning_rate": 3e-05, + "loss": 3.8249, + "step": 23040 + }, + { + "epoch": 5.387602508441872, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.0976, + "step": 23045 + }, + { + "epoch": 5.38880849011095, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9134, + "step": 23050 + }, + { + "epoch": 5.390014471780029, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8834, + "step": 23055 + }, + { + "epoch": 5.391220453449107, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.883, + "step": 23060 + }, + { + "epoch": 5.392426435118186, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.905, + "step": 23065 + }, + { + "epoch": 5.3936324167872645, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8558, + "step": 23070 + }, + { + "epoch": 5.3948383984563435, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9013, + "step": 23075 + }, + { + "epoch": 5.3960443801254225, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9328, + "step": 23080 + }, + { + "epoch": 5.397250361794501, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8126, + "step": 23085 + }, + { + "epoch": 5.39845634346358, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0001, + "step": 23090 + }, + { + "epoch": 5.399662325132658, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.8751, + "step": 23095 + }, + { + "epoch": 5.400868306801737, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.972, + "step": 23100 + }, + { + "epoch": 5.402074288470815, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 4.0142, + "step": 23105 + }, + { + "epoch": 5.403280270139894, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0198, + "step": 23110 + }, + { + "epoch": 5.404486251808972, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9366, + "step": 23115 + }, + { + "epoch": 5.405692233478051, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0044, + "step": 23120 + }, + { + "epoch": 5.406898215147129, + "grad_norm": 3.609375, + "learning_rate": 3e-05, + "loss": 4.1258, + "step": 23125 + }, + { + "epoch": 5.4081041968162085, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.9112, + "step": 23130 + }, + { + "epoch": 5.409310178485287, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.8063, + "step": 23135 + }, + { + "epoch": 5.410516160154366, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 4.0434, + "step": 23140 + }, + { + "epoch": 5.411722141823445, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.878, + "step": 23145 + }, + { + "epoch": 5.412928123492523, + "grad_norm": 3.171875, + "learning_rate": 3e-05, + "loss": 3.6575, + "step": 23150 + }, + { + "epoch": 5.414134105161602, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.882, + "step": 23155 + }, + { + "epoch": 5.41534008683068, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.1331, + "step": 23160 + }, + { + "epoch": 5.416546068499759, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7655, + "step": 23165 + }, + { + "epoch": 5.417752050168837, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7729, + "step": 23170 + }, + { + "epoch": 5.418958031837916, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.957, + "step": 23175 + }, + { + "epoch": 5.420164013506994, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.9156, + "step": 23180 + }, + { + "epoch": 5.421369995176073, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.856, + "step": 23185 + }, + { + "epoch": 5.422575976845152, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9478, + "step": 23190 + }, + { + "epoch": 5.423781958514231, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9211, + "step": 23195 + }, + { + "epoch": 5.424987940183309, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8786, + "step": 23200 + }, + { + "epoch": 5.426193921852388, + "grad_norm": 3.546875, + "learning_rate": 3e-05, + "loss": 3.8665, + "step": 23205 + }, + { + "epoch": 5.427399903521467, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.8091, + "step": 23210 + }, + { + "epoch": 5.428605885190545, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.9709, + "step": 23215 + }, + { + "epoch": 5.429811866859624, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.7431, + "step": 23220 + }, + { + "epoch": 5.431017848528702, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.857, + "step": 23225 + }, + { + "epoch": 5.432223830197781, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9715, + "step": 23230 + }, + { + "epoch": 5.433429811866859, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9681, + "step": 23235 + }, + { + "epoch": 5.434635793535938, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.938, + "step": 23240 + }, + { + "epoch": 5.4358417752050165, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8178, + "step": 23245 + }, + { + "epoch": 5.437047756874096, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8495, + "step": 23250 + }, + { + "epoch": 5.438253738543174, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8898, + "step": 23255 + }, + { + "epoch": 5.439459720212253, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8435, + "step": 23260 + }, + { + "epoch": 5.440665701881332, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0859, + "step": 23265 + }, + { + "epoch": 5.44187168355041, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8726, + "step": 23270 + }, + { + "epoch": 5.443077665219489, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0161, + "step": 23275 + }, + { + "epoch": 5.444283646888567, + "grad_norm": 3.65625, + "learning_rate": 3e-05, + "loss": 3.8961, + "step": 23280 + }, + { + "epoch": 5.445489628557646, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.7977, + "step": 23285 + }, + { + "epoch": 5.446695610226724, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9329, + "step": 23290 + }, + { + "epoch": 5.447901591895803, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.9534, + "step": 23295 + }, + { + "epoch": 5.4491075735648815, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.8252, + "step": 23300 + }, + { + "epoch": 5.4503135552339605, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9868, + "step": 23305 + }, + { + "epoch": 5.451519536903039, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9234, + "step": 23310 + }, + { + "epoch": 5.452725518572118, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9167, + "step": 23315 + }, + { + "epoch": 5.453931500241197, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.8283, + "step": 23320 + }, + { + "epoch": 5.455137481910275, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.895, + "step": 23325 + }, + { + "epoch": 5.456343463579354, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.7375, + "step": 23330 + }, + { + "epoch": 5.457549445248432, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9086, + "step": 23335 + }, + { + "epoch": 5.458755426917511, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9871, + "step": 23340 + }, + { + "epoch": 5.459961408586589, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8431, + "step": 23345 + }, + { + "epoch": 5.461167390255668, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9762, + "step": 23350 + }, + { + "epoch": 5.4623733719247465, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9696, + "step": 23355 + }, + { + "epoch": 5.4635793535938255, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.913, + "step": 23360 + }, + { + "epoch": 5.464785335262904, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9877, + "step": 23365 + }, + { + "epoch": 5.465991316931983, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.9073, + "step": 23370 + }, + { + "epoch": 5.467197298601061, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9377, + "step": 23375 + }, + { + "epoch": 5.46840328027014, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 4.0128, + "step": 23380 + }, + { + "epoch": 5.469609261939219, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 4.0208, + "step": 23385 + }, + { + "epoch": 5.470815243608297, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.939, + "step": 23390 + }, + { + "epoch": 5.472021225277376, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9935, + "step": 23395 + }, + { + "epoch": 5.473227206946454, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9437, + "step": 23400 + }, + { + "epoch": 5.474433188615533, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.0159, + "step": 23405 + }, + { + "epoch": 5.4756391702846114, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.9312, + "step": 23410 + }, + { + "epoch": 5.4768451519536905, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.0047, + "step": 23415 + }, + { + "epoch": 5.478051133622769, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8874, + "step": 23420 + }, + { + "epoch": 5.479257115291848, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9742, + "step": 23425 + }, + { + "epoch": 5.480463096960926, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9367, + "step": 23430 + }, + { + "epoch": 5.481669078630005, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8547, + "step": 23435 + }, + { + "epoch": 5.482875060299083, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9652, + "step": 23440 + }, + { + "epoch": 5.484081041968162, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.838, + "step": 23445 + }, + { + "epoch": 5.485287023637241, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.7544, + "step": 23450 + }, + { + "epoch": 5.486493005306319, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9559, + "step": 23455 + }, + { + "epoch": 5.487698986975398, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9173, + "step": 23460 + }, + { + "epoch": 5.488904968644476, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8572, + "step": 23465 + }, + { + "epoch": 5.4901109503135554, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9472, + "step": 23470 + }, + { + "epoch": 5.491316931982634, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8662, + "step": 23475 + }, + { + "epoch": 5.492522913651713, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.9692, + "step": 23480 + }, + { + "epoch": 5.493728895320791, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.8551, + "step": 23485 + }, + { + "epoch": 5.49493487698987, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.791, + "step": 23490 + }, + { + "epoch": 5.496140858658948, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.847, + "step": 23495 + }, + { + "epoch": 5.497346840328027, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.956, + "step": 23500 + }, + { + "epoch": 5.498552821997106, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.7946, + "step": 23505 + }, + { + "epoch": 5.499758803666184, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8917, + "step": 23510 + }, + { + "epoch": 5.500964785335263, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.7733, + "step": 23515 + }, + { + "epoch": 5.502170767004341, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.8953, + "step": 23520 + }, + { + "epoch": 5.50337674867342, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 3.8253, + "step": 23525 + }, + { + "epoch": 5.504582730342499, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9044, + "step": 23530 + }, + { + "epoch": 5.505788712011578, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 4.0008, + "step": 23535 + }, + { + "epoch": 5.506994693680656, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9245, + "step": 23540 + }, + { + "epoch": 5.508200675349735, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0074, + "step": 23545 + }, + { + "epoch": 5.509406657018813, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8905, + "step": 23550 + }, + { + "epoch": 5.510612638687892, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0509, + "step": 23555 + }, + { + "epoch": 5.511818620356971, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9083, + "step": 23560 + }, + { + "epoch": 5.513024602026049, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.9471, + "step": 23565 + }, + { + "epoch": 5.514230583695128, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9761, + "step": 23570 + }, + { + "epoch": 5.515436565364206, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8881, + "step": 23575 + }, + { + "epoch": 5.516642547033285, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.009, + "step": 23580 + }, + { + "epoch": 5.5178485287023635, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.9264, + "step": 23585 + }, + { + "epoch": 5.519054510371443, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8979, + "step": 23590 + }, + { + "epoch": 5.520260492040521, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.8037, + "step": 23595 + }, + { + "epoch": 5.5214664737096, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.8167, + "step": 23600 + }, + { + "epoch": 5.522672455378678, + "grad_norm": 3.34375, + "learning_rate": 3e-05, + "loss": 3.8055, + "step": 23605 + }, + { + "epoch": 5.523878437047757, + "grad_norm": 2.03125, + "learning_rate": 3e-05, + "loss": 3.9042, + "step": 23610 + }, + { + "epoch": 5.525084418716835, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.8375, + "step": 23615 + }, + { + "epoch": 5.526290400385914, + "grad_norm": 2.09375, + "learning_rate": 3e-05, + "loss": 3.9302, + "step": 23620 + }, + { + "epoch": 5.527496382054993, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.883, + "step": 23625 + }, + { + "epoch": 5.528702363724071, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.9157, + "step": 23630 + }, + { + "epoch": 5.52990834539315, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.826, + "step": 23635 + }, + { + "epoch": 5.5311143270622285, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9703, + "step": 23640 + }, + { + "epoch": 5.5323203087313075, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.7943, + "step": 23645 + }, + { + "epoch": 5.533526290400386, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.8437, + "step": 23650 + }, + { + "epoch": 5.534732272069465, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.8803, + "step": 23655 + }, + { + "epoch": 5.535938253738543, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.7778, + "step": 23660 + }, + { + "epoch": 5.537144235407622, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9392, + "step": 23665 + }, + { + "epoch": 5.5383502170767, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.7376, + "step": 23670 + }, + { + "epoch": 5.539556198745779, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.9211, + "step": 23675 + }, + { + "epoch": 5.540762180414857, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8134, + "step": 23680 + }, + { + "epoch": 5.541968162083936, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9135, + "step": 23685 + }, + { + "epoch": 5.543174143753015, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9313, + "step": 23690 + }, + { + "epoch": 5.5443801254220935, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.7804, + "step": 23695 + }, + { + "epoch": 5.5455861070911725, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9341, + "step": 23700 + }, + { + "epoch": 5.546792088760251, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.9208, + "step": 23705 + }, + { + "epoch": 5.54799807042933, + "grad_norm": 3.9375, + "learning_rate": 3e-05, + "loss": 4.0624, + "step": 23710 + }, + { + "epoch": 5.549204052098408, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7657, + "step": 23715 + }, + { + "epoch": 5.550410033767487, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.8927, + "step": 23720 + }, + { + "epoch": 5.551616015436565, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 4.0104, + "step": 23725 + }, + { + "epoch": 5.552821997105644, + "grad_norm": 3.046875, + "learning_rate": 3e-05, + "loss": 3.8402, + "step": 23730 + }, + { + "epoch": 5.554027978774723, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.0567, + "step": 23735 + }, + { + "epoch": 5.555233960443801, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.9959, + "step": 23740 + }, + { + "epoch": 5.556439942112879, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.8047, + "step": 23745 + }, + { + "epoch": 5.557645923781958, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8724, + "step": 23750 + }, + { + "epoch": 5.5588519054510375, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9087, + "step": 23755 + }, + { + "epoch": 5.560057887120116, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 4.0107, + "step": 23760 + }, + { + "epoch": 5.561263868789195, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9476, + "step": 23765 + }, + { + "epoch": 5.562469850458273, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 3.7981, + "step": 23770 + }, + { + "epoch": 5.563675832127352, + "grad_norm": 3.390625, + "learning_rate": 3e-05, + "loss": 3.8564, + "step": 23775 + }, + { + "epoch": 5.56488181379643, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.7502, + "step": 23780 + }, + { + "epoch": 5.566087795465509, + "grad_norm": 2.125, + "learning_rate": 3e-05, + "loss": 3.832, + "step": 23785 + }, + { + "epoch": 5.567293777134587, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9683, + "step": 23790 + }, + { + "epoch": 5.568499758803666, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.1233, + "step": 23795 + }, + { + "epoch": 5.569705740472745, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.7684, + "step": 23800 + }, + { + "epoch": 5.570911722141823, + "grad_norm": 2.703125, + "learning_rate": 3e-05, + "loss": 3.8199, + "step": 23805 + }, + { + "epoch": 5.572117703810902, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0923, + "step": 23810 + }, + { + "epoch": 5.573323685479981, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9091, + "step": 23815 + }, + { + "epoch": 5.57452966714906, + "grad_norm": 2.1875, + "learning_rate": 3e-05, + "loss": 3.7912, + "step": 23820 + }, + { + "epoch": 5.575735648818138, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9635, + "step": 23825 + }, + { + "epoch": 5.576941630487217, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 3.8467, + "step": 23830 + }, + { + "epoch": 5.578147612156295, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 4.0456, + "step": 23835 + }, + { + "epoch": 5.579353593825374, + "grad_norm": 2.25, + "learning_rate": 3e-05, + "loss": 3.8758, + "step": 23840 + }, + { + "epoch": 5.580559575494452, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9819, + "step": 23845 + }, + { + "epoch": 5.581765557163531, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.899, + "step": 23850 + }, + { + "epoch": 5.582971538832609, + "grad_norm": 3.25, + "learning_rate": 3e-05, + "loss": 3.9845, + "step": 23855 + }, + { + "epoch": 5.584177520501688, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.8407, + "step": 23860 + }, + { + "epoch": 5.585383502170767, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 3.9287, + "step": 23865 + }, + { + "epoch": 5.5865894838398455, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 4.0203, + "step": 23870 + }, + { + "epoch": 5.587795465508925, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7997, + "step": 23875 + }, + { + "epoch": 5.589001447178003, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 4.0247, + "step": 23880 + }, + { + "epoch": 5.590207428847082, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.9555, + "step": 23885 + }, + { + "epoch": 5.59141341051616, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.8785, + "step": 23890 + }, + { + "epoch": 5.592619392185239, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9272, + "step": 23895 + }, + { + "epoch": 5.593825373854317, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 3.9239, + "step": 23900 + }, + { + "epoch": 5.595031355523396, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.853, + "step": 23905 + }, + { + "epoch": 5.596237337192474, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9492, + "step": 23910 + }, + { + "epoch": 5.597443318861553, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.7937, + "step": 23915 + }, + { + "epoch": 5.5986493005306315, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8552, + "step": 23920 + }, + { + "epoch": 5.5998552821997105, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9371, + "step": 23925 + }, + { + "epoch": 5.6010612638687896, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9063, + "step": 23930 + }, + { + "epoch": 5.602267245537868, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9762, + "step": 23935 + }, + { + "epoch": 5.603473227206947, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.8711, + "step": 23940 + }, + { + "epoch": 5.604679208876025, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0296, + "step": 23945 + }, + { + "epoch": 5.605885190545104, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9397, + "step": 23950 + }, + { + "epoch": 5.607091172214182, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.914, + "step": 23955 + }, + { + "epoch": 5.608297153883261, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.8729, + "step": 23960 + }, + { + "epoch": 5.609503135552339, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.9831, + "step": 23965 + }, + { + "epoch": 5.610709117221418, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9341, + "step": 23970 + }, + { + "epoch": 5.611915098890497, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9959, + "step": 23975 + }, + { + "epoch": 5.6131210805595755, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 4.0269, + "step": 23980 + }, + { + "epoch": 5.614327062228654, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.077, + "step": 23985 + }, + { + "epoch": 5.615533043897733, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.9276, + "step": 23990 + }, + { + "epoch": 5.616739025566812, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.7895, + "step": 23995 + }, + { + "epoch": 5.61794500723589, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.8373, + "step": 24000 + }, + { + "epoch": 5.619150988904969, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9947, + "step": 24005 + }, + { + "epoch": 5.620356970574047, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.8984, + "step": 24010 + }, + { + "epoch": 5.621562952243126, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9236, + "step": 24015 + }, + { + "epoch": 5.622768933912204, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9875, + "step": 24020 + }, + { + "epoch": 5.623974915581283, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.7336, + "step": 24025 + }, + { + "epoch": 5.625180897250361, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 4.0664, + "step": 24030 + }, + { + "epoch": 5.6263868789194404, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.7686, + "step": 24035 + }, + { + "epoch": 5.6275928605885195, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.8031, + "step": 24040 + }, + { + "epoch": 5.628798842257598, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 4.1759, + "step": 24045 + }, + { + "epoch": 5.630004823926677, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9652, + "step": 24050 + }, + { + "epoch": 5.631210805595755, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9655, + "step": 24055 + }, + { + "epoch": 5.632416787264834, + "grad_norm": 2.953125, + "learning_rate": 3e-05, + "loss": 3.8859, + "step": 24060 + }, + { + "epoch": 5.633622768933912, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.8852, + "step": 24065 + }, + { + "epoch": 5.634828750602991, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9097, + "step": 24070 + }, + { + "epoch": 5.636034732272069, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.921, + "step": 24075 + }, + { + "epoch": 5.637240713941148, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8161, + "step": 24080 + }, + { + "epoch": 5.638446695610226, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.8408, + "step": 24085 + }, + { + "epoch": 5.639652677279305, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.9192, + "step": 24090 + }, + { + "epoch": 5.640858658948384, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8856, + "step": 24095 + }, + { + "epoch": 5.642064640617463, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 4.1615, + "step": 24100 + }, + { + "epoch": 5.643270622286542, + "grad_norm": 3.6875, + "learning_rate": 3e-05, + "loss": 3.9952, + "step": 24105 + }, + { + "epoch": 5.64447660395562, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.9389, + "step": 24110 + }, + { + "epoch": 5.645682585624699, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.8854, + "step": 24115 + }, + { + "epoch": 5.646888567293777, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.7662, + "step": 24120 + }, + { + "epoch": 5.648094548962856, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9437, + "step": 24125 + }, + { + "epoch": 5.649300530631934, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.7892, + "step": 24130 + }, + { + "epoch": 5.650506512301013, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7409, + "step": 24135 + }, + { + "epoch": 5.651712493970091, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9944, + "step": 24140 + }, + { + "epoch": 5.65291847563917, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.9909, + "step": 24145 + }, + { + "epoch": 5.6541244573082485, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 3.8207, + "step": 24150 + }, + { + "epoch": 5.655330438977328, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 4.0181, + "step": 24155 + }, + { + "epoch": 5.656536420646406, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8528, + "step": 24160 + }, + { + "epoch": 5.657742402315485, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8931, + "step": 24165 + }, + { + "epoch": 5.658948383984564, + "grad_norm": 2.921875, + "learning_rate": 3e-05, + "loss": 4.0681, + "step": 24170 + }, + { + "epoch": 5.660154365653642, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0327, + "step": 24175 + }, + { + "epoch": 5.661360347322721, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8842, + "step": 24180 + }, + { + "epoch": 5.662566328991799, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 4.0374, + "step": 24185 + }, + { + "epoch": 5.663772310660878, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9792, + "step": 24190 + }, + { + "epoch": 5.664978292329956, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.9157, + "step": 24195 + }, + { + "epoch": 5.666184273999035, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8082, + "step": 24200 + }, + { + "epoch": 5.6673902556681135, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9661, + "step": 24205 + }, + { + "epoch": 5.6685962373371925, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 4.0323, + "step": 24210 + }, + { + "epoch": 5.669802219006272, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.0348, + "step": 24215 + }, + { + "epoch": 5.67100820067535, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.9035, + "step": 24220 + }, + { + "epoch": 5.672214182344428, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.9819, + "step": 24225 + }, + { + "epoch": 5.673420164013507, + "grad_norm": 2.296875, + "learning_rate": 3e-05, + "loss": 3.9195, + "step": 24230 + }, + { + "epoch": 5.674626145682586, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.8516, + "step": 24235 + }, + { + "epoch": 5.675832127351664, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.9088, + "step": 24240 + }, + { + "epoch": 5.677038109020743, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.8743, + "step": 24245 + }, + { + "epoch": 5.678244090689821, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.792, + "step": 24250 + }, + { + "epoch": 5.6794500723589, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9193, + "step": 24255 + }, + { + "epoch": 5.6806560540279785, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9117, + "step": 24260 + }, + { + "epoch": 5.6818620356970575, + "grad_norm": 2.8125, + "learning_rate": 3e-05, + "loss": 4.032, + "step": 24265 + }, + { + "epoch": 5.683068017366136, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.793, + "step": 24270 + }, + { + "epoch": 5.684273999035215, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.8266, + "step": 24275 + }, + { + "epoch": 5.685479980704294, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 4.0098, + "step": 24280 + }, + { + "epoch": 5.686685962373372, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9192, + "step": 24285 + }, + { + "epoch": 5.687891944042451, + "grad_norm": 3.890625, + "learning_rate": 3e-05, + "loss": 3.8154, + "step": 24290 + }, + { + "epoch": 5.689097925711529, + "grad_norm": 2.75, + "learning_rate": 3e-05, + "loss": 3.8686, + "step": 24295 + }, + { + "epoch": 5.690303907380608, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.91, + "step": 24300 + }, + { + "epoch": 5.691509889049686, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8052, + "step": 24305 + }, + { + "epoch": 5.692715870718765, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.8589, + "step": 24310 + }, + { + "epoch": 5.693921852387843, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9194, + "step": 24315 + }, + { + "epoch": 5.6951278340569225, + "grad_norm": 2.203125, + "learning_rate": 3e-05, + "loss": 3.86, + "step": 24320 + }, + { + "epoch": 5.696333815726001, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8757, + "step": 24325 + }, + { + "epoch": 5.69753979739508, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.8218, + "step": 24330 + }, + { + "epoch": 5.698745779064158, + "grad_norm": 3.125, + "learning_rate": 3e-05, + "loss": 3.9438, + "step": 24335 + }, + { + "epoch": 5.699951760733237, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.8521, + "step": 24340 + }, + { + "epoch": 5.701157742402316, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 3.9065, + "step": 24345 + }, + { + "epoch": 5.702363724071394, + "grad_norm": 3.609375, + "learning_rate": 3e-05, + "loss": 3.8656, + "step": 24350 + }, + { + "epoch": 5.703569705740473, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.879, + "step": 24355 + }, + { + "epoch": 5.704775687409551, + "grad_norm": 3.421875, + "learning_rate": 3e-05, + "loss": 3.9726, + "step": 24360 + }, + { + "epoch": 5.70598166907863, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.7641, + "step": 24365 + }, + { + "epoch": 5.707187650747708, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9876, + "step": 24370 + }, + { + "epoch": 5.708393632416787, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9063, + "step": 24375 + }, + { + "epoch": 5.709599614085866, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8726, + "step": 24380 + }, + { + "epoch": 5.710805595754945, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.9036, + "step": 24385 + }, + { + "epoch": 5.712011577424023, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.833, + "step": 24390 + }, + { + "epoch": 5.713217559093102, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.767, + "step": 24395 + }, + { + "epoch": 5.71442354076218, + "grad_norm": 2.421875, + "learning_rate": 3e-05, + "loss": 3.9863, + "step": 24400 + }, + { + "epoch": 5.715629522431259, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.8597, + "step": 24405 + }, + { + "epoch": 5.716835504100338, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9471, + "step": 24410 + }, + { + "epoch": 5.718041485769416, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8303, + "step": 24415 + }, + { + "epoch": 5.719247467438495, + "grad_norm": 3.0, + "learning_rate": 3e-05, + "loss": 4.0843, + "step": 24420 + }, + { + "epoch": 5.720453449107573, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.9113, + "step": 24425 + }, + { + "epoch": 5.721659430776652, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9092, + "step": 24430 + }, + { + "epoch": 5.7228654124457305, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.9785, + "step": 24435 + }, + { + "epoch": 5.72407139411481, + "grad_norm": 2.546875, + "learning_rate": 3e-05, + "loss": 3.8626, + "step": 24440 + }, + { + "epoch": 5.725277375783888, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.8585, + "step": 24445 + }, + { + "epoch": 5.726483357452967, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8717, + "step": 24450 + }, + { + "epoch": 5.727689339122046, + "grad_norm": 3.5625, + "learning_rate": 3e-05, + "loss": 3.7441, + "step": 24455 + }, + { + "epoch": 5.728895320791124, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9387, + "step": 24460 + }, + { + "epoch": 5.730101302460202, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7744, + "step": 24465 + }, + { + "epoch": 5.731307284129281, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9583, + "step": 24470 + }, + { + "epoch": 5.73251326579836, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.9269, + "step": 24475 + }, + { + "epoch": 5.733719247467438, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9111, + "step": 24480 + }, + { + "epoch": 5.734925229136517, + "grad_norm": 2.578125, + "learning_rate": 3e-05, + "loss": 3.8222, + "step": 24485 + }, + { + "epoch": 5.7361312108055955, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.8979, + "step": 24490 + }, + { + "epoch": 5.7373371924746746, + "grad_norm": 2.859375, + "learning_rate": 3e-05, + "loss": 3.9525, + "step": 24495 + }, + { + "epoch": 5.738543174143753, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.8084, + "step": 24500 + }, + { + "epoch": 5.739749155812832, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8157, + "step": 24505 + }, + { + "epoch": 5.74095513748191, + "grad_norm": 2.265625, + "learning_rate": 3e-05, + "loss": 3.8661, + "step": 24510 + }, + { + "epoch": 5.742161119150989, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.853, + "step": 24515 + }, + { + "epoch": 5.743367100820068, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 4.1209, + "step": 24520 + }, + { + "epoch": 5.744573082489146, + "grad_norm": 2.609375, + "learning_rate": 3e-05, + "loss": 3.896, + "step": 24525 + }, + { + "epoch": 5.745779064158225, + "grad_norm": 2.625, + "learning_rate": 3e-05, + "loss": 3.8586, + "step": 24530 + }, + { + "epoch": 5.746985045827303, + "grad_norm": 3.359375, + "learning_rate": 3e-05, + "loss": 3.8605, + "step": 24535 + }, + { + "epoch": 5.748191027496382, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.7692, + "step": 24540 + }, + { + "epoch": 5.7493970091654605, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.6939, + "step": 24545 + }, + { + "epoch": 5.7506029908345395, + "grad_norm": 3.234375, + "learning_rate": 3e-05, + "loss": 4.1683, + "step": 24550 + }, + { + "epoch": 5.751808972503618, + "grad_norm": 3.59375, + "learning_rate": 3e-05, + "loss": 4.0264, + "step": 24555 + }, + { + "epoch": 5.753014954172697, + "grad_norm": 2.5, + "learning_rate": 3e-05, + "loss": 3.9376, + "step": 24560 + }, + { + "epoch": 5.754220935841775, + "grad_norm": 3.140625, + "learning_rate": 3e-05, + "loss": 3.9573, + "step": 24565 + }, + { + "epoch": 5.755426917510854, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9609, + "step": 24570 + }, + { + "epoch": 5.756632899179932, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.8621, + "step": 24575 + }, + { + "epoch": 5.757838880849011, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.9926, + "step": 24580 + }, + { + "epoch": 5.75904486251809, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.979, + "step": 24585 + }, + { + "epoch": 5.760250844187168, + "grad_norm": 3.21875, + "learning_rate": 3e-05, + "loss": 3.9041, + "step": 24590 + }, + { + "epoch": 5.761456825856247, + "grad_norm": 2.984375, + "learning_rate": 3e-05, + "loss": 3.8563, + "step": 24595 + }, + { + "epoch": 5.7626628075253254, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9112, + "step": 24600 + }, + { + "epoch": 5.7638687891944045, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8316, + "step": 24605 + }, + { + "epoch": 5.765074770863483, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.8034, + "step": 24610 + }, + { + "epoch": 5.766280752532562, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.8455, + "step": 24615 + }, + { + "epoch": 5.76748673420164, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.8887, + "step": 24620 + }, + { + "epoch": 5.768692715870719, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.978, + "step": 24625 + }, + { + "epoch": 5.769898697539798, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.8693, + "step": 24630 + }, + { + "epoch": 5.771104679208876, + "grad_norm": 3.015625, + "learning_rate": 3e-05, + "loss": 3.93, + "step": 24635 + }, + { + "epoch": 5.772310660877954, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.7529, + "step": 24640 + }, + { + "epoch": 5.773516642547033, + "grad_norm": 2.3125, + "learning_rate": 3e-05, + "loss": 3.7131, + "step": 24645 + }, + { + "epoch": 5.774722624216112, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.797, + "step": 24650 + }, + { + "epoch": 5.77592860588519, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 4.1485, + "step": 24655 + }, + { + "epoch": 5.7771345875542695, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 4.1069, + "step": 24660 + }, + { + "epoch": 5.778340569223348, + "grad_norm": 2.0625, + "learning_rate": 3e-05, + "loss": 3.9295, + "step": 24665 + }, + { + "epoch": 5.779546550892427, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.8998, + "step": 24670 + }, + { + "epoch": 5.780752532561505, + "grad_norm": 2.765625, + "learning_rate": 3e-05, + "loss": 3.9761, + "step": 24675 + }, + { + "epoch": 5.781958514230584, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.845, + "step": 24680 + }, + { + "epoch": 5.783164495899662, + "grad_norm": 2.5625, + "learning_rate": 3e-05, + "loss": 3.9596, + "step": 24685 + }, + { + "epoch": 5.784370477568741, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.8565, + "step": 24690 + }, + { + "epoch": 5.78557645923782, + "grad_norm": 3.09375, + "learning_rate": 3e-05, + "loss": 3.9124, + "step": 24695 + }, + { + "epoch": 5.786782440906898, + "grad_norm": 4.59375, + "learning_rate": 3e-05, + "loss": 4.0294, + "step": 24700 + }, + { + "epoch": 5.787988422575977, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 4.0272, + "step": 24705 + }, + { + "epoch": 5.789194404245055, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9979, + "step": 24710 + }, + { + "epoch": 5.790400385914134, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9102, + "step": 24715 + }, + { + "epoch": 5.791606367583213, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.8692, + "step": 24720 + }, + { + "epoch": 5.792812349252292, + "grad_norm": 2.4375, + "learning_rate": 3e-05, + "loss": 4.0655, + "step": 24725 + }, + { + "epoch": 5.79401833092137, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.94, + "step": 24730 + }, + { + "epoch": 5.795224312590449, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.9427, + "step": 24735 + }, + { + "epoch": 5.796430294259527, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.8959, + "step": 24740 + }, + { + "epoch": 5.797636275928606, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8888, + "step": 24745 + }, + { + "epoch": 5.798842257597684, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.9183, + "step": 24750 + }, + { + "epoch": 5.800048239266763, + "grad_norm": 3.03125, + "learning_rate": 3e-05, + "loss": 4.0635, + "step": 24755 + }, + { + "epoch": 5.801254220935842, + "grad_norm": 2.65625, + "learning_rate": 3e-05, + "loss": 3.8348, + "step": 24760 + }, + { + "epoch": 5.80246020260492, + "grad_norm": 2.59375, + "learning_rate": 3e-05, + "loss": 3.8865, + "step": 24765 + }, + { + "epoch": 5.803666184273999, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.9758, + "step": 24770 + }, + { + "epoch": 5.8048721659430775, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0289, + "step": 24775 + }, + { + "epoch": 5.806078147612157, + "grad_norm": 2.375, + "learning_rate": 3e-05, + "loss": 3.8561, + "step": 24780 + }, + { + "epoch": 5.807284129281235, + "grad_norm": 3.4375, + "learning_rate": 3e-05, + "loss": 4.0404, + "step": 24785 + }, + { + "epoch": 5.808490110950314, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.9954, + "step": 24790 + }, + { + "epoch": 5.809696092619392, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.7289, + "step": 24795 + }, + { + "epoch": 5.810902074288471, + "grad_norm": 3.1875, + "learning_rate": 3e-05, + "loss": 3.9266, + "step": 24800 + }, + { + "epoch": 5.812108055957549, + "grad_norm": 2.84375, + "learning_rate": 3e-05, + "loss": 3.8585, + "step": 24805 + }, + { + "epoch": 5.813314037626628, + "grad_norm": 2.359375, + "learning_rate": 3e-05, + "loss": 3.9895, + "step": 24810 + }, + { + "epoch": 5.814520019295706, + "grad_norm": 2.328125, + "learning_rate": 3e-05, + "loss": 3.901, + "step": 24815 + }, + { + "epoch": 5.815726000964785, + "grad_norm": 3.109375, + "learning_rate": 3e-05, + "loss": 3.9541, + "step": 24820 + }, + { + "epoch": 5.816931982633864, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8988, + "step": 24825 + }, + { + "epoch": 5.8181379643029425, + "grad_norm": 2.96875, + "learning_rate": 3e-05, + "loss": 3.9047, + "step": 24830 + }, + { + "epoch": 5.8193439459720215, + "grad_norm": 2.46875, + "learning_rate": 3e-05, + "loss": 3.9864, + "step": 24835 + }, + { + "epoch": 5.8205499276411, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 3.7528, + "step": 24840 + }, + { + "epoch": 5.821755909310179, + "grad_norm": 2.390625, + "learning_rate": 3e-05, + "loss": 3.9339, + "step": 24845 + }, + { + "epoch": 5.822961890979257, + "grad_norm": 3.265625, + "learning_rate": 3e-05, + "loss": 3.8203, + "step": 24850 + }, + { + "epoch": 5.824167872648336, + "grad_norm": 2.90625, + "learning_rate": 3e-05, + "loss": 3.8625, + "step": 24855 + }, + { + "epoch": 5.825373854317414, + "grad_norm": 2.796875, + "learning_rate": 3e-05, + "loss": 3.9287, + "step": 24860 + }, + { + "epoch": 5.826579835986493, + "grad_norm": 2.34375, + "learning_rate": 3e-05, + "loss": 3.9621, + "step": 24865 + }, + { + "epoch": 5.827785817655572, + "grad_norm": 3.53125, + "learning_rate": 3e-05, + "loss": 3.9187, + "step": 24870 + }, + { + "epoch": 5.82899179932465, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.8991, + "step": 24875 + }, + { + "epoch": 5.830197780993728, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.9435, + "step": 24880 + }, + { + "epoch": 5.8314037626628075, + "grad_norm": 2.640625, + "learning_rate": 3e-05, + "loss": 3.9192, + "step": 24885 + }, + { + "epoch": 5.8326097443318865, + "grad_norm": 2.78125, + "learning_rate": 3e-05, + "loss": 3.9059, + "step": 24890 + }, + { + "epoch": 5.833815726000965, + "grad_norm": 2.671875, + "learning_rate": 3e-05, + "loss": 3.8766, + "step": 24895 + }, + { + "epoch": 5.835021707670044, + "grad_norm": 3.0625, + "learning_rate": 3e-05, + "loss": 3.8632, + "step": 24900 + }, + { + "epoch": 5.836227689339122, + "grad_norm": 3.078125, + "learning_rate": 3e-05, + "loss": 3.8248, + "step": 24905 + }, + { + "epoch": 5.837433671008201, + "grad_norm": 2.828125, + "learning_rate": 3e-05, + "loss": 4.0409, + "step": 24910 + }, + { + "epoch": 5.838639652677279, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8307, + "step": 24915 + }, + { + "epoch": 5.839845634346358, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 4.0343, + "step": 24920 + }, + { + "epoch": 5.841051616015436, + "grad_norm": 2.53125, + "learning_rate": 3e-05, + "loss": 3.9369, + "step": 24925 + }, + { + "epoch": 5.842257597684515, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.7994, + "step": 24930 + }, + { + "epoch": 5.843463579353594, + "grad_norm": 3.15625, + "learning_rate": 3e-05, + "loss": 3.8435, + "step": 24935 + }, + { + "epoch": 5.844669561022672, + "grad_norm": 2.71875, + "learning_rate": 3e-05, + "loss": 3.7457, + "step": 24940 + }, + { + "epoch": 5.8458755426917515, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8773, + "step": 24945 + }, + { + "epoch": 5.84708152436083, + "grad_norm": 2.28125, + "learning_rate": 3e-05, + "loss": 3.9961, + "step": 24950 + }, + { + "epoch": 5.848287506029909, + "grad_norm": 2.9375, + "learning_rate": 3e-05, + "loss": 3.9634, + "step": 24955 + }, + { + "epoch": 5.849493487698987, + "grad_norm": 2.484375, + "learning_rate": 3e-05, + "loss": 3.8916, + "step": 24960 + }, + { + "epoch": 5.850699469368066, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 4.1893, + "step": 24965 + }, + { + "epoch": 5.851905451037144, + "grad_norm": 2.40625, + "learning_rate": 3e-05, + "loss": 3.9133, + "step": 24970 + }, + { + "epoch": 5.853111432706223, + "grad_norm": 2.453125, + "learning_rate": 3e-05, + "loss": 3.8251, + "step": 24975 + }, + { + "epoch": 5.854317414375301, + "grad_norm": 2.734375, + "learning_rate": 3e-05, + "loss": 3.9746, + "step": 24980 + }, + { + "epoch": 5.85552339604438, + "grad_norm": 2.515625, + "learning_rate": 3e-05, + "loss": 3.6877, + "step": 24985 + }, + { + "epoch": 5.856729377713458, + "grad_norm": 2.875, + "learning_rate": 3e-05, + "loss": 3.9787, + "step": 24990 + }, + { + "epoch": 5.857935359382537, + "grad_norm": 2.6875, + "learning_rate": 3e-05, + "loss": 3.7577, + "step": 24995 + }, + { + "epoch": 5.859141341051616, + "grad_norm": 2.890625, + "learning_rate": 3e-05, + "loss": 3.9542, + "step": 25000 + } + ], + "logging_steps": 5, + "max_steps": 41460, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7338551769487114e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}