diff --git "a/checkpoint-4200/trainer_state.json" "b/checkpoint-4200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4200/trainer_state.json" @@ -0,0 +1,5908 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6236543173212562, + "eval_steps": 500, + "global_step": 4200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.75, + "learning_rate": 2.9673590504451043e-07, + "loss": 1.433, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1.578125, + "learning_rate": 1.483679525222552e-06, + "loss": 1.5006, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 1.4921875, + "learning_rate": 2.967359050445104e-06, + "loss": 1.4919, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 1.4921875, + "learning_rate": 4.451038575667656e-06, + "loss": 1.4531, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 1.0234375, + "learning_rate": 5.934718100890208e-06, + "loss": 1.4242, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.9453125, + "learning_rate": 7.418397626112759e-06, + "loss": 1.3932, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.8359375, + "learning_rate": 8.902077151335312e-06, + "loss": 1.3661, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.828125, + "learning_rate": 1.0385756676557864e-05, + "loss": 1.3677, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.59375, + "learning_rate": 1.1869436201780416e-05, + "loss": 1.3497, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.56640625, + "learning_rate": 1.3353115727002968e-05, + "loss": 1.3049, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.439453125, + "learning_rate": 1.4836795252225518e-05, + "loss": 1.2836, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.46484375, + "learning_rate": 1.6320474777448072e-05, + "loss": 1.3133, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.40625, + "learning_rate": 1.7804154302670624e-05, + "loss": 1.2596, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.408203125, + "learning_rate": 1.9287833827893176e-05, + "loss": 1.2612, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.376953125, + "learning_rate": 2.0771513353115728e-05, + "loss": 1.2584, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.40625, + "learning_rate": 2.225519287833828e-05, + "loss": 1.2386, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.353515625, + "learning_rate": 2.3738872403560832e-05, + "loss": 1.1833, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.349609375, + "learning_rate": 2.5222551928783384e-05, + "loss": 1.2411, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.36328125, + "learning_rate": 2.6706231454005936e-05, + "loss": 1.1956, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.369140625, + "learning_rate": 2.818991097922849e-05, + "loss": 1.2327, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.353515625, + "learning_rate": 2.9673590504451037e-05, + "loss": 1.1896, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 0.3984375, + "learning_rate": 3.115727002967359e-05, + "loss": 1.1979, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 0.388671875, + "learning_rate": 3.2640949554896144e-05, + "loss": 1.2004, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 0.392578125, + "learning_rate": 3.41246290801187e-05, + "loss": 1.1624, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 0.400390625, + "learning_rate": 3.560830860534125e-05, + "loss": 1.1914, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 0.41796875, + "learning_rate": 3.70919881305638e-05, + "loss": 1.1952, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 0.4140625, + "learning_rate": 3.857566765578635e-05, + "loss": 1.1538, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 0.4375, + "learning_rate": 4.005934718100891e-05, + "loss": 1.1702, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 0.44140625, + "learning_rate": 4.1543026706231456e-05, + "loss": 1.1683, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 0.443359375, + "learning_rate": 4.3026706231454005e-05, + "loss": 1.1822, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 0.482421875, + "learning_rate": 4.451038575667656e-05, + "loss": 1.1649, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 0.46484375, + "learning_rate": 4.5994065281899116e-05, + "loss": 1.1868, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 0.5, + "learning_rate": 4.7477744807121664e-05, + "loss": 1.161, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 0.451171875, + "learning_rate": 4.896142433234421e-05, + "loss": 1.1454, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 0.45703125, + "learning_rate": 5.044510385756677e-05, + "loss": 1.1388, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 0.490234375, + "learning_rate": 5.1928783382789324e-05, + "loss": 1.1253, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 0.51171875, + "learning_rate": 5.341246290801187e-05, + "loss": 1.1527, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 0.486328125, + "learning_rate": 5.489614243323442e-05, + "loss": 1.1234, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 0.46875, + "learning_rate": 5.637982195845698e-05, + "loss": 1.113, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 0.458984375, + "learning_rate": 5.7863501483679525e-05, + "loss": 1.1368, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 0.484375, + "learning_rate": 5.9347181008902074e-05, + "loss": 1.137, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 0.4765625, + "learning_rate": 6.0830860534124636e-05, + "loss": 1.1286, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 0.484375, + "learning_rate": 6.231454005934718e-05, + "loss": 1.1277, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 0.474609375, + "learning_rate": 6.379821958456974e-05, + "loss": 1.1268, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 0.47265625, + "learning_rate": 6.528189910979229e-05, + "loss": 1.0993, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 0.50390625, + "learning_rate": 6.676557863501484e-05, + "loss": 1.1017, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 0.4609375, + "learning_rate": 6.82492581602374e-05, + "loss": 1.1345, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 0.48828125, + "learning_rate": 6.973293768545995e-05, + "loss": 1.1086, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 0.45703125, + "learning_rate": 7.12166172106825e-05, + "loss": 1.0791, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 0.470703125, + "learning_rate": 7.270029673590505e-05, + "loss": 1.1158, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 0.46875, + "learning_rate": 7.41839762611276e-05, + "loss": 1.132, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 0.4609375, + "learning_rate": 7.566765578635016e-05, + "loss": 1.1438, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 0.470703125, + "learning_rate": 7.71513353115727e-05, + "loss": 1.1405, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 0.447265625, + "learning_rate": 7.863501483679525e-05, + "loss": 1.1124, + "step": 265 + }, + { + "epoch": 0.04, + "grad_norm": 0.486328125, + "learning_rate": 8.011869436201782e-05, + "loss": 1.0813, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 0.48046875, + "learning_rate": 8.160237388724036e-05, + "loss": 1.1194, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 0.462890625, + "learning_rate": 8.308605341246291e-05, + "loss": 1.0927, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 0.48046875, + "learning_rate": 8.456973293768546e-05, + "loss": 1.1277, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 0.4453125, + "learning_rate": 8.605341246290801e-05, + "loss": 1.1271, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 0.435546875, + "learning_rate": 8.753709198813057e-05, + "loss": 1.1162, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 0.44921875, + "learning_rate": 8.902077151335312e-05, + "loss": 1.0857, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 0.4375, + "learning_rate": 9.050445103857568e-05, + "loss": 1.0869, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 0.4453125, + "learning_rate": 9.198813056379823e-05, + "loss": 1.0655, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 0.4296875, + "learning_rate": 9.347181008902077e-05, + "loss": 1.0585, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 0.41796875, + "learning_rate": 9.495548961424333e-05, + "loss": 1.1144, + "step": 320 + }, + { + "epoch": 0.05, + "grad_norm": 0.43359375, + "learning_rate": 9.643916913946588e-05, + "loss": 1.0719, + "step": 325 + }, + { + "epoch": 0.05, + "grad_norm": 0.416015625, + "learning_rate": 9.792284866468843e-05, + "loss": 1.0919, + "step": 330 + }, + { + "epoch": 0.05, + "grad_norm": 0.423828125, + "learning_rate": 9.940652818991099e-05, + "loss": 1.1223, + "step": 335 + }, + { + "epoch": 0.05, + "grad_norm": 0.431640625, + "learning_rate": 0.00010089020771513354, + "loss": 1.0565, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 0.431640625, + "learning_rate": 0.00010237388724035609, + "loss": 1.0962, + "step": 345 + }, + { + "epoch": 0.05, + "grad_norm": 0.44921875, + "learning_rate": 0.00010385756676557865, + "loss": 1.0959, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 0.43359375, + "learning_rate": 0.0001053412462908012, + "loss": 1.0628, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 0.431640625, + "learning_rate": 0.00010682492581602374, + "loss": 1.0975, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 0.42578125, + "learning_rate": 0.0001083086053412463, + "loss": 1.0727, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 0.416015625, + "learning_rate": 0.00010979228486646884, + "loss": 1.0649, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 0.4296875, + "learning_rate": 0.00011127596439169139, + "loss": 1.0904, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 0.3984375, + "learning_rate": 0.00011275964391691397, + "loss": 1.079, + "step": 380 + }, + { + "epoch": 0.06, + "grad_norm": 0.40234375, + "learning_rate": 0.0001142433234421365, + "loss": 1.0522, + "step": 385 + }, + { + "epoch": 0.06, + "grad_norm": 0.431640625, + "learning_rate": 0.00011572700296735905, + "loss": 1.0579, + "step": 390 + }, + { + "epoch": 0.06, + "grad_norm": 0.396484375, + "learning_rate": 0.0001172106824925816, + "loss": 1.0871, + "step": 395 + }, + { + "epoch": 0.06, + "grad_norm": 0.41015625, + "learning_rate": 0.00011869436201780415, + "loss": 1.0936, + "step": 400 + }, + { + "epoch": 0.06, + "grad_norm": 0.412109375, + "learning_rate": 0.00012017804154302672, + "loss": 1.0734, + "step": 405 + }, + { + "epoch": 0.06, + "grad_norm": 0.392578125, + "learning_rate": 0.00012166172106824927, + "loss": 1.0657, + "step": 410 + }, + { + "epoch": 0.06, + "grad_norm": 0.4140625, + "learning_rate": 0.00012314540059347182, + "loss": 1.0884, + "step": 415 + }, + { + "epoch": 0.06, + "grad_norm": 0.408203125, + "learning_rate": 0.00012462908011869436, + "loss": 1.0683, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 0.3984375, + "learning_rate": 0.00012611275964391692, + "loss": 1.1073, + "step": 425 + }, + { + "epoch": 0.06, + "grad_norm": 0.412109375, + "learning_rate": 0.00012759643916913948, + "loss": 1.0849, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 0.40625, + "learning_rate": 0.00012908011869436204, + "loss": 1.0798, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 0.404296875, + "learning_rate": 0.00013056379821958458, + "loss": 1.1029, + "step": 440 + }, + { + "epoch": 0.07, + "grad_norm": 0.3828125, + "learning_rate": 0.0001320474777448071, + "loss": 1.0664, + "step": 445 + }, + { + "epoch": 0.07, + "grad_norm": 0.38671875, + "learning_rate": 0.00013353115727002967, + "loss": 1.0998, + "step": 450 + }, + { + "epoch": 0.07, + "grad_norm": 0.40234375, + "learning_rate": 0.00013501483679525224, + "loss": 1.0834, + "step": 455 + }, + { + "epoch": 0.07, + "grad_norm": 0.400390625, + "learning_rate": 0.0001364985163204748, + "loss": 1.062, + "step": 460 + }, + { + "epoch": 0.07, + "grad_norm": 0.3984375, + "learning_rate": 0.00013798219584569733, + "loss": 1.0825, + "step": 465 + }, + { + "epoch": 0.07, + "grad_norm": 0.388671875, + "learning_rate": 0.0001394658753709199, + "loss": 1.0689, + "step": 470 + }, + { + "epoch": 0.07, + "grad_norm": 0.392578125, + "learning_rate": 0.00014094955489614243, + "loss": 1.0557, + "step": 475 + }, + { + "epoch": 0.07, + "grad_norm": 0.380859375, + "learning_rate": 0.000142433234421365, + "loss": 1.0582, + "step": 480 + }, + { + "epoch": 0.07, + "grad_norm": 0.380859375, + "learning_rate": 0.00014391691394658756, + "loss": 1.0921, + "step": 485 + }, + { + "epoch": 0.07, + "grad_norm": 0.384765625, + "learning_rate": 0.0001454005934718101, + "loss": 1.0544, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 0.39453125, + "learning_rate": 0.00014688427299703265, + "loss": 1.0333, + "step": 495 + }, + { + "epoch": 0.07, + "grad_norm": 0.384765625, + "learning_rate": 0.0001483679525222552, + "loss": 1.0454, + "step": 500 + }, + { + "epoch": 0.07, + "grad_norm": 0.37890625, + "learning_rate": 0.00014985163204747775, + "loss": 1.0434, + "step": 505 + }, + { + "epoch": 0.08, + "grad_norm": 0.390625, + "learning_rate": 0.0001513353115727003, + "loss": 1.0768, + "step": 510 + }, + { + "epoch": 0.08, + "grad_norm": 0.376953125, + "learning_rate": 0.00015281899109792285, + "loss": 1.0774, + "step": 515 + }, + { + "epoch": 0.08, + "grad_norm": 0.365234375, + "learning_rate": 0.0001543026706231454, + "loss": 1.0438, + "step": 520 + }, + { + "epoch": 0.08, + "grad_norm": 0.373046875, + "learning_rate": 0.00015578635014836794, + "loss": 1.0573, + "step": 525 + }, + { + "epoch": 0.08, + "grad_norm": 0.380859375, + "learning_rate": 0.0001572700296735905, + "loss": 1.0501, + "step": 530 + }, + { + "epoch": 0.08, + "grad_norm": 0.392578125, + "learning_rate": 0.00015875370919881307, + "loss": 1.0413, + "step": 535 + }, + { + "epoch": 0.08, + "grad_norm": 0.37109375, + "learning_rate": 0.00016023738872403563, + "loss": 1.0234, + "step": 540 + }, + { + "epoch": 0.08, + "grad_norm": 0.3671875, + "learning_rate": 0.00016172106824925817, + "loss": 1.0538, + "step": 545 + }, + { + "epoch": 0.08, + "grad_norm": 0.41015625, + "learning_rate": 0.00016320474777448073, + "loss": 1.0493, + "step": 550 + }, + { + "epoch": 0.08, + "grad_norm": 0.35546875, + "learning_rate": 0.00016468842729970326, + "loss": 1.0686, + "step": 555 + }, + { + "epoch": 0.08, + "grad_norm": 0.36328125, + "learning_rate": 0.00016617210682492583, + "loss": 1.0794, + "step": 560 + }, + { + "epoch": 0.08, + "grad_norm": 0.380859375, + "learning_rate": 0.0001676557863501484, + "loss": 1.0575, + "step": 565 + }, + { + "epoch": 0.08, + "grad_norm": 0.37109375, + "learning_rate": 0.00016913946587537092, + "loss": 1.0682, + "step": 570 + }, + { + "epoch": 0.09, + "grad_norm": 0.361328125, + "learning_rate": 0.00017062314540059348, + "loss": 1.0237, + "step": 575 + }, + { + "epoch": 0.09, + "grad_norm": 0.37109375, + "learning_rate": 0.00017210682492581602, + "loss": 1.0493, + "step": 580 + }, + { + "epoch": 0.09, + "grad_norm": 0.361328125, + "learning_rate": 0.00017359050445103858, + "loss": 1.0613, + "step": 585 + }, + { + "epoch": 0.09, + "grad_norm": 0.380859375, + "learning_rate": 0.00017507418397626114, + "loss": 1.0346, + "step": 590 + }, + { + "epoch": 0.09, + "grad_norm": 0.375, + "learning_rate": 0.00017655786350148368, + "loss": 1.0749, + "step": 595 + }, + { + "epoch": 0.09, + "grad_norm": 0.34765625, + "learning_rate": 0.00017804154302670624, + "loss": 1.0289, + "step": 600 + }, + { + "epoch": 0.09, + "grad_norm": 0.353515625, + "learning_rate": 0.00017952522255192878, + "loss": 1.0606, + "step": 605 + }, + { + "epoch": 0.09, + "grad_norm": 0.36328125, + "learning_rate": 0.00018100890207715137, + "loss": 1.0166, + "step": 610 + }, + { + "epoch": 0.09, + "grad_norm": 0.375, + "learning_rate": 0.0001824925816023739, + "loss": 1.0347, + "step": 615 + }, + { + "epoch": 0.09, + "grad_norm": 0.3671875, + "learning_rate": 0.00018397626112759646, + "loss": 1.0388, + "step": 620 + }, + { + "epoch": 0.09, + "grad_norm": 0.365234375, + "learning_rate": 0.000185459940652819, + "loss": 1.0545, + "step": 625 + }, + { + "epoch": 0.09, + "grad_norm": 0.361328125, + "learning_rate": 0.00018694362017804153, + "loss": 1.0281, + "step": 630 + }, + { + "epoch": 0.09, + "grad_norm": 0.34765625, + "learning_rate": 0.0001884272997032641, + "loss": 1.0347, + "step": 635 + }, + { + "epoch": 0.1, + "grad_norm": 0.353515625, + "learning_rate": 0.00018991097922848666, + "loss": 1.0347, + "step": 640 + }, + { + "epoch": 0.1, + "grad_norm": 0.357421875, + "learning_rate": 0.00019139465875370922, + "loss": 1.0594, + "step": 645 + }, + { + "epoch": 0.1, + "grad_norm": 0.353515625, + "learning_rate": 0.00019287833827893175, + "loss": 1.0201, + "step": 650 + }, + { + "epoch": 0.1, + "grad_norm": 0.384765625, + "learning_rate": 0.00019436201780415432, + "loss": 1.0287, + "step": 655 + }, + { + "epoch": 0.1, + "grad_norm": 0.359375, + "learning_rate": 0.00019584569732937685, + "loss": 1.0777, + "step": 660 + }, + { + "epoch": 0.1, + "grad_norm": 0.349609375, + "learning_rate": 0.00019732937685459941, + "loss": 1.0157, + "step": 665 + }, + { + "epoch": 0.1, + "grad_norm": 0.349609375, + "learning_rate": 0.00019881305637982198, + "loss": 1.0474, + "step": 670 + }, + { + "epoch": 0.1, + "grad_norm": 0.353515625, + "learning_rate": 0.0001999999865623139, + "loss": 1.0452, + "step": 675 + }, + { + "epoch": 0.1, + "grad_norm": 0.3828125, + "learning_rate": 0.00019999951624367985, + "loss": 1.02, + "step": 680 + }, + { + "epoch": 0.1, + "grad_norm": 0.345703125, + "learning_rate": 0.0001999983740443526, + "loss": 1.041, + "step": 685 + }, + { + "epoch": 0.1, + "grad_norm": 0.353515625, + "learning_rate": 0.00019999655997200635, + "loss": 1.0305, + "step": 690 + }, + { + "epoch": 0.1, + "grad_norm": 0.359375, + "learning_rate": 0.0001999940740388296, + "loss": 1.032, + "step": 695 + }, + { + "epoch": 0.1, + "grad_norm": 0.35546875, + "learning_rate": 0.00019999091626152492, + "loss": 1.0027, + "step": 700 + }, + { + "epoch": 0.1, + "grad_norm": 0.349609375, + "learning_rate": 0.00019998708666130893, + "loss": 1.0311, + "step": 705 + }, + { + "epoch": 0.11, + "grad_norm": 0.37109375, + "learning_rate": 0.00019998258526391207, + "loss": 1.0165, + "step": 710 + }, + { + "epoch": 0.11, + "grad_norm": 0.361328125, + "learning_rate": 0.00019997741209957853, + "loss": 1.013, + "step": 715 + }, + { + "epoch": 0.11, + "grad_norm": 0.357421875, + "learning_rate": 0.00019997156720306597, + "loss": 0.9992, + "step": 720 + }, + { + "epoch": 0.11, + "grad_norm": 0.34765625, + "learning_rate": 0.00019996505061364527, + "loss": 1.044, + "step": 725 + }, + { + "epoch": 0.11, + "grad_norm": 0.55859375, + "learning_rate": 0.0001999578623751004, + "loss": 0.998, + "step": 730 + }, + { + "epoch": 0.11, + "grad_norm": 0.349609375, + "learning_rate": 0.00019995000253572798, + "loss": 1.0354, + "step": 735 + }, + { + "epoch": 0.11, + "grad_norm": 0.353515625, + "learning_rate": 0.00019994147114833698, + "loss": 1.0083, + "step": 740 + }, + { + "epoch": 0.11, + "grad_norm": 0.359375, + "learning_rate": 0.00019993226827024843, + "loss": 1.012, + "step": 745 + }, + { + "epoch": 0.11, + "grad_norm": 0.359375, + "learning_rate": 0.00019992239396329498, + "loss": 0.9953, + "step": 750 + }, + { + "epoch": 0.11, + "grad_norm": 0.3515625, + "learning_rate": 0.00019991184829382057, + "loss": 1.0095, + "step": 755 + }, + { + "epoch": 0.11, + "grad_norm": 0.349609375, + "learning_rate": 0.00019990063133267975, + "loss": 1.0518, + "step": 760 + }, + { + "epoch": 0.11, + "grad_norm": 0.369140625, + "learning_rate": 0.0001998887431552376, + "loss": 1.0357, + "step": 765 + }, + { + "epoch": 0.11, + "grad_norm": 0.359375, + "learning_rate": 0.00019987618384136879, + "loss": 1.0542, + "step": 770 + }, + { + "epoch": 0.12, + "grad_norm": 0.361328125, + "learning_rate": 0.0001998629534754574, + "loss": 1.0071, + "step": 775 + }, + { + "epoch": 0.12, + "grad_norm": 0.34375, + "learning_rate": 0.00019984905214639608, + "loss": 1.0414, + "step": 780 + }, + { + "epoch": 0.12, + "grad_norm": 0.376953125, + "learning_rate": 0.00019983447994758563, + "loss": 1.003, + "step": 785 + }, + { + "epoch": 0.12, + "grad_norm": 0.3515625, + "learning_rate": 0.00019981923697693437, + "loss": 1.0482, + "step": 790 + }, + { + "epoch": 0.12, + "grad_norm": 0.361328125, + "learning_rate": 0.00019980332333685729, + "loss": 1.0394, + "step": 795 + }, + { + "epoch": 0.12, + "grad_norm": 0.353515625, + "learning_rate": 0.00019978673913427568, + "loss": 0.985, + "step": 800 + }, + { + "epoch": 0.12, + "grad_norm": 0.33984375, + "learning_rate": 0.00019976948448061603, + "loss": 1.0149, + "step": 805 + }, + { + "epoch": 0.12, + "grad_norm": 0.345703125, + "learning_rate": 0.00019975155949180967, + "loss": 1.0253, + "step": 810 + }, + { + "epoch": 0.12, + "grad_norm": 0.34765625, + "learning_rate": 0.00019973296428829168, + "loss": 1.0067, + "step": 815 + }, + { + "epoch": 0.12, + "grad_norm": 0.3515625, + "learning_rate": 0.00019971369899500024, + "loss": 1.013, + "step": 820 + }, + { + "epoch": 0.12, + "grad_norm": 0.3515625, + "learning_rate": 0.00019969376374137578, + "loss": 1.0253, + "step": 825 + }, + { + "epoch": 0.12, + "grad_norm": 0.345703125, + "learning_rate": 0.0001996731586613601, + "loss": 1.0505, + "step": 830 + }, + { + "epoch": 0.12, + "grad_norm": 0.3671875, + "learning_rate": 0.00019965188389339537, + "loss": 1.0199, + "step": 835 + }, + { + "epoch": 0.12, + "grad_norm": 0.345703125, + "learning_rate": 0.00019962993958042336, + "loss": 1.0066, + "step": 840 + }, + { + "epoch": 0.13, + "grad_norm": 0.33203125, + "learning_rate": 0.00019960732586988438, + "loss": 1.0226, + "step": 845 + }, + { + "epoch": 0.13, + "grad_norm": 0.345703125, + "learning_rate": 0.00019958404291371635, + "loss": 0.9902, + "step": 850 + }, + { + "epoch": 0.13, + "grad_norm": 0.365234375, + "learning_rate": 0.0001995600908683537, + "loss": 1.0338, + "step": 855 + }, + { + "epoch": 0.13, + "grad_norm": 0.349609375, + "learning_rate": 0.00019953546989472633, + "loss": 1.0103, + "step": 860 + }, + { + "epoch": 0.13, + "grad_norm": 0.3515625, + "learning_rate": 0.00019951018015825866, + "loss": 1.0237, + "step": 865 + }, + { + "epoch": 0.13, + "grad_norm": 0.349609375, + "learning_rate": 0.00019948422182886833, + "loss": 0.972, + "step": 870 + }, + { + "epoch": 0.13, + "grad_norm": 0.357421875, + "learning_rate": 0.00019945759508096527, + "loss": 1.0472, + "step": 875 + }, + { + "epoch": 0.13, + "grad_norm": 0.34765625, + "learning_rate": 0.00019943030009345023, + "loss": 1.0156, + "step": 880 + }, + { + "epoch": 0.13, + "grad_norm": 0.3515625, + "learning_rate": 0.00019940233704971388, + "loss": 1.0365, + "step": 885 + }, + { + "epoch": 0.13, + "grad_norm": 0.33984375, + "learning_rate": 0.00019937370613763543, + "loss": 1.0098, + "step": 890 + }, + { + "epoch": 0.13, + "grad_norm": 0.33203125, + "learning_rate": 0.00019934440754958136, + "loss": 1.0042, + "step": 895 + }, + { + "epoch": 0.13, + "grad_norm": 0.33984375, + "learning_rate": 0.00019931444148240423, + "loss": 1.0053, + "step": 900 + }, + { + "epoch": 0.13, + "grad_norm": 0.353515625, + "learning_rate": 0.00019928380813744119, + "loss": 1.0311, + "step": 905 + }, + { + "epoch": 0.14, + "grad_norm": 0.345703125, + "learning_rate": 0.00019925250772051276, + "loss": 0.9973, + "step": 910 + }, + { + "epoch": 0.14, + "grad_norm": 0.353515625, + "learning_rate": 0.00019922054044192145, + "loss": 1.0023, + "step": 915 + }, + { + "epoch": 0.14, + "grad_norm": 0.341796875, + "learning_rate": 0.00019918790651645023, + "loss": 1.0097, + "step": 920 + }, + { + "epoch": 0.14, + "grad_norm": 0.34375, + "learning_rate": 0.00019915460616336126, + "loss": 1.013, + "step": 925 + }, + { + "epoch": 0.14, + "grad_norm": 0.34765625, + "learning_rate": 0.0001991206396063942, + "loss": 0.9989, + "step": 930 + }, + { + "epoch": 0.14, + "grad_norm": 0.34375, + "learning_rate": 0.00019908600707376495, + "loss": 0.9815, + "step": 935 + }, + { + "epoch": 0.14, + "grad_norm": 0.3515625, + "learning_rate": 0.0001990507087981639, + "loss": 0.9753, + "step": 940 + }, + { + "epoch": 0.14, + "grad_norm": 0.33984375, + "learning_rate": 0.0001990147450167545, + "loss": 1.0145, + "step": 945 + }, + { + "epoch": 0.14, + "grad_norm": 0.35546875, + "learning_rate": 0.00019897811597117168, + "loss": 1.0359, + "step": 950 + }, + { + "epoch": 0.14, + "grad_norm": 0.333984375, + "learning_rate": 0.00019894082190751998, + "loss": 0.9902, + "step": 955 + }, + { + "epoch": 0.14, + "grad_norm": 0.349609375, + "learning_rate": 0.00019890286307637237, + "loss": 0.9997, + "step": 960 + }, + { + "epoch": 0.14, + "grad_norm": 0.33984375, + "learning_rate": 0.0001988642397327681, + "loss": 1.0412, + "step": 965 + }, + { + "epoch": 0.14, + "grad_norm": 0.33984375, + "learning_rate": 0.00019882495213621116, + "loss": 0.9552, + "step": 970 + }, + { + "epoch": 0.14, + "grad_norm": 0.3515625, + "learning_rate": 0.00019878500055066866, + "loss": 0.9901, + "step": 975 + }, + { + "epoch": 0.15, + "grad_norm": 0.337890625, + "learning_rate": 0.00019874438524456888, + "loss": 1.035, + "step": 980 + }, + { + "epoch": 0.15, + "grad_norm": 0.34765625, + "learning_rate": 0.0001987031064907995, + "loss": 0.996, + "step": 985 + }, + { + "epoch": 0.15, + "grad_norm": 0.333984375, + "learning_rate": 0.0001986611645667059, + "loss": 1.0248, + "step": 990 + }, + { + "epoch": 0.15, + "grad_norm": 0.3515625, + "learning_rate": 0.0001986185597540891, + "loss": 0.9889, + "step": 995 + }, + { + "epoch": 0.15, + "grad_norm": 0.341796875, + "learning_rate": 0.00019857529233920397, + "loss": 1.0151, + "step": 1000 + }, + { + "epoch": 0.15, + "grad_norm": 0.3671875, + "learning_rate": 0.00019853136261275737, + "loss": 0.9882, + "step": 1005 + }, + { + "epoch": 0.15, + "grad_norm": 0.33984375, + "learning_rate": 0.00019848677086990605, + "loss": 1.0051, + "step": 1010 + }, + { + "epoch": 0.15, + "grad_norm": 0.34375, + "learning_rate": 0.0001984415174102548, + "loss": 0.9904, + "step": 1015 + }, + { + "epoch": 0.15, + "grad_norm": 0.34765625, + "learning_rate": 0.0001983956025378543, + "loss": 0.9899, + "step": 1020 + }, + { + "epoch": 0.15, + "grad_norm": 0.341796875, + "learning_rate": 0.00019834902656119924, + "loss": 0.999, + "step": 1025 + }, + { + "epoch": 0.15, + "grad_norm": 0.3515625, + "learning_rate": 0.00019830178979322614, + "loss": 0.9798, + "step": 1030 + }, + { + "epoch": 0.15, + "grad_norm": 0.349609375, + "learning_rate": 0.00019825389255131125, + "loss": 0.984, + "step": 1035 + }, + { + "epoch": 0.15, + "grad_norm": 0.34765625, + "learning_rate": 0.0001982053351572684, + "loss": 1.0014, + "step": 1040 + }, + { + "epoch": 0.16, + "grad_norm": 0.345703125, + "learning_rate": 0.000198156117937347, + "loss": 1.0148, + "step": 1045 + }, + { + "epoch": 0.16, + "grad_norm": 0.330078125, + "learning_rate": 0.0001981062412222296, + "loss": 1.0271, + "step": 1050 + }, + { + "epoch": 0.16, + "grad_norm": 0.341796875, + "learning_rate": 0.00019805570534702987, + "loss": 0.9902, + "step": 1055 + }, + { + "epoch": 0.16, + "grad_norm": 0.330078125, + "learning_rate": 0.00019800451065129018, + "loss": 0.9699, + "step": 1060 + }, + { + "epoch": 0.16, + "grad_norm": 0.337890625, + "learning_rate": 0.00019795265747897956, + "loss": 0.9911, + "step": 1065 + }, + { + "epoch": 0.16, + "grad_norm": 0.345703125, + "learning_rate": 0.00019790014617849106, + "loss": 0.9923, + "step": 1070 + }, + { + "epoch": 0.16, + "grad_norm": 0.341796875, + "learning_rate": 0.00019784697710263974, + "loss": 0.9976, + "step": 1075 + }, + { + "epoch": 0.16, + "grad_norm": 0.3515625, + "learning_rate": 0.00019779315060866007, + "loss": 0.9647, + "step": 1080 + }, + { + "epoch": 0.16, + "grad_norm": 0.353515625, + "learning_rate": 0.00019773866705820363, + "loss": 0.9589, + "step": 1085 + }, + { + "epoch": 0.16, + "grad_norm": 0.330078125, + "learning_rate": 0.00019768352681733662, + "loss": 1.005, + "step": 1090 + }, + { + "epoch": 0.16, + "grad_norm": 0.341796875, + "learning_rate": 0.00019762773025653747, + "loss": 1.0066, + "step": 1095 + }, + { + "epoch": 0.16, + "grad_norm": 0.345703125, + "learning_rate": 0.0001975712777506943, + "loss": 1.0118, + "step": 1100 + }, + { + "epoch": 0.16, + "grad_norm": 0.34375, + "learning_rate": 0.00019751416967910248, + "loss": 1.0445, + "step": 1105 + }, + { + "epoch": 0.16, + "grad_norm": 0.34765625, + "learning_rate": 0.00019745640642546196, + "loss": 0.9766, + "step": 1110 + }, + { + "epoch": 0.17, + "grad_norm": 0.353515625, + "learning_rate": 0.0001973979883778747, + "loss": 0.9897, + "step": 1115 + }, + { + "epoch": 0.17, + "grad_norm": 0.3515625, + "learning_rate": 0.00019733891592884227, + "loss": 0.9826, + "step": 1120 + }, + { + "epoch": 0.17, + "grad_norm": 0.34765625, + "learning_rate": 0.00019727918947526292, + "loss": 0.988, + "step": 1125 + }, + { + "epoch": 0.17, + "grad_norm": 0.349609375, + "learning_rate": 0.00019721880941842913, + "loss": 1.0101, + "step": 1130 + }, + { + "epoch": 0.17, + "grad_norm": 0.34375, + "learning_rate": 0.00019715777616402479, + "loss": 0.9744, + "step": 1135 + }, + { + "epoch": 0.17, + "grad_norm": 0.33984375, + "learning_rate": 0.0001970960901221225, + "loss": 0.9896, + "step": 1140 + }, + { + "epoch": 0.17, + "grad_norm": 0.357421875, + "learning_rate": 0.00019703375170718093, + "loss": 0.9786, + "step": 1145 + }, + { + "epoch": 0.17, + "grad_norm": 0.34765625, + "learning_rate": 0.00019697076133804185, + "loss": 0.9958, + "step": 1150 + }, + { + "epoch": 0.17, + "grad_norm": 0.341796875, + "learning_rate": 0.0001969071194379275, + "loss": 0.9803, + "step": 1155 + }, + { + "epoch": 0.17, + "grad_norm": 0.3515625, + "learning_rate": 0.00019684282643443748, + "loss": 0.9894, + "step": 1160 + }, + { + "epoch": 0.17, + "grad_norm": 0.357421875, + "learning_rate": 0.00019677788275954624, + "loss": 1.0073, + "step": 1165 + }, + { + "epoch": 0.17, + "grad_norm": 0.33984375, + "learning_rate": 0.00019671228884959987, + "loss": 1.0145, + "step": 1170 + }, + { + "epoch": 0.17, + "grad_norm": 0.357421875, + "learning_rate": 0.00019664604514531332, + "loss": 1.0035, + "step": 1175 + }, + { + "epoch": 0.18, + "grad_norm": 0.353515625, + "learning_rate": 0.0001965791520917674, + "loss": 0.9856, + "step": 1180 + }, + { + "epoch": 0.18, + "grad_norm": 0.33984375, + "learning_rate": 0.00019651161013840583, + "loss": 0.9573, + "step": 1185 + }, + { + "epoch": 0.18, + "grad_norm": 0.328125, + "learning_rate": 0.00019644341973903208, + "loss": 0.9824, + "step": 1190 + }, + { + "epoch": 0.18, + "grad_norm": 0.333984375, + "learning_rate": 0.00019637458135180657, + "loss": 0.9873, + "step": 1195 + }, + { + "epoch": 0.18, + "grad_norm": 0.3515625, + "learning_rate": 0.0001963050954392433, + "loss": 0.9743, + "step": 1200 + }, + { + "epoch": 0.18, + "grad_norm": 0.345703125, + "learning_rate": 0.00019623496246820704, + "loss": 0.9987, + "step": 1205 + }, + { + "epoch": 0.18, + "grad_norm": 0.349609375, + "learning_rate": 0.00019616418290990993, + "loss": 1.0153, + "step": 1210 + }, + { + "epoch": 0.18, + "grad_norm": 0.39453125, + "learning_rate": 0.0001960927572399084, + "loss": 0.9907, + "step": 1215 + }, + { + "epoch": 0.18, + "grad_norm": 0.337890625, + "learning_rate": 0.00019602068593810014, + "loss": 1.0148, + "step": 1220 + }, + { + "epoch": 0.18, + "grad_norm": 0.34375, + "learning_rate": 0.0001959479694887206, + "loss": 1.0024, + "step": 1225 + }, + { + "epoch": 0.18, + "grad_norm": 0.34765625, + "learning_rate": 0.00019587460838033996, + "loss": 1.0298, + "step": 1230 + }, + { + "epoch": 0.18, + "grad_norm": 0.357421875, + "learning_rate": 0.00019580060310585973, + "loss": 0.9946, + "step": 1235 + }, + { + "epoch": 0.18, + "grad_norm": 0.349609375, + "learning_rate": 0.00019572595416250942, + "loss": 0.9961, + "step": 1240 + }, + { + "epoch": 0.18, + "grad_norm": 0.333984375, + "learning_rate": 0.00019565066205184332, + "loss": 1.0016, + "step": 1245 + }, + { + "epoch": 0.19, + "grad_norm": 0.353515625, + "learning_rate": 0.00019557472727973707, + "loss": 0.9829, + "step": 1250 + }, + { + "epoch": 0.19, + "grad_norm": 0.34765625, + "learning_rate": 0.00019549815035638414, + "loss": 0.9781, + "step": 1255 + }, + { + "epoch": 0.19, + "grad_norm": 0.353515625, + "learning_rate": 0.00019542093179629268, + "loss": 1.0089, + "step": 1260 + }, + { + "epoch": 0.19, + "grad_norm": 0.34375, + "learning_rate": 0.0001953430721182817, + "loss": 0.9942, + "step": 1265 + }, + { + "epoch": 0.19, + "grad_norm": 0.357421875, + "learning_rate": 0.00019526457184547793, + "loss": 0.9818, + "step": 1270 + }, + { + "epoch": 0.19, + "grad_norm": 0.359375, + "learning_rate": 0.00019518543150531207, + "loss": 1.018, + "step": 1275 + }, + { + "epoch": 0.19, + "grad_norm": 0.345703125, + "learning_rate": 0.00019510565162951537, + "loss": 0.9697, + "step": 1280 + }, + { + "epoch": 0.19, + "grad_norm": 0.359375, + "learning_rate": 0.00019502523275411599, + "loss": 1.0331, + "step": 1285 + }, + { + "epoch": 0.19, + "grad_norm": 0.34375, + "learning_rate": 0.00019494417541943547, + "loss": 1.0071, + "step": 1290 + }, + { + "epoch": 0.19, + "grad_norm": 0.345703125, + "learning_rate": 0.00019486248017008503, + "loss": 0.9699, + "step": 1295 + }, + { + "epoch": 0.19, + "grad_norm": 0.34765625, + "learning_rate": 0.00019478014755496196, + "loss": 1.0168, + "step": 1300 + }, + { + "epoch": 0.19, + "grad_norm": 0.345703125, + "learning_rate": 0.00019469717812724575, + "loss": 0.9719, + "step": 1305 + }, + { + "epoch": 0.19, + "grad_norm": 0.345703125, + "learning_rate": 0.00019461357244439479, + "loss": 0.9974, + "step": 1310 + }, + { + "epoch": 0.2, + "grad_norm": 0.34375, + "learning_rate": 0.00019452933106814223, + "loss": 0.9897, + "step": 1315 + }, + { + "epoch": 0.2, + "grad_norm": 0.345703125, + "learning_rate": 0.0001944444545644923, + "loss": 0.9809, + "step": 1320 + }, + { + "epoch": 0.2, + "grad_norm": 0.341796875, + "learning_rate": 0.0001943589435037166, + "loss": 0.9601, + "step": 1325 + }, + { + "epoch": 0.2, + "grad_norm": 0.345703125, + "learning_rate": 0.00019427279846035025, + "loss": 0.9615, + "step": 1330 + }, + { + "epoch": 0.2, + "grad_norm": 0.337890625, + "learning_rate": 0.00019418602001318797, + "loss": 0.9888, + "step": 1335 + }, + { + "epoch": 0.2, + "grad_norm": 0.341796875, + "learning_rate": 0.00019409860874528017, + "loss": 1.0099, + "step": 1340 + }, + { + "epoch": 0.2, + "grad_norm": 0.333984375, + "learning_rate": 0.00019401056524392916, + "loss": 0.9911, + "step": 1345 + }, + { + "epoch": 0.2, + "grad_norm": 0.3515625, + "learning_rate": 0.00019392189010068508, + "loss": 0.992, + "step": 1350 + }, + { + "epoch": 0.2, + "grad_norm": 0.337890625, + "learning_rate": 0.000193832583911342, + "loss": 0.9827, + "step": 1355 + }, + { + "epoch": 0.2, + "grad_norm": 0.3515625, + "learning_rate": 0.0001937426472759338, + "loss": 0.9473, + "step": 1360 + }, + { + "epoch": 0.2, + "grad_norm": 0.3671875, + "learning_rate": 0.00019365208079873036, + "loss": 0.9863, + "step": 1365 + }, + { + "epoch": 0.2, + "grad_norm": 0.349609375, + "learning_rate": 0.0001935608850882333, + "loss": 0.9662, + "step": 1370 + }, + { + "epoch": 0.2, + "grad_norm": 0.3515625, + "learning_rate": 0.0001934690607571719, + "loss": 0.9864, + "step": 1375 + }, + { + "epoch": 0.2, + "grad_norm": 0.345703125, + "learning_rate": 0.00019337660842249914, + "loss": 1.0151, + "step": 1380 + }, + { + "epoch": 0.21, + "grad_norm": 0.337890625, + "learning_rate": 0.0001932835287053874, + "loss": 0.9638, + "step": 1385 + }, + { + "epoch": 0.21, + "grad_norm": 0.341796875, + "learning_rate": 0.00019318982223122437, + "loss": 1.0149, + "step": 1390 + }, + { + "epoch": 0.21, + "grad_norm": 0.341796875, + "learning_rate": 0.00019309548962960876, + "loss": 0.9827, + "step": 1395 + }, + { + "epoch": 0.21, + "grad_norm": 0.349609375, + "learning_rate": 0.00019300053153434622, + "loss": 0.9726, + "step": 1400 + }, + { + "epoch": 0.21, + "grad_norm": 0.33984375, + "learning_rate": 0.00019290494858344493, + "loss": 0.9742, + "step": 1405 + }, + { + "epoch": 0.21, + "grad_norm": 0.345703125, + "learning_rate": 0.00019280874141911137, + "loss": 0.9987, + "step": 1410 + }, + { + "epoch": 0.21, + "grad_norm": 0.349609375, + "learning_rate": 0.00019271191068774606, + "loss": 1.0067, + "step": 1415 + }, + { + "epoch": 0.21, + "grad_norm": 0.33984375, + "learning_rate": 0.00019261445703993912, + "loss": 0.9833, + "step": 1420 + }, + { + "epoch": 0.21, + "grad_norm": 0.35546875, + "learning_rate": 0.00019251638113046597, + "loss": 0.9599, + "step": 1425 + }, + { + "epoch": 0.21, + "grad_norm": 0.3671875, + "learning_rate": 0.0001924176836182829, + "loss": 0.9793, + "step": 1430 + }, + { + "epoch": 0.21, + "grad_norm": 0.337890625, + "learning_rate": 0.00019231836516652261, + "loss": 0.965, + "step": 1435 + }, + { + "epoch": 0.21, + "grad_norm": 0.353515625, + "learning_rate": 0.0001922184264424899, + "loss": 0.9598, + "step": 1440 + }, + { + "epoch": 0.21, + "grad_norm": 0.3359375, + "learning_rate": 0.00019211786811765692, + "loss": 0.9931, + "step": 1445 + }, + { + "epoch": 0.22, + "grad_norm": 0.345703125, + "learning_rate": 0.00019201669086765902, + "loss": 0.9571, + "step": 1450 + }, + { + "epoch": 0.22, + "grad_norm": 0.328125, + "learning_rate": 0.0001919148953722898, + "loss": 0.9448, + "step": 1455 + }, + { + "epoch": 0.22, + "grad_norm": 0.328125, + "learning_rate": 0.0001918124823154969, + "loss": 0.9523, + "step": 1460 + }, + { + "epoch": 0.22, + "grad_norm": 0.34765625, + "learning_rate": 0.00019170945238537718, + "loss": 0.9888, + "step": 1465 + }, + { + "epoch": 0.22, + "grad_norm": 0.34765625, + "learning_rate": 0.00019160580627417223, + "loss": 0.9735, + "step": 1470 + }, + { + "epoch": 0.22, + "grad_norm": 0.337890625, + "learning_rate": 0.00019150154467826357, + "loss": 0.9726, + "step": 1475 + }, + { + "epoch": 0.22, + "grad_norm": 0.34765625, + "learning_rate": 0.00019139666829816817, + "loss": 0.9679, + "step": 1480 + }, + { + "epoch": 0.22, + "grad_norm": 0.33203125, + "learning_rate": 0.0001912911778385336, + "loss": 0.9913, + "step": 1485 + }, + { + "epoch": 0.22, + "grad_norm": 0.33203125, + "learning_rate": 0.00019118507400813325, + "loss": 0.9736, + "step": 1490 + }, + { + "epoch": 0.22, + "grad_norm": 0.337890625, + "learning_rate": 0.0001910783575198618, + "loss": 0.9535, + "step": 1495 + }, + { + "epoch": 0.22, + "grad_norm": 0.341796875, + "learning_rate": 0.0001909710290907302, + "loss": 0.993, + "step": 1500 + }, + { + "epoch": 0.22, + "grad_norm": 0.3515625, + "learning_rate": 0.00019086308944186084, + "loss": 0.9645, + "step": 1505 + }, + { + "epoch": 0.22, + "grad_norm": 0.33984375, + "learning_rate": 0.000190754539298483, + "loss": 0.9694, + "step": 1510 + }, + { + "epoch": 0.22, + "grad_norm": 0.337890625, + "learning_rate": 0.00019064537938992757, + "loss": 0.9775, + "step": 1515 + }, + { + "epoch": 0.23, + "grad_norm": 0.34375, + "learning_rate": 0.0001905356104496225, + "loss": 0.9388, + "step": 1520 + }, + { + "epoch": 0.23, + "grad_norm": 0.357421875, + "learning_rate": 0.00019042523321508768, + "loss": 0.9755, + "step": 1525 + }, + { + "epoch": 0.23, + "grad_norm": 0.359375, + "learning_rate": 0.00019031424842793, + "loss": 1.0159, + "step": 1530 + }, + { + "epoch": 0.23, + "grad_norm": 0.3515625, + "learning_rate": 0.00019020265683383842, + "loss": 0.9835, + "step": 1535 + }, + { + "epoch": 0.23, + "grad_norm": 0.349609375, + "learning_rate": 0.000190090459182579, + "loss": 0.9711, + "step": 1540 + }, + { + "epoch": 0.23, + "grad_norm": 0.341796875, + "learning_rate": 0.00018997765622798967, + "loss": 0.9742, + "step": 1545 + }, + { + "epoch": 0.23, + "grad_norm": 0.33984375, + "learning_rate": 0.0001898642487279754, + "loss": 0.9669, + "step": 1550 + }, + { + "epoch": 0.23, + "grad_norm": 0.3359375, + "learning_rate": 0.000189750237444503, + "loss": 0.9516, + "step": 1555 + }, + { + "epoch": 0.23, + "grad_norm": 0.349609375, + "learning_rate": 0.00018963562314359595, + "loss": 1.0002, + "step": 1560 + }, + { + "epoch": 0.23, + "grad_norm": 0.341796875, + "learning_rate": 0.00018952040659532936, + "loss": 0.937, + "step": 1565 + }, + { + "epoch": 0.23, + "grad_norm": 0.33984375, + "learning_rate": 0.00018940458857382467, + "loss": 0.9757, + "step": 1570 + }, + { + "epoch": 0.23, + "grad_norm": 0.333984375, + "learning_rate": 0.00018928816985724458, + "loss": 0.9968, + "step": 1575 + }, + { + "epoch": 0.23, + "grad_norm": 0.3359375, + "learning_rate": 0.0001891711512277878, + "loss": 0.974, + "step": 1580 + }, + { + "epoch": 0.24, + "grad_norm": 0.34765625, + "learning_rate": 0.00018905353347168366, + "loss": 0.9641, + "step": 1585 + }, + { + "epoch": 0.24, + "grad_norm": 0.33984375, + "learning_rate": 0.00018893531737918702, + "loss": 0.9799, + "step": 1590 + }, + { + "epoch": 0.24, + "grad_norm": 0.3359375, + "learning_rate": 0.0001888165037445728, + "loss": 0.9385, + "step": 1595 + }, + { + "epoch": 0.24, + "grad_norm": 0.357421875, + "learning_rate": 0.00018869709336613073, + "loss": 1.0199, + "step": 1600 + }, + { + "epoch": 0.24, + "grad_norm": 0.33203125, + "learning_rate": 0.00018857708704615996, + "loss": 0.9743, + "step": 1605 + }, + { + "epoch": 0.24, + "grad_norm": 0.3515625, + "learning_rate": 0.00018845648559096377, + "loss": 0.9641, + "step": 1610 + }, + { + "epoch": 0.24, + "grad_norm": 0.34375, + "learning_rate": 0.00018833528981084384, + "loss": 0.9978, + "step": 1615 + }, + { + "epoch": 0.24, + "grad_norm": 0.34765625, + "learning_rate": 0.00018821350052009524, + "loss": 0.9661, + "step": 1620 + }, + { + "epoch": 0.24, + "grad_norm": 0.33984375, + "learning_rate": 0.00018809111853700068, + "loss": 0.9739, + "step": 1625 + }, + { + "epoch": 0.24, + "grad_norm": 0.33984375, + "learning_rate": 0.00018796814468382498, + "loss": 0.9467, + "step": 1630 + }, + { + "epoch": 0.24, + "grad_norm": 0.357421875, + "learning_rate": 0.00018784457978680976, + "loss": 0.9578, + "step": 1635 + }, + { + "epoch": 0.24, + "grad_norm": 0.34375, + "learning_rate": 0.0001877204246761677, + "loss": 0.9351, + "step": 1640 + }, + { + "epoch": 0.24, + "grad_norm": 0.341796875, + "learning_rate": 0.00018759568018607707, + "loss": 0.969, + "step": 1645 + }, + { + "epoch": 0.25, + "grad_norm": 0.353515625, + "learning_rate": 0.00018747034715467612, + "loss": 0.9618, + "step": 1650 + }, + { + "epoch": 0.25, + "grad_norm": 0.328125, + "learning_rate": 0.00018734442642405724, + "loss": 0.9595, + "step": 1655 + }, + { + "epoch": 0.25, + "grad_norm": 0.34375, + "learning_rate": 0.0001872179188402617, + "loss": 0.9795, + "step": 1660 + }, + { + "epoch": 0.25, + "grad_norm": 0.33984375, + "learning_rate": 0.00018709082525327362, + "loss": 0.9558, + "step": 1665 + }, + { + "epoch": 0.25, + "grad_norm": 0.3359375, + "learning_rate": 0.00018696314651701437, + "loss": 0.9677, + "step": 1670 + }, + { + "epoch": 0.25, + "grad_norm": 0.345703125, + "learning_rate": 0.00018683488348933693, + "loss": 0.9851, + "step": 1675 + }, + { + "epoch": 0.25, + "grad_norm": 0.34765625, + "learning_rate": 0.00018670603703202, + "loss": 0.9973, + "step": 1680 + }, + { + "epoch": 0.25, + "grad_norm": 0.35546875, + "learning_rate": 0.00018657660801076214, + "loss": 0.9835, + "step": 1685 + }, + { + "epoch": 0.25, + "grad_norm": 0.34765625, + "learning_rate": 0.00018644659729517623, + "loss": 0.9342, + "step": 1690 + }, + { + "epoch": 0.25, + "grad_norm": 0.349609375, + "learning_rate": 0.00018631600575878343, + "loss": 1.0092, + "step": 1695 + }, + { + "epoch": 0.25, + "grad_norm": 0.359375, + "learning_rate": 0.00018618483427900722, + "loss": 0.9792, + "step": 1700 + }, + { + "epoch": 0.25, + "grad_norm": 0.34765625, + "learning_rate": 0.00018605308373716782, + "loss": 0.9774, + "step": 1705 + }, + { + "epoch": 0.25, + "grad_norm": 0.33984375, + "learning_rate": 0.00018592075501847584, + "loss": 0.9629, + "step": 1710 + }, + { + "epoch": 0.25, + "grad_norm": 0.33984375, + "learning_rate": 0.00018578784901202675, + "loss": 0.9647, + "step": 1715 + }, + { + "epoch": 0.26, + "grad_norm": 0.34765625, + "learning_rate": 0.0001856543666107947, + "loss": 0.9881, + "step": 1720 + }, + { + "epoch": 0.26, + "grad_norm": 0.345703125, + "learning_rate": 0.0001855203087116265, + "loss": 0.9917, + "step": 1725 + }, + { + "epoch": 0.26, + "grad_norm": 0.35546875, + "learning_rate": 0.0001853856762152356, + "loss": 0.9852, + "step": 1730 + }, + { + "epoch": 0.26, + "grad_norm": 0.3359375, + "learning_rate": 0.00018525047002619616, + "loss": 0.9541, + "step": 1735 + }, + { + "epoch": 0.26, + "grad_norm": 0.34765625, + "learning_rate": 0.00018511469105293683, + "loss": 0.971, + "step": 1740 + }, + { + "epoch": 0.26, + "grad_norm": 0.341796875, + "learning_rate": 0.0001849783402077348, + "loss": 0.9864, + "step": 1745 + }, + { + "epoch": 0.26, + "grad_norm": 0.345703125, + "learning_rate": 0.0001848414184067094, + "loss": 0.9857, + "step": 1750 + }, + { + "epoch": 0.26, + "grad_norm": 0.33203125, + "learning_rate": 0.00018470392656981633, + "loss": 0.9607, + "step": 1755 + }, + { + "epoch": 0.26, + "grad_norm": 0.333984375, + "learning_rate": 0.0001845658656208411, + "loss": 0.9854, + "step": 1760 + }, + { + "epoch": 0.26, + "grad_norm": 0.345703125, + "learning_rate": 0.0001844272364873931, + "loss": 0.9878, + "step": 1765 + }, + { + "epoch": 0.26, + "grad_norm": 0.3359375, + "learning_rate": 0.0001842880401008992, + "loss": 0.9802, + "step": 1770 + }, + { + "epoch": 0.26, + "grad_norm": 0.337890625, + "learning_rate": 0.00018414827739659755, + "loss": 0.9668, + "step": 1775 + }, + { + "epoch": 0.26, + "grad_norm": 0.34375, + "learning_rate": 0.00018400794931353132, + "loss": 0.9624, + "step": 1780 + }, + { + "epoch": 0.27, + "grad_norm": 0.353515625, + "learning_rate": 0.00018386705679454242, + "loss": 1.0008, + "step": 1785 + }, + { + "epoch": 0.27, + "grad_norm": 0.341796875, + "learning_rate": 0.000183725600786265, + "loss": 0.9585, + "step": 1790 + }, + { + "epoch": 0.27, + "grad_norm": 0.345703125, + "learning_rate": 0.00018358358223911932, + "loss": 0.9947, + "step": 1795 + }, + { + "epoch": 0.27, + "grad_norm": 0.337890625, + "learning_rate": 0.00018344100210730514, + "loss": 0.9581, + "step": 1800 + }, + { + "epoch": 0.27, + "grad_norm": 0.33984375, + "learning_rate": 0.00018329786134879553, + "loss": 0.9528, + "step": 1805 + }, + { + "epoch": 0.27, + "grad_norm": 0.33984375, + "learning_rate": 0.00018315416092533023, + "loss": 0.9702, + "step": 1810 + }, + { + "epoch": 0.27, + "grad_norm": 0.3359375, + "learning_rate": 0.00018300990180240931, + "loss": 0.9341, + "step": 1815 + }, + { + "epoch": 0.27, + "grad_norm": 0.333984375, + "learning_rate": 0.00018286508494928666, + "loss": 0.9587, + "step": 1820 + }, + { + "epoch": 0.27, + "grad_norm": 0.34375, + "learning_rate": 0.00018271971133896345, + "loss": 0.9632, + "step": 1825 + }, + { + "epoch": 0.27, + "grad_norm": 0.34375, + "learning_rate": 0.0001825737819481817, + "loss": 0.949, + "step": 1830 + }, + { + "epoch": 0.27, + "grad_norm": 0.35546875, + "learning_rate": 0.00018242729775741744, + "loss": 0.9854, + "step": 1835 + }, + { + "epoch": 0.27, + "grad_norm": 0.3359375, + "learning_rate": 0.0001822802597508745, + "loss": 0.9745, + "step": 1840 + }, + { + "epoch": 0.27, + "grad_norm": 0.353515625, + "learning_rate": 0.00018213266891647765, + "loss": 0.9367, + "step": 1845 + }, + { + "epoch": 0.27, + "grad_norm": 0.349609375, + "learning_rate": 0.00018198452624586592, + "loss": 0.9518, + "step": 1850 + }, + { + "epoch": 0.28, + "grad_norm": 0.345703125, + "learning_rate": 0.00018183583273438623, + "loss": 0.9493, + "step": 1855 + }, + { + "epoch": 0.28, + "grad_norm": 0.337890625, + "learning_rate": 0.00018168658938108636, + "loss": 0.9703, + "step": 1860 + }, + { + "epoch": 0.28, + "grad_norm": 0.349609375, + "learning_rate": 0.00018153679718870843, + "loss": 1.0251, + "step": 1865 + }, + { + "epoch": 0.28, + "grad_norm": 0.337890625, + "learning_rate": 0.00018138645716368212, + "loss": 0.9541, + "step": 1870 + }, + { + "epoch": 0.28, + "grad_norm": 0.34765625, + "learning_rate": 0.00018123557031611798, + "loss": 0.9702, + "step": 1875 + }, + { + "epoch": 0.28, + "grad_norm": 0.349609375, + "learning_rate": 0.00018108413765980045, + "loss": 0.9776, + "step": 1880 + }, + { + "epoch": 0.28, + "grad_norm": 0.328125, + "learning_rate": 0.00018093216021218127, + "loss": 0.9974, + "step": 1885 + }, + { + "epoch": 0.28, + "grad_norm": 0.34375, + "learning_rate": 0.0001807796389943725, + "loss": 0.9357, + "step": 1890 + }, + { + "epoch": 0.28, + "grad_norm": 0.328125, + "learning_rate": 0.00018062657503113968, + "loss": 0.97, + "step": 1895 + }, + { + "epoch": 0.28, + "grad_norm": 0.345703125, + "learning_rate": 0.00018047296935089503, + "loss": 0.9495, + "step": 1900 + }, + { + "epoch": 0.28, + "grad_norm": 0.33984375, + "learning_rate": 0.00018031882298569048, + "loss": 0.977, + "step": 1905 + }, + { + "epoch": 0.28, + "grad_norm": 0.349609375, + "learning_rate": 0.00018016413697121065, + "loss": 0.9441, + "step": 1910 + }, + { + "epoch": 0.28, + "grad_norm": 0.353515625, + "learning_rate": 0.0001800089123467661, + "loss": 0.9695, + "step": 1915 + }, + { + "epoch": 0.29, + "grad_norm": 0.326171875, + "learning_rate": 0.00017985315015528607, + "loss": 0.9556, + "step": 1920 + }, + { + "epoch": 0.29, + "grad_norm": 0.357421875, + "learning_rate": 0.00017969685144331182, + "loss": 0.9801, + "step": 1925 + }, + { + "epoch": 0.29, + "grad_norm": 0.3515625, + "learning_rate": 0.00017954001726098917, + "loss": 1.0291, + "step": 1930 + }, + { + "epoch": 0.29, + "grad_norm": 0.34765625, + "learning_rate": 0.00017938264866206193, + "loss": 0.9283, + "step": 1935 + }, + { + "epoch": 0.29, + "grad_norm": 0.328125, + "learning_rate": 0.0001792247467038644, + "loss": 0.9569, + "step": 1940 + }, + { + "epoch": 0.29, + "grad_norm": 0.34375, + "learning_rate": 0.00017906631244731446, + "loss": 0.9667, + "step": 1945 + }, + { + "epoch": 0.29, + "grad_norm": 0.34375, + "learning_rate": 0.00017890734695690652, + "loss": 0.953, + "step": 1950 + }, + { + "epoch": 0.29, + "grad_norm": 0.3515625, + "learning_rate": 0.0001787478513007041, + "loss": 0.9536, + "step": 1955 + }, + { + "epoch": 0.29, + "grad_norm": 0.349609375, + "learning_rate": 0.00017858782655033291, + "loss": 0.9896, + "step": 1960 + }, + { + "epoch": 0.29, + "grad_norm": 0.341796875, + "learning_rate": 0.0001784272737809736, + "loss": 0.9257, + "step": 1965 + }, + { + "epoch": 0.29, + "grad_norm": 0.349609375, + "learning_rate": 0.00017826619407135445, + "loss": 0.9844, + "step": 1970 + }, + { + "epoch": 0.29, + "grad_norm": 0.349609375, + "learning_rate": 0.00017810458850374414, + "loss": 0.9824, + "step": 1975 + }, + { + "epoch": 0.29, + "grad_norm": 0.34765625, + "learning_rate": 0.0001779424581639445, + "loss": 0.9524, + "step": 1980 + }, + { + "epoch": 0.29, + "grad_norm": 0.341796875, + "learning_rate": 0.0001777798041412833, + "loss": 0.9813, + "step": 1985 + }, + { + "epoch": 0.3, + "grad_norm": 0.345703125, + "learning_rate": 0.00017761662752860678, + "loss": 0.9497, + "step": 1990 + }, + { + "epoch": 0.3, + "grad_norm": 0.3515625, + "learning_rate": 0.00017745292942227244, + "loss": 0.921, + "step": 1995 + }, + { + "epoch": 0.3, + "grad_norm": 0.34375, + "learning_rate": 0.0001772887109221415, + "loss": 0.9375, + "step": 2000 + }, + { + "epoch": 0.3, + "grad_norm": 0.34375, + "learning_rate": 0.00017712397313157175, + "loss": 0.9381, + "step": 2005 + }, + { + "epoch": 0.3, + "grad_norm": 0.349609375, + "learning_rate": 0.00017695871715740994, + "loss": 0.954, + "step": 2010 + }, + { + "epoch": 0.3, + "grad_norm": 0.341796875, + "learning_rate": 0.00017679294410998442, + "loss": 0.9281, + "step": 2015 + }, + { + "epoch": 0.3, + "grad_norm": 0.349609375, + "learning_rate": 0.00017662665510309768, + "loss": 0.9522, + "step": 2020 + }, + { + "epoch": 0.3, + "grad_norm": 0.341796875, + "learning_rate": 0.00017645985125401884, + "loss": 0.9619, + "step": 2025 + }, + { + "epoch": 0.3, + "grad_norm": 0.341796875, + "learning_rate": 0.00017629253368347622, + "loss": 0.9485, + "step": 2030 + }, + { + "epoch": 0.3, + "grad_norm": 0.337890625, + "learning_rate": 0.0001761247035156497, + "loss": 0.9487, + "step": 2035 + }, + { + "epoch": 0.3, + "grad_norm": 0.337890625, + "learning_rate": 0.00017595636187816318, + "loss": 0.9368, + "step": 2040 + }, + { + "epoch": 0.3, + "grad_norm": 0.337890625, + "learning_rate": 0.00017578750990207716, + "loss": 0.923, + "step": 2045 + }, + { + "epoch": 0.3, + "grad_norm": 0.337890625, + "learning_rate": 0.00017561814872188092, + "loss": 0.9878, + "step": 2050 + }, + { + "epoch": 0.31, + "grad_norm": 0.34375, + "learning_rate": 0.00017544827947548502, + "loss": 1.0012, + "step": 2055 + }, + { + "epoch": 0.31, + "grad_norm": 0.337890625, + "learning_rate": 0.0001752779033042137, + "loss": 1.0007, + "step": 2060 + }, + { + "epoch": 0.31, + "grad_norm": 0.341796875, + "learning_rate": 0.00017510702135279708, + "loss": 0.9638, + "step": 2065 + }, + { + "epoch": 0.31, + "grad_norm": 0.33203125, + "learning_rate": 0.00017493563476936351, + "loss": 0.968, + "step": 2070 + }, + { + "epoch": 0.31, + "grad_norm": 0.345703125, + "learning_rate": 0.00017476374470543199, + "loss": 0.9327, + "step": 2075 + }, + { + "epoch": 0.31, + "grad_norm": 0.33984375, + "learning_rate": 0.00017459135231590424, + "loss": 0.958, + "step": 2080 + }, + { + "epoch": 0.31, + "grad_norm": 0.333984375, + "learning_rate": 0.00017441845875905702, + "loss": 0.9515, + "step": 2085 + }, + { + "epoch": 0.31, + "grad_norm": 0.34765625, + "learning_rate": 0.00017424506519653438, + "loss": 0.9442, + "step": 2090 + }, + { + "epoch": 0.31, + "grad_norm": 0.337890625, + "learning_rate": 0.00017407117279333987, + "loss": 0.942, + "step": 2095 + }, + { + "epoch": 0.31, + "grad_norm": 0.34765625, + "learning_rate": 0.0001738967827178286, + "loss": 0.9547, + "step": 2100 + }, + { + "epoch": 0.31, + "grad_norm": 0.333984375, + "learning_rate": 0.00017372189614169947, + "loss": 0.966, + "step": 2105 + }, + { + "epoch": 0.31, + "grad_norm": 0.349609375, + "learning_rate": 0.00017354651423998733, + "loss": 0.9754, + "step": 2110 + }, + { + "epoch": 0.31, + "grad_norm": 0.341796875, + "learning_rate": 0.00017337063819105496, + "loss": 0.928, + "step": 2115 + }, + { + "epoch": 0.31, + "grad_norm": 0.337890625, + "learning_rate": 0.00017319426917658537, + "loss": 0.9721, + "step": 2120 + }, + { + "epoch": 0.32, + "grad_norm": 0.3515625, + "learning_rate": 0.00017301740838157362, + "loss": 0.9716, + "step": 2125 + }, + { + "epoch": 0.32, + "grad_norm": 0.34375, + "learning_rate": 0.00017284005699431896, + "loss": 0.9606, + "step": 2130 + }, + { + "epoch": 0.32, + "grad_norm": 0.341796875, + "learning_rate": 0.000172662216206417, + "loss": 0.9619, + "step": 2135 + }, + { + "epoch": 0.32, + "grad_norm": 0.3359375, + "learning_rate": 0.00017248388721275129, + "loss": 0.9236, + "step": 2140 + }, + { + "epoch": 0.32, + "grad_norm": 0.333984375, + "learning_rate": 0.00017230507121148575, + "loss": 0.9592, + "step": 2145 + }, + { + "epoch": 0.32, + "grad_norm": 0.3359375, + "learning_rate": 0.00017212576940405647, + "loss": 0.9507, + "step": 2150 + }, + { + "epoch": 0.32, + "grad_norm": 0.369140625, + "learning_rate": 0.00017194598299516338, + "loss": 0.9622, + "step": 2155 + }, + { + "epoch": 0.32, + "grad_norm": 0.349609375, + "learning_rate": 0.00017176571319276257, + "loss": 0.9527, + "step": 2160 + }, + { + "epoch": 0.32, + "grad_norm": 0.330078125, + "learning_rate": 0.00017158496120805788, + "loss": 0.9426, + "step": 2165 + }, + { + "epoch": 0.32, + "grad_norm": 0.349609375, + "learning_rate": 0.00017140372825549284, + "loss": 0.9517, + "step": 2170 + }, + { + "epoch": 0.32, + "grad_norm": 0.33984375, + "learning_rate": 0.00017122201555274261, + "loss": 0.9285, + "step": 2175 + }, + { + "epoch": 0.32, + "grad_norm": 0.33984375, + "learning_rate": 0.00017103982432070563, + "loss": 0.9671, + "step": 2180 + }, + { + "epoch": 0.32, + "grad_norm": 0.34375, + "learning_rate": 0.00017085715578349557, + "loss": 0.9375, + "step": 2185 + }, + { + "epoch": 0.33, + "grad_norm": 0.333984375, + "learning_rate": 0.00017067401116843296, + "loss": 0.954, + "step": 2190 + }, + { + "epoch": 0.33, + "grad_norm": 0.345703125, + "learning_rate": 0.0001704903917060371, + "loss": 0.9486, + "step": 2195 + }, + { + "epoch": 0.33, + "grad_norm": 0.337890625, + "learning_rate": 0.00017030629863001764, + "loss": 0.9823, + "step": 2200 + }, + { + "epoch": 0.33, + "grad_norm": 0.330078125, + "learning_rate": 0.0001701217331772664, + "loss": 0.9401, + "step": 2205 + }, + { + "epoch": 0.33, + "grad_norm": 0.34375, + "learning_rate": 0.00016993669658784904, + "loss": 0.9571, + "step": 2210 + }, + { + "epoch": 0.33, + "grad_norm": 0.328125, + "learning_rate": 0.0001697511901049967, + "loss": 0.9801, + "step": 2215 + }, + { + "epoch": 0.33, + "grad_norm": 0.3359375, + "learning_rate": 0.00016956521497509764, + "loss": 0.943, + "step": 2220 + }, + { + "epoch": 0.33, + "grad_norm": 0.34375, + "learning_rate": 0.0001693787724476889, + "loss": 0.973, + "step": 2225 + }, + { + "epoch": 0.33, + "grad_norm": 0.345703125, + "learning_rate": 0.00016919186377544788, + "loss": 0.9662, + "step": 2230 + }, + { + "epoch": 0.33, + "grad_norm": 0.353515625, + "learning_rate": 0.00016900449021418394, + "loss": 0.9618, + "step": 2235 + }, + { + "epoch": 0.33, + "grad_norm": 0.33984375, + "learning_rate": 0.00016881665302282995, + "loss": 0.9454, + "step": 2240 + }, + { + "epoch": 0.33, + "grad_norm": 0.3359375, + "learning_rate": 0.00016862835346343385, + "loss": 0.9414, + "step": 2245 + }, + { + "epoch": 0.33, + "grad_norm": 0.349609375, + "learning_rate": 0.00016843959280115015, + "loss": 0.9437, + "step": 2250 + }, + { + "epoch": 0.33, + "grad_norm": 0.353515625, + "learning_rate": 0.00016825037230423139, + "loss": 0.9761, + "step": 2255 + }, + { + "epoch": 0.34, + "grad_norm": 0.345703125, + "learning_rate": 0.00016806069324401977, + "loss": 0.9458, + "step": 2260 + }, + { + "epoch": 0.34, + "grad_norm": 0.33203125, + "learning_rate": 0.00016787055689493837, + "loss": 0.9676, + "step": 2265 + }, + { + "epoch": 0.34, + "grad_norm": 0.341796875, + "learning_rate": 0.00016767996453448283, + "loss": 0.9729, + "step": 2270 + }, + { + "epoch": 0.34, + "grad_norm": 0.33984375, + "learning_rate": 0.00016748891744321263, + "loss": 0.9613, + "step": 2275 + }, + { + "epoch": 0.34, + "grad_norm": 0.34765625, + "learning_rate": 0.0001672974169047425, + "loss": 0.9453, + "step": 2280 + }, + { + "epoch": 0.34, + "grad_norm": 0.33203125, + "learning_rate": 0.00016710546420573377, + "loss": 0.9538, + "step": 2285 + }, + { + "epoch": 0.34, + "grad_norm": 0.349609375, + "learning_rate": 0.00016691306063588583, + "loss": 0.9484, + "step": 2290 + }, + { + "epoch": 0.34, + "grad_norm": 0.3359375, + "learning_rate": 0.0001667202074879274, + "loss": 0.9501, + "step": 2295 + }, + { + "epoch": 0.34, + "grad_norm": 0.3515625, + "learning_rate": 0.00016652690605760775, + "loss": 0.9739, + "step": 2300 + }, + { + "epoch": 0.34, + "grad_norm": 0.333984375, + "learning_rate": 0.00016633315764368818, + "loss": 0.9428, + "step": 2305 + }, + { + "epoch": 0.34, + "grad_norm": 0.341796875, + "learning_rate": 0.0001661389635479332, + "loss": 0.938, + "step": 2310 + }, + { + "epoch": 0.34, + "grad_norm": 0.35546875, + "learning_rate": 0.00016594432507510175, + "loss": 0.9647, + "step": 2315 + }, + { + "epoch": 0.34, + "grad_norm": 0.333984375, + "learning_rate": 0.00016574924353293845, + "loss": 0.9413, + "step": 2320 + }, + { + "epoch": 0.35, + "grad_norm": 0.3515625, + "learning_rate": 0.0001655537202321649, + "loss": 0.9443, + "step": 2325 + }, + { + "epoch": 0.35, + "grad_norm": 0.341796875, + "learning_rate": 0.00016535775648647075, + "loss": 0.9407, + "step": 2330 + }, + { + "epoch": 0.35, + "grad_norm": 0.337890625, + "learning_rate": 0.0001651613536125049, + "loss": 0.9803, + "step": 2335 + }, + { + "epoch": 0.35, + "grad_norm": 0.34375, + "learning_rate": 0.0001649645129298668, + "loss": 0.9349, + "step": 2340 + }, + { + "epoch": 0.35, + "grad_norm": 0.328125, + "learning_rate": 0.0001647672357610973, + "loss": 0.9581, + "step": 2345 + }, + { + "epoch": 0.35, + "grad_norm": 0.33984375, + "learning_rate": 0.00016456952343167007, + "loss": 0.9243, + "step": 2350 + }, + { + "epoch": 0.35, + "grad_norm": 0.34765625, + "learning_rate": 0.00016437137726998255, + "loss": 0.9591, + "step": 2355 + }, + { + "epoch": 0.35, + "grad_norm": 0.34375, + "learning_rate": 0.00016417279860734692, + "loss": 0.9286, + "step": 2360 + }, + { + "epoch": 0.35, + "grad_norm": 0.345703125, + "learning_rate": 0.00016397378877798134, + "loss": 0.9695, + "step": 2365 + }, + { + "epoch": 0.35, + "grad_norm": 0.3515625, + "learning_rate": 0.0001637743491190009, + "loss": 0.9696, + "step": 2370 + }, + { + "epoch": 0.35, + "grad_norm": 0.3359375, + "learning_rate": 0.00016357448097040867, + "loss": 0.9465, + "step": 2375 + }, + { + "epoch": 0.35, + "grad_norm": 0.333984375, + "learning_rate": 0.00016337418567508665, + "loss": 0.9405, + "step": 2380 + }, + { + "epoch": 0.35, + "grad_norm": 0.337890625, + "learning_rate": 0.00016317346457878675, + "loss": 0.9151, + "step": 2385 + }, + { + "epoch": 0.35, + "grad_norm": 0.361328125, + "learning_rate": 0.0001629723190301218, + "loss": 0.9319, + "step": 2390 + }, + { + "epoch": 0.36, + "grad_norm": 0.345703125, + "learning_rate": 0.00016277075038055634, + "loss": 0.956, + "step": 2395 + }, + { + "epoch": 0.36, + "grad_norm": 0.3359375, + "learning_rate": 0.00016256875998439796, + "loss": 0.9475, + "step": 2400 + }, + { + "epoch": 0.36, + "grad_norm": 0.3359375, + "learning_rate": 0.00016236634919878756, + "loss": 0.9747, + "step": 2405 + }, + { + "epoch": 0.36, + "grad_norm": 0.3359375, + "learning_rate": 0.00016216351938369073, + "loss": 0.9405, + "step": 2410 + }, + { + "epoch": 0.36, + "grad_norm": 0.341796875, + "learning_rate": 0.00016196027190188848, + "loss": 0.9888, + "step": 2415 + }, + { + "epoch": 0.36, + "grad_norm": 0.34765625, + "learning_rate": 0.00016175660811896796, + "loss": 0.9063, + "step": 2420 + }, + { + "epoch": 0.36, + "grad_norm": 0.345703125, + "learning_rate": 0.00016155252940331342, + "loss": 0.9454, + "step": 2425 + }, + { + "epoch": 0.36, + "grad_norm": 0.341796875, + "learning_rate": 0.000161348037126097, + "loss": 0.9783, + "step": 2430 + }, + { + "epoch": 0.36, + "grad_norm": 0.34375, + "learning_rate": 0.0001611431326612695, + "loss": 0.9951, + "step": 2435 + }, + { + "epoch": 0.36, + "grad_norm": 0.33984375, + "learning_rate": 0.00016093781738555106, + "loss": 0.9194, + "step": 2440 + }, + { + "epoch": 0.36, + "grad_norm": 0.345703125, + "learning_rate": 0.0001607320926784221, + "loss": 0.9692, + "step": 2445 + }, + { + "epoch": 0.36, + "grad_norm": 0.349609375, + "learning_rate": 0.00016052595992211387, + "loss": 0.8929, + "step": 2450 + }, + { + "epoch": 0.36, + "grad_norm": 0.345703125, + "learning_rate": 0.0001603194205015993, + "loss": 0.9456, + "step": 2455 + }, + { + "epoch": 0.37, + "grad_norm": 0.337890625, + "learning_rate": 0.00016011247580458355, + "loss": 0.956, + "step": 2460 + }, + { + "epoch": 0.37, + "grad_norm": 0.341796875, + "learning_rate": 0.00015990512722149482, + "loss": 0.9113, + "step": 2465 + }, + { + "epoch": 0.37, + "grad_norm": 0.333984375, + "learning_rate": 0.00015969737614547494, + "loss": 0.9445, + "step": 2470 + }, + { + "epoch": 0.37, + "grad_norm": 0.357421875, + "learning_rate": 0.00015948922397237007, + "loss": 0.957, + "step": 2475 + }, + { + "epoch": 0.37, + "grad_norm": 0.349609375, + "learning_rate": 0.00015928067210072122, + "loss": 0.924, + "step": 2480 + }, + { + "epoch": 0.37, + "grad_norm": 0.35546875, + "learning_rate": 0.0001590717219317549, + "loss": 0.9558, + "step": 2485 + }, + { + "epoch": 0.37, + "grad_norm": 0.33984375, + "learning_rate": 0.00015886237486937378, + "loss": 0.9632, + "step": 2490 + }, + { + "epoch": 0.37, + "grad_norm": 0.375, + "learning_rate": 0.00015865263232014715, + "loss": 0.9477, + "step": 2495 + }, + { + "epoch": 0.37, + "grad_norm": 0.341796875, + "learning_rate": 0.0001584424956933015, + "loss": 0.9443, + "step": 2500 + }, + { + "epoch": 0.37, + "grad_norm": 0.333984375, + "learning_rate": 0.0001582319664007111, + "loss": 0.9316, + "step": 2505 + }, + { + "epoch": 0.37, + "grad_norm": 0.3359375, + "learning_rate": 0.00015802104585688851, + "loss": 0.9205, + "step": 2510 + }, + { + "epoch": 0.37, + "grad_norm": 0.333984375, + "learning_rate": 0.00015780973547897494, + "loss": 0.924, + "step": 2515 + }, + { + "epoch": 0.37, + "grad_norm": 0.333984375, + "learning_rate": 0.0001575980366867309, + "loss": 0.9498, + "step": 2520 + }, + { + "epoch": 0.37, + "grad_norm": 0.34765625, + "learning_rate": 0.00015738595090252657, + "loss": 0.9677, + "step": 2525 + }, + { + "epoch": 0.38, + "grad_norm": 0.353515625, + "learning_rate": 0.00015717347955133233, + "loss": 0.9786, + "step": 2530 + }, + { + "epoch": 0.38, + "grad_norm": 0.345703125, + "learning_rate": 0.00015696062406070902, + "loss": 0.9523, + "step": 2535 + }, + { + "epoch": 0.38, + "grad_norm": 0.3359375, + "learning_rate": 0.00015674738586079852, + "loss": 0.9496, + "step": 2540 + }, + { + "epoch": 0.38, + "grad_norm": 0.349609375, + "learning_rate": 0.0001565337663843141, + "loss": 0.9439, + "step": 2545 + }, + { + "epoch": 0.38, + "grad_norm": 0.33203125, + "learning_rate": 0.00015631976706653063, + "loss": 0.9496, + "step": 2550 + }, + { + "epoch": 0.38, + "grad_norm": 0.330078125, + "learning_rate": 0.00015610538934527526, + "loss": 0.9315, + "step": 2555 + }, + { + "epoch": 0.38, + "grad_norm": 0.345703125, + "learning_rate": 0.00015589063466091743, + "loss": 0.9595, + "step": 2560 + }, + { + "epoch": 0.38, + "grad_norm": 0.333984375, + "learning_rate": 0.0001556755044563594, + "loss": 0.9524, + "step": 2565 + }, + { + "epoch": 0.38, + "grad_norm": 0.341796875, + "learning_rate": 0.00015546000017702648, + "loss": 0.9442, + "step": 2570 + }, + { + "epoch": 0.38, + "grad_norm": 0.357421875, + "learning_rate": 0.00015524412327085725, + "loss": 0.9807, + "step": 2575 + }, + { + "epoch": 0.38, + "grad_norm": 0.318359375, + "learning_rate": 0.00015502787518829406, + "loss": 0.9385, + "step": 2580 + }, + { + "epoch": 0.38, + "grad_norm": 0.328125, + "learning_rate": 0.00015481125738227305, + "loss": 0.9618, + "step": 2585 + }, + { + "epoch": 0.38, + "grad_norm": 0.3515625, + "learning_rate": 0.00015459427130821442, + "loss": 0.9443, + "step": 2590 + }, + { + "epoch": 0.39, + "grad_norm": 0.35546875, + "learning_rate": 0.0001543769184240128, + "loss": 0.9428, + "step": 2595 + }, + { + "epoch": 0.39, + "grad_norm": 0.361328125, + "learning_rate": 0.00015415920019002736, + "loss": 0.9349, + "step": 2600 + }, + { + "epoch": 0.39, + "grad_norm": 0.3359375, + "learning_rate": 0.00015394111806907188, + "loss": 0.9325, + "step": 2605 + }, + { + "epoch": 0.39, + "grad_norm": 0.345703125, + "learning_rate": 0.00015372267352640513, + "loss": 0.9622, + "step": 2610 + }, + { + "epoch": 0.39, + "grad_norm": 0.34765625, + "learning_rate": 0.00015350386802972097, + "loss": 0.934, + "step": 2615 + }, + { + "epoch": 0.39, + "grad_norm": 0.349609375, + "learning_rate": 0.00015328470304913833, + "loss": 0.9729, + "step": 2620 + }, + { + "epoch": 0.39, + "grad_norm": 0.330078125, + "learning_rate": 0.00015306518005719157, + "loss": 0.9473, + "step": 2625 + }, + { + "epoch": 0.39, + "grad_norm": 0.3359375, + "learning_rate": 0.00015284530052882045, + "loss": 0.9268, + "step": 2630 + }, + { + "epoch": 0.39, + "grad_norm": 0.34765625, + "learning_rate": 0.00015262506594136016, + "loss": 0.9737, + "step": 2635 + }, + { + "epoch": 0.39, + "grad_norm": 0.34375, + "learning_rate": 0.00015240447777453153, + "loss": 0.9231, + "step": 2640 + }, + { + "epoch": 0.39, + "grad_norm": 0.345703125, + "learning_rate": 0.00015218353751043107, + "loss": 0.9359, + "step": 2645 + }, + { + "epoch": 0.39, + "grad_norm": 0.353515625, + "learning_rate": 0.00015196224663352093, + "loss": 0.9394, + "step": 2650 + }, + { + "epoch": 0.39, + "grad_norm": 0.3671875, + "learning_rate": 0.00015174060663061898, + "loss": 0.9556, + "step": 2655 + }, + { + "epoch": 0.39, + "grad_norm": 0.3359375, + "learning_rate": 0.00015151861899088877, + "loss": 0.9345, + "step": 2660 + }, + { + "epoch": 0.4, + "grad_norm": 0.357421875, + "learning_rate": 0.0001512962852058297, + "loss": 0.9363, + "step": 2665 + }, + { + "epoch": 0.4, + "grad_norm": 0.357421875, + "learning_rate": 0.00015107360676926666, + "loss": 0.9575, + "step": 2670 + }, + { + "epoch": 0.4, + "grad_norm": 0.33984375, + "learning_rate": 0.00015085058517734043, + "loss": 0.9824, + "step": 2675 + }, + { + "epoch": 0.4, + "grad_norm": 0.341796875, + "learning_rate": 0.0001506272219284972, + "loss": 0.95, + "step": 2680 + }, + { + "epoch": 0.4, + "grad_norm": 0.33984375, + "learning_rate": 0.00015040351852347878, + "loss": 0.9348, + "step": 2685 + }, + { + "epoch": 0.4, + "grad_norm": 0.34765625, + "learning_rate": 0.0001501794764653124, + "loss": 0.9585, + "step": 2690 + }, + { + "epoch": 0.4, + "grad_norm": 0.345703125, + "learning_rate": 0.00014995509725930078, + "loss": 0.9342, + "step": 2695 + }, + { + "epoch": 0.4, + "grad_norm": 0.34375, + "learning_rate": 0.0001497303824130117, + "loss": 0.9266, + "step": 2700 + }, + { + "epoch": 0.4, + "grad_norm": 0.33984375, + "learning_rate": 0.00014950533343626812, + "loss": 0.9552, + "step": 2705 + }, + { + "epoch": 0.4, + "grad_norm": 0.33984375, + "learning_rate": 0.000149279951841138, + "loss": 0.9514, + "step": 2710 + }, + { + "epoch": 0.4, + "grad_norm": 0.341796875, + "learning_rate": 0.00014905423914192412, + "loss": 0.9697, + "step": 2715 + }, + { + "epoch": 0.4, + "grad_norm": 0.34375, + "learning_rate": 0.0001488281968551538, + "loss": 0.9494, + "step": 2720 + }, + { + "epoch": 0.4, + "grad_norm": 0.357421875, + "learning_rate": 0.00014860182649956892, + "loss": 0.9667, + "step": 2725 + }, + { + "epoch": 0.41, + "grad_norm": 0.35546875, + "learning_rate": 0.0001483751295961156, + "loss": 0.9504, + "step": 2730 + }, + { + "epoch": 0.41, + "grad_norm": 0.3515625, + "learning_rate": 0.0001481481076679338, + "loss": 0.953, + "step": 2735 + }, + { + "epoch": 0.41, + "grad_norm": 0.34375, + "learning_rate": 0.00014792076224034753, + "loss": 0.9548, + "step": 2740 + }, + { + "epoch": 0.41, + "grad_norm": 0.34375, + "learning_rate": 0.00014769309484085412, + "loss": 0.9632, + "step": 2745 + }, + { + "epoch": 0.41, + "grad_norm": 0.33984375, + "learning_rate": 0.00014746510699911432, + "loss": 0.9619, + "step": 2750 + }, + { + "epoch": 0.41, + "grad_norm": 0.34375, + "learning_rate": 0.00014723680024694184, + "loss": 0.9155, + "step": 2755 + }, + { + "epoch": 0.41, + "grad_norm": 0.353515625, + "learning_rate": 0.00014700817611829308, + "loss": 0.9234, + "step": 2760 + }, + { + "epoch": 0.41, + "grad_norm": 0.337890625, + "learning_rate": 0.00014677923614925685, + "loss": 0.934, + "step": 2765 + }, + { + "epoch": 0.41, + "grad_norm": 0.3359375, + "learning_rate": 0.000146549981878044, + "loss": 0.9354, + "step": 2770 + }, + { + "epoch": 0.41, + "grad_norm": 0.333984375, + "learning_rate": 0.00014632041484497727, + "loss": 0.9554, + "step": 2775 + }, + { + "epoch": 0.41, + "grad_norm": 0.353515625, + "learning_rate": 0.00014609053659248058, + "loss": 0.9443, + "step": 2780 + }, + { + "epoch": 0.41, + "grad_norm": 0.341796875, + "learning_rate": 0.00014586034866506906, + "loss": 0.9584, + "step": 2785 + }, + { + "epoch": 0.41, + "grad_norm": 0.34375, + "learning_rate": 0.00014562985260933845, + "loss": 0.9035, + "step": 2790 + }, + { + "epoch": 0.42, + "grad_norm": 0.34765625, + "learning_rate": 0.00014539904997395468, + "loss": 0.9215, + "step": 2795 + }, + { + "epoch": 0.42, + "grad_norm": 0.341796875, + "learning_rate": 0.00014516794230964365, + "loss": 0.9279, + "step": 2800 + }, + { + "epoch": 0.42, + "grad_norm": 0.349609375, + "learning_rate": 0.00014493653116918066, + "loss": 0.9395, + "step": 2805 + }, + { + "epoch": 0.42, + "grad_norm": 0.341796875, + "learning_rate": 0.0001447048181073799, + "loss": 0.9314, + "step": 2810 + }, + { + "epoch": 0.42, + "grad_norm": 0.3359375, + "learning_rate": 0.00014447280468108436, + "loss": 0.9497, + "step": 2815 + }, + { + "epoch": 0.42, + "grad_norm": 0.345703125, + "learning_rate": 0.00014424049244915493, + "loss": 0.9324, + "step": 2820 + }, + { + "epoch": 0.42, + "grad_norm": 0.3515625, + "learning_rate": 0.00014400788297246024, + "loss": 0.9351, + "step": 2825 + }, + { + "epoch": 0.42, + "grad_norm": 0.345703125, + "learning_rate": 0.0001437749778138659, + "loss": 0.9482, + "step": 2830 + }, + { + "epoch": 0.42, + "grad_norm": 0.34375, + "learning_rate": 0.00014354177853822443, + "loss": 0.938, + "step": 2835 + }, + { + "epoch": 0.42, + "grad_norm": 0.326171875, + "learning_rate": 0.00014330828671236425, + "loss": 0.9478, + "step": 2840 + }, + { + "epoch": 0.42, + "grad_norm": 0.33984375, + "learning_rate": 0.0001430745039050794, + "loss": 0.9596, + "step": 2845 + }, + { + "epoch": 0.42, + "grad_norm": 0.34765625, + "learning_rate": 0.00014284043168711906, + "loss": 0.9186, + "step": 2850 + }, + { + "epoch": 0.42, + "grad_norm": 0.337890625, + "learning_rate": 0.00014260607163117694, + "loss": 0.9631, + "step": 2855 + }, + { + "epoch": 0.42, + "grad_norm": 0.341796875, + "learning_rate": 0.00014237142531188055, + "loss": 0.9411, + "step": 2860 + }, + { + "epoch": 0.43, + "grad_norm": 0.357421875, + "learning_rate": 0.00014213649430578083, + "loss": 0.9558, + "step": 2865 + }, + { + "epoch": 0.43, + "grad_norm": 0.341796875, + "learning_rate": 0.00014190128019134153, + "loss": 0.9684, + "step": 2870 + }, + { + "epoch": 0.43, + "grad_norm": 0.33984375, + "learning_rate": 0.00014166578454892853, + "loss": 0.9226, + "step": 2875 + }, + { + "epoch": 0.43, + "grad_norm": 0.341796875, + "learning_rate": 0.00014143000896079918, + "loss": 0.9103, + "step": 2880 + }, + { + "epoch": 0.43, + "grad_norm": 0.337890625, + "learning_rate": 0.00014119395501109182, + "loss": 0.9262, + "step": 2885 + }, + { + "epoch": 0.43, + "grad_norm": 0.349609375, + "learning_rate": 0.00014095762428581506, + "loss": 0.9444, + "step": 2890 + }, + { + "epoch": 0.43, + "grad_norm": 0.349609375, + "learning_rate": 0.0001407210183728371, + "loss": 0.9438, + "step": 2895 + }, + { + "epoch": 0.43, + "grad_norm": 0.357421875, + "learning_rate": 0.00014048413886187503, + "loss": 0.9699, + "step": 2900 + }, + { + "epoch": 0.43, + "grad_norm": 0.33203125, + "learning_rate": 0.00014024698734448431, + "loss": 0.9255, + "step": 2905 + }, + { + "epoch": 0.43, + "grad_norm": 0.361328125, + "learning_rate": 0.00014000956541404785, + "loss": 0.9733, + "step": 2910 + }, + { + "epoch": 0.43, + "grad_norm": 0.353515625, + "learning_rate": 0.0001397718746657655, + "loss": 0.9554, + "step": 2915 + }, + { + "epoch": 0.43, + "grad_norm": 0.34765625, + "learning_rate": 0.0001395339166966433, + "loss": 0.9273, + "step": 2920 + }, + { + "epoch": 0.43, + "grad_norm": 0.34375, + "learning_rate": 0.0001392956931054825, + "loss": 0.9364, + "step": 2925 + }, + { + "epoch": 0.44, + "grad_norm": 0.3359375, + "learning_rate": 0.00013905720549286932, + "loss": 0.9433, + "step": 2930 + }, + { + "epoch": 0.44, + "grad_norm": 0.341796875, + "learning_rate": 0.0001388184554611636, + "loss": 0.9288, + "step": 2935 + }, + { + "epoch": 0.44, + "grad_norm": 0.34375, + "learning_rate": 0.0001385794446144885, + "loss": 0.9338, + "step": 2940 + }, + { + "epoch": 0.44, + "grad_norm": 0.34765625, + "learning_rate": 0.0001383401745587196, + "loss": 1.0093, + "step": 2945 + }, + { + "epoch": 0.44, + "grad_norm": 0.337890625, + "learning_rate": 0.00013810064690147387, + "loss": 0.924, + "step": 2950 + }, + { + "epoch": 0.44, + "grad_norm": 0.345703125, + "learning_rate": 0.0001378608632520993, + "loss": 0.9731, + "step": 2955 + }, + { + "epoch": 0.44, + "grad_norm": 0.341796875, + "learning_rate": 0.00013762082522166363, + "loss": 0.9236, + "step": 2960 + }, + { + "epoch": 0.44, + "grad_norm": 0.33984375, + "learning_rate": 0.0001373805344229439, + "loss": 0.9378, + "step": 2965 + }, + { + "epoch": 0.44, + "grad_norm": 0.349609375, + "learning_rate": 0.00013713999247041533, + "loss": 0.929, + "step": 2970 + }, + { + "epoch": 0.44, + "grad_norm": 0.353515625, + "learning_rate": 0.00013689920098024078, + "loss": 0.9338, + "step": 2975 + }, + { + "epoch": 0.44, + "grad_norm": 0.34765625, + "learning_rate": 0.0001366581615702596, + "loss": 0.9437, + "step": 2980 + }, + { + "epoch": 0.44, + "grad_norm": 0.345703125, + "learning_rate": 0.00013641687585997677, + "loss": 0.9331, + "step": 2985 + }, + { + "epoch": 0.44, + "grad_norm": 0.34765625, + "learning_rate": 0.00013617534547055236, + "loss": 0.9655, + "step": 2990 + }, + { + "epoch": 0.44, + "grad_norm": 0.341796875, + "learning_rate": 0.0001359335720247902, + "loss": 0.9384, + "step": 2995 + }, + { + "epoch": 0.45, + "grad_norm": 0.3515625, + "learning_rate": 0.0001356915571471273, + "loss": 0.9497, + "step": 3000 + }, + { + "epoch": 0.45, + "grad_norm": 0.353515625, + "learning_rate": 0.0001354493024636227, + "loss": 0.934, + "step": 3005 + }, + { + "epoch": 0.45, + "grad_norm": 0.345703125, + "learning_rate": 0.0001352068096019468, + "loss": 0.9272, + "step": 3010 + }, + { + "epoch": 0.45, + "grad_norm": 0.341796875, + "learning_rate": 0.00013496408019137018, + "loss": 0.9428, + "step": 3015 + }, + { + "epoch": 0.45, + "grad_norm": 0.33203125, + "learning_rate": 0.00013472111586275274, + "loss": 0.9461, + "step": 3020 + }, + { + "epoch": 0.45, + "grad_norm": 0.341796875, + "learning_rate": 0.0001344779182485328, + "loss": 0.9403, + "step": 3025 + }, + { + "epoch": 0.45, + "grad_norm": 0.34375, + "learning_rate": 0.0001342344889827161, + "loss": 0.9324, + "step": 3030 + }, + { + "epoch": 0.45, + "grad_norm": 0.345703125, + "learning_rate": 0.0001339908297008648, + "loss": 0.9427, + "step": 3035 + }, + { + "epoch": 0.45, + "grad_norm": 0.345703125, + "learning_rate": 0.00013374694204008647, + "loss": 0.9369, + "step": 3040 + }, + { + "epoch": 0.45, + "grad_norm": 0.33984375, + "learning_rate": 0.00013350282763902315, + "loss": 0.9251, + "step": 3045 + }, + { + "epoch": 0.45, + "grad_norm": 0.33984375, + "learning_rate": 0.0001332584881378403, + "loss": 0.9365, + "step": 3050 + }, + { + "epoch": 0.45, + "grad_norm": 0.345703125, + "learning_rate": 0.00013301392517821577, + "loss": 0.9184, + "step": 3055 + }, + { + "epoch": 0.45, + "grad_norm": 0.3359375, + "learning_rate": 0.00013276914040332889, + "loss": 0.9231, + "step": 3060 + }, + { + "epoch": 0.46, + "grad_norm": 0.349609375, + "learning_rate": 0.0001325241354578492, + "loss": 0.9217, + "step": 3065 + }, + { + "epoch": 0.46, + "grad_norm": 0.341796875, + "learning_rate": 0.0001322789119879256, + "loss": 0.9712, + "step": 3070 + }, + { + "epoch": 0.46, + "grad_norm": 0.3515625, + "learning_rate": 0.00013203347164117524, + "loss": 0.9239, + "step": 3075 + }, + { + "epoch": 0.46, + "grad_norm": 0.341796875, + "learning_rate": 0.00013178781606667234, + "loss": 0.9567, + "step": 3080 + }, + { + "epoch": 0.46, + "grad_norm": 0.34375, + "learning_rate": 0.00013154194691493732, + "loss": 0.9389, + "step": 3085 + }, + { + "epoch": 0.46, + "grad_norm": 0.337890625, + "learning_rate": 0.0001312958658379255, + "loss": 0.9429, + "step": 3090 + }, + { + "epoch": 0.46, + "grad_norm": 0.341796875, + "learning_rate": 0.00013104957448901614, + "loss": 0.9226, + "step": 3095 + }, + { + "epoch": 0.46, + "grad_norm": 0.34375, + "learning_rate": 0.00013080307452300127, + "loss": 0.9155, + "step": 3100 + }, + { + "epoch": 0.46, + "grad_norm": 0.34765625, + "learning_rate": 0.00013055636759607458, + "loss": 0.9516, + "step": 3105 + }, + { + "epoch": 0.46, + "grad_norm": 0.337890625, + "learning_rate": 0.00013030945536582025, + "loss": 0.9284, + "step": 3110 + }, + { + "epoch": 0.46, + "grad_norm": 0.33984375, + "learning_rate": 0.00013006233949120199, + "loss": 0.9113, + "step": 3115 + }, + { + "epoch": 0.46, + "grad_norm": 0.341796875, + "learning_rate": 0.00012981502163255166, + "loss": 0.9471, + "step": 3120 + }, + { + "epoch": 0.46, + "grad_norm": 0.34765625, + "learning_rate": 0.0001295675034515582, + "loss": 0.9416, + "step": 3125 + }, + { + "epoch": 0.46, + "grad_norm": 0.359375, + "learning_rate": 0.00012931978661125655, + "loss": 0.9395, + "step": 3130 + }, + { + "epoch": 0.47, + "grad_norm": 0.361328125, + "learning_rate": 0.00012907187277601641, + "loss": 0.9367, + "step": 3135 + }, + { + "epoch": 0.47, + "grad_norm": 0.35546875, + "learning_rate": 0.00012882376361153102, + "loss": 0.923, + "step": 3140 + }, + { + "epoch": 0.47, + "grad_norm": 0.3515625, + "learning_rate": 0.00012857546078480598, + "loss": 0.9115, + "step": 3145 + }, + { + "epoch": 0.47, + "grad_norm": 0.345703125, + "learning_rate": 0.00012832696596414817, + "loss": 0.9182, + "step": 3150 + }, + { + "epoch": 0.47, + "grad_norm": 0.3359375, + "learning_rate": 0.00012807828081915436, + "loss": 0.913, + "step": 3155 + }, + { + "epoch": 0.47, + "grad_norm": 0.34765625, + "learning_rate": 0.0001278294070207001, + "loss": 0.9438, + "step": 3160 + }, + { + "epoch": 0.47, + "grad_norm": 0.35546875, + "learning_rate": 0.0001275803462409285, + "loss": 0.9487, + "step": 3165 + }, + { + "epoch": 0.47, + "grad_norm": 0.349609375, + "learning_rate": 0.00012733110015323898, + "loss": 0.9521, + "step": 3170 + }, + { + "epoch": 0.47, + "grad_norm": 0.333984375, + "learning_rate": 0.0001270816704322759, + "loss": 0.9146, + "step": 3175 + }, + { + "epoch": 0.47, + "grad_norm": 0.34765625, + "learning_rate": 0.00012683205875391754, + "loss": 0.9211, + "step": 3180 + }, + { + "epoch": 0.47, + "grad_norm": 0.3515625, + "learning_rate": 0.00012658226679526476, + "loss": 0.9297, + "step": 3185 + }, + { + "epoch": 0.47, + "grad_norm": 0.34375, + "learning_rate": 0.00012633229623462951, + "loss": 0.9186, + "step": 3190 + }, + { + "epoch": 0.47, + "grad_norm": 0.35546875, + "learning_rate": 0.00012608214875152392, + "loss": 0.9443, + "step": 3195 + }, + { + "epoch": 0.48, + "grad_norm": 0.341796875, + "learning_rate": 0.00012583182602664877, + "loss": 0.9399, + "step": 3200 + }, + { + "epoch": 0.48, + "grad_norm": 0.337890625, + "learning_rate": 0.00012558132974188223, + "loss": 0.9342, + "step": 3205 + }, + { + "epoch": 0.48, + "grad_norm": 0.330078125, + "learning_rate": 0.00012533066158026862, + "loss": 0.9088, + "step": 3210 + }, + { + "epoch": 0.48, + "grad_norm": 0.3515625, + "learning_rate": 0.00012507982322600703, + "loss": 0.9777, + "step": 3215 + }, + { + "epoch": 0.48, + "grad_norm": 0.33984375, + "learning_rate": 0.00012482881636444014, + "loss": 0.944, + "step": 3220 + }, + { + "epoch": 0.48, + "grad_norm": 0.33203125, + "learning_rate": 0.00012457764268204277, + "loss": 0.9307, + "step": 3225 + }, + { + "epoch": 0.48, + "grad_norm": 0.34375, + "learning_rate": 0.0001243263038664105, + "loss": 0.9396, + "step": 3230 + }, + { + "epoch": 0.48, + "grad_norm": 0.34375, + "learning_rate": 0.00012407480160624848, + "loss": 0.9138, + "step": 3235 + }, + { + "epoch": 0.48, + "grad_norm": 0.345703125, + "learning_rate": 0.0001238231375913601, + "loss": 0.9583, + "step": 3240 + }, + { + "epoch": 0.48, + "grad_norm": 0.3515625, + "learning_rate": 0.00012357131351263537, + "loss": 0.9799, + "step": 3245 + }, + { + "epoch": 0.48, + "grad_norm": 0.35546875, + "learning_rate": 0.00012331933106203986, + "loss": 0.9619, + "step": 3250 + }, + { + "epoch": 0.48, + "grad_norm": 0.349609375, + "learning_rate": 0.00012306719193260323, + "loss": 0.9188, + "step": 3255 + }, + { + "epoch": 0.48, + "grad_norm": 0.35546875, + "learning_rate": 0.00012281489781840781, + "loss": 0.9593, + "step": 3260 + }, + { + "epoch": 0.48, + "grad_norm": 0.349609375, + "learning_rate": 0.0001225624504145772, + "loss": 0.938, + "step": 3265 + }, + { + "epoch": 0.49, + "grad_norm": 0.349609375, + "learning_rate": 0.00012230985141726498, + "loss": 0.9237, + "step": 3270 + }, + { + "epoch": 0.49, + "grad_norm": 0.357421875, + "learning_rate": 0.00012205710252364329, + "loss": 0.9314, + "step": 3275 + }, + { + "epoch": 0.49, + "grad_norm": 0.34765625, + "learning_rate": 0.00012180420543189131, + "loss": 0.9093, + "step": 3280 + }, + { + "epoch": 0.49, + "grad_norm": 0.349609375, + "learning_rate": 0.00012155116184118402, + "loss": 0.9335, + "step": 3285 + }, + { + "epoch": 0.49, + "grad_norm": 0.341796875, + "learning_rate": 0.00012129797345168073, + "loss": 0.9643, + "step": 3290 + }, + { + "epoch": 0.49, + "grad_norm": 0.341796875, + "learning_rate": 0.00012104464196451353, + "loss": 0.9507, + "step": 3295 + }, + { + "epoch": 0.49, + "grad_norm": 0.34375, + "learning_rate": 0.00012079116908177593, + "loss": 0.9201, + "step": 3300 + }, + { + "epoch": 0.49, + "grad_norm": 0.33203125, + "learning_rate": 0.00012053755650651166, + "loss": 0.9375, + "step": 3305 + }, + { + "epoch": 0.49, + "grad_norm": 0.337890625, + "learning_rate": 0.00012028380594270283, + "loss": 0.9681, + "step": 3310 + }, + { + "epoch": 0.49, + "grad_norm": 0.34765625, + "learning_rate": 0.00012002991909525873, + "loss": 0.923, + "step": 3315 + }, + { + "epoch": 0.49, + "grad_norm": 0.341796875, + "learning_rate": 0.00011977589767000433, + "loss": 0.9143, + "step": 3320 + }, + { + "epoch": 0.49, + "grad_norm": 0.341796875, + "learning_rate": 0.00011952174337366881, + "loss": 0.9205, + "step": 3325 + }, + { + "epoch": 0.49, + "grad_norm": 0.349609375, + "learning_rate": 0.00011926745791387406, + "loss": 0.9113, + "step": 3330 + }, + { + "epoch": 0.5, + "grad_norm": 0.357421875, + "learning_rate": 0.00011901304299912326, + "loss": 0.9357, + "step": 3335 + }, + { + "epoch": 0.5, + "grad_norm": 0.349609375, + "learning_rate": 0.00011875850033878939, + "loss": 0.9552, + "step": 3340 + }, + { + "epoch": 0.5, + "grad_norm": 0.32421875, + "learning_rate": 0.00011850383164310371, + "loss": 0.9234, + "step": 3345 + }, + { + "epoch": 0.5, + "grad_norm": 0.33203125, + "learning_rate": 0.00011824903862314427, + "loss": 0.9085, + "step": 3350 + }, + { + "epoch": 0.5, + "grad_norm": 0.337890625, + "learning_rate": 0.00011799412299082448, + "loss": 0.9226, + "step": 3355 + }, + { + "epoch": 0.5, + "grad_norm": 0.341796875, + "learning_rate": 0.00011773908645888152, + "loss": 0.9107, + "step": 3360 + }, + { + "epoch": 0.5, + "grad_norm": 0.349609375, + "learning_rate": 0.00011748393074086497, + "loss": 0.9402, + "step": 3365 + }, + { + "epoch": 0.5, + "grad_norm": 0.341796875, + "learning_rate": 0.00011722865755112504, + "loss": 0.9419, + "step": 3370 + }, + { + "epoch": 0.5, + "grad_norm": 0.359375, + "learning_rate": 0.00011697326860480133, + "loss": 0.9146, + "step": 3375 + }, + { + "epoch": 0.5, + "grad_norm": 0.34375, + "learning_rate": 0.00011671776561781123, + "loss": 0.8996, + "step": 3380 + }, + { + "epoch": 0.5, + "grad_norm": 0.341796875, + "learning_rate": 0.00011646215030683818, + "loss": 0.9089, + "step": 3385 + }, + { + "epoch": 0.5, + "grad_norm": 0.34765625, + "learning_rate": 0.0001162064243893205, + "loss": 0.9257, + "step": 3390 + }, + { + "epoch": 0.5, + "grad_norm": 0.333984375, + "learning_rate": 0.00011595058958343952, + "loss": 0.924, + "step": 3395 + }, + { + "epoch": 0.5, + "grad_norm": 0.34375, + "learning_rate": 0.00011569464760810825, + "loss": 0.9558, + "step": 3400 + }, + { + "epoch": 0.51, + "grad_norm": 0.345703125, + "learning_rate": 0.00011543860018295966, + "loss": 0.9002, + "step": 3405 + }, + { + "epoch": 0.51, + "grad_norm": 0.34375, + "learning_rate": 0.00011518244902833537, + "loss": 0.9107, + "step": 3410 + }, + { + "epoch": 0.51, + "grad_norm": 0.34375, + "learning_rate": 0.00011492619586527385, + "loss": 0.9712, + "step": 3415 + }, + { + "epoch": 0.51, + "grad_norm": 0.357421875, + "learning_rate": 0.0001146698424154989, + "loss": 0.9171, + "step": 3420 + }, + { + "epoch": 0.51, + "grad_norm": 0.33984375, + "learning_rate": 0.00011441339040140824, + "loss": 0.9107, + "step": 3425 + }, + { + "epoch": 0.51, + "grad_norm": 0.337890625, + "learning_rate": 0.00011415684154606177, + "loss": 0.9396, + "step": 3430 + }, + { + "epoch": 0.51, + "grad_norm": 0.341796875, + "learning_rate": 0.00011390019757317003, + "loss": 0.9101, + "step": 3435 + }, + { + "epoch": 0.51, + "grad_norm": 0.345703125, + "learning_rate": 0.00011364346020708266, + "loss": 0.9324, + "step": 3440 + }, + { + "epoch": 0.51, + "grad_norm": 0.341796875, + "learning_rate": 0.00011338663117277686, + "loss": 0.933, + "step": 3445 + }, + { + "epoch": 0.51, + "grad_norm": 0.34765625, + "learning_rate": 0.00011312971219584563, + "loss": 0.9153, + "step": 3450 + }, + { + "epoch": 0.51, + "grad_norm": 0.345703125, + "learning_rate": 0.00011287270500248631, + "loss": 0.9401, + "step": 3455 + }, + { + "epoch": 0.51, + "grad_norm": 0.353515625, + "learning_rate": 0.00011261561131948897, + "loss": 0.9264, + "step": 3460 + }, + { + "epoch": 0.51, + "grad_norm": 0.34375, + "learning_rate": 0.00011235843287422482, + "loss": 0.92, + "step": 3465 + }, + { + "epoch": 0.52, + "grad_norm": 0.361328125, + "learning_rate": 0.00011210117139463452, + "loss": 0.9187, + "step": 3470 + }, + { + "epoch": 0.52, + "grad_norm": 0.33984375, + "learning_rate": 0.00011184382860921663, + "loss": 0.949, + "step": 3475 + }, + { + "epoch": 0.52, + "grad_norm": 0.341796875, + "learning_rate": 0.00011158640624701603, + "loss": 0.9201, + "step": 3480 + }, + { + "epoch": 0.52, + "grad_norm": 0.33984375, + "learning_rate": 0.00011132890603761221, + "loss": 0.9096, + "step": 3485 + }, + { + "epoch": 0.52, + "grad_norm": 0.353515625, + "learning_rate": 0.00011107132971110779, + "loss": 0.9248, + "step": 3490 + }, + { + "epoch": 0.52, + "grad_norm": 0.349609375, + "learning_rate": 0.00011081367899811668, + "loss": 0.9189, + "step": 3495 + }, + { + "epoch": 0.52, + "grad_norm": 0.33984375, + "learning_rate": 0.00011055595562975267, + "loss": 0.9182, + "step": 3500 + }, + { + "epoch": 0.52, + "grad_norm": 0.365234375, + "learning_rate": 0.00011029816133761772, + "loss": 0.9549, + "step": 3505 + }, + { + "epoch": 0.52, + "grad_norm": 0.341796875, + "learning_rate": 0.00011004029785379024, + "loss": 0.9158, + "step": 3510 + }, + { + "epoch": 0.52, + "grad_norm": 0.33203125, + "learning_rate": 0.00010978236691081365, + "loss": 0.9104, + "step": 3515 + }, + { + "epoch": 0.52, + "grad_norm": 0.3515625, + "learning_rate": 0.00010952437024168444, + "loss": 0.9305, + "step": 3520 + }, + { + "epoch": 0.52, + "grad_norm": 0.349609375, + "learning_rate": 0.00010926630957984087, + "loss": 0.9383, + "step": 3525 + }, + { + "epoch": 0.52, + "grad_norm": 0.34765625, + "learning_rate": 0.00010900818665915109, + "loss": 0.9124, + "step": 3530 + }, + { + "epoch": 0.52, + "grad_norm": 0.341796875, + "learning_rate": 0.00010875000321390154, + "loss": 0.9169, + "step": 3535 + }, + { + "epoch": 0.53, + "grad_norm": 0.34375, + "learning_rate": 0.00010849176097878535, + "loss": 0.9044, + "step": 3540 + }, + { + "epoch": 0.53, + "grad_norm": 0.359375, + "learning_rate": 0.00010823346168889062, + "loss": 0.9234, + "step": 3545 + }, + { + "epoch": 0.53, + "grad_norm": 0.34765625, + "learning_rate": 0.00010797510707968878, + "loss": 0.9431, + "step": 3550 + }, + { + "epoch": 0.53, + "grad_norm": 0.345703125, + "learning_rate": 0.00010771669888702303, + "loss": 0.9263, + "step": 3555 + }, + { + "epoch": 0.53, + "grad_norm": 0.34765625, + "learning_rate": 0.00010745823884709647, + "loss": 0.9423, + "step": 3560 + }, + { + "epoch": 0.53, + "grad_norm": 0.3515625, + "learning_rate": 0.00010719972869646062, + "loss": 0.9232, + "step": 3565 + }, + { + "epoch": 0.53, + "grad_norm": 0.35546875, + "learning_rate": 0.00010694117017200372, + "loss": 0.962, + "step": 3570 + }, + { + "epoch": 0.53, + "grad_norm": 0.33203125, + "learning_rate": 0.00010668256501093892, + "loss": 0.935, + "step": 3575 + }, + { + "epoch": 0.53, + "grad_norm": 0.35546875, + "learning_rate": 0.00010642391495079278, + "loss": 0.9212, + "step": 3580 + }, + { + "epoch": 0.53, + "grad_norm": 0.353515625, + "learning_rate": 0.00010616522172939356, + "loss": 0.9269, + "step": 3585 + }, + { + "epoch": 0.53, + "grad_norm": 0.349609375, + "learning_rate": 0.00010590648708485946, + "loss": 0.9182, + "step": 3590 + }, + { + "epoch": 0.53, + "grad_norm": 0.345703125, + "learning_rate": 0.000105647712755587, + "loss": 0.9437, + "step": 3595 + }, + { + "epoch": 0.53, + "grad_norm": 0.353515625, + "learning_rate": 0.00010538890048023937, + "loss": 0.9449, + "step": 3600 + }, + { + "epoch": 0.54, + "grad_norm": 0.353515625, + "learning_rate": 0.0001051300519977347, + "loss": 0.9023, + "step": 3605 + }, + { + "epoch": 0.54, + "grad_norm": 0.349609375, + "learning_rate": 0.00010487116904723433, + "loss": 0.9136, + "step": 3610 + }, + { + "epoch": 0.54, + "grad_norm": 0.33984375, + "learning_rate": 0.00010461225336813128, + "loss": 0.9317, + "step": 3615 + }, + { + "epoch": 0.54, + "grad_norm": 0.337890625, + "learning_rate": 0.00010435330670003842, + "loss": 0.8979, + "step": 3620 + }, + { + "epoch": 0.54, + "grad_norm": 0.34375, + "learning_rate": 0.00010409433078277684, + "loss": 0.9319, + "step": 3625 + }, + { + "epoch": 0.54, + "grad_norm": 0.34375, + "learning_rate": 0.00010383532735636411, + "loss": 0.9344, + "step": 3630 + }, + { + "epoch": 0.54, + "grad_norm": 0.34765625, + "learning_rate": 0.00010357629816100272, + "loss": 0.907, + "step": 3635 + }, + { + "epoch": 0.54, + "grad_norm": 0.345703125, + "learning_rate": 0.0001033172449370682, + "loss": 0.9222, + "step": 3640 + }, + { + "epoch": 0.54, + "grad_norm": 0.345703125, + "learning_rate": 0.00010305816942509761, + "loss": 0.9384, + "step": 3645 + }, + { + "epoch": 0.54, + "grad_norm": 0.353515625, + "learning_rate": 0.00010279907336577765, + "loss": 0.9195, + "step": 3650 + }, + { + "epoch": 0.54, + "grad_norm": 0.34765625, + "learning_rate": 0.00010253995849993321, + "loss": 0.9177, + "step": 3655 + }, + { + "epoch": 0.54, + "grad_norm": 0.353515625, + "learning_rate": 0.0001022808265685154, + "loss": 0.9058, + "step": 3660 + }, + { + "epoch": 0.54, + "grad_norm": 0.353515625, + "learning_rate": 0.0001020216793125901, + "loss": 0.9293, + "step": 3665 + }, + { + "epoch": 0.54, + "grad_norm": 0.34765625, + "learning_rate": 0.00010176251847332614, + "loss": 0.8824, + "step": 3670 + }, + { + "epoch": 0.55, + "grad_norm": 0.33984375, + "learning_rate": 0.00010150334579198353, + "loss": 0.9316, + "step": 3675 + }, + { + "epoch": 0.55, + "grad_norm": 0.3515625, + "learning_rate": 0.00010124416300990196, + "loss": 0.9351, + "step": 3680 + }, + { + "epoch": 0.55, + "grad_norm": 0.349609375, + "learning_rate": 0.00010098497186848888, + "loss": 0.9187, + "step": 3685 + }, + { + "epoch": 0.55, + "grad_norm": 0.349609375, + "learning_rate": 0.00010072577410920794, + "loss": 0.9019, + "step": 3690 + }, + { + "epoch": 0.55, + "grad_norm": 0.361328125, + "learning_rate": 0.00010046657147356733, + "loss": 0.9152, + "step": 3695 + }, + { + "epoch": 0.55, + "grad_norm": 0.34375, + "learning_rate": 0.00010020736570310789, + "loss": 0.904, + "step": 3700 + }, + { + "epoch": 0.55, + "grad_norm": 0.345703125, + "learning_rate": 9.99481585393916e-05, + "loss": 0.9462, + "step": 3705 + }, + { + "epoch": 0.55, + "grad_norm": 0.34765625, + "learning_rate": 9.968895172398974e-05, + "loss": 0.9428, + "step": 3710 + }, + { + "epoch": 0.55, + "grad_norm": 0.34375, + "learning_rate": 9.94297469984713e-05, + "loss": 0.9453, + "step": 3715 + }, + { + "epoch": 0.55, + "grad_norm": 0.34765625, + "learning_rate": 9.917054610439124e-05, + "loss": 0.9176, + "step": 3720 + }, + { + "epoch": 0.55, + "grad_norm": 0.345703125, + "learning_rate": 9.89113507832787e-05, + "loss": 0.951, + "step": 3725 + }, + { + "epoch": 0.55, + "grad_norm": 0.337890625, + "learning_rate": 9.865216277662545e-05, + "loss": 0.8904, + "step": 3730 + }, + { + "epoch": 0.55, + "grad_norm": 0.3515625, + "learning_rate": 9.83929838258741e-05, + "loss": 0.9197, + "step": 3735 + }, + { + "epoch": 0.56, + "grad_norm": 0.341796875, + "learning_rate": 9.813381567240639e-05, + "loss": 0.9342, + "step": 3740 + }, + { + "epoch": 0.56, + "grad_norm": 0.361328125, + "learning_rate": 9.787466005753152e-05, + "loss": 0.9713, + "step": 3745 + }, + { + "epoch": 0.56, + "grad_norm": 0.3359375, + "learning_rate": 9.761551872247449e-05, + "loss": 0.9556, + "step": 3750 + }, + { + "epoch": 0.56, + "grad_norm": 0.35546875, + "learning_rate": 9.735639340836428e-05, + "loss": 0.9125, + "step": 3755 + }, + { + "epoch": 0.56, + "grad_norm": 0.3515625, + "learning_rate": 9.709728585622229e-05, + "loss": 0.9716, + "step": 3760 + }, + { + "epoch": 0.56, + "grad_norm": 0.345703125, + "learning_rate": 9.68381978069506e-05, + "loss": 0.9105, + "step": 3765 + }, + { + "epoch": 0.56, + "grad_norm": 0.341796875, + "learning_rate": 9.657913100132011e-05, + "loss": 0.8839, + "step": 3770 + }, + { + "epoch": 0.56, + "grad_norm": 0.359375, + "learning_rate": 9.632008717995916e-05, + "loss": 0.9204, + "step": 3775 + }, + { + "epoch": 0.56, + "grad_norm": 0.337890625, + "learning_rate": 9.606106808334165e-05, + "loss": 0.8863, + "step": 3780 + }, + { + "epoch": 0.56, + "grad_norm": 0.328125, + "learning_rate": 9.580207545177516e-05, + "loss": 0.905, + "step": 3785 + }, + { + "epoch": 0.56, + "grad_norm": 0.32421875, + "learning_rate": 9.554311102538966e-05, + "loss": 0.9207, + "step": 3790 + }, + { + "epoch": 0.56, + "grad_norm": 0.33984375, + "learning_rate": 9.528417654412564e-05, + "loss": 0.9598, + "step": 3795 + }, + { + "epoch": 0.56, + "grad_norm": 0.359375, + "learning_rate": 9.502527374772217e-05, + "loss": 0.943, + "step": 3800 + }, + { + "epoch": 0.57, + "grad_norm": 0.35546875, + "learning_rate": 9.476640437570562e-05, + "loss": 0.9201, + "step": 3805 + }, + { + "epoch": 0.57, + "grad_norm": 0.34765625, + "learning_rate": 9.450757016737776e-05, + "loss": 0.9366, + "step": 3810 + }, + { + "epoch": 0.57, + "grad_norm": 0.345703125, + "learning_rate": 9.424877286180404e-05, + "loss": 0.9357, + "step": 3815 + }, + { + "epoch": 0.57, + "grad_norm": 0.34765625, + "learning_rate": 9.3990014197802e-05, + "loss": 0.9431, + "step": 3820 + }, + { + "epoch": 0.57, + "grad_norm": 0.34765625, + "learning_rate": 9.37312959139296e-05, + "loss": 0.9362, + "step": 3825 + }, + { + "epoch": 0.57, + "grad_norm": 0.34375, + "learning_rate": 9.347261974847341e-05, + "loss": 0.9157, + "step": 3830 + }, + { + "epoch": 0.57, + "grad_norm": 0.34375, + "learning_rate": 9.321398743943706e-05, + "loss": 0.9213, + "step": 3835 + }, + { + "epoch": 0.57, + "grad_norm": 0.3359375, + "learning_rate": 9.295540072452951e-05, + "loss": 0.9502, + "step": 3840 + }, + { + "epoch": 0.57, + "grad_norm": 0.337890625, + "learning_rate": 9.269686134115336e-05, + "loss": 0.9224, + "step": 3845 + }, + { + "epoch": 0.57, + "grad_norm": 0.33984375, + "learning_rate": 9.243837102639328e-05, + "loss": 0.8954, + "step": 3850 + }, + { + "epoch": 0.57, + "grad_norm": 0.345703125, + "learning_rate": 9.217993151700408e-05, + "loss": 0.9021, + "step": 3855 + }, + { + "epoch": 0.57, + "grad_norm": 0.345703125, + "learning_rate": 9.19215445493994e-05, + "loss": 0.9367, + "step": 3860 + }, + { + "epoch": 0.57, + "grad_norm": 0.337890625, + "learning_rate": 9.166321185963984e-05, + "loss": 0.9301, + "step": 3865 + }, + { + "epoch": 0.57, + "grad_norm": 0.349609375, + "learning_rate": 9.140493518342113e-05, + "loss": 0.9468, + "step": 3870 + }, + { + "epoch": 0.58, + "grad_norm": 0.337890625, + "learning_rate": 9.114671625606285e-05, + "loss": 0.891, + "step": 3875 + }, + { + "epoch": 0.58, + "grad_norm": 0.359375, + "learning_rate": 9.088855681249658e-05, + "loss": 0.938, + "step": 3880 + }, + { + "epoch": 0.58, + "grad_norm": 0.34765625, + "learning_rate": 9.063045858725406e-05, + "loss": 0.9287, + "step": 3885 + }, + { + "epoch": 0.58, + "grad_norm": 0.3359375, + "learning_rate": 9.037242331445588e-05, + "loss": 0.8992, + "step": 3890 + }, + { + "epoch": 0.58, + "grad_norm": 0.333984375, + "learning_rate": 9.011445272779962e-05, + "loss": 0.9444, + "step": 3895 + }, + { + "epoch": 0.58, + "grad_norm": 0.35546875, + "learning_rate": 8.985654856054818e-05, + "loss": 0.9119, + "step": 3900 + }, + { + "epoch": 0.58, + "grad_norm": 0.34375, + "learning_rate": 8.95987125455183e-05, + "loss": 0.904, + "step": 3905 + }, + { + "epoch": 0.58, + "grad_norm": 0.33984375, + "learning_rate": 8.934094641506873e-05, + "loss": 0.9223, + "step": 3910 + }, + { + "epoch": 0.58, + "grad_norm": 0.349609375, + "learning_rate": 8.908325190108873e-05, + "loss": 0.9288, + "step": 3915 + }, + { + "epoch": 0.58, + "grad_norm": 0.349609375, + "learning_rate": 8.882563073498635e-05, + "loss": 0.9177, + "step": 3920 + }, + { + "epoch": 0.58, + "grad_norm": 0.359375, + "learning_rate": 8.856808464767689e-05, + "loss": 0.888, + "step": 3925 + }, + { + "epoch": 0.58, + "grad_norm": 0.345703125, + "learning_rate": 8.831061536957107e-05, + "loss": 0.9174, + "step": 3930 + }, + { + "epoch": 0.58, + "grad_norm": 0.359375, + "learning_rate": 8.80532246305637e-05, + "loss": 0.9345, + "step": 3935 + }, + { + "epoch": 0.59, + "grad_norm": 0.333984375, + "learning_rate": 8.779591416002179e-05, + "loss": 0.926, + "step": 3940 + }, + { + "epoch": 0.59, + "grad_norm": 0.349609375, + "learning_rate": 8.753868568677311e-05, + "loss": 0.9409, + "step": 3945 + }, + { + "epoch": 0.59, + "grad_norm": 0.34375, + "learning_rate": 8.728154093909441e-05, + "loss": 0.918, + "step": 3950 + }, + { + "epoch": 0.59, + "grad_norm": 0.36328125, + "learning_rate": 8.702448164470007e-05, + "loss": 0.9695, + "step": 3955 + }, + { + "epoch": 0.59, + "grad_norm": 0.349609375, + "learning_rate": 8.676750953073011e-05, + "loss": 0.9221, + "step": 3960 + }, + { + "epoch": 0.59, + "grad_norm": 0.341796875, + "learning_rate": 8.65106263237389e-05, + "loss": 0.9461, + "step": 3965 + }, + { + "epoch": 0.59, + "grad_norm": 0.3515625, + "learning_rate": 8.625383374968357e-05, + "loss": 0.9107, + "step": 3970 + }, + { + "epoch": 0.59, + "grad_norm": 0.359375, + "learning_rate": 8.599713353391207e-05, + "loss": 0.9238, + "step": 3975 + }, + { + "epoch": 0.59, + "grad_norm": 0.34765625, + "learning_rate": 8.574052740115201e-05, + "loss": 0.9063, + "step": 3980 + }, + { + "epoch": 0.59, + "grad_norm": 0.34765625, + "learning_rate": 8.548401707549878e-05, + "loss": 0.9457, + "step": 3985 + }, + { + "epoch": 0.59, + "grad_norm": 0.353515625, + "learning_rate": 8.522760428040402e-05, + "loss": 0.9385, + "step": 3990 + }, + { + "epoch": 0.59, + "grad_norm": 0.341796875, + "learning_rate": 8.49712907386642e-05, + "loss": 0.9101, + "step": 3995 + }, + { + "epoch": 0.59, + "grad_norm": 0.34375, + "learning_rate": 8.471507817240882e-05, + "loss": 0.9426, + "step": 4000 + }, + { + "epoch": 0.59, + "grad_norm": 0.345703125, + "learning_rate": 8.445896830308898e-05, + "loss": 0.9284, + "step": 4005 + }, + { + "epoch": 0.6, + "grad_norm": 0.34375, + "learning_rate": 8.420296285146574e-05, + "loss": 0.8853, + "step": 4010 + }, + { + "epoch": 0.6, + "grad_norm": 0.33984375, + "learning_rate": 8.394706353759869e-05, + "loss": 0.9316, + "step": 4015 + }, + { + "epoch": 0.6, + "grad_norm": 0.3515625, + "learning_rate": 8.369127208083418e-05, + "loss": 0.9209, + "step": 4020 + }, + { + "epoch": 0.6, + "grad_norm": 0.3515625, + "learning_rate": 8.343559019979392e-05, + "loss": 0.903, + "step": 4025 + }, + { + "epoch": 0.6, + "grad_norm": 0.35546875, + "learning_rate": 8.318001961236349e-05, + "loss": 0.9319, + "step": 4030 + }, + { + "epoch": 0.6, + "grad_norm": 0.333984375, + "learning_rate": 8.292456203568055e-05, + "loss": 0.8953, + "step": 4035 + }, + { + "epoch": 0.6, + "grad_norm": 0.34765625, + "learning_rate": 8.266921918612359e-05, + "loss": 0.9299, + "step": 4040 + }, + { + "epoch": 0.6, + "grad_norm": 0.3515625, + "learning_rate": 8.241399277930021e-05, + "loss": 0.9355, + "step": 4045 + }, + { + "epoch": 0.6, + "grad_norm": 0.330078125, + "learning_rate": 8.215888453003562e-05, + "loss": 0.8886, + "step": 4050 + }, + { + "epoch": 0.6, + "grad_norm": 0.337890625, + "learning_rate": 8.190389615236123e-05, + "loss": 0.9026, + "step": 4055 + }, + { + "epoch": 0.6, + "grad_norm": 0.341796875, + "learning_rate": 8.164902935950303e-05, + "loss": 0.8896, + "step": 4060 + }, + { + "epoch": 0.6, + "grad_norm": 0.345703125, + "learning_rate": 8.139428586386998e-05, + "loss": 0.9258, + "step": 4065 + }, + { + "epoch": 0.6, + "grad_norm": 0.3515625, + "learning_rate": 8.11396673770428e-05, + "loss": 0.9139, + "step": 4070 + }, + { + "epoch": 0.61, + "grad_norm": 0.34375, + "learning_rate": 8.088517560976226e-05, + "loss": 0.9078, + "step": 4075 + }, + { + "epoch": 0.61, + "grad_norm": 0.353515625, + "learning_rate": 8.063081227191755e-05, + "loss": 0.9161, + "step": 4080 + }, + { + "epoch": 0.61, + "grad_norm": 0.34765625, + "learning_rate": 8.037657907253523e-05, + "loss": 0.9098, + "step": 4085 + }, + { + "epoch": 0.61, + "grad_norm": 0.337890625, + "learning_rate": 8.012247771976726e-05, + "loss": 0.9088, + "step": 4090 + }, + { + "epoch": 0.61, + "grad_norm": 0.349609375, + "learning_rate": 7.986850992087984e-05, + "loss": 0.9156, + "step": 4095 + }, + { + "epoch": 0.61, + "grad_norm": 0.3515625, + "learning_rate": 7.961467738224189e-05, + "loss": 0.9148, + "step": 4100 + }, + { + "epoch": 0.61, + "grad_norm": 0.345703125, + "learning_rate": 7.936098180931341e-05, + "loss": 0.9172, + "step": 4105 + }, + { + "epoch": 0.61, + "grad_norm": 0.361328125, + "learning_rate": 7.910742490663425e-05, + "loss": 0.9105, + "step": 4110 + }, + { + "epoch": 0.61, + "grad_norm": 0.337890625, + "learning_rate": 7.885400837781255e-05, + "loss": 0.9309, + "step": 4115 + }, + { + "epoch": 0.61, + "grad_norm": 0.337890625, + "learning_rate": 7.860073392551323e-05, + "loss": 0.8877, + "step": 4120 + }, + { + "epoch": 0.61, + "grad_norm": 0.349609375, + "learning_rate": 7.83476032514467e-05, + "loss": 0.9208, + "step": 4125 + }, + { + "epoch": 0.61, + "grad_norm": 0.34375, + "learning_rate": 7.809461805635734e-05, + "loss": 0.8915, + "step": 4130 + }, + { + "epoch": 0.61, + "grad_norm": 0.357421875, + "learning_rate": 7.784178004001197e-05, + "loss": 0.9121, + "step": 4135 + }, + { + "epoch": 0.61, + "grad_norm": 0.345703125, + "learning_rate": 7.758909090118868e-05, + "loss": 0.9053, + "step": 4140 + }, + { + "epoch": 0.62, + "grad_norm": 0.345703125, + "learning_rate": 7.733655233766528e-05, + "loss": 0.9288, + "step": 4145 + }, + { + "epoch": 0.62, + "grad_norm": 0.345703125, + "learning_rate": 7.708416604620772e-05, + "loss": 0.9522, + "step": 4150 + }, + { + "epoch": 0.62, + "grad_norm": 0.349609375, + "learning_rate": 7.683193372255898e-05, + "loss": 0.8911, + "step": 4155 + }, + { + "epoch": 0.62, + "grad_norm": 0.34375, + "learning_rate": 7.657985706142767e-05, + "loss": 0.917, + "step": 4160 + }, + { + "epoch": 0.62, + "grad_norm": 0.33984375, + "learning_rate": 7.632793775647625e-05, + "loss": 0.8925, + "step": 4165 + }, + { + "epoch": 0.62, + "grad_norm": 0.349609375, + "learning_rate": 7.607617750031014e-05, + "loss": 0.9262, + "step": 4170 + }, + { + "epoch": 0.62, + "grad_norm": 0.345703125, + "learning_rate": 7.58245779844661e-05, + "loss": 0.954, + "step": 4175 + }, + { + "epoch": 0.62, + "grad_norm": 0.35546875, + "learning_rate": 7.557314089940085e-05, + "loss": 0.9071, + "step": 4180 + }, + { + "epoch": 0.62, + "grad_norm": 0.3515625, + "learning_rate": 7.532186793447977e-05, + "loss": 0.9207, + "step": 4185 + }, + { + "epoch": 0.62, + "grad_norm": 0.361328125, + "learning_rate": 7.507076077796565e-05, + "loss": 0.9352, + "step": 4190 + }, + { + "epoch": 0.62, + "grad_norm": 0.34375, + "learning_rate": 7.481982111700705e-05, + "loss": 0.9581, + "step": 4195 + }, + { + "epoch": 0.62, + "grad_norm": 0.357421875, + "learning_rate": 7.456905063762731e-05, + "loss": 0.9343, + "step": 4200 + } + ], + "logging_steps": 5, + "max_steps": 6734, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 5.906288014140637e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}