diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,100651 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999652161814324, + "eval_steps": 500, + "global_step": 14374, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.956763713520471e-05, + "grad_norm": 16.375, + "learning_rate": 4.6296296296296296e-06, + "loss": 4.1097, + "step": 1 + }, + { + "epoch": 0.00013913527427040942, + "grad_norm": 30.125, + "learning_rate": 9.259259259259259e-06, + "loss": 4.3742, + "step": 2 + }, + { + "epoch": 0.0002087029114056141, + "grad_norm": 20.625, + "learning_rate": 1.3888888888888888e-05, + "loss": 3.9231, + "step": 3 + }, + { + "epoch": 0.00027827054854081884, + "grad_norm": 14.3125, + "learning_rate": 1.8518518518518518e-05, + "loss": 3.7877, + "step": 4 + }, + { + "epoch": 0.0003478381856760235, + "grad_norm": 20.0, + "learning_rate": 2.3148148148148147e-05, + "loss": 3.9694, + "step": 5 + }, + { + "epoch": 0.0004174058228112282, + "grad_norm": 13.75, + "learning_rate": 2.7777777777777776e-05, + "loss": 4.0547, + "step": 6 + }, + { + "epoch": 0.0004869734599464329, + "grad_norm": 17.875, + "learning_rate": 3.240740740740741e-05, + "loss": 3.9384, + "step": 7 + }, + { + "epoch": 0.0005565410970816377, + "grad_norm": 15.8125, + "learning_rate": 3.7037037037037037e-05, + "loss": 3.8156, + "step": 8 + }, + { + "epoch": 0.0006261087342168423, + "grad_norm": 11.625, + "learning_rate": 4.1666666666666665e-05, + "loss": 4.0875, + "step": 9 + }, + { + "epoch": 0.000695676371352047, + "grad_norm": 14.125, + "learning_rate": 4.6296296296296294e-05, + "loss": 3.9151, + "step": 10 + }, + { + "epoch": 0.0007652440084872517, + "grad_norm": 5.65625, + "learning_rate": 5.092592592592592e-05, + "loss": 3.8153, + "step": 11 + }, + { + "epoch": 0.0008348116456224564, + "grad_norm": 5.09375, + "learning_rate": 5.555555555555555e-05, + "loss": 3.9969, + "step": 12 + }, + { + "epoch": 0.0009043792827576611, + "grad_norm": 7.65625, + "learning_rate": 6.018518518518518e-05, + "loss": 3.6755, + "step": 13 + }, + { + "epoch": 0.0009739469198928658, + "grad_norm": 5.6875, + "learning_rate": 6.481481481481482e-05, + "loss": 3.8728, + "step": 14 + }, + { + "epoch": 0.0010435145570280705, + "grad_norm": 3.03125, + "learning_rate": 6.944444444444444e-05, + "loss": 3.8949, + "step": 15 + }, + { + "epoch": 0.0011130821941632753, + "grad_norm": 2.65625, + "learning_rate": 7.407407407407407e-05, + "loss": 4.2955, + "step": 16 + }, + { + "epoch": 0.00118264983129848, + "grad_norm": 3.546875, + "learning_rate": 7.87037037037037e-05, + "loss": 3.6512, + "step": 17 + }, + { + "epoch": 0.0012522174684336846, + "grad_norm": 2.765625, + "learning_rate": 8.333333333333333e-05, + "loss": 4.3409, + "step": 18 + }, + { + "epoch": 0.0013217851055688894, + "grad_norm": 2.546875, + "learning_rate": 8.796296296296297e-05, + "loss": 3.5914, + "step": 19 + }, + { + "epoch": 0.001391352742704094, + "grad_norm": 2.21875, + "learning_rate": 9.259259259259259e-05, + "loss": 4.1952, + "step": 20 + }, + { + "epoch": 0.0014609203798392988, + "grad_norm": 2.234375, + "learning_rate": 9.722222222222223e-05, + "loss": 3.7605, + "step": 21 + }, + { + "epoch": 0.0015304880169745034, + "grad_norm": 2.703125, + "learning_rate": 0.00010185185185185185, + "loss": 3.8877, + "step": 22 + }, + { + "epoch": 0.0016000556541097082, + "grad_norm": 1.6953125, + "learning_rate": 0.00010648148148148149, + "loss": 3.738, + "step": 23 + }, + { + "epoch": 0.0016696232912449128, + "grad_norm": 1.75, + "learning_rate": 0.0001111111111111111, + "loss": 3.7867, + "step": 24 + }, + { + "epoch": 0.0017391909283801176, + "grad_norm": 2.0, + "learning_rate": 0.00011574074074074075, + "loss": 3.6365, + "step": 25 + }, + { + "epoch": 0.0018087585655153222, + "grad_norm": 1.8984375, + "learning_rate": 0.00012037037037037036, + "loss": 4.0938, + "step": 26 + }, + { + "epoch": 0.001878326202650527, + "grad_norm": 2.25, + "learning_rate": 0.000125, + "loss": 3.5465, + "step": 27 + }, + { + "epoch": 0.0019478938397857316, + "grad_norm": 1.7578125, + "learning_rate": 0.00012962962962962963, + "loss": 4.0124, + "step": 28 + }, + { + "epoch": 0.0020174614769209362, + "grad_norm": 1.6015625, + "learning_rate": 0.0001342592592592593, + "loss": 3.7071, + "step": 29 + }, + { + "epoch": 0.002087029114056141, + "grad_norm": 2.140625, + "learning_rate": 0.0001388888888888889, + "loss": 4.0923, + "step": 30 + }, + { + "epoch": 0.002156596751191346, + "grad_norm": 1.9453125, + "learning_rate": 0.00014351851851851852, + "loss": 3.6059, + "step": 31 + }, + { + "epoch": 0.0022261643883265507, + "grad_norm": 1.9765625, + "learning_rate": 0.00014814814814814815, + "loss": 3.5063, + "step": 32 + }, + { + "epoch": 0.002295732025461755, + "grad_norm": 2.59375, + "learning_rate": 0.0001527777777777778, + "loss": 3.846, + "step": 33 + }, + { + "epoch": 0.00236529966259696, + "grad_norm": 1.9921875, + "learning_rate": 0.0001574074074074074, + "loss": 3.8888, + "step": 34 + }, + { + "epoch": 0.0024348672997321647, + "grad_norm": 3.046875, + "learning_rate": 0.00016203703703703703, + "loss": 3.7714, + "step": 35 + }, + { + "epoch": 0.002504434936867369, + "grad_norm": 2.1875, + "learning_rate": 0.00016666666666666666, + "loss": 3.7296, + "step": 36 + }, + { + "epoch": 0.002574002574002574, + "grad_norm": 2.484375, + "learning_rate": 0.00017129629629629632, + "loss": 3.9198, + "step": 37 + }, + { + "epoch": 0.0026435702111377787, + "grad_norm": 2.125, + "learning_rate": 0.00017592592592592595, + "loss": 3.6842, + "step": 38 + }, + { + "epoch": 0.0027131378482729836, + "grad_norm": 2.828125, + "learning_rate": 0.00018055555555555555, + "loss": 3.5604, + "step": 39 + }, + { + "epoch": 0.002782705485408188, + "grad_norm": 2.203125, + "learning_rate": 0.00018518518518518518, + "loss": 3.8453, + "step": 40 + }, + { + "epoch": 0.0028522731225433928, + "grad_norm": 2.515625, + "learning_rate": 0.00018981481481481483, + "loss": 3.5141, + "step": 41 + }, + { + "epoch": 0.0029218407596785976, + "grad_norm": 2.8125, + "learning_rate": 0.00019444444444444446, + "loss": 3.477, + "step": 42 + }, + { + "epoch": 0.0029914083968138024, + "grad_norm": 4.0, + "learning_rate": 0.00019907407407407406, + "loss": 3.822, + "step": 43 + }, + { + "epoch": 0.0030609760339490068, + "grad_norm": 2.09375, + "learning_rate": 0.0002037037037037037, + "loss": 3.9645, + "step": 44 + }, + { + "epoch": 0.0031305436710842116, + "grad_norm": 4.375, + "learning_rate": 0.00020833333333333335, + "loss": 3.3629, + "step": 45 + }, + { + "epoch": 0.0032001113082194164, + "grad_norm": 3.578125, + "learning_rate": 0.00021296296296296298, + "loss": 3.4721, + "step": 46 + }, + { + "epoch": 0.0032696789453546212, + "grad_norm": 4.03125, + "learning_rate": 0.0002175925925925926, + "loss": 3.2992, + "step": 47 + }, + { + "epoch": 0.0033392465824898256, + "grad_norm": 3.234375, + "learning_rate": 0.0002222222222222222, + "loss": 3.648, + "step": 48 + }, + { + "epoch": 0.0034088142196250304, + "grad_norm": 3.203125, + "learning_rate": 0.00022685185185185186, + "loss": 3.3534, + "step": 49 + }, + { + "epoch": 0.0034783818567602352, + "grad_norm": 3.203125, + "learning_rate": 0.0002314814814814815, + "loss": 3.8404, + "step": 50 + }, + { + "epoch": 0.0035479494938954396, + "grad_norm": 2.859375, + "learning_rate": 0.00023611111111111112, + "loss": 3.6755, + "step": 51 + }, + { + "epoch": 0.0036175171310306444, + "grad_norm": 3.25, + "learning_rate": 0.00024074074074074072, + "loss": 3.5372, + "step": 52 + }, + { + "epoch": 0.0036870847681658493, + "grad_norm": 2.953125, + "learning_rate": 0.0002453703703703704, + "loss": 3.6597, + "step": 53 + }, + { + "epoch": 0.003756652405301054, + "grad_norm": 3.53125, + "learning_rate": 0.00025, + "loss": 3.7115, + "step": 54 + }, + { + "epoch": 0.0038262200424362585, + "grad_norm": 3.9375, + "learning_rate": 0.00025462962962962966, + "loss": 3.8284, + "step": 55 + }, + { + "epoch": 0.0038957876795714633, + "grad_norm": 3.234375, + "learning_rate": 0.00025925925925925926, + "loss": 3.8952, + "step": 56 + }, + { + "epoch": 0.003965355316706668, + "grad_norm": 3.28125, + "learning_rate": 0.0002638888888888889, + "loss": 3.6997, + "step": 57 + }, + { + "epoch": 0.0040349229538418725, + "grad_norm": 7.03125, + "learning_rate": 0.0002685185185185186, + "loss": 3.8363, + "step": 58 + }, + { + "epoch": 0.004104490590977078, + "grad_norm": 1.9921875, + "learning_rate": 0.0002731481481481481, + "loss": 3.6792, + "step": 59 + }, + { + "epoch": 0.004174058228112282, + "grad_norm": 3.0, + "learning_rate": 0.0002777777777777778, + "loss": 3.6028, + "step": 60 + }, + { + "epoch": 0.0042436258652474865, + "grad_norm": 2.765625, + "learning_rate": 0.0002824074074074074, + "loss": 3.6588, + "step": 61 + }, + { + "epoch": 0.004313193502382692, + "grad_norm": 2.734375, + "learning_rate": 0.00028703703703703703, + "loss": 3.3777, + "step": 62 + }, + { + "epoch": 0.004382761139517896, + "grad_norm": 4.5, + "learning_rate": 0.0002916666666666667, + "loss": 3.3868, + "step": 63 + }, + { + "epoch": 0.004452328776653101, + "grad_norm": 9.5, + "learning_rate": 0.0002962962962962963, + "loss": 3.8549, + "step": 64 + }, + { + "epoch": 0.004521896413788306, + "grad_norm": 3.328125, + "learning_rate": 0.00030092592592592595, + "loss": 3.5282, + "step": 65 + }, + { + "epoch": 0.00459146405092351, + "grad_norm": 3.203125, + "learning_rate": 0.0003055555555555556, + "loss": 3.5236, + "step": 66 + }, + { + "epoch": 0.004661031688058715, + "grad_norm": 4.59375, + "learning_rate": 0.0003101851851851852, + "loss": 3.8009, + "step": 67 + }, + { + "epoch": 0.00473059932519392, + "grad_norm": 3.25, + "learning_rate": 0.0003148148148148148, + "loss": 3.4033, + "step": 68 + }, + { + "epoch": 0.004800166962329124, + "grad_norm": 4.96875, + "learning_rate": 0.0003194444444444444, + "loss": 3.2774, + "step": 69 + }, + { + "epoch": 0.004869734599464329, + "grad_norm": 3.265625, + "learning_rate": 0.00032407407407407406, + "loss": 3.7126, + "step": 70 + }, + { + "epoch": 0.004939302236599534, + "grad_norm": 3.625, + "learning_rate": 0.0003287037037037037, + "loss": 3.7453, + "step": 71 + }, + { + "epoch": 0.005008869873734738, + "grad_norm": 2.96875, + "learning_rate": 0.0003333333333333333, + "loss": 3.2229, + "step": 72 + }, + { + "epoch": 0.0050784375108699435, + "grad_norm": 4.875, + "learning_rate": 0.000337962962962963, + "loss": 3.7234, + "step": 73 + }, + { + "epoch": 0.005148005148005148, + "grad_norm": 3.71875, + "learning_rate": 0.00034259259259259263, + "loss": 3.7748, + "step": 74 + }, + { + "epoch": 0.005217572785140353, + "grad_norm": 2.25, + "learning_rate": 0.00034722222222222224, + "loss": 3.6813, + "step": 75 + }, + { + "epoch": 0.0052871404222755575, + "grad_norm": 3.3125, + "learning_rate": 0.0003518518518518519, + "loss": 3.793, + "step": 76 + }, + { + "epoch": 0.005356708059410762, + "grad_norm": 3.1875, + "learning_rate": 0.00035648148148148144, + "loss": 3.2565, + "step": 77 + }, + { + "epoch": 0.005426275696545967, + "grad_norm": 3.5, + "learning_rate": 0.0003611111111111111, + "loss": 3.6943, + "step": 78 + }, + { + "epoch": 0.0054958433336811715, + "grad_norm": 3.453125, + "learning_rate": 0.00036574074074074075, + "loss": 3.3554, + "step": 79 + }, + { + "epoch": 0.005565410970816376, + "grad_norm": 3.171875, + "learning_rate": 0.00037037037037037035, + "loss": 3.6748, + "step": 80 + }, + { + "epoch": 0.005634978607951581, + "grad_norm": 3.25, + "learning_rate": 0.000375, + "loss": 3.5064, + "step": 81 + }, + { + "epoch": 0.0057045462450867855, + "grad_norm": 2.65625, + "learning_rate": 0.00037962962962962966, + "loss": 3.9164, + "step": 82 + }, + { + "epoch": 0.005774113882221991, + "grad_norm": 3.84375, + "learning_rate": 0.00038425925925925927, + "loss": 3.4874, + "step": 83 + }, + { + "epoch": 0.005843681519357195, + "grad_norm": 3.40625, + "learning_rate": 0.0003888888888888889, + "loss": 3.6889, + "step": 84 + }, + { + "epoch": 0.0059132491564923995, + "grad_norm": 3.140625, + "learning_rate": 0.0003935185185185186, + "loss": 3.3944, + "step": 85 + }, + { + "epoch": 0.005982816793627605, + "grad_norm": 2.34375, + "learning_rate": 0.0003981481481481481, + "loss": 3.3187, + "step": 86 + }, + { + "epoch": 0.006052384430762809, + "grad_norm": 6.5, + "learning_rate": 0.0004027777777777778, + "loss": 3.6085, + "step": 87 + }, + { + "epoch": 0.0061219520678980135, + "grad_norm": 6.375, + "learning_rate": 0.0004074074074074074, + "loss": 3.3279, + "step": 88 + }, + { + "epoch": 0.006191519705033219, + "grad_norm": 4.28125, + "learning_rate": 0.00041203703703703704, + "loss": 3.6968, + "step": 89 + }, + { + "epoch": 0.006261087342168423, + "grad_norm": 5.0, + "learning_rate": 0.0004166666666666667, + "loss": 3.162, + "step": 90 + }, + { + "epoch": 0.006330654979303628, + "grad_norm": 3.296875, + "learning_rate": 0.0004212962962962963, + "loss": 3.106, + "step": 91 + }, + { + "epoch": 0.006400222616438833, + "grad_norm": 3.0625, + "learning_rate": 0.00042592592592592595, + "loss": 3.2511, + "step": 92 + }, + { + "epoch": 0.006469790253574037, + "grad_norm": 3.734375, + "learning_rate": 0.0004305555555555556, + "loss": 3.3395, + "step": 93 + }, + { + "epoch": 0.0065393578907092425, + "grad_norm": 3.5625, + "learning_rate": 0.0004351851851851852, + "loss": 3.9849, + "step": 94 + }, + { + "epoch": 0.006608925527844447, + "grad_norm": 3.34375, + "learning_rate": 0.0004398148148148148, + "loss": 3.7063, + "step": 95 + }, + { + "epoch": 0.006678493164979651, + "grad_norm": 3.671875, + "learning_rate": 0.0004444444444444444, + "loss": 3.643, + "step": 96 + }, + { + "epoch": 0.0067480608021148565, + "grad_norm": 3.671875, + "learning_rate": 0.00044907407407407407, + "loss": 3.7839, + "step": 97 + }, + { + "epoch": 0.006817628439250061, + "grad_norm": 2.9375, + "learning_rate": 0.0004537037037037037, + "loss": 3.3117, + "step": 98 + }, + { + "epoch": 0.006887196076385265, + "grad_norm": 4.0625, + "learning_rate": 0.0004583333333333333, + "loss": 3.2695, + "step": 99 + }, + { + "epoch": 0.0069567637135204705, + "grad_norm": 3.1875, + "learning_rate": 0.000462962962962963, + "loss": 4.1127, + "step": 100 + }, + { + "epoch": 0.007026331350655675, + "grad_norm": 7.0, + "learning_rate": 0.00046759259259259264, + "loss": 3.3675, + "step": 101 + }, + { + "epoch": 0.007095898987790879, + "grad_norm": 4.0625, + "learning_rate": 0.00047222222222222224, + "loss": 3.4207, + "step": 102 + }, + { + "epoch": 0.0071654666249260845, + "grad_norm": 4.125, + "learning_rate": 0.0004768518518518519, + "loss": 3.3211, + "step": 103 + }, + { + "epoch": 0.007235034262061289, + "grad_norm": 3.84375, + "learning_rate": 0.00048148148148148144, + "loss": 3.0541, + "step": 104 + }, + { + "epoch": 0.007304601899196494, + "grad_norm": 6.46875, + "learning_rate": 0.0004861111111111111, + "loss": 3.7767, + "step": 105 + }, + { + "epoch": 0.0073741695363316985, + "grad_norm": 6.21875, + "learning_rate": 0.0004907407407407408, + "loss": 3.6529, + "step": 106 + }, + { + "epoch": 0.007443737173466903, + "grad_norm": 5.03125, + "learning_rate": 0.0004953703703703704, + "loss": 3.3748, + "step": 107 + }, + { + "epoch": 0.007513304810602108, + "grad_norm": 3.3125, + "learning_rate": 0.0005, + "loss": 3.1601, + "step": 108 + }, + { + "epoch": 0.0075828724477373126, + "grad_norm": 4.53125, + "learning_rate": 0.0005046296296296296, + "loss": 3.583, + "step": 109 + }, + { + "epoch": 0.007652440084872517, + "grad_norm": 4.46875, + "learning_rate": 0.0005092592592592593, + "loss": 3.576, + "step": 110 + }, + { + "epoch": 0.007722007722007722, + "grad_norm": 3.140625, + "learning_rate": 0.0005138888888888888, + "loss": 3.2224, + "step": 111 + }, + { + "epoch": 0.007791575359142927, + "grad_norm": 3.84375, + "learning_rate": 0.0005185185185185185, + "loss": 3.5271, + "step": 112 + }, + { + "epoch": 0.007861142996278131, + "grad_norm": 5.6875, + "learning_rate": 0.0005231481481481481, + "loss": 3.5304, + "step": 113 + }, + { + "epoch": 0.007930710633413336, + "grad_norm": 3.609375, + "learning_rate": 0.0005277777777777778, + "loss": 3.4794, + "step": 114 + }, + { + "epoch": 0.008000278270548541, + "grad_norm": 2.921875, + "learning_rate": 0.0005324074074074074, + "loss": 3.7191, + "step": 115 + }, + { + "epoch": 0.008069845907683745, + "grad_norm": 3.328125, + "learning_rate": 0.0005370370370370371, + "loss": 3.5999, + "step": 116 + }, + { + "epoch": 0.00813941354481895, + "grad_norm": 3.828125, + "learning_rate": 0.0005416666666666666, + "loss": 3.5959, + "step": 117 + }, + { + "epoch": 0.008208981181954155, + "grad_norm": 2.484375, + "learning_rate": 0.0005462962962962962, + "loss": 3.5367, + "step": 118 + }, + { + "epoch": 0.008278548819089359, + "grad_norm": 2.96875, + "learning_rate": 0.000550925925925926, + "loss": 3.5518, + "step": 119 + }, + { + "epoch": 0.008348116456224564, + "grad_norm": 3.546875, + "learning_rate": 0.0005555555555555556, + "loss": 3.5799, + "step": 120 + }, + { + "epoch": 0.00841768409335977, + "grad_norm": 3.25, + "learning_rate": 0.0005601851851851853, + "loss": 3.4259, + "step": 121 + }, + { + "epoch": 0.008487251730494973, + "grad_norm": 3.078125, + "learning_rate": 0.0005648148148148148, + "loss": 3.1868, + "step": 122 + }, + { + "epoch": 0.008556819367630178, + "grad_norm": 2.609375, + "learning_rate": 0.0005694444444444445, + "loss": 3.6312, + "step": 123 + }, + { + "epoch": 0.008626387004765384, + "grad_norm": 3.421875, + "learning_rate": 0.0005740740740740741, + "loss": 3.5673, + "step": 124 + }, + { + "epoch": 0.008695954641900587, + "grad_norm": 3.5, + "learning_rate": 0.0005787037037037038, + "loss": 3.3845, + "step": 125 + }, + { + "epoch": 0.008765522279035792, + "grad_norm": 3.46875, + "learning_rate": 0.0005833333333333334, + "loss": 3.5458, + "step": 126 + }, + { + "epoch": 0.008835089916170998, + "grad_norm": 3.0, + "learning_rate": 0.0005879629629629629, + "loss": 3.3844, + "step": 127 + }, + { + "epoch": 0.008904657553306203, + "grad_norm": 2.921875, + "learning_rate": 0.0005925925925925926, + "loss": 3.4753, + "step": 128 + }, + { + "epoch": 0.008974225190441406, + "grad_norm": 2.75, + "learning_rate": 0.0005972222222222222, + "loss": 3.7892, + "step": 129 + }, + { + "epoch": 0.009043792827576612, + "grad_norm": 3.234375, + "learning_rate": 0.0006018518518518519, + "loss": 3.3437, + "step": 130 + }, + { + "epoch": 0.009113360464711817, + "grad_norm": 3.171875, + "learning_rate": 0.0006064814814814815, + "loss": 3.3362, + "step": 131 + }, + { + "epoch": 0.00918292810184702, + "grad_norm": 5.875, + "learning_rate": 0.0006111111111111112, + "loss": 3.806, + "step": 132 + }, + { + "epoch": 0.009252495738982226, + "grad_norm": 4.65625, + "learning_rate": 0.0006157407407407407, + "loss": 3.3642, + "step": 133 + }, + { + "epoch": 0.00932206337611743, + "grad_norm": 4.8125, + "learning_rate": 0.0006203703703703704, + "loss": 3.4821, + "step": 134 + }, + { + "epoch": 0.009391631013252634, + "grad_norm": 5.03125, + "learning_rate": 0.000625, + "loss": 3.5422, + "step": 135 + }, + { + "epoch": 0.00946119865038784, + "grad_norm": 3.796875, + "learning_rate": 0.0006296296296296296, + "loss": 3.7212, + "step": 136 + }, + { + "epoch": 0.009530766287523045, + "grad_norm": 3.234375, + "learning_rate": 0.0006342592592592593, + "loss": 3.7358, + "step": 137 + }, + { + "epoch": 0.009600333924658248, + "grad_norm": 3.125, + "learning_rate": 0.0006388888888888888, + "loss": 3.2124, + "step": 138 + }, + { + "epoch": 0.009669901561793454, + "grad_norm": 2.34375, + "learning_rate": 0.0006435185185185185, + "loss": 3.5355, + "step": 139 + }, + { + "epoch": 0.009739469198928659, + "grad_norm": 3.828125, + "learning_rate": 0.0006481481481481481, + "loss": 2.8285, + "step": 140 + }, + { + "epoch": 0.009809036836063862, + "grad_norm": 2.265625, + "learning_rate": 0.0006527777777777778, + "loss": 3.2107, + "step": 141 + }, + { + "epoch": 0.009878604473199068, + "grad_norm": 2.671875, + "learning_rate": 0.0006574074074074074, + "loss": 3.6405, + "step": 142 + }, + { + "epoch": 0.009948172110334273, + "grad_norm": 2.734375, + "learning_rate": 0.0006620370370370372, + "loss": 3.77, + "step": 143 + }, + { + "epoch": 0.010017739747469476, + "grad_norm": 5.5625, + "learning_rate": 0.0006666666666666666, + "loss": 3.5891, + "step": 144 + }, + { + "epoch": 0.010087307384604682, + "grad_norm": 4.1875, + "learning_rate": 0.0006712962962962962, + "loss": 3.6008, + "step": 145 + }, + { + "epoch": 0.010156875021739887, + "grad_norm": 2.4375, + "learning_rate": 0.000675925925925926, + "loss": 3.3318, + "step": 146 + }, + { + "epoch": 0.010226442658875092, + "grad_norm": 3.75, + "learning_rate": 0.0006805555555555556, + "loss": 3.635, + "step": 147 + }, + { + "epoch": 0.010296010296010296, + "grad_norm": 3.59375, + "learning_rate": 0.0006851851851851853, + "loss": 3.6206, + "step": 148 + }, + { + "epoch": 0.010365577933145501, + "grad_norm": 3.5625, + "learning_rate": 0.0006898148148148148, + "loss": 3.5804, + "step": 149 + }, + { + "epoch": 0.010435145570280706, + "grad_norm": 2.46875, + "learning_rate": 0.0006944444444444445, + "loss": 3.4559, + "step": 150 + }, + { + "epoch": 0.01050471320741591, + "grad_norm": 2.859375, + "learning_rate": 0.0006990740740740741, + "loss": 3.2063, + "step": 151 + }, + { + "epoch": 0.010574280844551115, + "grad_norm": 2.96875, + "learning_rate": 0.0007037037037037038, + "loss": 3.746, + "step": 152 + }, + { + "epoch": 0.01064384848168632, + "grad_norm": 5.75, + "learning_rate": 0.0007083333333333334, + "loss": 2.9674, + "step": 153 + }, + { + "epoch": 0.010713416118821524, + "grad_norm": 2.65625, + "learning_rate": 0.0007129629629629629, + "loss": 3.7912, + "step": 154 + }, + { + "epoch": 0.010782983755956729, + "grad_norm": 2.90625, + "learning_rate": 0.0007175925925925926, + "loss": 3.2516, + "step": 155 + }, + { + "epoch": 0.010852551393091934, + "grad_norm": 3.1875, + "learning_rate": 0.0007222222222222222, + "loss": 3.3805, + "step": 156 + }, + { + "epoch": 0.010922119030227138, + "grad_norm": 3.96875, + "learning_rate": 0.0007268518518518519, + "loss": 3.542, + "step": 157 + }, + { + "epoch": 0.010991686667362343, + "grad_norm": 3.328125, + "learning_rate": 0.0007314814814814815, + "loss": 3.4749, + "step": 158 + }, + { + "epoch": 0.011061254304497548, + "grad_norm": 2.28125, + "learning_rate": 0.0007361111111111112, + "loss": 3.5944, + "step": 159 + }, + { + "epoch": 0.011130821941632752, + "grad_norm": 3.25, + "learning_rate": 0.0007407407407407407, + "loss": 3.2803, + "step": 160 + }, + { + "epoch": 0.011200389578767957, + "grad_norm": 3.453125, + "learning_rate": 0.0007453703703703704, + "loss": 3.3337, + "step": 161 + }, + { + "epoch": 0.011269957215903162, + "grad_norm": 2.515625, + "learning_rate": 0.00075, + "loss": 3.5694, + "step": 162 + }, + { + "epoch": 0.011339524853038366, + "grad_norm": 3.921875, + "learning_rate": 0.0007546296296296296, + "loss": 3.3504, + "step": 163 + }, + { + "epoch": 0.011409092490173571, + "grad_norm": 4.65625, + "learning_rate": 0.0007592592592592593, + "loss": 3.3871, + "step": 164 + }, + { + "epoch": 0.011478660127308776, + "grad_norm": 3.703125, + "learning_rate": 0.0007638888888888888, + "loss": 3.5259, + "step": 165 + }, + { + "epoch": 0.011548227764443982, + "grad_norm": 2.796875, + "learning_rate": 0.0007685185185185185, + "loss": 3.2406, + "step": 166 + }, + { + "epoch": 0.011617795401579185, + "grad_norm": 4.03125, + "learning_rate": 0.0007731481481481481, + "loss": 3.4967, + "step": 167 + }, + { + "epoch": 0.01168736303871439, + "grad_norm": 2.90625, + "learning_rate": 0.0007777777777777778, + "loss": 3.6256, + "step": 168 + }, + { + "epoch": 0.011756930675849596, + "grad_norm": 2.21875, + "learning_rate": 0.0007824074074074074, + "loss": 3.4215, + "step": 169 + }, + { + "epoch": 0.011826498312984799, + "grad_norm": 2.796875, + "learning_rate": 0.0007870370370370372, + "loss": 3.2696, + "step": 170 + }, + { + "epoch": 0.011896065950120004, + "grad_norm": 3.109375, + "learning_rate": 0.0007916666666666666, + "loss": 3.34, + "step": 171 + }, + { + "epoch": 0.01196563358725521, + "grad_norm": 2.25, + "learning_rate": 0.0007962962962962962, + "loss": 3.4738, + "step": 172 + }, + { + "epoch": 0.012035201224390413, + "grad_norm": 2.484375, + "learning_rate": 0.000800925925925926, + "loss": 3.5333, + "step": 173 + }, + { + "epoch": 0.012104768861525618, + "grad_norm": 3.453125, + "learning_rate": 0.0008055555555555556, + "loss": 3.6269, + "step": 174 + }, + { + "epoch": 0.012174336498660824, + "grad_norm": 2.546875, + "learning_rate": 0.0008101851851851853, + "loss": 3.3796, + "step": 175 + }, + { + "epoch": 0.012243904135796027, + "grad_norm": 4.71875, + "learning_rate": 0.0008148148148148148, + "loss": 3.7743, + "step": 176 + }, + { + "epoch": 0.012313471772931232, + "grad_norm": 3.078125, + "learning_rate": 0.0008194444444444445, + "loss": 3.494, + "step": 177 + }, + { + "epoch": 0.012383039410066438, + "grad_norm": 3.328125, + "learning_rate": 0.0008240740740740741, + "loss": 3.4347, + "step": 178 + }, + { + "epoch": 0.012452607047201641, + "grad_norm": 3.015625, + "learning_rate": 0.0008287037037037038, + "loss": 3.0524, + "step": 179 + }, + { + "epoch": 0.012522174684336846, + "grad_norm": 2.046875, + "learning_rate": 0.0008333333333333334, + "loss": 3.4803, + "step": 180 + }, + { + "epoch": 0.012591742321472052, + "grad_norm": 3.234375, + "learning_rate": 0.0008379629629629629, + "loss": 3.4848, + "step": 181 + }, + { + "epoch": 0.012661309958607255, + "grad_norm": 3.28125, + "learning_rate": 0.0008425925925925926, + "loss": 3.5803, + "step": 182 + }, + { + "epoch": 0.01273087759574246, + "grad_norm": 3.015625, + "learning_rate": 0.0008472222222222222, + "loss": 3.1998, + "step": 183 + }, + { + "epoch": 0.012800445232877666, + "grad_norm": 2.78125, + "learning_rate": 0.0008518518518518519, + "loss": 4.02, + "step": 184 + }, + { + "epoch": 0.01287001287001287, + "grad_norm": 3.46875, + "learning_rate": 0.0008564814814814815, + "loss": 3.5389, + "step": 185 + }, + { + "epoch": 0.012939580507148074, + "grad_norm": 2.3125, + "learning_rate": 0.0008611111111111112, + "loss": 3.6485, + "step": 186 + }, + { + "epoch": 0.01300914814428328, + "grad_norm": 2.390625, + "learning_rate": 0.0008657407407407407, + "loss": 3.6143, + "step": 187 + }, + { + "epoch": 0.013078715781418485, + "grad_norm": 3.03125, + "learning_rate": 0.0008703703703703704, + "loss": 3.2083, + "step": 188 + }, + { + "epoch": 0.013148283418553688, + "grad_norm": 3.0625, + "learning_rate": 0.000875, + "loss": 3.7757, + "step": 189 + }, + { + "epoch": 0.013217851055688894, + "grad_norm": 3.0, + "learning_rate": 0.0008796296296296296, + "loss": 3.3781, + "step": 190 + }, + { + "epoch": 0.013287418692824099, + "grad_norm": 2.8125, + "learning_rate": 0.0008842592592592593, + "loss": 3.5871, + "step": 191 + }, + { + "epoch": 0.013356986329959302, + "grad_norm": 2.484375, + "learning_rate": 0.0008888888888888888, + "loss": 3.4771, + "step": 192 + }, + { + "epoch": 0.013426553967094508, + "grad_norm": 2.875, + "learning_rate": 0.0008935185185185185, + "loss": 3.2491, + "step": 193 + }, + { + "epoch": 0.013496121604229713, + "grad_norm": 7.21875, + "learning_rate": 0.0008981481481481481, + "loss": 3.3524, + "step": 194 + }, + { + "epoch": 0.013565689241364916, + "grad_norm": 3.15625, + "learning_rate": 0.0009027777777777778, + "loss": 3.4881, + "step": 195 + }, + { + "epoch": 0.013635256878500122, + "grad_norm": 2.921875, + "learning_rate": 0.0009074074074074074, + "loss": 3.3662, + "step": 196 + }, + { + "epoch": 0.013704824515635327, + "grad_norm": 3.015625, + "learning_rate": 0.0009120370370370372, + "loss": 3.647, + "step": 197 + }, + { + "epoch": 0.01377439215277053, + "grad_norm": 2.65625, + "learning_rate": 0.0009166666666666666, + "loss": 3.2356, + "step": 198 + }, + { + "epoch": 0.013843959789905736, + "grad_norm": 2.28125, + "learning_rate": 0.0009212962962962963, + "loss": 3.5328, + "step": 199 + }, + { + "epoch": 0.013913527427040941, + "grad_norm": 2.859375, + "learning_rate": 0.000925925925925926, + "loss": 3.576, + "step": 200 + }, + { + "epoch": 0.013983095064176145, + "grad_norm": 3.0625, + "learning_rate": 0.0009305555555555556, + "loss": 3.3183, + "step": 201 + }, + { + "epoch": 0.01405266270131135, + "grad_norm": 2.046875, + "learning_rate": 0.0009351851851851853, + "loss": 3.4818, + "step": 202 + }, + { + "epoch": 0.014122230338446555, + "grad_norm": 2.1875, + "learning_rate": 0.0009398148148148148, + "loss": 3.5219, + "step": 203 + }, + { + "epoch": 0.014191797975581759, + "grad_norm": 2.6875, + "learning_rate": 0.0009444444444444445, + "loss": 3.775, + "step": 204 + }, + { + "epoch": 0.014261365612716964, + "grad_norm": 2.296875, + "learning_rate": 0.0009490740740740741, + "loss": 3.4462, + "step": 205 + }, + { + "epoch": 0.014330933249852169, + "grad_norm": 2.359375, + "learning_rate": 0.0009537037037037038, + "loss": 3.4381, + "step": 206 + }, + { + "epoch": 0.014400500886987374, + "grad_norm": 2.875, + "learning_rate": 0.0009583333333333334, + "loss": 3.0672, + "step": 207 + }, + { + "epoch": 0.014470068524122578, + "grad_norm": 1.953125, + "learning_rate": 0.0009629629629629629, + "loss": 3.1428, + "step": 208 + }, + { + "epoch": 0.014539636161257783, + "grad_norm": 2.078125, + "learning_rate": 0.0009675925925925926, + "loss": 3.4844, + "step": 209 + }, + { + "epoch": 0.014609203798392988, + "grad_norm": 3.078125, + "learning_rate": 0.0009722222222222222, + "loss": 3.3521, + "step": 210 + }, + { + "epoch": 0.014678771435528192, + "grad_norm": 3.109375, + "learning_rate": 0.0009768518518518518, + "loss": 3.4093, + "step": 211 + }, + { + "epoch": 0.014748339072663397, + "grad_norm": 2.390625, + "learning_rate": 0.0009814814814814816, + "loss": 3.8329, + "step": 212 + }, + { + "epoch": 0.014817906709798602, + "grad_norm": 3.28125, + "learning_rate": 0.0009861111111111112, + "loss": 3.2119, + "step": 213 + }, + { + "epoch": 0.014887474346933806, + "grad_norm": 2.8125, + "learning_rate": 0.0009907407407407408, + "loss": 3.1916, + "step": 214 + }, + { + "epoch": 0.014957041984069011, + "grad_norm": 5.3125, + "learning_rate": 0.0009953703703703704, + "loss": 3.7224, + "step": 215 + }, + { + "epoch": 0.015026609621204216, + "grad_norm": 2.8125, + "learning_rate": 0.001, + "loss": 3.3947, + "step": 216 + }, + { + "epoch": 0.01509617725833942, + "grad_norm": 4.40625, + "learning_rate": 0.0010046296296296296, + "loss": 3.8063, + "step": 217 + }, + { + "epoch": 0.015165744895474625, + "grad_norm": 3.34375, + "learning_rate": 0.0010092592592592592, + "loss": 3.4035, + "step": 218 + }, + { + "epoch": 0.01523531253260983, + "grad_norm": 2.921875, + "learning_rate": 0.0010138888888888888, + "loss": 3.2933, + "step": 219 + }, + { + "epoch": 0.015304880169745034, + "grad_norm": 2.0625, + "learning_rate": 0.0010185185185185186, + "loss": 3.5955, + "step": 220 + }, + { + "epoch": 0.015374447806880239, + "grad_norm": 3.265625, + "learning_rate": 0.0010231481481481482, + "loss": 3.4307, + "step": 221 + }, + { + "epoch": 0.015444015444015444, + "grad_norm": 4.0625, + "learning_rate": 0.0010277777777777776, + "loss": 3.0483, + "step": 222 + }, + { + "epoch": 0.015513583081150648, + "grad_norm": 2.3125, + "learning_rate": 0.0010324074074074074, + "loss": 3.3634, + "step": 223 + }, + { + "epoch": 0.015583150718285853, + "grad_norm": 3.78125, + "learning_rate": 0.001037037037037037, + "loss": 3.6414, + "step": 224 + }, + { + "epoch": 0.015652718355421057, + "grad_norm": 2.6875, + "learning_rate": 0.0010416666666666669, + "loss": 3.4195, + "step": 225 + }, + { + "epoch": 0.015722285992556262, + "grad_norm": 1.75, + "learning_rate": 0.0010462962962962963, + "loss": 3.9841, + "step": 226 + }, + { + "epoch": 0.015791853629691467, + "grad_norm": 2.625, + "learning_rate": 0.0010509259259259259, + "loss": 3.6758, + "step": 227 + }, + { + "epoch": 0.015861421266826672, + "grad_norm": 2.53125, + "learning_rate": 0.0010555555555555557, + "loss": 3.6038, + "step": 228 + }, + { + "epoch": 0.015930988903961878, + "grad_norm": 2.1875, + "learning_rate": 0.001060185185185185, + "loss": 3.7388, + "step": 229 + }, + { + "epoch": 0.016000556541097083, + "grad_norm": 3.9375, + "learning_rate": 0.0010648148148148149, + "loss": 3.3876, + "step": 230 + }, + { + "epoch": 0.016070124178232285, + "grad_norm": 4.1875, + "learning_rate": 0.0010694444444444445, + "loss": 3.3911, + "step": 231 + }, + { + "epoch": 0.01613969181536749, + "grad_norm": 3.0, + "learning_rate": 0.0010740740740740743, + "loss": 3.7106, + "step": 232 + }, + { + "epoch": 0.016209259452502695, + "grad_norm": 4.28125, + "learning_rate": 0.0010787037037037037, + "loss": 3.7648, + "step": 233 + }, + { + "epoch": 0.0162788270896379, + "grad_norm": 2.28125, + "learning_rate": 0.0010833333333333333, + "loss": 3.1461, + "step": 234 + }, + { + "epoch": 0.016348394726773106, + "grad_norm": 2.359375, + "learning_rate": 0.001087962962962963, + "loss": 3.7154, + "step": 235 + }, + { + "epoch": 0.01641796236390831, + "grad_norm": 3.453125, + "learning_rate": 0.0010925925925925925, + "loss": 3.5562, + "step": 236 + }, + { + "epoch": 0.016487530001043516, + "grad_norm": 3.296875, + "learning_rate": 0.0010972222222222223, + "loss": 3.2213, + "step": 237 + }, + { + "epoch": 0.016557097638178718, + "grad_norm": 4.4375, + "learning_rate": 0.001101851851851852, + "loss": 3.7439, + "step": 238 + }, + { + "epoch": 0.016626665275313923, + "grad_norm": 2.265625, + "learning_rate": 0.0011064814814814815, + "loss": 3.4636, + "step": 239 + }, + { + "epoch": 0.01669623291244913, + "grad_norm": 2.84375, + "learning_rate": 0.0011111111111111111, + "loss": 3.4433, + "step": 240 + }, + { + "epoch": 0.016765800549584334, + "grad_norm": 3.59375, + "learning_rate": 0.0011157407407407407, + "loss": 3.5306, + "step": 241 + }, + { + "epoch": 0.01683536818671954, + "grad_norm": 4.6875, + "learning_rate": 0.0011203703703703705, + "loss": 3.5676, + "step": 242 + }, + { + "epoch": 0.016904935823854744, + "grad_norm": 3.296875, + "learning_rate": 0.0011250000000000001, + "loss": 2.8527, + "step": 243 + }, + { + "epoch": 0.016974503460989946, + "grad_norm": 1.78125, + "learning_rate": 0.0011296296296296295, + "loss": 3.7269, + "step": 244 + }, + { + "epoch": 0.01704407109812515, + "grad_norm": 2.21875, + "learning_rate": 0.0011342592592592593, + "loss": 3.1405, + "step": 245 + }, + { + "epoch": 0.017113638735260357, + "grad_norm": 2.09375, + "learning_rate": 0.001138888888888889, + "loss": 3.7365, + "step": 246 + }, + { + "epoch": 0.017183206372395562, + "grad_norm": 2.796875, + "learning_rate": 0.0011435185185185185, + "loss": 2.9634, + "step": 247 + }, + { + "epoch": 0.017252774009530767, + "grad_norm": 2.890625, + "learning_rate": 0.0011481481481481481, + "loss": 3.4298, + "step": 248 + }, + { + "epoch": 0.017322341646665972, + "grad_norm": 3.09375, + "learning_rate": 0.0011527777777777777, + "loss": 3.5184, + "step": 249 + }, + { + "epoch": 0.017391909283801174, + "grad_norm": 2.671875, + "learning_rate": 0.0011574074074074076, + "loss": 3.6976, + "step": 250 + }, + { + "epoch": 0.01746147692093638, + "grad_norm": 2.609375, + "learning_rate": 0.001162037037037037, + "loss": 3.2592, + "step": 251 + }, + { + "epoch": 0.017531044558071585, + "grad_norm": 2.328125, + "learning_rate": 0.0011666666666666668, + "loss": 3.491, + "step": 252 + }, + { + "epoch": 0.01760061219520679, + "grad_norm": 2.328125, + "learning_rate": 0.0011712962962962964, + "loss": 3.6034, + "step": 253 + }, + { + "epoch": 0.017670179832341995, + "grad_norm": 2.28125, + "learning_rate": 0.0011759259259259257, + "loss": 3.5464, + "step": 254 + }, + { + "epoch": 0.0177397474694772, + "grad_norm": 2.84375, + "learning_rate": 0.0011805555555555556, + "loss": 3.6768, + "step": 255 + }, + { + "epoch": 0.017809315106612406, + "grad_norm": 1.8359375, + "learning_rate": 0.0011851851851851852, + "loss": 3.2789, + "step": 256 + }, + { + "epoch": 0.017878882743747607, + "grad_norm": 2.703125, + "learning_rate": 0.001189814814814815, + "loss": 3.4833, + "step": 257 + }, + { + "epoch": 0.017948450380882813, + "grad_norm": 2.875, + "learning_rate": 0.0011944444444444444, + "loss": 3.3873, + "step": 258 + }, + { + "epoch": 0.018018018018018018, + "grad_norm": 3.28125, + "learning_rate": 0.0011990740740740742, + "loss": 3.7072, + "step": 259 + }, + { + "epoch": 0.018087585655153223, + "grad_norm": 2.765625, + "learning_rate": 0.0012037037037037038, + "loss": 2.9247, + "step": 260 + }, + { + "epoch": 0.01815715329228843, + "grad_norm": 2.609375, + "learning_rate": 0.0012083333333333332, + "loss": 3.7827, + "step": 261 + }, + { + "epoch": 0.018226720929423634, + "grad_norm": 2.875, + "learning_rate": 0.001212962962962963, + "loss": 3.6445, + "step": 262 + }, + { + "epoch": 0.018296288566558835, + "grad_norm": 3.1875, + "learning_rate": 0.0012175925925925926, + "loss": 3.3478, + "step": 263 + }, + { + "epoch": 0.01836585620369404, + "grad_norm": 228.0, + "learning_rate": 0.0012222222222222224, + "loss": 4.2626, + "step": 264 + }, + { + "epoch": 0.018435423840829246, + "grad_norm": 2.9375, + "learning_rate": 0.0012268518518518518, + "loss": 3.646, + "step": 265 + }, + { + "epoch": 0.01850499147796445, + "grad_norm": 2.8125, + "learning_rate": 0.0012314814814814814, + "loss": 3.4842, + "step": 266 + }, + { + "epoch": 0.018574559115099656, + "grad_norm": 2.765625, + "learning_rate": 0.0012361111111111112, + "loss": 3.6565, + "step": 267 + }, + { + "epoch": 0.01864412675223486, + "grad_norm": 3.109375, + "learning_rate": 0.0012407407407407408, + "loss": 3.2464, + "step": 268 + }, + { + "epoch": 0.018713694389370063, + "grad_norm": 2.203125, + "learning_rate": 0.0012453703703703704, + "loss": 3.9116, + "step": 269 + }, + { + "epoch": 0.01878326202650527, + "grad_norm": 4.15625, + "learning_rate": 0.00125, + "loss": 3.1034, + "step": 270 + }, + { + "epoch": 0.018852829663640474, + "grad_norm": 3.53125, + "learning_rate": 0.0012546296296296296, + "loss": 3.4661, + "step": 271 + }, + { + "epoch": 0.01892239730077568, + "grad_norm": 2.765625, + "learning_rate": 0.0012592592592592592, + "loss": 3.4787, + "step": 272 + }, + { + "epoch": 0.018991964937910884, + "grad_norm": 2.8125, + "learning_rate": 0.0012638888888888888, + "loss": 3.4316, + "step": 273 + }, + { + "epoch": 0.01906153257504609, + "grad_norm": 2.703125, + "learning_rate": 0.0012685185185185186, + "loss": 3.4963, + "step": 274 + }, + { + "epoch": 0.019131100212181295, + "grad_norm": 2.46875, + "learning_rate": 0.0012731481481481483, + "loss": 3.8052, + "step": 275 + }, + { + "epoch": 0.019200667849316497, + "grad_norm": 2.515625, + "learning_rate": 0.0012777777777777776, + "loss": 3.6617, + "step": 276 + }, + { + "epoch": 0.019270235486451702, + "grad_norm": 3.3125, + "learning_rate": 0.0012824074074074075, + "loss": 3.0172, + "step": 277 + }, + { + "epoch": 0.019339803123586907, + "grad_norm": 2.828125, + "learning_rate": 0.001287037037037037, + "loss": 3.6914, + "step": 278 + }, + { + "epoch": 0.019409370760722112, + "grad_norm": 2.484375, + "learning_rate": 0.0012916666666666669, + "loss": 3.7064, + "step": 279 + }, + { + "epoch": 0.019478938397857318, + "grad_norm": 2.53125, + "learning_rate": 0.0012962962962962963, + "loss": 3.1845, + "step": 280 + }, + { + "epoch": 0.019548506034992523, + "grad_norm": 2.375, + "learning_rate": 0.0013009259259259259, + "loss": 3.29, + "step": 281 + }, + { + "epoch": 0.019618073672127725, + "grad_norm": 2.421875, + "learning_rate": 0.0013055555555555557, + "loss": 3.5425, + "step": 282 + }, + { + "epoch": 0.01968764130926293, + "grad_norm": 2.703125, + "learning_rate": 0.001310185185185185, + "loss": 3.0504, + "step": 283 + }, + { + "epoch": 0.019757208946398135, + "grad_norm": 2.0, + "learning_rate": 0.0013148148148148149, + "loss": 3.6795, + "step": 284 + }, + { + "epoch": 0.01982677658353334, + "grad_norm": 3.3125, + "learning_rate": 0.0013194444444444445, + "loss": 3.4279, + "step": 285 + }, + { + "epoch": 0.019896344220668546, + "grad_norm": 2.53125, + "learning_rate": 0.0013240740740740743, + "loss": 3.3526, + "step": 286 + }, + { + "epoch": 0.01996591185780375, + "grad_norm": 3.21875, + "learning_rate": 0.0013287037037037037, + "loss": 3.5018, + "step": 287 + }, + { + "epoch": 0.020035479494938953, + "grad_norm": 2.453125, + "learning_rate": 0.0013333333333333333, + "loss": 3.1581, + "step": 288 + }, + { + "epoch": 0.020105047132074158, + "grad_norm": 2.84375, + "learning_rate": 0.001337962962962963, + "loss": 3.0851, + "step": 289 + }, + { + "epoch": 0.020174614769209363, + "grad_norm": 2.65625, + "learning_rate": 0.0013425925925925925, + "loss": 3.6954, + "step": 290 + }, + { + "epoch": 0.02024418240634457, + "grad_norm": 2.40625, + "learning_rate": 0.0013472222222222223, + "loss": 3.5156, + "step": 291 + }, + { + "epoch": 0.020313750043479774, + "grad_norm": 1.765625, + "learning_rate": 0.001351851851851852, + "loss": 3.3342, + "step": 292 + }, + { + "epoch": 0.02038331768061498, + "grad_norm": 2.015625, + "learning_rate": 0.0013564814814814815, + "loss": 3.4297, + "step": 293 + }, + { + "epoch": 0.020452885317750184, + "grad_norm": 2.59375, + "learning_rate": 0.0013611111111111111, + "loss": 3.461, + "step": 294 + }, + { + "epoch": 0.020522452954885386, + "grad_norm": 2.4375, + "learning_rate": 0.0013657407407407407, + "loss": 3.9659, + "step": 295 + }, + { + "epoch": 0.02059202059202059, + "grad_norm": 2.359375, + "learning_rate": 0.0013703703703703705, + "loss": 3.7741, + "step": 296 + }, + { + "epoch": 0.020661588229155797, + "grad_norm": 1.75, + "learning_rate": 0.001375, + "loss": 3.4807, + "step": 297 + }, + { + "epoch": 0.020731155866291002, + "grad_norm": 1.8125, + "learning_rate": 0.0013796296296296295, + "loss": 3.4547, + "step": 298 + }, + { + "epoch": 0.020800723503426207, + "grad_norm": 3.125, + "learning_rate": 0.0013842592592592593, + "loss": 3.5839, + "step": 299 + }, + { + "epoch": 0.020870291140561412, + "grad_norm": 2.328125, + "learning_rate": 0.001388888888888889, + "loss": 3.4602, + "step": 300 + }, + { + "epoch": 0.020939858777696614, + "grad_norm": 2.71875, + "learning_rate": 0.0013935185185185185, + "loss": 3.0079, + "step": 301 + }, + { + "epoch": 0.02100942641483182, + "grad_norm": 3.0, + "learning_rate": 0.0013981481481481481, + "loss": 3.4029, + "step": 302 + }, + { + "epoch": 0.021078994051967025, + "grad_norm": 2.890625, + "learning_rate": 0.0014027777777777777, + "loss": 3.3146, + "step": 303 + }, + { + "epoch": 0.02114856168910223, + "grad_norm": 2.46875, + "learning_rate": 0.0014074074074074076, + "loss": 3.468, + "step": 304 + }, + { + "epoch": 0.021218129326237435, + "grad_norm": 2.46875, + "learning_rate": 0.001412037037037037, + "loss": 3.3304, + "step": 305 + }, + { + "epoch": 0.02128769696337264, + "grad_norm": 1.640625, + "learning_rate": 0.0014166666666666668, + "loss": 3.6421, + "step": 306 + }, + { + "epoch": 0.021357264600507842, + "grad_norm": 3.0, + "learning_rate": 0.0014212962962962964, + "loss": 2.9303, + "step": 307 + }, + { + "epoch": 0.021426832237643047, + "grad_norm": 2.171875, + "learning_rate": 0.0014259259259259258, + "loss": 3.2054, + "step": 308 + }, + { + "epoch": 0.021496399874778253, + "grad_norm": 1.90625, + "learning_rate": 0.0014305555555555556, + "loss": 3.7044, + "step": 309 + }, + { + "epoch": 0.021565967511913458, + "grad_norm": 2.046875, + "learning_rate": 0.0014351851851851852, + "loss": 3.7959, + "step": 310 + }, + { + "epoch": 0.021635535149048663, + "grad_norm": 1.890625, + "learning_rate": 0.001439814814814815, + "loss": 3.2365, + "step": 311 + }, + { + "epoch": 0.02170510278618387, + "grad_norm": 2.328125, + "learning_rate": 0.0014444444444444444, + "loss": 3.5516, + "step": 312 + }, + { + "epoch": 0.021774670423319074, + "grad_norm": 2.421875, + "learning_rate": 0.0014490740740740742, + "loss": 3.3478, + "step": 313 + }, + { + "epoch": 0.021844238060454275, + "grad_norm": 3.625, + "learning_rate": 0.0014537037037037038, + "loss": 2.91, + "step": 314 + }, + { + "epoch": 0.02191380569758948, + "grad_norm": 2.8125, + "learning_rate": 0.0014583333333333332, + "loss": 3.3708, + "step": 315 + }, + { + "epoch": 0.021983373334724686, + "grad_norm": 3.265625, + "learning_rate": 0.001462962962962963, + "loss": 3.5365, + "step": 316 + }, + { + "epoch": 0.02205294097185989, + "grad_norm": 2.09375, + "learning_rate": 0.0014675925925925926, + "loss": 3.3751, + "step": 317 + }, + { + "epoch": 0.022122508608995096, + "grad_norm": 2.515625, + "learning_rate": 0.0014722222222222224, + "loss": 3.3161, + "step": 318 + }, + { + "epoch": 0.0221920762461303, + "grad_norm": 3.265625, + "learning_rate": 0.0014768518518518518, + "loss": 3.3045, + "step": 319 + }, + { + "epoch": 0.022261643883265504, + "grad_norm": 1.953125, + "learning_rate": 0.0014814814814814814, + "loss": 3.5489, + "step": 320 + }, + { + "epoch": 0.02233121152040071, + "grad_norm": 3.015625, + "learning_rate": 0.0014861111111111112, + "loss": 3.5659, + "step": 321 + }, + { + "epoch": 0.022400779157535914, + "grad_norm": 3.34375, + "learning_rate": 0.0014907407407407408, + "loss": 3.2205, + "step": 322 + }, + { + "epoch": 0.02247034679467112, + "grad_norm": 2.640625, + "learning_rate": 0.0014953703703703704, + "loss": 3.3073, + "step": 323 + }, + { + "epoch": 0.022539914431806325, + "grad_norm": 2.171875, + "learning_rate": 0.0015, + "loss": 3.6127, + "step": 324 + }, + { + "epoch": 0.02260948206894153, + "grad_norm": 2.703125, + "learning_rate": 0.0015046296296296296, + "loss": 3.2646, + "step": 325 + }, + { + "epoch": 0.02267904970607673, + "grad_norm": 2.203125, + "learning_rate": 0.0015092592592592592, + "loss": 3.4497, + "step": 326 + }, + { + "epoch": 0.022748617343211937, + "grad_norm": 2.796875, + "learning_rate": 0.0015138888888888888, + "loss": 3.2017, + "step": 327 + }, + { + "epoch": 0.022818184980347142, + "grad_norm": 2.5, + "learning_rate": 0.0015185185185185187, + "loss": 3.5135, + "step": 328 + }, + { + "epoch": 0.022887752617482347, + "grad_norm": 2.4375, + "learning_rate": 0.0015231481481481483, + "loss": 3.2238, + "step": 329 + }, + { + "epoch": 0.022957320254617553, + "grad_norm": 2.0, + "learning_rate": 0.0015277777777777776, + "loss": 2.8799, + "step": 330 + }, + { + "epoch": 0.023026887891752758, + "grad_norm": 2.703125, + "learning_rate": 0.0015324074074074075, + "loss": 3.4803, + "step": 331 + }, + { + "epoch": 0.023096455528887963, + "grad_norm": 2.4375, + "learning_rate": 0.001537037037037037, + "loss": 3.3901, + "step": 332 + }, + { + "epoch": 0.023166023166023165, + "grad_norm": 2.25, + "learning_rate": 0.0015416666666666669, + "loss": 3.2674, + "step": 333 + }, + { + "epoch": 0.02323559080315837, + "grad_norm": 2.09375, + "learning_rate": 0.0015462962962962963, + "loss": 3.6683, + "step": 334 + }, + { + "epoch": 0.023305158440293575, + "grad_norm": 3.0625, + "learning_rate": 0.0015509259259259259, + "loss": 3.4703, + "step": 335 + }, + { + "epoch": 0.02337472607742878, + "grad_norm": 1.65625, + "learning_rate": 0.0015555555555555557, + "loss": 3.2555, + "step": 336 + }, + { + "epoch": 0.023444293714563986, + "grad_norm": 2.21875, + "learning_rate": 0.001560185185185185, + "loss": 3.6211, + "step": 337 + }, + { + "epoch": 0.02351386135169919, + "grad_norm": 2.421875, + "learning_rate": 0.0015648148148148149, + "loss": 3.5455, + "step": 338 + }, + { + "epoch": 0.023583428988834393, + "grad_norm": 2.25, + "learning_rate": 0.0015694444444444445, + "loss": 3.6748, + "step": 339 + }, + { + "epoch": 0.023652996625969598, + "grad_norm": 1.546875, + "learning_rate": 0.0015740740740740743, + "loss": 3.453, + "step": 340 + }, + { + "epoch": 0.023722564263104803, + "grad_norm": 1.9921875, + "learning_rate": 0.0015787037037037037, + "loss": 3.3692, + "step": 341 + }, + { + "epoch": 0.02379213190024001, + "grad_norm": 2.53125, + "learning_rate": 0.0015833333333333333, + "loss": 2.9392, + "step": 342 + }, + { + "epoch": 0.023861699537375214, + "grad_norm": 1.96875, + "learning_rate": 0.0015879629629629631, + "loss": 3.5069, + "step": 343 + }, + { + "epoch": 0.02393126717451042, + "grad_norm": 1.59375, + "learning_rate": 0.0015925925925925925, + "loss": 3.5041, + "step": 344 + }, + { + "epoch": 0.02400083481164562, + "grad_norm": 2.25, + "learning_rate": 0.0015972222222222223, + "loss": 3.8914, + "step": 345 + }, + { + "epoch": 0.024070402448780826, + "grad_norm": 2.328125, + "learning_rate": 0.001601851851851852, + "loss": 3.3932, + "step": 346 + }, + { + "epoch": 0.02413997008591603, + "grad_norm": 2.609375, + "learning_rate": 0.0016064814814814815, + "loss": 3.2436, + "step": 347 + }, + { + "epoch": 0.024209537723051237, + "grad_norm": 2.5625, + "learning_rate": 0.0016111111111111111, + "loss": 3.4229, + "step": 348 + }, + { + "epoch": 0.024279105360186442, + "grad_norm": 2.109375, + "learning_rate": 0.0016157407407407407, + "loss": 3.5363, + "step": 349 + }, + { + "epoch": 0.024348672997321647, + "grad_norm": 3.71875, + "learning_rate": 0.0016203703703703705, + "loss": 3.1235, + "step": 350 + }, + { + "epoch": 0.02441824063445685, + "grad_norm": 3.140625, + "learning_rate": 0.0016250000000000001, + "loss": 3.0492, + "step": 351 + }, + { + "epoch": 0.024487808271592054, + "grad_norm": 2.09375, + "learning_rate": 0.0016296296296296295, + "loss": 3.3601, + "step": 352 + }, + { + "epoch": 0.02455737590872726, + "grad_norm": 2.03125, + "learning_rate": 0.0016342592592592593, + "loss": 4.0213, + "step": 353 + }, + { + "epoch": 0.024626943545862465, + "grad_norm": 6.1875, + "learning_rate": 0.001638888888888889, + "loss": 3.2648, + "step": 354 + }, + { + "epoch": 0.02469651118299767, + "grad_norm": 2.875, + "learning_rate": 0.0016435185185185185, + "loss": 3.5466, + "step": 355 + }, + { + "epoch": 0.024766078820132875, + "grad_norm": 2.125, + "learning_rate": 0.0016481481481481482, + "loss": 3.611, + "step": 356 + }, + { + "epoch": 0.02483564645726808, + "grad_norm": 2.515625, + "learning_rate": 0.0016527777777777778, + "loss": 3.8875, + "step": 357 + }, + { + "epoch": 0.024905214094403282, + "grad_norm": 3.03125, + "learning_rate": 0.0016574074074074076, + "loss": 3.0971, + "step": 358 + }, + { + "epoch": 0.024974781731538487, + "grad_norm": 2.078125, + "learning_rate": 0.001662037037037037, + "loss": 3.4832, + "step": 359 + }, + { + "epoch": 0.025044349368673693, + "grad_norm": 1.890625, + "learning_rate": 0.0016666666666666668, + "loss": 3.4558, + "step": 360 + }, + { + "epoch": 0.025113917005808898, + "grad_norm": 2.109375, + "learning_rate": 0.0016712962962962964, + "loss": 3.7369, + "step": 361 + }, + { + "epoch": 0.025183484642944103, + "grad_norm": 3.15625, + "learning_rate": 0.0016759259259259258, + "loss": 3.5278, + "step": 362 + }, + { + "epoch": 0.02525305228007931, + "grad_norm": 2.875, + "learning_rate": 0.0016805555555555556, + "loss": 3.575, + "step": 363 + }, + { + "epoch": 0.02532261991721451, + "grad_norm": 2.34375, + "learning_rate": 0.0016851851851851852, + "loss": 3.7479, + "step": 364 + }, + { + "epoch": 0.025392187554349716, + "grad_norm": 3.015625, + "learning_rate": 0.001689814814814815, + "loss": 3.2374, + "step": 365 + }, + { + "epoch": 0.02546175519148492, + "grad_norm": 2.765625, + "learning_rate": 0.0016944444444444444, + "loss": 3.479, + "step": 366 + }, + { + "epoch": 0.025531322828620126, + "grad_norm": 2.5625, + "learning_rate": 0.0016990740740740742, + "loss": 3.6505, + "step": 367 + }, + { + "epoch": 0.02560089046575533, + "grad_norm": 1.6953125, + "learning_rate": 0.0017037037037037038, + "loss": 3.5518, + "step": 368 + }, + { + "epoch": 0.025670458102890537, + "grad_norm": 1.8203125, + "learning_rate": 0.0017083333333333332, + "loss": 3.5959, + "step": 369 + }, + { + "epoch": 0.02574002574002574, + "grad_norm": 1.9140625, + "learning_rate": 0.001712962962962963, + "loss": 3.6952, + "step": 370 + }, + { + "epoch": 0.025809593377160944, + "grad_norm": 2.09375, + "learning_rate": 0.0017175925925925926, + "loss": 3.4196, + "step": 371 + }, + { + "epoch": 0.02587916101429615, + "grad_norm": 2.109375, + "learning_rate": 0.0017222222222222224, + "loss": 3.3174, + "step": 372 + }, + { + "epoch": 0.025948728651431354, + "grad_norm": 1.7734375, + "learning_rate": 0.0017268518518518518, + "loss": 3.5334, + "step": 373 + }, + { + "epoch": 0.02601829628856656, + "grad_norm": 2.515625, + "learning_rate": 0.0017314814814814814, + "loss": 3.6951, + "step": 374 + }, + { + "epoch": 0.026087863925701765, + "grad_norm": 2.125, + "learning_rate": 0.0017361111111111112, + "loss": 3.0378, + "step": 375 + }, + { + "epoch": 0.02615743156283697, + "grad_norm": 1.9921875, + "learning_rate": 0.0017407407407407408, + "loss": 3.1484, + "step": 376 + }, + { + "epoch": 0.02622699919997217, + "grad_norm": 2.640625, + "learning_rate": 0.0017453703703703704, + "loss": 3.3457, + "step": 377 + }, + { + "epoch": 0.026296566837107377, + "grad_norm": 1.96875, + "learning_rate": 0.00175, + "loss": 3.4347, + "step": 378 + }, + { + "epoch": 0.026366134474242582, + "grad_norm": 1.6328125, + "learning_rate": 0.0017546296296296296, + "loss": 3.3564, + "step": 379 + }, + { + "epoch": 0.026435702111377787, + "grad_norm": 3.046875, + "learning_rate": 0.0017592592592592592, + "loss": 3.223, + "step": 380 + }, + { + "epoch": 0.026505269748512993, + "grad_norm": 1.53125, + "learning_rate": 0.0017638888888888888, + "loss": 3.593, + "step": 381 + }, + { + "epoch": 0.026574837385648198, + "grad_norm": 1.5234375, + "learning_rate": 0.0017685185185185187, + "loss": 3.6302, + "step": 382 + }, + { + "epoch": 0.0266444050227834, + "grad_norm": 1.875, + "learning_rate": 0.0017731481481481483, + "loss": 3.5909, + "step": 383 + }, + { + "epoch": 0.026713972659918605, + "grad_norm": 2.03125, + "learning_rate": 0.0017777777777777776, + "loss": 3.5074, + "step": 384 + }, + { + "epoch": 0.02678354029705381, + "grad_norm": 1.390625, + "learning_rate": 0.0017824074074074075, + "loss": 3.8452, + "step": 385 + }, + { + "epoch": 0.026853107934189015, + "grad_norm": 1.640625, + "learning_rate": 0.001787037037037037, + "loss": 3.5292, + "step": 386 + }, + { + "epoch": 0.02692267557132422, + "grad_norm": 1.5546875, + "learning_rate": 0.0017916666666666669, + "loss": 3.3985, + "step": 387 + }, + { + "epoch": 0.026992243208459426, + "grad_norm": 1.453125, + "learning_rate": 0.0017962962962962963, + "loss": 3.6987, + "step": 388 + }, + { + "epoch": 0.027061810845594628, + "grad_norm": 2.9375, + "learning_rate": 0.0018009259259259259, + "loss": 3.6334, + "step": 389 + }, + { + "epoch": 0.027131378482729833, + "grad_norm": 1.6015625, + "learning_rate": 0.0018055555555555557, + "loss": 3.3575, + "step": 390 + }, + { + "epoch": 0.027200946119865038, + "grad_norm": 1.953125, + "learning_rate": 0.001810185185185185, + "loss": 3.7009, + "step": 391 + }, + { + "epoch": 0.027270513757000243, + "grad_norm": 1.5078125, + "learning_rate": 0.001814814814814815, + "loss": 3.602, + "step": 392 + }, + { + "epoch": 0.02734008139413545, + "grad_norm": 1.6953125, + "learning_rate": 0.0018194444444444445, + "loss": 3.6099, + "step": 393 + }, + { + "epoch": 0.027409649031270654, + "grad_norm": 2.0625, + "learning_rate": 0.0018240740740740743, + "loss": 3.3514, + "step": 394 + }, + { + "epoch": 0.02747921666840586, + "grad_norm": 1.5078125, + "learning_rate": 0.0018287037037037037, + "loss": 3.6098, + "step": 395 + }, + { + "epoch": 0.02754878430554106, + "grad_norm": 2.03125, + "learning_rate": 0.0018333333333333333, + "loss": 3.5332, + "step": 396 + }, + { + "epoch": 0.027618351942676266, + "grad_norm": 2.875, + "learning_rate": 0.0018379629629629631, + "loss": 3.3918, + "step": 397 + }, + { + "epoch": 0.02768791957981147, + "grad_norm": 1.828125, + "learning_rate": 0.0018425925925925925, + "loss": 3.2549, + "step": 398 + }, + { + "epoch": 0.027757487216946677, + "grad_norm": 2.921875, + "learning_rate": 0.0018472222222222223, + "loss": 3.378, + "step": 399 + }, + { + "epoch": 0.027827054854081882, + "grad_norm": 1.7421875, + "learning_rate": 0.001851851851851852, + "loss": 3.3514, + "step": 400 + }, + { + "epoch": 0.027896622491217087, + "grad_norm": 1.875, + "learning_rate": 0.0018564814814814815, + "loss": 3.163, + "step": 401 + }, + { + "epoch": 0.02796619012835229, + "grad_norm": 1.5859375, + "learning_rate": 0.0018611111111111111, + "loss": 3.3832, + "step": 402 + }, + { + "epoch": 0.028035757765487494, + "grad_norm": 1.296875, + "learning_rate": 0.0018657407407407407, + "loss": 3.3657, + "step": 403 + }, + { + "epoch": 0.0281053254026227, + "grad_norm": 2.15625, + "learning_rate": 0.0018703703703703705, + "loss": 3.7129, + "step": 404 + }, + { + "epoch": 0.028174893039757905, + "grad_norm": 1.609375, + "learning_rate": 0.001875, + "loss": 3.3413, + "step": 405 + }, + { + "epoch": 0.02824446067689311, + "grad_norm": 2.796875, + "learning_rate": 0.0018796296296296295, + "loss": 3.5818, + "step": 406 + }, + { + "epoch": 0.028314028314028315, + "grad_norm": 2.015625, + "learning_rate": 0.0018842592592592594, + "loss": 3.2641, + "step": 407 + }, + { + "epoch": 0.028383595951163517, + "grad_norm": 2.3125, + "learning_rate": 0.001888888888888889, + "loss": 3.4429, + "step": 408 + }, + { + "epoch": 0.028453163588298722, + "grad_norm": 2.0, + "learning_rate": 0.0018935185185185186, + "loss": 3.2141, + "step": 409 + }, + { + "epoch": 0.028522731225433928, + "grad_norm": 1.8515625, + "learning_rate": 0.0018981481481481482, + "loss": 3.4574, + "step": 410 + }, + { + "epoch": 0.028592298862569133, + "grad_norm": 1.734375, + "learning_rate": 0.0019027777777777778, + "loss": 3.148, + "step": 411 + }, + { + "epoch": 0.028661866499704338, + "grad_norm": 1.9609375, + "learning_rate": 0.0019074074074074076, + "loss": 3.7982, + "step": 412 + }, + { + "epoch": 0.028731434136839543, + "grad_norm": 1.6171875, + "learning_rate": 0.001912037037037037, + "loss": 3.3567, + "step": 413 + }, + { + "epoch": 0.02880100177397475, + "grad_norm": 3.046875, + "learning_rate": 0.0019166666666666668, + "loss": 3.6453, + "step": 414 + }, + { + "epoch": 0.02887056941110995, + "grad_norm": 2.1875, + "learning_rate": 0.0019212962962962964, + "loss": 3.4031, + "step": 415 + }, + { + "epoch": 0.028940137048245156, + "grad_norm": 1.6796875, + "learning_rate": 0.0019259259259259258, + "loss": 3.6185, + "step": 416 + }, + { + "epoch": 0.02900970468538036, + "grad_norm": 1.8828125, + "learning_rate": 0.0019305555555555556, + "loss": 3.0648, + "step": 417 + }, + { + "epoch": 0.029079272322515566, + "grad_norm": 1.7734375, + "learning_rate": 0.0019351851851851852, + "loss": 3.3076, + "step": 418 + }, + { + "epoch": 0.02914883995965077, + "grad_norm": 2.0, + "learning_rate": 0.001939814814814815, + "loss": 3.5246, + "step": 419 + }, + { + "epoch": 0.029218407596785977, + "grad_norm": 2.921875, + "learning_rate": 0.0019444444444444444, + "loss": 3.6498, + "step": 420 + }, + { + "epoch": 0.02928797523392118, + "grad_norm": 2.828125, + "learning_rate": 0.0019490740740740742, + "loss": 3.2355, + "step": 421 + }, + { + "epoch": 0.029357542871056384, + "grad_norm": 2.8125, + "learning_rate": 0.0019537037037037036, + "loss": 3.9044, + "step": 422 + }, + { + "epoch": 0.02942711050819159, + "grad_norm": 2.390625, + "learning_rate": 0.001958333333333333, + "loss": 3.5943, + "step": 423 + }, + { + "epoch": 0.029496678145326794, + "grad_norm": 1.6640625, + "learning_rate": 0.0019629629629629632, + "loss": 3.7883, + "step": 424 + }, + { + "epoch": 0.029566245782462, + "grad_norm": 1.8203125, + "learning_rate": 0.0019675925925925924, + "loss": 3.5543, + "step": 425 + }, + { + "epoch": 0.029635813419597205, + "grad_norm": 1.5, + "learning_rate": 0.0019722222222222224, + "loss": 3.6355, + "step": 426 + }, + { + "epoch": 0.029705381056732406, + "grad_norm": 1.6796875, + "learning_rate": 0.001976851851851852, + "loss": 3.1634, + "step": 427 + }, + { + "epoch": 0.02977494869386761, + "grad_norm": 2.0625, + "learning_rate": 0.0019814814814814816, + "loss": 3.2555, + "step": 428 + }, + { + "epoch": 0.029844516331002817, + "grad_norm": 2.421875, + "learning_rate": 0.0019861111111111112, + "loss": 3.1424, + "step": 429 + }, + { + "epoch": 0.029914083968138022, + "grad_norm": 2.15625, + "learning_rate": 0.001990740740740741, + "loss": 3.5279, + "step": 430 + }, + { + "epoch": 0.029983651605273227, + "grad_norm": 3.203125, + "learning_rate": 0.0019953703703703704, + "loss": 3.0176, + "step": 431 + }, + { + "epoch": 0.030053219242408433, + "grad_norm": 3.03125, + "learning_rate": 0.002, + "loss": 3.1193, + "step": 432 + }, + { + "epoch": 0.030122786879543638, + "grad_norm": 2.828125, + "learning_rate": 0.00199999997461252, + "loss": 3.2754, + "step": 433 + }, + { + "epoch": 0.03019235451667884, + "grad_norm": 2.125, + "learning_rate": 0.0019999998984500823, + "loss": 3.5648, + "step": 434 + }, + { + "epoch": 0.030261922153814045, + "grad_norm": 2.125, + "learning_rate": 0.0019999997715126894, + "loss": 3.3541, + "step": 435 + }, + { + "epoch": 0.03033148979094925, + "grad_norm": 2.078125, + "learning_rate": 0.001999999593800349, + "loss": 3.3309, + "step": 436 + }, + { + "epoch": 0.030401057428084455, + "grad_norm": 2.890625, + "learning_rate": 0.001999999365313069, + "loss": 3.3605, + "step": 437 + }, + { + "epoch": 0.03047062506521966, + "grad_norm": 1.921875, + "learning_rate": 0.0019999990860508623, + "loss": 3.3191, + "step": 438 + }, + { + "epoch": 0.030540192702354866, + "grad_norm": 2.25, + "learning_rate": 0.001999998756013742, + "loss": 3.1285, + "step": 439 + }, + { + "epoch": 0.030609760339490068, + "grad_norm": 2.984375, + "learning_rate": 0.001999998375201725, + "loss": 3.0919, + "step": 440 + }, + { + "epoch": 0.030679327976625273, + "grad_norm": 1.6328125, + "learning_rate": 0.001999997943614831, + "loss": 3.4453, + "step": 441 + }, + { + "epoch": 0.030748895613760478, + "grad_norm": 1.78125, + "learning_rate": 0.001999997461253082, + "loss": 3.4458, + "step": 442 + }, + { + "epoch": 0.030818463250895684, + "grad_norm": 2.3125, + "learning_rate": 0.001999996928116502, + "loss": 3.7504, + "step": 443 + }, + { + "epoch": 0.03088803088803089, + "grad_norm": 1.6171875, + "learning_rate": 0.001999996344205119, + "loss": 3.333, + "step": 444 + }, + { + "epoch": 0.030957598525166094, + "grad_norm": 1.9921875, + "learning_rate": 0.0019999957095189615, + "loss": 3.4064, + "step": 445 + }, + { + "epoch": 0.031027166162301296, + "grad_norm": 1.875, + "learning_rate": 0.001999995024058062, + "loss": 3.095, + "step": 446 + }, + { + "epoch": 0.0310967337994365, + "grad_norm": 1.9609375, + "learning_rate": 0.001999994287822456, + "loss": 3.4324, + "step": 447 + }, + { + "epoch": 0.031166301436571706, + "grad_norm": 2.34375, + "learning_rate": 0.00199999350081218, + "loss": 2.9063, + "step": 448 + }, + { + "epoch": 0.03123586907370691, + "grad_norm": 1.6875, + "learning_rate": 0.001999992663027275, + "loss": 3.5043, + "step": 449 + }, + { + "epoch": 0.03130543671084211, + "grad_norm": 1.9140625, + "learning_rate": 0.0019999917744677824, + "loss": 3.4375, + "step": 450 + }, + { + "epoch": 0.03137500434797732, + "grad_norm": 2.5625, + "learning_rate": 0.001999990835133748, + "loss": 3.3257, + "step": 451 + }, + { + "epoch": 0.031444571985112524, + "grad_norm": 1.8515625, + "learning_rate": 0.001999989845025219, + "loss": 3.6367, + "step": 452 + }, + { + "epoch": 0.03151413962224773, + "grad_norm": 1.4765625, + "learning_rate": 0.001999988804142246, + "loss": 3.3612, + "step": 453 + }, + { + "epoch": 0.031583707259382934, + "grad_norm": 1.859375, + "learning_rate": 0.0019999877124848822, + "loss": 3.1522, + "step": 454 + }, + { + "epoch": 0.03165327489651814, + "grad_norm": 2.15625, + "learning_rate": 0.0019999865700531826, + "loss": 3.3392, + "step": 455 + }, + { + "epoch": 0.031722842533653345, + "grad_norm": 1.671875, + "learning_rate": 0.001999985376847205, + "loss": 3.2132, + "step": 456 + }, + { + "epoch": 0.03179241017078855, + "grad_norm": 1.4375, + "learning_rate": 0.0019999841328670106, + "loss": 2.9862, + "step": 457 + }, + { + "epoch": 0.031861977807923755, + "grad_norm": 1.6171875, + "learning_rate": 0.001999982838112662, + "loss": 3.2235, + "step": 458 + }, + { + "epoch": 0.03193154544505896, + "grad_norm": 2.1875, + "learning_rate": 0.0019999814925842256, + "loss": 3.1465, + "step": 459 + }, + { + "epoch": 0.032001113082194166, + "grad_norm": 2.03125, + "learning_rate": 0.0019999800962817687, + "loss": 3.6032, + "step": 460 + }, + { + "epoch": 0.03207068071932937, + "grad_norm": 1.7109375, + "learning_rate": 0.0019999786492053634, + "loss": 3.2446, + "step": 461 + }, + { + "epoch": 0.03214024835646457, + "grad_norm": 2.140625, + "learning_rate": 0.001999977151355082, + "loss": 3.5569, + "step": 462 + }, + { + "epoch": 0.032209815993599775, + "grad_norm": 1.796875, + "learning_rate": 0.001999975602731001, + "loss": 3.2789, + "step": 463 + }, + { + "epoch": 0.03227938363073498, + "grad_norm": 1.7109375, + "learning_rate": 0.0019999740033332, + "loss": 3.1285, + "step": 464 + }, + { + "epoch": 0.032348951267870185, + "grad_norm": 1.453125, + "learning_rate": 0.0019999723531617586, + "loss": 3.2197, + "step": 465 + }, + { + "epoch": 0.03241851890500539, + "grad_norm": 1.9375, + "learning_rate": 0.0019999706522167617, + "loss": 3.6819, + "step": 466 + }, + { + "epoch": 0.032488086542140596, + "grad_norm": 1.9296875, + "learning_rate": 0.0019999689004982953, + "loss": 3.6977, + "step": 467 + }, + { + "epoch": 0.0325576541792758, + "grad_norm": 1.6484375, + "learning_rate": 0.001999967098006448, + "loss": 3.4413, + "step": 468 + }, + { + "epoch": 0.032627221816411006, + "grad_norm": 1.953125, + "learning_rate": 0.0019999652447413117, + "loss": 3.381, + "step": 469 + }, + { + "epoch": 0.03269678945354621, + "grad_norm": 1.6875, + "learning_rate": 0.0019999633407029806, + "loss": 3.3125, + "step": 470 + }, + { + "epoch": 0.03276635709068142, + "grad_norm": 2.125, + "learning_rate": 0.0019999613858915515, + "loss": 3.4642, + "step": 471 + }, + { + "epoch": 0.03283592472781662, + "grad_norm": 1.4140625, + "learning_rate": 0.0019999593803071234, + "loss": 3.5275, + "step": 472 + }, + { + "epoch": 0.03290549236495183, + "grad_norm": 1.6484375, + "learning_rate": 0.0019999573239497977, + "loss": 3.3881, + "step": 473 + }, + { + "epoch": 0.03297506000208703, + "grad_norm": 2.0625, + "learning_rate": 0.0019999552168196797, + "loss": 3.446, + "step": 474 + }, + { + "epoch": 0.03304462763922223, + "grad_norm": 1.8359375, + "learning_rate": 0.0019999530589168753, + "loss": 3.4311, + "step": 475 + }, + { + "epoch": 0.033114195276357436, + "grad_norm": 2.484375, + "learning_rate": 0.001999950850241495, + "loss": 3.626, + "step": 476 + }, + { + "epoch": 0.03318376291349264, + "grad_norm": 1.890625, + "learning_rate": 0.001999948590793651, + "loss": 3.486, + "step": 477 + }, + { + "epoch": 0.033253330550627846, + "grad_norm": 1.40625, + "learning_rate": 0.0019999462805734575, + "loss": 3.8674, + "step": 478 + }, + { + "epoch": 0.03332289818776305, + "grad_norm": 2.0, + "learning_rate": 0.0019999439195810317, + "loss": 3.5158, + "step": 479 + }, + { + "epoch": 0.03339246582489826, + "grad_norm": 2.015625, + "learning_rate": 0.0019999415078164945, + "loss": 3.1666, + "step": 480 + }, + { + "epoch": 0.03346203346203346, + "grad_norm": 1.46875, + "learning_rate": 0.001999939045279967, + "loss": 3.6701, + "step": 481 + }, + { + "epoch": 0.03353160109916867, + "grad_norm": 1.390625, + "learning_rate": 0.0019999365319715748, + "loss": 3.4038, + "step": 482 + }, + { + "epoch": 0.03360116873630387, + "grad_norm": 2.109375, + "learning_rate": 0.0019999339678914456, + "loss": 3.3869, + "step": 483 + }, + { + "epoch": 0.03367073637343908, + "grad_norm": 1.8671875, + "learning_rate": 0.00199993135303971, + "loss": 3.4404, + "step": 484 + }, + { + "epoch": 0.03374030401057428, + "grad_norm": 1.65625, + "learning_rate": 0.0019999286874165, + "loss": 3.2973, + "step": 485 + }, + { + "epoch": 0.03380987164770949, + "grad_norm": 2.171875, + "learning_rate": 0.0019999259710219513, + "loss": 3.4502, + "step": 486 + }, + { + "epoch": 0.03387943928484469, + "grad_norm": 1.484375, + "learning_rate": 0.0019999232038562013, + "loss": 3.3338, + "step": 487 + }, + { + "epoch": 0.03394900692197989, + "grad_norm": 1.25, + "learning_rate": 0.0019999203859193916, + "loss": 3.5657, + "step": 488 + }, + { + "epoch": 0.0340185745591151, + "grad_norm": 1.7578125, + "learning_rate": 0.0019999175172116645, + "loss": 3.6222, + "step": 489 + }, + { + "epoch": 0.0340881421962503, + "grad_norm": 2.203125, + "learning_rate": 0.0019999145977331657, + "loss": 3.5801, + "step": 490 + }, + { + "epoch": 0.03415770983338551, + "grad_norm": 2.203125, + "learning_rate": 0.001999911627484044, + "loss": 3.0592, + "step": 491 + }, + { + "epoch": 0.03422727747052071, + "grad_norm": 2.1875, + "learning_rate": 0.0019999086064644493, + "loss": 3.1636, + "step": 492 + }, + { + "epoch": 0.03429684510765592, + "grad_norm": 1.4140625, + "learning_rate": 0.0019999055346745357, + "loss": 3.2533, + "step": 493 + }, + { + "epoch": 0.034366412744791124, + "grad_norm": 1.859375, + "learning_rate": 0.0019999024121144585, + "loss": 2.8686, + "step": 494 + }, + { + "epoch": 0.03443598038192633, + "grad_norm": 2.203125, + "learning_rate": 0.001999899238784377, + "loss": 3.1926, + "step": 495 + }, + { + "epoch": 0.034505548019061534, + "grad_norm": 1.8359375, + "learning_rate": 0.0019998960146844526, + "loss": 3.454, + "step": 496 + }, + { + "epoch": 0.03457511565619674, + "grad_norm": 1.6875, + "learning_rate": 0.001999892739814848, + "loss": 3.9289, + "step": 497 + }, + { + "epoch": 0.034644683293331945, + "grad_norm": 1.46875, + "learning_rate": 0.0019998894141757297, + "loss": 3.6297, + "step": 498 + }, + { + "epoch": 0.03471425093046715, + "grad_norm": 1.640625, + "learning_rate": 0.001999886037767267, + "loss": 3.2615, + "step": 499 + }, + { + "epoch": 0.03478381856760235, + "grad_norm": 1.3515625, + "learning_rate": 0.0019998826105896306, + "loss": 3.6395, + "step": 500 + }, + { + "epoch": 0.03485338620473755, + "grad_norm": 1.6484375, + "learning_rate": 0.0019998791326429955, + "loss": 2.9141, + "step": 501 + }, + { + "epoch": 0.03492295384187276, + "grad_norm": 1.8828125, + "learning_rate": 0.001999875603927538, + "loss": 3.3799, + "step": 502 + }, + { + "epoch": 0.034992521479007964, + "grad_norm": 2.015625, + "learning_rate": 0.001999872024443437, + "loss": 3.7696, + "step": 503 + }, + { + "epoch": 0.03506208911614317, + "grad_norm": 1.7421875, + "learning_rate": 0.001999868394190874, + "loss": 3.5759, + "step": 504 + }, + { + "epoch": 0.035131656753278374, + "grad_norm": 1.8359375, + "learning_rate": 0.001999864713170034, + "loss": 3.5347, + "step": 505 + }, + { + "epoch": 0.03520122439041358, + "grad_norm": 1.3359375, + "learning_rate": 0.001999860981381103, + "loss": 3.4105, + "step": 506 + }, + { + "epoch": 0.035270792027548785, + "grad_norm": 1.6484375, + "learning_rate": 0.0019998571988242716, + "loss": 3.516, + "step": 507 + }, + { + "epoch": 0.03534035966468399, + "grad_norm": 1.34375, + "learning_rate": 0.001999853365499731, + "loss": 3.7225, + "step": 508 + }, + { + "epoch": 0.035409927301819195, + "grad_norm": 1.296875, + "learning_rate": 0.001999849481407676, + "loss": 3.3798, + "step": 509 + }, + { + "epoch": 0.0354794949389544, + "grad_norm": 1.734375, + "learning_rate": 0.0019998455465483045, + "loss": 3.2524, + "step": 510 + }, + { + "epoch": 0.035549062576089606, + "grad_norm": 1.4609375, + "learning_rate": 0.0019998415609218155, + "loss": 3.4873, + "step": 511 + }, + { + "epoch": 0.03561863021322481, + "grad_norm": 1.515625, + "learning_rate": 0.0019998375245284116, + "loss": 3.3883, + "step": 512 + }, + { + "epoch": 0.03568819785036001, + "grad_norm": 1.2734375, + "learning_rate": 0.0019998334373682977, + "loss": 3.401, + "step": 513 + }, + { + "epoch": 0.035757765487495215, + "grad_norm": 1.5703125, + "learning_rate": 0.0019998292994416814, + "loss": 2.9872, + "step": 514 + }, + { + "epoch": 0.03582733312463042, + "grad_norm": 1.8828125, + "learning_rate": 0.0019998251107487728, + "loss": 3.3022, + "step": 515 + }, + { + "epoch": 0.035896900761765625, + "grad_norm": 1.953125, + "learning_rate": 0.0019998208712897845, + "loss": 3.3426, + "step": 516 + }, + { + "epoch": 0.03596646839890083, + "grad_norm": 2.265625, + "learning_rate": 0.0019998165810649316, + "loss": 3.2612, + "step": 517 + }, + { + "epoch": 0.036036036036036036, + "grad_norm": 1.8046875, + "learning_rate": 0.0019998122400744327, + "loss": 3.2155, + "step": 518 + }, + { + "epoch": 0.03610560367317124, + "grad_norm": 1.328125, + "learning_rate": 0.001999807848318507, + "loss": 3.3988, + "step": 519 + }, + { + "epoch": 0.036175171310306446, + "grad_norm": 1.21875, + "learning_rate": 0.001999803405797379, + "loss": 3.5409, + "step": 520 + }, + { + "epoch": 0.03624473894744165, + "grad_norm": 1.609375, + "learning_rate": 0.001999798912511273, + "loss": 3.8011, + "step": 521 + }, + { + "epoch": 0.03631430658457686, + "grad_norm": 1.3671875, + "learning_rate": 0.0019997943684604176, + "loss": 3.0568, + "step": 522 + }, + { + "epoch": 0.03638387422171206, + "grad_norm": 1.6640625, + "learning_rate": 0.001999789773645043, + "loss": 3.2267, + "step": 523 + }, + { + "epoch": 0.03645344185884727, + "grad_norm": 1.5859375, + "learning_rate": 0.001999785128065384, + "loss": 3.0883, + "step": 524 + }, + { + "epoch": 0.036523009495982466, + "grad_norm": 1.546875, + "learning_rate": 0.001999780431721675, + "loss": 3.5484, + "step": 525 + }, + { + "epoch": 0.03659257713311767, + "grad_norm": 1.8203125, + "learning_rate": 0.0019997756846141545, + "loss": 3.5082, + "step": 526 + }, + { + "epoch": 0.036662144770252876, + "grad_norm": 1.359375, + "learning_rate": 0.0019997708867430645, + "loss": 3.5143, + "step": 527 + }, + { + "epoch": 0.03673171240738808, + "grad_norm": 1.4921875, + "learning_rate": 0.001999766038108648, + "loss": 3.2694, + "step": 528 + }, + { + "epoch": 0.03680128004452329, + "grad_norm": 1.9453125, + "learning_rate": 0.0019997611387111516, + "loss": 3.274, + "step": 529 + }, + { + "epoch": 0.03687084768165849, + "grad_norm": 1.53125, + "learning_rate": 0.001999756188550823, + "loss": 3.415, + "step": 530 + }, + { + "epoch": 0.0369404153187937, + "grad_norm": 2.0625, + "learning_rate": 0.001999751187627915, + "loss": 3.8245, + "step": 531 + }, + { + "epoch": 0.0370099829559289, + "grad_norm": 2.203125, + "learning_rate": 0.0019997461359426805, + "loss": 3.3096, + "step": 532 + }, + { + "epoch": 0.03707955059306411, + "grad_norm": 2.625, + "learning_rate": 0.001999741033495376, + "loss": 3.1154, + "step": 533 + }, + { + "epoch": 0.03714911823019931, + "grad_norm": 2.46875, + "learning_rate": 0.0019997358802862617, + "loss": 3.028, + "step": 534 + }, + { + "epoch": 0.03721868586733452, + "grad_norm": 2.375, + "learning_rate": 0.0019997306763155976, + "loss": 3.3044, + "step": 535 + }, + { + "epoch": 0.03728825350446972, + "grad_norm": 1.7109375, + "learning_rate": 0.001999725421583649, + "loss": 3.5187, + "step": 536 + }, + { + "epoch": 0.03735782114160493, + "grad_norm": 1.75, + "learning_rate": 0.001999720116090683, + "loss": 3.1256, + "step": 537 + }, + { + "epoch": 0.03742738877874013, + "grad_norm": 1.828125, + "learning_rate": 0.001999714759836968, + "loss": 3.4639, + "step": 538 + }, + { + "epoch": 0.03749695641587533, + "grad_norm": 1.2578125, + "learning_rate": 0.0019997093528227768, + "loss": 3.4131, + "step": 539 + }, + { + "epoch": 0.03756652405301054, + "grad_norm": 1.609375, + "learning_rate": 0.001999703895048383, + "loss": 3.2755, + "step": 540 + }, + { + "epoch": 0.03763609169014574, + "grad_norm": 1.703125, + "learning_rate": 0.0019996983865140645, + "loss": 3.0651, + "step": 541 + }, + { + "epoch": 0.03770565932728095, + "grad_norm": 1.2734375, + "learning_rate": 0.001999692827220101, + "loss": 3.7271, + "step": 542 + }, + { + "epoch": 0.03777522696441615, + "grad_norm": 1.4765625, + "learning_rate": 0.001999687217166774, + "loss": 2.8666, + "step": 543 + }, + { + "epoch": 0.03784479460155136, + "grad_norm": 2.609375, + "learning_rate": 0.0019996815563543694, + "loss": 3.3484, + "step": 544 + }, + { + "epoch": 0.037914362238686564, + "grad_norm": 1.9140625, + "learning_rate": 0.0019996758447831746, + "loss": 2.9785, + "step": 545 + }, + { + "epoch": 0.03798392987582177, + "grad_norm": 2.09375, + "learning_rate": 0.0019996700824534783, + "loss": 3.1998, + "step": 546 + }, + { + "epoch": 0.038053497512956974, + "grad_norm": 1.7734375, + "learning_rate": 0.001999664269365574, + "loss": 2.9826, + "step": 547 + }, + { + "epoch": 0.03812306515009218, + "grad_norm": 1.3515625, + "learning_rate": 0.001999658405519757, + "loss": 3.3673, + "step": 548 + }, + { + "epoch": 0.038192632787227385, + "grad_norm": 1.640625, + "learning_rate": 0.001999652490916325, + "loss": 3.7924, + "step": 549 + }, + { + "epoch": 0.03826220042436259, + "grad_norm": 1.578125, + "learning_rate": 0.001999646525555578, + "loss": 3.3943, + "step": 550 + }, + { + "epoch": 0.03833176806149779, + "grad_norm": 1.921875, + "learning_rate": 0.0019996405094378188, + "loss": 2.9449, + "step": 551 + }, + { + "epoch": 0.03840133569863299, + "grad_norm": 2.53125, + "learning_rate": 0.0019996344425633533, + "loss": 3.5443, + "step": 552 + }, + { + "epoch": 0.0384709033357682, + "grad_norm": 1.9765625, + "learning_rate": 0.0019996283249324896, + "loss": 3.2403, + "step": 553 + }, + { + "epoch": 0.038540470972903404, + "grad_norm": 1.515625, + "learning_rate": 0.0019996221565455378, + "loss": 3.493, + "step": 554 + }, + { + "epoch": 0.03861003861003861, + "grad_norm": 1.4921875, + "learning_rate": 0.0019996159374028113, + "loss": 3.4755, + "step": 555 + }, + { + "epoch": 0.038679606247173814, + "grad_norm": 1.484375, + "learning_rate": 0.0019996096675046256, + "loss": 3.5693, + "step": 556 + }, + { + "epoch": 0.03874917388430902, + "grad_norm": 1.53125, + "learning_rate": 0.0019996033468513003, + "loss": 3.2506, + "step": 557 + }, + { + "epoch": 0.038818741521444225, + "grad_norm": 1.75, + "learning_rate": 0.001999596975443155, + "loss": 3.5411, + "step": 558 + }, + { + "epoch": 0.03888830915857943, + "grad_norm": 1.7265625, + "learning_rate": 0.0019995905532805133, + "loss": 3.6029, + "step": 559 + }, + { + "epoch": 0.038957876795714635, + "grad_norm": 2.03125, + "learning_rate": 0.001999584080363702, + "loss": 3.3463, + "step": 560 + }, + { + "epoch": 0.03902744443284984, + "grad_norm": 3.0625, + "learning_rate": 0.001999577556693049, + "loss": 3.5366, + "step": 561 + }, + { + "epoch": 0.039097012069985046, + "grad_norm": 1.84375, + "learning_rate": 0.001999570982268886, + "loss": 3.3199, + "step": 562 + }, + { + "epoch": 0.039166579707120244, + "grad_norm": 1.46875, + "learning_rate": 0.001999564357091547, + "loss": 3.5079, + "step": 563 + }, + { + "epoch": 0.03923614734425545, + "grad_norm": 1.328125, + "learning_rate": 0.001999557681161368, + "loss": 3.5224, + "step": 564 + }, + { + "epoch": 0.039305714981390655, + "grad_norm": 1.59375, + "learning_rate": 0.001999550954478688, + "loss": 3.674, + "step": 565 + }, + { + "epoch": 0.03937528261852586, + "grad_norm": 1.234375, + "learning_rate": 0.0019995441770438486, + "loss": 3.3461, + "step": 566 + }, + { + "epoch": 0.039444850255661065, + "grad_norm": 1.3125, + "learning_rate": 0.001999537348857194, + "loss": 3.3256, + "step": 567 + }, + { + "epoch": 0.03951441789279627, + "grad_norm": 2.078125, + "learning_rate": 0.0019995304699190713, + "loss": 3.0955, + "step": 568 + }, + { + "epoch": 0.039583985529931476, + "grad_norm": 1.96875, + "learning_rate": 0.0019995235402298288, + "loss": 3.6332, + "step": 569 + }, + { + "epoch": 0.03965355316706668, + "grad_norm": 1.71875, + "learning_rate": 0.0019995165597898193, + "loss": 3.2571, + "step": 570 + }, + { + "epoch": 0.039723120804201886, + "grad_norm": 1.640625, + "learning_rate": 0.0019995095285993965, + "loss": 2.8634, + "step": 571 + }, + { + "epoch": 0.03979268844133709, + "grad_norm": 1.6796875, + "learning_rate": 0.001999502446658918, + "loss": 3.444, + "step": 572 + }, + { + "epoch": 0.0398622560784723, + "grad_norm": 1.265625, + "learning_rate": 0.001999495313968743, + "loss": 3.4595, + "step": 573 + }, + { + "epoch": 0.0399318237156075, + "grad_norm": 1.375, + "learning_rate": 0.0019994881305292335, + "loss": 3.588, + "step": 574 + }, + { + "epoch": 0.04000139135274271, + "grad_norm": 2.1875, + "learning_rate": 0.0019994808963407548, + "loss": 3.1029, + "step": 575 + }, + { + "epoch": 0.040070958989877906, + "grad_norm": 1.6796875, + "learning_rate": 0.001999473611403674, + "loss": 3.4616, + "step": 576 + }, + { + "epoch": 0.04014052662701311, + "grad_norm": 2.40625, + "learning_rate": 0.001999466275718361, + "loss": 3.1576, + "step": 577 + }, + { + "epoch": 0.040210094264148316, + "grad_norm": 2.453125, + "learning_rate": 0.001999458889285188, + "loss": 3.1757, + "step": 578 + }, + { + "epoch": 0.04027966190128352, + "grad_norm": 1.6796875, + "learning_rate": 0.00199945145210453, + "loss": 2.8828, + "step": 579 + }, + { + "epoch": 0.04034922953841873, + "grad_norm": 1.578125, + "learning_rate": 0.0019994439641767654, + "loss": 3.1672, + "step": 580 + }, + { + "epoch": 0.04041879717555393, + "grad_norm": 1.6796875, + "learning_rate": 0.001999436425502274, + "loss": 3.35, + "step": 581 + }, + { + "epoch": 0.04048836481268914, + "grad_norm": 1.671875, + "learning_rate": 0.0019994288360814377, + "loss": 3.3505, + "step": 582 + }, + { + "epoch": 0.04055793244982434, + "grad_norm": 1.5, + "learning_rate": 0.001999421195914643, + "loss": 3.2588, + "step": 583 + }, + { + "epoch": 0.04062750008695955, + "grad_norm": 2.0, + "learning_rate": 0.0019994135050022776, + "loss": 3.6955, + "step": 584 + }, + { + "epoch": 0.04069706772409475, + "grad_norm": 1.453125, + "learning_rate": 0.0019994057633447317, + "loss": 3.1572, + "step": 585 + }, + { + "epoch": 0.04076663536122996, + "grad_norm": 1.671875, + "learning_rate": 0.0019993979709423985, + "loss": 3.1665, + "step": 586 + }, + { + "epoch": 0.04083620299836516, + "grad_norm": 1.6796875, + "learning_rate": 0.0019993901277956735, + "loss": 3.7347, + "step": 587 + }, + { + "epoch": 0.04090577063550037, + "grad_norm": 1.1640625, + "learning_rate": 0.0019993822339049554, + "loss": 3.4667, + "step": 588 + }, + { + "epoch": 0.04097533827263557, + "grad_norm": 1.5390625, + "learning_rate": 0.0019993742892706447, + "loss": 3.3394, + "step": 589 + }, + { + "epoch": 0.04104490590977077, + "grad_norm": 1.4296875, + "learning_rate": 0.001999366293893145, + "loss": 3.34, + "step": 590 + }, + { + "epoch": 0.04111447354690598, + "grad_norm": 1.171875, + "learning_rate": 0.0019993582477728614, + "loss": 3.2674, + "step": 591 + }, + { + "epoch": 0.04118404118404118, + "grad_norm": 1.640625, + "learning_rate": 0.0019993501509102036, + "loss": 3.2031, + "step": 592 + }, + { + "epoch": 0.04125360882117639, + "grad_norm": 1.1640625, + "learning_rate": 0.001999342003305582, + "loss": 3.6727, + "step": 593 + }, + { + "epoch": 0.04132317645831159, + "grad_norm": 1.5625, + "learning_rate": 0.0019993338049594106, + "loss": 2.8559, + "step": 594 + }, + { + "epoch": 0.0413927440954468, + "grad_norm": 1.5, + "learning_rate": 0.001999325555872106, + "loss": 3.2697, + "step": 595 + }, + { + "epoch": 0.041462311732582004, + "grad_norm": 1.4375, + "learning_rate": 0.001999317256044086, + "loss": 3.6079, + "step": 596 + }, + { + "epoch": 0.04153187936971721, + "grad_norm": 1.5, + "learning_rate": 0.0019993089054757733, + "loss": 3.4023, + "step": 597 + }, + { + "epoch": 0.041601447006852414, + "grad_norm": 2.265625, + "learning_rate": 0.001999300504167591, + "loss": 3.3421, + "step": 598 + }, + { + "epoch": 0.04167101464398762, + "grad_norm": 1.6484375, + "learning_rate": 0.0019992920521199656, + "loss": 3.1932, + "step": 599 + }, + { + "epoch": 0.041740582281122825, + "grad_norm": 1.9453125, + "learning_rate": 0.001999283549333327, + "loss": 3.1098, + "step": 600 + }, + { + "epoch": 0.04181014991825802, + "grad_norm": 2.390625, + "learning_rate": 0.001999274995808106, + "loss": 3.0167, + "step": 601 + }, + { + "epoch": 0.04187971755539323, + "grad_norm": 1.40625, + "learning_rate": 0.001999266391544738, + "loss": 3.2582, + "step": 602 + }, + { + "epoch": 0.041949285192528434, + "grad_norm": 1.9453125, + "learning_rate": 0.0019992577365436593, + "loss": 3.6975, + "step": 603 + }, + { + "epoch": 0.04201885282966364, + "grad_norm": 1.5703125, + "learning_rate": 0.001999249030805309, + "loss": 3.2861, + "step": 604 + }, + { + "epoch": 0.042088420466798844, + "grad_norm": 1.6796875, + "learning_rate": 0.00199924027433013, + "loss": 3.6903, + "step": 605 + }, + { + "epoch": 0.04215798810393405, + "grad_norm": 1.4921875, + "learning_rate": 0.0019992314671185662, + "loss": 3.3531, + "step": 606 + }, + { + "epoch": 0.042227555741069255, + "grad_norm": 1.28125, + "learning_rate": 0.001999222609171065, + "loss": 3.5518, + "step": 607 + }, + { + "epoch": 0.04229712337820446, + "grad_norm": 1.0546875, + "learning_rate": 0.001999213700488076, + "loss": 3.0014, + "step": 608 + }, + { + "epoch": 0.042366691015339665, + "grad_norm": 1.6328125, + "learning_rate": 0.0019992047410700518, + "loss": 2.9034, + "step": 609 + }, + { + "epoch": 0.04243625865247487, + "grad_norm": 1.34375, + "learning_rate": 0.0019991957309174473, + "loss": 3.6726, + "step": 610 + }, + { + "epoch": 0.042505826289610076, + "grad_norm": 1.015625, + "learning_rate": 0.0019991866700307197, + "loss": 3.7199, + "step": 611 + }, + { + "epoch": 0.04257539392674528, + "grad_norm": 0.99609375, + "learning_rate": 0.0019991775584103297, + "loss": 3.639, + "step": 612 + }, + { + "epoch": 0.042644961563880486, + "grad_norm": 1.546875, + "learning_rate": 0.001999168396056739, + "loss": 3.2102, + "step": 613 + }, + { + "epoch": 0.042714529201015684, + "grad_norm": 1.796875, + "learning_rate": 0.0019991591829704135, + "loss": 3.319, + "step": 614 + }, + { + "epoch": 0.04278409683815089, + "grad_norm": 1.5546875, + "learning_rate": 0.0019991499191518206, + "loss": 3.1985, + "step": 615 + }, + { + "epoch": 0.042853664475286095, + "grad_norm": 1.5234375, + "learning_rate": 0.0019991406046014314, + "loss": 3.4867, + "step": 616 + }, + { + "epoch": 0.0429232321124213, + "grad_norm": 1.4140625, + "learning_rate": 0.001999131239319718, + "loss": 3.304, + "step": 617 + }, + { + "epoch": 0.042992799749556505, + "grad_norm": 1.609375, + "learning_rate": 0.0019991218233071564, + "loss": 3.4094, + "step": 618 + }, + { + "epoch": 0.04306236738669171, + "grad_norm": 1.2578125, + "learning_rate": 0.0019991123565642247, + "loss": 3.1638, + "step": 619 + }, + { + "epoch": 0.043131935023826916, + "grad_norm": 1.234375, + "learning_rate": 0.001999102839091403, + "loss": 3.3121, + "step": 620 + }, + { + "epoch": 0.04320150266096212, + "grad_norm": 1.1953125, + "learning_rate": 0.001999093270889175, + "loss": 3.2227, + "step": 621 + }, + { + "epoch": 0.043271070298097326, + "grad_norm": 2.0625, + "learning_rate": 0.001999083651958027, + "loss": 3.4165, + "step": 622 + }, + { + "epoch": 0.04334063793523253, + "grad_norm": 1.3515625, + "learning_rate": 0.0019990739822984467, + "loss": 3.5862, + "step": 623 + }, + { + "epoch": 0.04341020557236774, + "grad_norm": 1.390625, + "learning_rate": 0.0019990642619109253, + "loss": 3.0626, + "step": 624 + }, + { + "epoch": 0.04347977320950294, + "grad_norm": 1.3046875, + "learning_rate": 0.001999054490795956, + "loss": 3.7579, + "step": 625 + }, + { + "epoch": 0.04354934084663815, + "grad_norm": 1.265625, + "learning_rate": 0.001999044668954036, + "loss": 3.245, + "step": 626 + }, + { + "epoch": 0.043618908483773346, + "grad_norm": 1.984375, + "learning_rate": 0.0019990347963856625, + "loss": 3.6678, + "step": 627 + }, + { + "epoch": 0.04368847612090855, + "grad_norm": 1.3828125, + "learning_rate": 0.001999024873091338, + "loss": 3.2047, + "step": 628 + }, + { + "epoch": 0.043758043758043756, + "grad_norm": 1.84375, + "learning_rate": 0.0019990148990715654, + "loss": 3.3547, + "step": 629 + }, + { + "epoch": 0.04382761139517896, + "grad_norm": 1.671875, + "learning_rate": 0.001999004874326852, + "loss": 3.2467, + "step": 630 + }, + { + "epoch": 0.04389717903231417, + "grad_norm": 1.5703125, + "learning_rate": 0.001998994798857707, + "loss": 3.5803, + "step": 631 + }, + { + "epoch": 0.04396674666944937, + "grad_norm": 2.09375, + "learning_rate": 0.0019989846726646407, + "loss": 3.2881, + "step": 632 + }, + { + "epoch": 0.04403631430658458, + "grad_norm": 1.7578125, + "learning_rate": 0.001998974495748168, + "loss": 3.0706, + "step": 633 + }, + { + "epoch": 0.04410588194371978, + "grad_norm": 1.6484375, + "learning_rate": 0.0019989642681088058, + "loss": 3.4599, + "step": 634 + }, + { + "epoch": 0.04417544958085499, + "grad_norm": 1.6015625, + "learning_rate": 0.001998953989747073, + "loss": 3.3248, + "step": 635 + }, + { + "epoch": 0.04424501721799019, + "grad_norm": 1.09375, + "learning_rate": 0.001998943660663492, + "loss": 3.4659, + "step": 636 + }, + { + "epoch": 0.0443145848551254, + "grad_norm": 1.3671875, + "learning_rate": 0.001998933280858587, + "loss": 3.6053, + "step": 637 + }, + { + "epoch": 0.0443841524922606, + "grad_norm": 1.1328125, + "learning_rate": 0.0019989228503328846, + "loss": 3.4567, + "step": 638 + }, + { + "epoch": 0.0444537201293958, + "grad_norm": 1.5078125, + "learning_rate": 0.001998912369086915, + "loss": 2.8121, + "step": 639 + }, + { + "epoch": 0.04452328776653101, + "grad_norm": 1.5, + "learning_rate": 0.00199890183712121, + "loss": 3.4138, + "step": 640 + }, + { + "epoch": 0.04459285540366621, + "grad_norm": 1.3515625, + "learning_rate": 0.001998891254436305, + "loss": 3.1913, + "step": 641 + }, + { + "epoch": 0.04466242304080142, + "grad_norm": 1.15625, + "learning_rate": 0.0019988806210327367, + "loss": 3.3442, + "step": 642 + }, + { + "epoch": 0.04473199067793662, + "grad_norm": 1.3984375, + "learning_rate": 0.001998869936911045, + "loss": 3.4197, + "step": 643 + }, + { + "epoch": 0.04480155831507183, + "grad_norm": 1.765625, + "learning_rate": 0.0019988592020717725, + "loss": 3.5198, + "step": 644 + }, + { + "epoch": 0.04487112595220703, + "grad_norm": 1.3984375, + "learning_rate": 0.001998848416515465, + "loss": 3.3542, + "step": 645 + }, + { + "epoch": 0.04494069358934224, + "grad_norm": 1.59375, + "learning_rate": 0.001998837580242669, + "loss": 3.1037, + "step": 646 + }, + { + "epoch": 0.045010261226477444, + "grad_norm": 1.203125, + "learning_rate": 0.001998826693253935, + "loss": 3.4104, + "step": 647 + }, + { + "epoch": 0.04507982886361265, + "grad_norm": 1.28125, + "learning_rate": 0.0019988157555498164, + "loss": 3.5321, + "step": 648 + }, + { + "epoch": 0.045149396500747854, + "grad_norm": 1.3359375, + "learning_rate": 0.001998804767130868, + "loss": 3.0773, + "step": 649 + }, + { + "epoch": 0.04521896413788306, + "grad_norm": 1.359375, + "learning_rate": 0.0019987937279976474, + "loss": 3.6503, + "step": 650 + }, + { + "epoch": 0.045288531775018265, + "grad_norm": 1.5625, + "learning_rate": 0.001998782638150716, + "loss": 3.2508, + "step": 651 + }, + { + "epoch": 0.04535809941215346, + "grad_norm": 1.671875, + "learning_rate": 0.001998771497590637, + "loss": 3.3105, + "step": 652 + }, + { + "epoch": 0.04542766704928867, + "grad_norm": 1.2265625, + "learning_rate": 0.001998760306317975, + "loss": 3.1075, + "step": 653 + }, + { + "epoch": 0.045497234686423874, + "grad_norm": 2.40625, + "learning_rate": 0.0019987490643332984, + "loss": 3.257, + "step": 654 + }, + { + "epoch": 0.04556680232355908, + "grad_norm": 1.421875, + "learning_rate": 0.001998737771637179, + "loss": 3.4907, + "step": 655 + }, + { + "epoch": 0.045636369960694284, + "grad_norm": 1.5234375, + "learning_rate": 0.0019987264282301893, + "loss": 3.7234, + "step": 656 + }, + { + "epoch": 0.04570593759782949, + "grad_norm": 1.3359375, + "learning_rate": 0.0019987150341129055, + "loss": 2.9309, + "step": 657 + }, + { + "epoch": 0.045775505234964695, + "grad_norm": 1.359375, + "learning_rate": 0.001998703589285906, + "loss": 3.348, + "step": 658 + }, + { + "epoch": 0.0458450728720999, + "grad_norm": 1.5859375, + "learning_rate": 0.0019986920937497725, + "loss": 3.3921, + "step": 659 + }, + { + "epoch": 0.045914640509235105, + "grad_norm": 1.328125, + "learning_rate": 0.001998680547505088, + "loss": 3.0439, + "step": 660 + }, + { + "epoch": 0.04598420814637031, + "grad_norm": 1.2578125, + "learning_rate": 0.001998668950552439, + "loss": 3.1959, + "step": 661 + }, + { + "epoch": 0.046053775783505516, + "grad_norm": 2.125, + "learning_rate": 0.0019986573028924143, + "loss": 3.4075, + "step": 662 + }, + { + "epoch": 0.04612334342064072, + "grad_norm": 1.421875, + "learning_rate": 0.0019986456045256056, + "loss": 3.1542, + "step": 663 + }, + { + "epoch": 0.046192911057775926, + "grad_norm": 1.5, + "learning_rate": 0.001998633855452607, + "loss": 3.1908, + "step": 664 + }, + { + "epoch": 0.046262478694911124, + "grad_norm": 1.578125, + "learning_rate": 0.001998622055674014, + "loss": 3.5424, + "step": 665 + }, + { + "epoch": 0.04633204633204633, + "grad_norm": 1.28125, + "learning_rate": 0.0019986102051904268, + "loss": 3.1491, + "step": 666 + }, + { + "epoch": 0.046401613969181535, + "grad_norm": 1.5625, + "learning_rate": 0.0019985983040024468, + "loss": 3.4403, + "step": 667 + }, + { + "epoch": 0.04647118160631674, + "grad_norm": 1.3828125, + "learning_rate": 0.001998586352110678, + "loss": 3.3588, + "step": 668 + }, + { + "epoch": 0.046540749243451945, + "grad_norm": 1.234375, + "learning_rate": 0.0019985743495157275, + "loss": 3.7794, + "step": 669 + }, + { + "epoch": 0.04661031688058715, + "grad_norm": 1.28125, + "learning_rate": 0.0019985622962182046, + "loss": 3.3294, + "step": 670 + }, + { + "epoch": 0.046679884517722356, + "grad_norm": 1.3046875, + "learning_rate": 0.001998550192218722, + "loss": 3.1803, + "step": 671 + }, + { + "epoch": 0.04674945215485756, + "grad_norm": 1.6953125, + "learning_rate": 0.001998538037517893, + "loss": 3.0911, + "step": 672 + }, + { + "epoch": 0.046819019791992766, + "grad_norm": 1.2421875, + "learning_rate": 0.0019985258321163356, + "loss": 2.7282, + "step": 673 + }, + { + "epoch": 0.04688858742912797, + "grad_norm": 1.453125, + "learning_rate": 0.00199851357601467, + "loss": 2.9728, + "step": 674 + }, + { + "epoch": 0.04695815506626318, + "grad_norm": 1.0546875, + "learning_rate": 0.001998501269213517, + "loss": 3.3479, + "step": 675 + }, + { + "epoch": 0.04702772270339838, + "grad_norm": 1.515625, + "learning_rate": 0.001998488911713503, + "loss": 3.2668, + "step": 676 + }, + { + "epoch": 0.04709729034053358, + "grad_norm": 1.7109375, + "learning_rate": 0.0019984765035152546, + "loss": 3.5445, + "step": 677 + }, + { + "epoch": 0.047166857977668786, + "grad_norm": 1.2578125, + "learning_rate": 0.001998464044619402, + "loss": 3.2349, + "step": 678 + }, + { + "epoch": 0.04723642561480399, + "grad_norm": 2.515625, + "learning_rate": 0.0019984515350265778, + "loss": 3.1874, + "step": 679 + }, + { + "epoch": 0.047305993251939196, + "grad_norm": 1.0234375, + "learning_rate": 0.0019984389747374175, + "loss": 3.4472, + "step": 680 + }, + { + "epoch": 0.0473755608890744, + "grad_norm": 1.15625, + "learning_rate": 0.0019984263637525587, + "loss": 3.7444, + "step": 681 + }, + { + "epoch": 0.04744512852620961, + "grad_norm": 1.4296875, + "learning_rate": 0.001998413702072641, + "loss": 3.2759, + "step": 682 + }, + { + "epoch": 0.04751469616334481, + "grad_norm": 1.6015625, + "learning_rate": 0.0019984009896983086, + "loss": 3.2497, + "step": 683 + }, + { + "epoch": 0.04758426380048002, + "grad_norm": 1.21875, + "learning_rate": 0.0019983882266302057, + "loss": 3.6491, + "step": 684 + }, + { + "epoch": 0.04765383143761522, + "grad_norm": 1.25, + "learning_rate": 0.001998375412868981, + "loss": 3.5994, + "step": 685 + }, + { + "epoch": 0.04772339907475043, + "grad_norm": 1.2265625, + "learning_rate": 0.001998362548415285, + "loss": 3.3485, + "step": 686 + }, + { + "epoch": 0.04779296671188563, + "grad_norm": 1.7578125, + "learning_rate": 0.001998349633269771, + "loss": 3.5639, + "step": 687 + }, + { + "epoch": 0.04786253434902084, + "grad_norm": 1.7265625, + "learning_rate": 0.0019983366674330948, + "loss": 3.4652, + "step": 688 + }, + { + "epoch": 0.047932101986156044, + "grad_norm": 1.3828125, + "learning_rate": 0.0019983236509059144, + "loss": 3.4984, + "step": 689 + }, + { + "epoch": 0.04800166962329124, + "grad_norm": 1.3828125, + "learning_rate": 0.001998310583688891, + "loss": 3.1597, + "step": 690 + }, + { + "epoch": 0.04807123726042645, + "grad_norm": 1.1640625, + "learning_rate": 0.001998297465782688, + "loss": 3.3416, + "step": 691 + }, + { + "epoch": 0.04814080489756165, + "grad_norm": 1.40625, + "learning_rate": 0.0019982842971879716, + "loss": 3.7584, + "step": 692 + }, + { + "epoch": 0.04821037253469686, + "grad_norm": 1.328125, + "learning_rate": 0.0019982710779054102, + "loss": 3.1876, + "step": 693 + }, + { + "epoch": 0.04827994017183206, + "grad_norm": 2.140625, + "learning_rate": 0.001998257807935675, + "loss": 3.5906, + "step": 694 + }, + { + "epoch": 0.04834950780896727, + "grad_norm": 1.21875, + "learning_rate": 0.00199824448727944, + "loss": 3.512, + "step": 695 + }, + { + "epoch": 0.04841907544610247, + "grad_norm": 1.1015625, + "learning_rate": 0.0019982311159373817, + "loss": 3.5001, + "step": 696 + }, + { + "epoch": 0.04848864308323768, + "grad_norm": 1.3515625, + "learning_rate": 0.0019982176939101785, + "loss": 3.4267, + "step": 697 + }, + { + "epoch": 0.048558210720372884, + "grad_norm": 1.1171875, + "learning_rate": 0.001998204221198512, + "loss": 3.5476, + "step": 698 + }, + { + "epoch": 0.04862777835750809, + "grad_norm": 1.125, + "learning_rate": 0.001998190697803067, + "loss": 3.7046, + "step": 699 + }, + { + "epoch": 0.048697345994643294, + "grad_norm": 1.1640625, + "learning_rate": 0.0019981771237245296, + "loss": 3.4709, + "step": 700 + }, + { + "epoch": 0.0487669136317785, + "grad_norm": 1.421875, + "learning_rate": 0.0019981634989635886, + "loss": 3.6087, + "step": 701 + }, + { + "epoch": 0.0488364812689137, + "grad_norm": 1.0625, + "learning_rate": 0.0019981498235209366, + "loss": 3.3568, + "step": 702 + }, + { + "epoch": 0.0489060489060489, + "grad_norm": 2.09375, + "learning_rate": 0.0019981360973972675, + "loss": 3.1876, + "step": 703 + }, + { + "epoch": 0.04897561654318411, + "grad_norm": 1.484375, + "learning_rate": 0.0019981223205932782, + "loss": 3.2481, + "step": 704 + }, + { + "epoch": 0.049045184180319314, + "grad_norm": 1.1796875, + "learning_rate": 0.0019981084931096687, + "loss": 3.5898, + "step": 705 + }, + { + "epoch": 0.04911475181745452, + "grad_norm": 1.8828125, + "learning_rate": 0.0019980946149471403, + "loss": 3.5733, + "step": 706 + }, + { + "epoch": 0.049184319454589724, + "grad_norm": 1.4296875, + "learning_rate": 0.001998080686106399, + "loss": 3.2213, + "step": 707 + }, + { + "epoch": 0.04925388709172493, + "grad_norm": 1.7734375, + "learning_rate": 0.00199806670658815, + "loss": 2.8544, + "step": 708 + }, + { + "epoch": 0.049323454728860135, + "grad_norm": 1.421875, + "learning_rate": 0.001998052676393105, + "loss": 3.8152, + "step": 709 + }, + { + "epoch": 0.04939302236599534, + "grad_norm": 1.6796875, + "learning_rate": 0.0019980385955219756, + "loss": 3.4783, + "step": 710 + }, + { + "epoch": 0.049462590003130545, + "grad_norm": 1.921875, + "learning_rate": 0.0019980244639754767, + "loss": 3.3807, + "step": 711 + }, + { + "epoch": 0.04953215764026575, + "grad_norm": 1.296875, + "learning_rate": 0.0019980102817543258, + "loss": 3.7659, + "step": 712 + }, + { + "epoch": 0.049601725277400956, + "grad_norm": 1.6953125, + "learning_rate": 0.001997996048859243, + "loss": 3.4794, + "step": 713 + }, + { + "epoch": 0.04967129291453616, + "grad_norm": 1.6015625, + "learning_rate": 0.0019979817652909515, + "loss": 3.496, + "step": 714 + }, + { + "epoch": 0.04974086055167136, + "grad_norm": 1.0546875, + "learning_rate": 0.0019979674310501763, + "loss": 3.189, + "step": 715 + }, + { + "epoch": 0.049810428188806564, + "grad_norm": 1.171875, + "learning_rate": 0.0019979530461376447, + "loss": 3.2305, + "step": 716 + }, + { + "epoch": 0.04987999582594177, + "grad_norm": 1.2890625, + "learning_rate": 0.001997938610554087, + "loss": 3.175, + "step": 717 + }, + { + "epoch": 0.049949563463076975, + "grad_norm": 0.96875, + "learning_rate": 0.0019979241243002375, + "loss": 3.3976, + "step": 718 + }, + { + "epoch": 0.05001913110021218, + "grad_norm": 1.1953125, + "learning_rate": 0.0019979095873768307, + "loss": 3.2898, + "step": 719 + }, + { + "epoch": 0.050088698737347385, + "grad_norm": 1.53125, + "learning_rate": 0.0019978949997846046, + "loss": 3.3068, + "step": 720 + }, + { + "epoch": 0.05015826637448259, + "grad_norm": 1.6484375, + "learning_rate": 0.0019978803615243006, + "loss": 2.9773, + "step": 721 + }, + { + "epoch": 0.050227834011617796, + "grad_norm": 1.21875, + "learning_rate": 0.0019978656725966612, + "loss": 3.5996, + "step": 722 + }, + { + "epoch": 0.050297401648753, + "grad_norm": 1.6875, + "learning_rate": 0.0019978509330024325, + "loss": 2.9751, + "step": 723 + }, + { + "epoch": 0.050366969285888206, + "grad_norm": 1.6015625, + "learning_rate": 0.0019978361427423633, + "loss": 3.2896, + "step": 724 + }, + { + "epoch": 0.05043653692302341, + "grad_norm": 1.6875, + "learning_rate": 0.001997821301817204, + "loss": 3.4558, + "step": 725 + }, + { + "epoch": 0.05050610456015862, + "grad_norm": 1.6796875, + "learning_rate": 0.0019978064102277085, + "loss": 3.5634, + "step": 726 + }, + { + "epoch": 0.05057567219729382, + "grad_norm": 1.296875, + "learning_rate": 0.0019977914679746326, + "loss": 3.643, + "step": 727 + }, + { + "epoch": 0.05064523983442902, + "grad_norm": 1.265625, + "learning_rate": 0.0019977764750587356, + "loss": 3.5794, + "step": 728 + }, + { + "epoch": 0.050714807471564226, + "grad_norm": 1.6875, + "learning_rate": 0.001997761431480778, + "loss": 3.5215, + "step": 729 + }, + { + "epoch": 0.05078437510869943, + "grad_norm": 1.0234375, + "learning_rate": 0.0019977463372415237, + "loss": 3.1712, + "step": 730 + }, + { + "epoch": 0.050853942745834636, + "grad_norm": 1.125, + "learning_rate": 0.00199773119234174, + "loss": 3.6297, + "step": 731 + }, + { + "epoch": 0.05092351038296984, + "grad_norm": 1.265625, + "learning_rate": 0.001997715996782195, + "loss": 3.3124, + "step": 732 + }, + { + "epoch": 0.05099307802010505, + "grad_norm": 1.5390625, + "learning_rate": 0.0019977007505636605, + "loss": 3.1167, + "step": 733 + }, + { + "epoch": 0.05106264565724025, + "grad_norm": 1.2890625, + "learning_rate": 0.0019976854536869113, + "loss": 3.2124, + "step": 734 + }, + { + "epoch": 0.05113221329437546, + "grad_norm": 1.5625, + "learning_rate": 0.001997670106152723, + "loss": 3.2263, + "step": 735 + }, + { + "epoch": 0.05120178093151066, + "grad_norm": 1.4375, + "learning_rate": 0.001997654707961875, + "loss": 3.3927, + "step": 736 + }, + { + "epoch": 0.05127134856864587, + "grad_norm": 1.375, + "learning_rate": 0.0019976392591151497, + "loss": 3.4537, + "step": 737 + }, + { + "epoch": 0.05134091620578107, + "grad_norm": 1.1328125, + "learning_rate": 0.0019976237596133315, + "loss": 3.4768, + "step": 738 + }, + { + "epoch": 0.05141048384291628, + "grad_norm": 1.21875, + "learning_rate": 0.0019976082094572073, + "loss": 3.4063, + "step": 739 + }, + { + "epoch": 0.05148005148005148, + "grad_norm": 1.46875, + "learning_rate": 0.0019975926086475662, + "loss": 3.5262, + "step": 740 + }, + { + "epoch": 0.05154961911718668, + "grad_norm": 1.2890625, + "learning_rate": 0.001997576957185201, + "loss": 3.1927, + "step": 741 + }, + { + "epoch": 0.05161918675432189, + "grad_norm": 1.9453125, + "learning_rate": 0.0019975612550709055, + "loss": 3.4512, + "step": 742 + }, + { + "epoch": 0.05168875439145709, + "grad_norm": 1.1640625, + "learning_rate": 0.001997545502305478, + "loss": 3.3019, + "step": 743 + }, + { + "epoch": 0.0517583220285923, + "grad_norm": 1.2890625, + "learning_rate": 0.001997529698889718, + "loss": 2.9856, + "step": 744 + }, + { + "epoch": 0.0518278896657275, + "grad_norm": 1.171875, + "learning_rate": 0.0019975138448244272, + "loss": 2.9967, + "step": 745 + }, + { + "epoch": 0.05189745730286271, + "grad_norm": 1.3359375, + "learning_rate": 0.001997497940110412, + "loss": 3.3576, + "step": 746 + }, + { + "epoch": 0.05196702493999791, + "grad_norm": 0.921875, + "learning_rate": 0.0019974819847484787, + "loss": 3.4691, + "step": 747 + }, + { + "epoch": 0.05203659257713312, + "grad_norm": 1.1015625, + "learning_rate": 0.001997465978739438, + "loss": 3.7629, + "step": 748 + }, + { + "epoch": 0.052106160214268324, + "grad_norm": 1.359375, + "learning_rate": 0.0019974499220841023, + "loss": 3.1283, + "step": 749 + }, + { + "epoch": 0.05217572785140353, + "grad_norm": 1.453125, + "learning_rate": 0.0019974338147832876, + "loss": 3.7169, + "step": 750 + }, + { + "epoch": 0.052245295488538734, + "grad_norm": 1.1875, + "learning_rate": 0.0019974176568378107, + "loss": 3.3305, + "step": 751 + }, + { + "epoch": 0.05231486312567394, + "grad_norm": 1.1796875, + "learning_rate": 0.001997401448248493, + "loss": 2.918, + "step": 752 + }, + { + "epoch": 0.05238443076280914, + "grad_norm": 1.4921875, + "learning_rate": 0.0019973851890161564, + "loss": 3.1841, + "step": 753 + }, + { + "epoch": 0.05245399839994434, + "grad_norm": 1.3671875, + "learning_rate": 0.001997368879141628, + "loss": 3.4353, + "step": 754 + }, + { + "epoch": 0.05252356603707955, + "grad_norm": 0.98828125, + "learning_rate": 0.0019973525186257344, + "loss": 3.779, + "step": 755 + }, + { + "epoch": 0.052593133674214754, + "grad_norm": 1.2265625, + "learning_rate": 0.0019973361074693066, + "loss": 3.8655, + "step": 756 + }, + { + "epoch": 0.05266270131134996, + "grad_norm": 1.03125, + "learning_rate": 0.001997319645673179, + "loss": 3.0859, + "step": 757 + }, + { + "epoch": 0.052732268948485164, + "grad_norm": 1.203125, + "learning_rate": 0.001997303133238186, + "loss": 3.3803, + "step": 758 + }, + { + "epoch": 0.05280183658562037, + "grad_norm": 1.25, + "learning_rate": 0.001997286570165167, + "loss": 3.1753, + "step": 759 + }, + { + "epoch": 0.052871404222755575, + "grad_norm": 1.515625, + "learning_rate": 0.0019972699564549624, + "loss": 3.5117, + "step": 760 + }, + { + "epoch": 0.05294097185989078, + "grad_norm": 1.234375, + "learning_rate": 0.0019972532921084165, + "loss": 3.3453, + "step": 761 + }, + { + "epoch": 0.053010539497025985, + "grad_norm": 1.6953125, + "learning_rate": 0.001997236577126375, + "loss": 3.1444, + "step": 762 + }, + { + "epoch": 0.05308010713416119, + "grad_norm": 1.2578125, + "learning_rate": 0.001997219811509686, + "loss": 2.9711, + "step": 763 + }, + { + "epoch": 0.053149674771296396, + "grad_norm": 1.28125, + "learning_rate": 0.0019972029952592014, + "loss": 3.7106, + "step": 764 + }, + { + "epoch": 0.0532192424084316, + "grad_norm": 1.5234375, + "learning_rate": 0.0019971861283757755, + "loss": 3.5073, + "step": 765 + }, + { + "epoch": 0.0532888100455668, + "grad_norm": 1.34375, + "learning_rate": 0.0019971692108602637, + "loss": 3.1031, + "step": 766 + }, + { + "epoch": 0.053358377682702005, + "grad_norm": 1.4921875, + "learning_rate": 0.001997152242713526, + "loss": 3.2373, + "step": 767 + }, + { + "epoch": 0.05342794531983721, + "grad_norm": 1.203125, + "learning_rate": 0.0019971352239364225, + "loss": 3.1924, + "step": 768 + }, + { + "epoch": 0.053497512956972415, + "grad_norm": 1.125, + "learning_rate": 0.001997118154529819, + "loss": 3.5255, + "step": 769 + }, + { + "epoch": 0.05356708059410762, + "grad_norm": 1.28125, + "learning_rate": 0.001997101034494581, + "loss": 3.5165, + "step": 770 + }, + { + "epoch": 0.053636648231242826, + "grad_norm": 0.96875, + "learning_rate": 0.001997083863831579, + "loss": 3.2297, + "step": 771 + }, + { + "epoch": 0.05370621586837803, + "grad_norm": 1.4140625, + "learning_rate": 0.0019970666425416835, + "loss": 3.3447, + "step": 772 + }, + { + "epoch": 0.053775783505513236, + "grad_norm": 1.3046875, + "learning_rate": 0.0019970493706257695, + "loss": 3.6247, + "step": 773 + }, + { + "epoch": 0.05384535114264844, + "grad_norm": 1.53125, + "learning_rate": 0.001997032048084714, + "loss": 3.2659, + "step": 774 + }, + { + "epoch": 0.05391491877978365, + "grad_norm": 1.0546875, + "learning_rate": 0.0019970146749193965, + "loss": 3.3362, + "step": 775 + }, + { + "epoch": 0.05398448641691885, + "grad_norm": 1.3671875, + "learning_rate": 0.0019969972511306995, + "loss": 3.8919, + "step": 776 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 1.6640625, + "learning_rate": 0.0019969797767195067, + "loss": 3.1955, + "step": 777 + }, + { + "epoch": 0.054123621691189255, + "grad_norm": 1.53125, + "learning_rate": 0.0019969622516867063, + "loss": 3.4623, + "step": 778 + }, + { + "epoch": 0.05419318932832446, + "grad_norm": 0.93359375, + "learning_rate": 0.001996944676033188, + "loss": 3.5068, + "step": 779 + }, + { + "epoch": 0.054262756965459666, + "grad_norm": 1.1171875, + "learning_rate": 0.0019969270497598437, + "loss": 3.8145, + "step": 780 + }, + { + "epoch": 0.05433232460259487, + "grad_norm": 1.296875, + "learning_rate": 0.001996909372867569, + "loss": 3.3767, + "step": 781 + }, + { + "epoch": 0.054401892239730076, + "grad_norm": 1.21875, + "learning_rate": 0.001996891645357261, + "loss": 3.5851, + "step": 782 + }, + { + "epoch": 0.05447145987686528, + "grad_norm": 1.3046875, + "learning_rate": 0.0019968738672298198, + "loss": 3.4255, + "step": 783 + }, + { + "epoch": 0.05454102751400049, + "grad_norm": 0.9921875, + "learning_rate": 0.0019968560384861487, + "loss": 3.5207, + "step": 784 + }, + { + "epoch": 0.05461059515113569, + "grad_norm": 1.3671875, + "learning_rate": 0.001996838159127152, + "loss": 3.5369, + "step": 785 + }, + { + "epoch": 0.0546801627882709, + "grad_norm": 1.5703125, + "learning_rate": 0.0019968202291537384, + "loss": 3.1211, + "step": 786 + }, + { + "epoch": 0.0547497304254061, + "grad_norm": 1.65625, + "learning_rate": 0.0019968022485668175, + "loss": 3.6356, + "step": 787 + }, + { + "epoch": 0.05481929806254131, + "grad_norm": 2.125, + "learning_rate": 0.0019967842173673027, + "loss": 3.6714, + "step": 788 + }, + { + "epoch": 0.05488886569967651, + "grad_norm": 1.5625, + "learning_rate": 0.00199676613555611, + "loss": 3.1509, + "step": 789 + }, + { + "epoch": 0.05495843333681172, + "grad_norm": 1.421875, + "learning_rate": 0.0019967480031341566, + "loss": 3.2795, + "step": 790 + }, + { + "epoch": 0.05502800097394692, + "grad_norm": 1.171875, + "learning_rate": 0.0019967298201023637, + "loss": 3.2036, + "step": 791 + }, + { + "epoch": 0.05509756861108212, + "grad_norm": 1.0625, + "learning_rate": 0.0019967115864616544, + "loss": 3.2689, + "step": 792 + }, + { + "epoch": 0.05516713624821733, + "grad_norm": 1.0234375, + "learning_rate": 0.0019966933022129542, + "loss": 3.3734, + "step": 793 + }, + { + "epoch": 0.05523670388535253, + "grad_norm": 1.40625, + "learning_rate": 0.001996674967357192, + "loss": 2.7514, + "step": 794 + }, + { + "epoch": 0.05530627152248774, + "grad_norm": 1.3359375, + "learning_rate": 0.001996656581895299, + "loss": 3.4004, + "step": 795 + }, + { + "epoch": 0.05537583915962294, + "grad_norm": 1.2890625, + "learning_rate": 0.0019966381458282082, + "loss": 3.0403, + "step": 796 + }, + { + "epoch": 0.05544540679675815, + "grad_norm": 1.0859375, + "learning_rate": 0.0019966196591568557, + "loss": 3.2993, + "step": 797 + }, + { + "epoch": 0.05551497443389335, + "grad_norm": 1.4921875, + "learning_rate": 0.00199660112188218, + "loss": 3.3328, + "step": 798 + }, + { + "epoch": 0.05558454207102856, + "grad_norm": 0.98828125, + "learning_rate": 0.001996582534005123, + "loss": 3.5094, + "step": 799 + }, + { + "epoch": 0.055654109708163764, + "grad_norm": 1.234375, + "learning_rate": 0.0019965638955266275, + "loss": 3.4173, + "step": 800 + }, + { + "epoch": 0.05572367734529897, + "grad_norm": 1.75, + "learning_rate": 0.0019965452064476404, + "loss": 3.6744, + "step": 801 + }, + { + "epoch": 0.055793244982434174, + "grad_norm": 1.7109375, + "learning_rate": 0.0019965264667691114, + "loss": 3.2192, + "step": 802 + }, + { + "epoch": 0.05586281261956938, + "grad_norm": 1.40625, + "learning_rate": 0.0019965076764919907, + "loss": 3.1377, + "step": 803 + }, + { + "epoch": 0.05593238025670458, + "grad_norm": 1.4375, + "learning_rate": 0.001996488835617233, + "loss": 3.2694, + "step": 804 + }, + { + "epoch": 0.05600194789383978, + "grad_norm": 1.1015625, + "learning_rate": 0.0019964699441457952, + "loss": 3.5418, + "step": 805 + }, + { + "epoch": 0.05607151553097499, + "grad_norm": 1.25, + "learning_rate": 0.001996451002078636, + "loss": 3.1088, + "step": 806 + }, + { + "epoch": 0.056141083168110194, + "grad_norm": 1.109375, + "learning_rate": 0.0019964320094167176, + "loss": 3.3395, + "step": 807 + }, + { + "epoch": 0.0562106508052454, + "grad_norm": 1.9296875, + "learning_rate": 0.001996412966161004, + "loss": 3.2299, + "step": 808 + }, + { + "epoch": 0.056280218442380604, + "grad_norm": 1.3046875, + "learning_rate": 0.0019963938723124622, + "loss": 3.4256, + "step": 809 + }, + { + "epoch": 0.05634978607951581, + "grad_norm": 1.25, + "learning_rate": 0.001996374727872062, + "loss": 3.3638, + "step": 810 + }, + { + "epoch": 0.056419353716651015, + "grad_norm": 1.2109375, + "learning_rate": 0.001996355532840775, + "loss": 3.8451, + "step": 811 + }, + { + "epoch": 0.05648892135378622, + "grad_norm": 1.09375, + "learning_rate": 0.001996336287219576, + "loss": 3.2477, + "step": 812 + }, + { + "epoch": 0.056558488990921425, + "grad_norm": 1.4921875, + "learning_rate": 0.0019963169910094426, + "loss": 3.607, + "step": 813 + }, + { + "epoch": 0.05662805662805663, + "grad_norm": 1.359375, + "learning_rate": 0.0019962976442113537, + "loss": 3.2908, + "step": 814 + }, + { + "epoch": 0.056697624265191836, + "grad_norm": 1.09375, + "learning_rate": 0.0019962782468262927, + "loss": 3.3164, + "step": 815 + }, + { + "epoch": 0.056767191902327034, + "grad_norm": 1.0859375, + "learning_rate": 0.0019962587988552436, + "loss": 3.2608, + "step": 816 + }, + { + "epoch": 0.05683675953946224, + "grad_norm": 1.625, + "learning_rate": 0.0019962393002991943, + "loss": 2.8271, + "step": 817 + }, + { + "epoch": 0.056906327176597445, + "grad_norm": 1.7109375, + "learning_rate": 0.0019962197511591345, + "loss": 3.2975, + "step": 818 + }, + { + "epoch": 0.05697589481373265, + "grad_norm": 1.4921875, + "learning_rate": 0.0019962001514360573, + "loss": 3.4618, + "step": 819 + }, + { + "epoch": 0.057045462450867855, + "grad_norm": 1.3359375, + "learning_rate": 0.0019961805011309577, + "loss": 3.2017, + "step": 820 + }, + { + "epoch": 0.05711503008800306, + "grad_norm": 1.453125, + "learning_rate": 0.0019961608002448334, + "loss": 3.4054, + "step": 821 + }, + { + "epoch": 0.057184597725138266, + "grad_norm": 1.203125, + "learning_rate": 0.0019961410487786845, + "loss": 3.5605, + "step": 822 + }, + { + "epoch": 0.05725416536227347, + "grad_norm": 0.921875, + "learning_rate": 0.0019961212467335143, + "loss": 3.4814, + "step": 823 + }, + { + "epoch": 0.057323732999408676, + "grad_norm": 1.203125, + "learning_rate": 0.0019961013941103274, + "loss": 3.1413, + "step": 824 + }, + { + "epoch": 0.05739330063654388, + "grad_norm": 1.9921875, + "learning_rate": 0.001996081490910133, + "loss": 3.4585, + "step": 825 + }, + { + "epoch": 0.05746286827367909, + "grad_norm": 2.84375, + "learning_rate": 0.001996061537133941, + "loss": 3.4995, + "step": 826 + }, + { + "epoch": 0.05753243591081429, + "grad_norm": 1.5703125, + "learning_rate": 0.0019960415327827646, + "loss": 3.4805, + "step": 827 + }, + { + "epoch": 0.0576020035479495, + "grad_norm": 1.078125, + "learning_rate": 0.0019960214778576195, + "loss": 3.0566, + "step": 828 + }, + { + "epoch": 0.057671571185084695, + "grad_norm": 1.2265625, + "learning_rate": 0.001996001372359524, + "loss": 3.0971, + "step": 829 + }, + { + "epoch": 0.0577411388222199, + "grad_norm": 1.265625, + "learning_rate": 0.0019959812162894997, + "loss": 3.596, + "step": 830 + }, + { + "epoch": 0.057810706459355106, + "grad_norm": 0.9609375, + "learning_rate": 0.001995961009648569, + "loss": 3.5438, + "step": 831 + }, + { + "epoch": 0.05788027409649031, + "grad_norm": 0.75, + "learning_rate": 0.001995940752437758, + "loss": 3.7194, + "step": 832 + }, + { + "epoch": 0.057949841733625516, + "grad_norm": 1.2578125, + "learning_rate": 0.0019959204446580955, + "loss": 3.187, + "step": 833 + }, + { + "epoch": 0.05801940937076072, + "grad_norm": 1.4609375, + "learning_rate": 0.001995900086310613, + "loss": 3.2581, + "step": 834 + }, + { + "epoch": 0.05808897700789593, + "grad_norm": 1.375, + "learning_rate": 0.0019958796773963433, + "loss": 3.3424, + "step": 835 + }, + { + "epoch": 0.05815854464503113, + "grad_norm": 1.125, + "learning_rate": 0.0019958592179163234, + "loss": 3.5114, + "step": 836 + }, + { + "epoch": 0.05822811228216634, + "grad_norm": 1.1953125, + "learning_rate": 0.0019958387078715923, + "loss": 3.2554, + "step": 837 + }, + { + "epoch": 0.05829767991930154, + "grad_norm": 1.6796875, + "learning_rate": 0.0019958181472631907, + "loss": 3.3679, + "step": 838 + }, + { + "epoch": 0.05836724755643675, + "grad_norm": 1.015625, + "learning_rate": 0.001995797536092163, + "loss": 3.3486, + "step": 839 + }, + { + "epoch": 0.05843681519357195, + "grad_norm": 1.2421875, + "learning_rate": 0.001995776874359555, + "loss": 3.0417, + "step": 840 + }, + { + "epoch": 0.05850638283070716, + "grad_norm": 1.5546875, + "learning_rate": 0.001995756162066417, + "loss": 3.4033, + "step": 841 + }, + { + "epoch": 0.05857595046784236, + "grad_norm": 1.3203125, + "learning_rate": 0.0019957353992138003, + "loss": 3.1399, + "step": 842 + }, + { + "epoch": 0.05864551810497756, + "grad_norm": 1.0234375, + "learning_rate": 0.0019957145858027587, + "loss": 3.3802, + "step": 843 + }, + { + "epoch": 0.05871508574211277, + "grad_norm": 1.1328125, + "learning_rate": 0.001995693721834349, + "loss": 3.0986, + "step": 844 + }, + { + "epoch": 0.05878465337924797, + "grad_norm": 0.98828125, + "learning_rate": 0.001995672807309631, + "loss": 3.2547, + "step": 845 + }, + { + "epoch": 0.05885422101638318, + "grad_norm": 0.95703125, + "learning_rate": 0.001995651842229666, + "loss": 3.1011, + "step": 846 + }, + { + "epoch": 0.05892378865351838, + "grad_norm": 1.3046875, + "learning_rate": 0.0019956308265955194, + "loss": 3.4335, + "step": 847 + }, + { + "epoch": 0.05899335629065359, + "grad_norm": 1.3203125, + "learning_rate": 0.0019956097604082574, + "loss": 2.7745, + "step": 848 + }, + { + "epoch": 0.059062923927788794, + "grad_norm": 1.2578125, + "learning_rate": 0.00199558864366895, + "loss": 3.4019, + "step": 849 + }, + { + "epoch": 0.059132491564924, + "grad_norm": 1.375, + "learning_rate": 0.0019955674763786698, + "loss": 3.3095, + "step": 850 + }, + { + "epoch": 0.059202059202059204, + "grad_norm": 1.3203125, + "learning_rate": 0.001995546258538491, + "loss": 3.3396, + "step": 851 + }, + { + "epoch": 0.05927162683919441, + "grad_norm": 1.1875, + "learning_rate": 0.001995524990149491, + "loss": 3.4062, + "step": 852 + }, + { + "epoch": 0.059341194476329615, + "grad_norm": 1.4453125, + "learning_rate": 0.00199550367121275, + "loss": 3.2541, + "step": 853 + }, + { + "epoch": 0.05941076211346481, + "grad_norm": 2.4375, + "learning_rate": 0.00199548230172935, + "loss": 3.0319, + "step": 854 + }, + { + "epoch": 0.05948032975060002, + "grad_norm": 1.8359375, + "learning_rate": 0.001995460881700377, + "loss": 3.1415, + "step": 855 + }, + { + "epoch": 0.05954989738773522, + "grad_norm": 1.375, + "learning_rate": 0.001995439411126917, + "loss": 3.4585, + "step": 856 + }, + { + "epoch": 0.05961946502487043, + "grad_norm": 1.0703125, + "learning_rate": 0.0019954178900100615, + "loss": 3.2141, + "step": 857 + }, + { + "epoch": 0.059689032662005634, + "grad_norm": 1.1484375, + "learning_rate": 0.001995396318350903, + "loss": 3.3658, + "step": 858 + }, + { + "epoch": 0.05975860029914084, + "grad_norm": 1.1875, + "learning_rate": 0.0019953746961505364, + "loss": 3.5806, + "step": 859 + }, + { + "epoch": 0.059828167936276044, + "grad_norm": 1.296875, + "learning_rate": 0.00199535302341006, + "loss": 3.4283, + "step": 860 + }, + { + "epoch": 0.05989773557341125, + "grad_norm": 1.1953125, + "learning_rate": 0.0019953313001305735, + "loss": 3.2957, + "step": 861 + }, + { + "epoch": 0.059967303210546455, + "grad_norm": 1.1953125, + "learning_rate": 0.001995309526313181, + "loss": 3.1868, + "step": 862 + }, + { + "epoch": 0.06003687084768166, + "grad_norm": 1.296875, + "learning_rate": 0.001995287701958987, + "loss": 3.5114, + "step": 863 + }, + { + "epoch": 0.060106438484816865, + "grad_norm": 1.453125, + "learning_rate": 0.0019952658270691007, + "loss": 3.2758, + "step": 864 + }, + { + "epoch": 0.06017600612195207, + "grad_norm": 1.0625, + "learning_rate": 0.001995243901644632, + "loss": 3.0768, + "step": 865 + }, + { + "epoch": 0.060245573759087276, + "grad_norm": 2.046875, + "learning_rate": 0.0019952219256866945, + "loss": 3.4132, + "step": 866 + }, + { + "epoch": 0.060315141396222474, + "grad_norm": 1.0625, + "learning_rate": 0.0019951998991964036, + "loss": 3.3159, + "step": 867 + }, + { + "epoch": 0.06038470903335768, + "grad_norm": 1.1015625, + "learning_rate": 0.001995177822174878, + "loss": 3.1059, + "step": 868 + }, + { + "epoch": 0.060454276670492885, + "grad_norm": 1.09375, + "learning_rate": 0.0019951556946232385, + "loss": 3.3609, + "step": 869 + }, + { + "epoch": 0.06052384430762809, + "grad_norm": 1.015625, + "learning_rate": 0.001995133516542609, + "loss": 3.1245, + "step": 870 + }, + { + "epoch": 0.060593411944763295, + "grad_norm": 2.46875, + "learning_rate": 0.0019951112879341157, + "loss": 3.4607, + "step": 871 + }, + { + "epoch": 0.0606629795818985, + "grad_norm": 1.1171875, + "learning_rate": 0.0019950890087988868, + "loss": 3.9611, + "step": 872 + }, + { + "epoch": 0.060732547219033706, + "grad_norm": 1.1015625, + "learning_rate": 0.0019950666791380533, + "loss": 3.0612, + "step": 873 + }, + { + "epoch": 0.06080211485616891, + "grad_norm": 0.86328125, + "learning_rate": 0.0019950442989527493, + "loss": 3.4439, + "step": 874 + }, + { + "epoch": 0.060871682493304116, + "grad_norm": 1.1796875, + "learning_rate": 0.0019950218682441116, + "loss": 3.4221, + "step": 875 + }, + { + "epoch": 0.06094125013043932, + "grad_norm": 1.078125, + "learning_rate": 0.0019949993870132785, + "loss": 3.5011, + "step": 876 + }, + { + "epoch": 0.06101081776757453, + "grad_norm": 1.2890625, + "learning_rate": 0.001994976855261392, + "loss": 3.1799, + "step": 877 + }, + { + "epoch": 0.06108038540470973, + "grad_norm": 1.0234375, + "learning_rate": 0.0019949542729895955, + "loss": 3.4673, + "step": 878 + }, + { + "epoch": 0.06114995304184494, + "grad_norm": 1.6015625, + "learning_rate": 0.001994931640199036, + "loss": 3.3861, + "step": 879 + }, + { + "epoch": 0.061219520678980135, + "grad_norm": 1.0078125, + "learning_rate": 0.0019949089568908627, + "loss": 3.7525, + "step": 880 + }, + { + "epoch": 0.06128908831611534, + "grad_norm": 1.2421875, + "learning_rate": 0.001994886223066227, + "loss": 3.5964, + "step": 881 + }, + { + "epoch": 0.061358655953250546, + "grad_norm": 1.078125, + "learning_rate": 0.001994863438726284, + "loss": 3.6425, + "step": 882 + }, + { + "epoch": 0.06142822359038575, + "grad_norm": 1.3671875, + "learning_rate": 0.0019948406038721896, + "loss": 3.5512, + "step": 883 + }, + { + "epoch": 0.061497791227520957, + "grad_norm": 1.359375, + "learning_rate": 0.001994817718505104, + "loss": 3.445, + "step": 884 + }, + { + "epoch": 0.06156735886465616, + "grad_norm": 1.6875, + "learning_rate": 0.001994794782626189, + "loss": 2.9935, + "step": 885 + }, + { + "epoch": 0.06163692650179137, + "grad_norm": 1.1953125, + "learning_rate": 0.0019947717962366085, + "loss": 3.5828, + "step": 886 + }, + { + "epoch": 0.06170649413892657, + "grad_norm": 1.0703125, + "learning_rate": 0.001994748759337531, + "loss": 3.1945, + "step": 887 + }, + { + "epoch": 0.06177606177606178, + "grad_norm": 1.296875, + "learning_rate": 0.001994725671930125, + "loss": 3.4912, + "step": 888 + }, + { + "epoch": 0.06184562941319698, + "grad_norm": 1.3125, + "learning_rate": 0.001994702534015563, + "loss": 3.0837, + "step": 889 + }, + { + "epoch": 0.06191519705033219, + "grad_norm": 1.0703125, + "learning_rate": 0.00199467934559502, + "loss": 3.4557, + "step": 890 + }, + { + "epoch": 0.06198476468746739, + "grad_norm": 1.0703125, + "learning_rate": 0.001994656106669674, + "loss": 3.282, + "step": 891 + }, + { + "epoch": 0.06205433232460259, + "grad_norm": 1.2421875, + "learning_rate": 0.0019946328172407036, + "loss": 3.8112, + "step": 892 + }, + { + "epoch": 0.0621238999617378, + "grad_norm": 1.1796875, + "learning_rate": 0.0019946094773092924, + "loss": 3.429, + "step": 893 + }, + { + "epoch": 0.062193467598873, + "grad_norm": 1.078125, + "learning_rate": 0.001994586086876625, + "loss": 3.5757, + "step": 894 + }, + { + "epoch": 0.06226303523600821, + "grad_norm": 1.125, + "learning_rate": 0.0019945626459438896, + "loss": 3.4571, + "step": 895 + }, + { + "epoch": 0.06233260287314341, + "grad_norm": 1.1015625, + "learning_rate": 0.0019945391545122754, + "loss": 2.874, + "step": 896 + }, + { + "epoch": 0.06240217051027862, + "grad_norm": 1.1640625, + "learning_rate": 0.001994515612582976, + "loss": 2.7974, + "step": 897 + }, + { + "epoch": 0.06247173814741382, + "grad_norm": 1.3671875, + "learning_rate": 0.0019944920201571867, + "loss": 3.2491, + "step": 898 + }, + { + "epoch": 0.06254130578454903, + "grad_norm": 1.1953125, + "learning_rate": 0.0019944683772361053, + "loss": 3.3027, + "step": 899 + }, + { + "epoch": 0.06261087342168423, + "grad_norm": 1.359375, + "learning_rate": 0.001994444683820932, + "loss": 3.3587, + "step": 900 + }, + { + "epoch": 0.06268044105881944, + "grad_norm": 1.171875, + "learning_rate": 0.00199442093991287, + "loss": 3.7572, + "step": 901 + }, + { + "epoch": 0.06275000869595464, + "grad_norm": 0.97265625, + "learning_rate": 0.001994397145513125, + "loss": 3.4524, + "step": 902 + }, + { + "epoch": 0.06281957633308985, + "grad_norm": 1.3984375, + "learning_rate": 0.0019943733006229053, + "loss": 3.2251, + "step": 903 + }, + { + "epoch": 0.06288914397022505, + "grad_norm": 1.0546875, + "learning_rate": 0.001994349405243421, + "loss": 3.4437, + "step": 904 + }, + { + "epoch": 0.06295871160736026, + "grad_norm": 1.2578125, + "learning_rate": 0.001994325459375886, + "loss": 3.3404, + "step": 905 + }, + { + "epoch": 0.06302827924449546, + "grad_norm": 1.1875, + "learning_rate": 0.001994301463021516, + "loss": 3.4911, + "step": 906 + }, + { + "epoch": 0.06309784688163067, + "grad_norm": 1.0234375, + "learning_rate": 0.001994277416181529, + "loss": 3.2754, + "step": 907 + }, + { + "epoch": 0.06316741451876587, + "grad_norm": 1.5625, + "learning_rate": 0.001994253318857147, + "loss": 3.1639, + "step": 908 + }, + { + "epoch": 0.06323698215590108, + "grad_norm": 1.6015625, + "learning_rate": 0.001994229171049592, + "loss": 3.0842, + "step": 909 + }, + { + "epoch": 0.06330654979303628, + "grad_norm": 1.109375, + "learning_rate": 0.001994204972760092, + "loss": 3.7311, + "step": 910 + }, + { + "epoch": 0.06337611743017148, + "grad_norm": 1.171875, + "learning_rate": 0.001994180723989874, + "loss": 3.3897, + "step": 911 + }, + { + "epoch": 0.06344568506730669, + "grad_norm": 1.2734375, + "learning_rate": 0.00199415642474017, + "loss": 3.3411, + "step": 912 + }, + { + "epoch": 0.06351525270444189, + "grad_norm": 1.2265625, + "learning_rate": 0.0019941320750122135, + "loss": 3.3479, + "step": 913 + }, + { + "epoch": 0.0635848203415771, + "grad_norm": 0.796875, + "learning_rate": 0.0019941076748072415, + "loss": 3.6582, + "step": 914 + }, + { + "epoch": 0.0636543879787123, + "grad_norm": 1.1171875, + "learning_rate": 0.001994083224126492, + "loss": 3.0862, + "step": 915 + }, + { + "epoch": 0.06372395561584751, + "grad_norm": 1.2265625, + "learning_rate": 0.001994058722971207, + "loss": 3.3082, + "step": 916 + }, + { + "epoch": 0.06379352325298271, + "grad_norm": 0.9921875, + "learning_rate": 0.0019940341713426306, + "loss": 3.7296, + "step": 917 + }, + { + "epoch": 0.06386309089011792, + "grad_norm": 1.1640625, + "learning_rate": 0.001994009569242009, + "loss": 3.6124, + "step": 918 + }, + { + "epoch": 0.06393265852725312, + "grad_norm": 1.4375, + "learning_rate": 0.001993984916670592, + "loss": 3.1775, + "step": 919 + }, + { + "epoch": 0.06400222616438833, + "grad_norm": 1.2734375, + "learning_rate": 0.001993960213629631, + "loss": 2.9769, + "step": 920 + }, + { + "epoch": 0.06407179380152353, + "grad_norm": 1.1953125, + "learning_rate": 0.0019939354601203802, + "loss": 3.117, + "step": 921 + }, + { + "epoch": 0.06414136143865874, + "grad_norm": 0.99609375, + "learning_rate": 0.0019939106561440963, + "loss": 3.3155, + "step": 922 + }, + { + "epoch": 0.06421092907579394, + "grad_norm": 0.98828125, + "learning_rate": 0.0019938858017020393, + "loss": 3.7916, + "step": 923 + }, + { + "epoch": 0.06428049671292914, + "grad_norm": 1.140625, + "learning_rate": 0.0019938608967954704, + "loss": 3.5828, + "step": 924 + }, + { + "epoch": 0.06435006435006435, + "grad_norm": 1.3125, + "learning_rate": 0.001993835941425655, + "loss": 3.4896, + "step": 925 + }, + { + "epoch": 0.06441963198719955, + "grad_norm": 1.0234375, + "learning_rate": 0.00199381093559386, + "loss": 3.454, + "step": 926 + }, + { + "epoch": 0.06448919962433476, + "grad_norm": 0.94921875, + "learning_rate": 0.0019937858793013545, + "loss": 3.5434, + "step": 927 + }, + { + "epoch": 0.06455876726146996, + "grad_norm": 0.796875, + "learning_rate": 0.001993760772549411, + "loss": 3.5979, + "step": 928 + }, + { + "epoch": 0.06462833489860517, + "grad_norm": 1.0078125, + "learning_rate": 0.0019937356153393046, + "loss": 3.6209, + "step": 929 + }, + { + "epoch": 0.06469790253574037, + "grad_norm": 1.1328125, + "learning_rate": 0.0019937104076723127, + "loss": 3.1411, + "step": 930 + }, + { + "epoch": 0.06476747017287558, + "grad_norm": 1.296875, + "learning_rate": 0.0019936851495497144, + "loss": 3.1043, + "step": 931 + }, + { + "epoch": 0.06483703781001078, + "grad_norm": 1.0859375, + "learning_rate": 0.001993659840972793, + "loss": 3.3624, + "step": 932 + }, + { + "epoch": 0.06490660544714599, + "grad_norm": 1.0859375, + "learning_rate": 0.0019936344819428335, + "loss": 3.3843, + "step": 933 + }, + { + "epoch": 0.06497617308428119, + "grad_norm": 1.1015625, + "learning_rate": 0.001993609072461123, + "loss": 3.1549, + "step": 934 + }, + { + "epoch": 0.0650457407214164, + "grad_norm": 1.25, + "learning_rate": 0.001993583612528952, + "loss": 3.5149, + "step": 935 + }, + { + "epoch": 0.0651153083585516, + "grad_norm": 1.0078125, + "learning_rate": 0.0019935581021476136, + "loss": 3.5026, + "step": 936 + }, + { + "epoch": 0.0651848759956868, + "grad_norm": 1.171875, + "learning_rate": 0.001993532541318402, + "loss": 3.2724, + "step": 937 + }, + { + "epoch": 0.06525444363282201, + "grad_norm": 1.0859375, + "learning_rate": 0.001993506930042616, + "loss": 3.2721, + "step": 938 + }, + { + "epoch": 0.06532401126995721, + "grad_norm": 0.8984375, + "learning_rate": 0.001993481268321556, + "loss": 3.0339, + "step": 939 + }, + { + "epoch": 0.06539357890709242, + "grad_norm": 1.0625, + "learning_rate": 0.0019934555561565244, + "loss": 3.6624, + "step": 940 + }, + { + "epoch": 0.06546314654422762, + "grad_norm": 1.1015625, + "learning_rate": 0.0019934297935488275, + "loss": 3.2563, + "step": 941 + }, + { + "epoch": 0.06553271418136283, + "grad_norm": 1.1953125, + "learning_rate": 0.0019934039804997724, + "loss": 3.1389, + "step": 942 + }, + { + "epoch": 0.06560228181849803, + "grad_norm": 1.484375, + "learning_rate": 0.0019933781170106703, + "loss": 3.7684, + "step": 943 + }, + { + "epoch": 0.06567184945563324, + "grad_norm": 1.25, + "learning_rate": 0.0019933522030828347, + "loss": 3.2817, + "step": 944 + }, + { + "epoch": 0.06574141709276844, + "grad_norm": 0.82421875, + "learning_rate": 0.0019933262387175814, + "loss": 3.4684, + "step": 945 + }, + { + "epoch": 0.06581098472990365, + "grad_norm": 1.203125, + "learning_rate": 0.001993300223916228, + "loss": 3.2135, + "step": 946 + }, + { + "epoch": 0.06588055236703885, + "grad_norm": 1.125, + "learning_rate": 0.0019932741586800957, + "loss": 3.4436, + "step": 947 + }, + { + "epoch": 0.06595012000417406, + "grad_norm": 1.1875, + "learning_rate": 0.0019932480430105083, + "loss": 3.4002, + "step": 948 + }, + { + "epoch": 0.06601968764130926, + "grad_norm": 0.921875, + "learning_rate": 0.0019932218769087916, + "loss": 3.8864, + "step": 949 + }, + { + "epoch": 0.06608925527844446, + "grad_norm": 1.34375, + "learning_rate": 0.001993195660376274, + "loss": 2.9565, + "step": 950 + }, + { + "epoch": 0.06615882291557967, + "grad_norm": 1.28125, + "learning_rate": 0.001993169393414287, + "loss": 3.2403, + "step": 951 + }, + { + "epoch": 0.06622839055271487, + "grad_norm": 1.1171875, + "learning_rate": 0.001993143076024164, + "loss": 3.5741, + "step": 952 + }, + { + "epoch": 0.06629795818985008, + "grad_norm": 1.1875, + "learning_rate": 0.001993116708207242, + "loss": 3.2992, + "step": 953 + }, + { + "epoch": 0.06636752582698528, + "grad_norm": 1.3046875, + "learning_rate": 0.001993090289964859, + "loss": 3.3092, + "step": 954 + }, + { + "epoch": 0.0664370934641205, + "grad_norm": 1.859375, + "learning_rate": 0.0019930638212983564, + "loss": 3.2502, + "step": 955 + }, + { + "epoch": 0.06650666110125569, + "grad_norm": 0.91796875, + "learning_rate": 0.0019930373022090785, + "loss": 3.2321, + "step": 956 + }, + { + "epoch": 0.0665762287383909, + "grad_norm": 1.1171875, + "learning_rate": 0.0019930107326983715, + "loss": 3.2501, + "step": 957 + }, + { + "epoch": 0.0666457963755261, + "grad_norm": 0.9921875, + "learning_rate": 0.0019929841127675845, + "loss": 3.5014, + "step": 958 + }, + { + "epoch": 0.06671536401266132, + "grad_norm": 0.875, + "learning_rate": 0.0019929574424180697, + "loss": 3.1929, + "step": 959 + }, + { + "epoch": 0.06678493164979651, + "grad_norm": 1.625, + "learning_rate": 0.0019929307216511806, + "loss": 3.5224, + "step": 960 + }, + { + "epoch": 0.06685449928693173, + "grad_norm": 1.0234375, + "learning_rate": 0.0019929039504682743, + "loss": 3.7231, + "step": 961 + }, + { + "epoch": 0.06692406692406692, + "grad_norm": 1.4765625, + "learning_rate": 0.0019928771288707098, + "loss": 3.4419, + "step": 962 + }, + { + "epoch": 0.06699363456120212, + "grad_norm": 1.1796875, + "learning_rate": 0.0019928502568598494, + "loss": 3.5337, + "step": 963 + }, + { + "epoch": 0.06706320219833733, + "grad_norm": 1.6875, + "learning_rate": 0.0019928233344370574, + "loss": 3.2093, + "step": 964 + }, + { + "epoch": 0.06713276983547253, + "grad_norm": 1.09375, + "learning_rate": 0.0019927963616037003, + "loss": 3.0913, + "step": 965 + }, + { + "epoch": 0.06720233747260775, + "grad_norm": 1.265625, + "learning_rate": 0.001992769338361148, + "loss": 3.4042, + "step": 966 + }, + { + "epoch": 0.06727190510974294, + "grad_norm": 0.89453125, + "learning_rate": 0.001992742264710773, + "loss": 3.6867, + "step": 967 + }, + { + "epoch": 0.06734147274687816, + "grad_norm": 1.453125, + "learning_rate": 0.0019927151406539494, + "loss": 3.4302, + "step": 968 + }, + { + "epoch": 0.06741104038401335, + "grad_norm": 0.9609375, + "learning_rate": 0.0019926879661920547, + "loss": 3.4974, + "step": 969 + }, + { + "epoch": 0.06748060802114857, + "grad_norm": 1.0703125, + "learning_rate": 0.0019926607413264684, + "loss": 3.2897, + "step": 970 + }, + { + "epoch": 0.06755017565828376, + "grad_norm": 1.0625, + "learning_rate": 0.0019926334660585734, + "loss": 3.2806, + "step": 971 + }, + { + "epoch": 0.06761974329541898, + "grad_norm": 1.125, + "learning_rate": 0.0019926061403897537, + "loss": 3.3603, + "step": 972 + }, + { + "epoch": 0.06768931093255418, + "grad_norm": 1.125, + "learning_rate": 0.001992578764321398, + "loss": 3.0188, + "step": 973 + }, + { + "epoch": 0.06775887856968937, + "grad_norm": 0.89453125, + "learning_rate": 0.001992551337854895, + "loss": 3.3528, + "step": 974 + }, + { + "epoch": 0.06782844620682459, + "grad_norm": 1.015625, + "learning_rate": 0.0019925238609916377, + "loss": 3.1065, + "step": 975 + }, + { + "epoch": 0.06789801384395978, + "grad_norm": 1.0078125, + "learning_rate": 0.0019924963337330224, + "loss": 3.5713, + "step": 976 + }, + { + "epoch": 0.067967581481095, + "grad_norm": 1.1171875, + "learning_rate": 0.001992468756080445, + "loss": 3.1457, + "step": 977 + }, + { + "epoch": 0.0680371491182302, + "grad_norm": 0.984375, + "learning_rate": 0.001992441128035307, + "loss": 3.346, + "step": 978 + }, + { + "epoch": 0.0681067167553654, + "grad_norm": 0.95703125, + "learning_rate": 0.0019924134495990105, + "loss": 3.6706, + "step": 979 + }, + { + "epoch": 0.0681762843925006, + "grad_norm": 0.9453125, + "learning_rate": 0.0019923857207729614, + "loss": 3.5294, + "step": 980 + }, + { + "epoch": 0.06824585202963582, + "grad_norm": 1.171875, + "learning_rate": 0.0019923579415585674, + "loss": 3.4409, + "step": 981 + }, + { + "epoch": 0.06831541966677102, + "grad_norm": 1.09375, + "learning_rate": 0.0019923301119572387, + "loss": 3.3162, + "step": 982 + }, + { + "epoch": 0.06838498730390623, + "grad_norm": 1.140625, + "learning_rate": 0.0019923022319703887, + "loss": 3.1688, + "step": 983 + }, + { + "epoch": 0.06845455494104143, + "grad_norm": 0.828125, + "learning_rate": 0.001992274301599433, + "loss": 3.2179, + "step": 984 + }, + { + "epoch": 0.06852412257817664, + "grad_norm": 0.87109375, + "learning_rate": 0.0019922463208457896, + "loss": 3.6096, + "step": 985 + }, + { + "epoch": 0.06859369021531184, + "grad_norm": 1.4609375, + "learning_rate": 0.0019922182897108794, + "loss": 3.4342, + "step": 986 + }, + { + "epoch": 0.06866325785244703, + "grad_norm": 1.09375, + "learning_rate": 0.001992190208196126, + "loss": 2.9741, + "step": 987 + }, + { + "epoch": 0.06873282548958225, + "grad_norm": 1.1171875, + "learning_rate": 0.0019921620763029544, + "loss": 3.58, + "step": 988 + }, + { + "epoch": 0.06880239312671745, + "grad_norm": 0.953125, + "learning_rate": 0.0019921338940327936, + "loss": 3.4243, + "step": 989 + }, + { + "epoch": 0.06887196076385266, + "grad_norm": 0.95703125, + "learning_rate": 0.001992105661387074, + "loss": 3.3534, + "step": 990 + }, + { + "epoch": 0.06894152840098786, + "grad_norm": 1.1171875, + "learning_rate": 0.00199207737836723, + "loss": 3.1215, + "step": 991 + }, + { + "epoch": 0.06901109603812307, + "grad_norm": 1.0546875, + "learning_rate": 0.0019920490449746972, + "loss": 3.4135, + "step": 992 + }, + { + "epoch": 0.06908066367525827, + "grad_norm": 1.3671875, + "learning_rate": 0.001992020661210914, + "loss": 3.516, + "step": 993 + }, + { + "epoch": 0.06915023131239348, + "grad_norm": 1.3515625, + "learning_rate": 0.0019919922270773215, + "loss": 2.9574, + "step": 994 + }, + { + "epoch": 0.06921979894952868, + "grad_norm": 1.4453125, + "learning_rate": 0.001991963742575364, + "loss": 3.4767, + "step": 995 + }, + { + "epoch": 0.06928936658666389, + "grad_norm": 1.5625, + "learning_rate": 0.0019919352077064872, + "loss": 3.7818, + "step": 996 + }, + { + "epoch": 0.06935893422379909, + "grad_norm": 1.4765625, + "learning_rate": 0.0019919066224721406, + "loss": 3.3807, + "step": 997 + }, + { + "epoch": 0.0694285018609343, + "grad_norm": 1.1796875, + "learning_rate": 0.0019918779868737754, + "loss": 3.2762, + "step": 998 + }, + { + "epoch": 0.0694980694980695, + "grad_norm": 1.2109375, + "learning_rate": 0.0019918493009128454, + "loss": 3.0363, + "step": 999 + }, + { + "epoch": 0.0695676371352047, + "grad_norm": 0.96875, + "learning_rate": 0.0019918205645908073, + "loss": 3.3782, + "step": 1000 + }, + { + "epoch": 0.06963720477233991, + "grad_norm": 1.25, + "learning_rate": 0.0019917917779091196, + "loss": 3.4582, + "step": 1001 + }, + { + "epoch": 0.0697067724094751, + "grad_norm": 0.99609375, + "learning_rate": 0.0019917629408692447, + "loss": 3.3803, + "step": 1002 + }, + { + "epoch": 0.06977634004661032, + "grad_norm": 1.1640625, + "learning_rate": 0.0019917340534726467, + "loss": 3.3236, + "step": 1003 + }, + { + "epoch": 0.06984590768374552, + "grad_norm": 1.0546875, + "learning_rate": 0.0019917051157207918, + "loss": 3.3535, + "step": 1004 + }, + { + "epoch": 0.06991547532088073, + "grad_norm": 0.77734375, + "learning_rate": 0.00199167612761515, + "loss": 3.5928, + "step": 1005 + }, + { + "epoch": 0.06998504295801593, + "grad_norm": 1.2734375, + "learning_rate": 0.0019916470891571925, + "loss": 3.3844, + "step": 1006 + }, + { + "epoch": 0.07005461059515114, + "grad_norm": 0.94921875, + "learning_rate": 0.0019916180003483946, + "loss": 3.1268, + "step": 1007 + }, + { + "epoch": 0.07012417823228634, + "grad_norm": 1.3203125, + "learning_rate": 0.0019915888611902323, + "loss": 3.4423, + "step": 1008 + }, + { + "epoch": 0.07019374586942155, + "grad_norm": 1.03125, + "learning_rate": 0.001991559671684186, + "loss": 2.9498, + "step": 1009 + }, + { + "epoch": 0.07026331350655675, + "grad_norm": 0.8984375, + "learning_rate": 0.001991530431831737, + "loss": 3.3612, + "step": 1010 + }, + { + "epoch": 0.07033288114369196, + "grad_norm": 1.0703125, + "learning_rate": 0.0019915011416343706, + "loss": 3.0226, + "step": 1011 + }, + { + "epoch": 0.07040244878082716, + "grad_norm": 1.3984375, + "learning_rate": 0.001991471801093574, + "loss": 3.5298, + "step": 1012 + }, + { + "epoch": 0.07047201641796236, + "grad_norm": 0.9921875, + "learning_rate": 0.001991442410210836, + "loss": 3.2802, + "step": 1013 + }, + { + "epoch": 0.07054158405509757, + "grad_norm": 0.9921875, + "learning_rate": 0.0019914129689876502, + "loss": 3.4393, + "step": 1014 + }, + { + "epoch": 0.07061115169223277, + "grad_norm": 1.109375, + "learning_rate": 0.0019913834774255112, + "loss": 3.0369, + "step": 1015 + }, + { + "epoch": 0.07068071932936798, + "grad_norm": 1.28125, + "learning_rate": 0.001991353935525916, + "loss": 3.4006, + "step": 1016 + }, + { + "epoch": 0.07075028696650318, + "grad_norm": 1.234375, + "learning_rate": 0.001991324343290364, + "loss": 3.2334, + "step": 1017 + }, + { + "epoch": 0.07081985460363839, + "grad_norm": 1.0625, + "learning_rate": 0.001991294700720359, + "loss": 3.5597, + "step": 1018 + }, + { + "epoch": 0.07088942224077359, + "grad_norm": 0.99609375, + "learning_rate": 0.001991265007817406, + "loss": 3.2408, + "step": 1019 + }, + { + "epoch": 0.0709589898779088, + "grad_norm": 1.546875, + "learning_rate": 0.001991235264583012, + "loss": 3.2134, + "step": 1020 + }, + { + "epoch": 0.071028557515044, + "grad_norm": 2.0625, + "learning_rate": 0.001991205471018687, + "loss": 3.423, + "step": 1021 + }, + { + "epoch": 0.07109812515217921, + "grad_norm": 0.921875, + "learning_rate": 0.0019911756271259445, + "loss": 3.1471, + "step": 1022 + }, + { + "epoch": 0.07116769278931441, + "grad_norm": 1.15625, + "learning_rate": 0.0019911457329062996, + "loss": 3.37, + "step": 1023 + }, + { + "epoch": 0.07123726042644962, + "grad_norm": 1.2421875, + "learning_rate": 0.0019911157883612703, + "loss": 3.5813, + "step": 1024 + }, + { + "epoch": 0.07130682806358482, + "grad_norm": 1.0390625, + "learning_rate": 0.0019910857934923765, + "loss": 3.6288, + "step": 1025 + }, + { + "epoch": 0.07137639570072002, + "grad_norm": 1.703125, + "learning_rate": 0.001991055748301142, + "loss": 3.3881, + "step": 1026 + }, + { + "epoch": 0.07144596333785523, + "grad_norm": 0.87890625, + "learning_rate": 0.0019910256527890914, + "loss": 3.1126, + "step": 1027 + }, + { + "epoch": 0.07151553097499043, + "grad_norm": 1.15625, + "learning_rate": 0.0019909955069577533, + "loss": 3.278, + "step": 1028 + }, + { + "epoch": 0.07158509861212564, + "grad_norm": 0.88671875, + "learning_rate": 0.001990965310808659, + "loss": 3.2712, + "step": 1029 + }, + { + "epoch": 0.07165466624926084, + "grad_norm": 1.3203125, + "learning_rate": 0.0019909350643433402, + "loss": 3.1583, + "step": 1030 + }, + { + "epoch": 0.07172423388639605, + "grad_norm": 1.0, + "learning_rate": 0.0019909047675633344, + "loss": 3.6229, + "step": 1031 + }, + { + "epoch": 0.07179380152353125, + "grad_norm": 1.234375, + "learning_rate": 0.0019908744204701783, + "loss": 3.1285, + "step": 1032 + }, + { + "epoch": 0.07186336916066646, + "grad_norm": 0.8359375, + "learning_rate": 0.0019908440230654136, + "loss": 3.3274, + "step": 1033 + }, + { + "epoch": 0.07193293679780166, + "grad_norm": 0.9765625, + "learning_rate": 0.001990813575350584, + "loss": 3.6442, + "step": 1034 + }, + { + "epoch": 0.07200250443493687, + "grad_norm": 0.91796875, + "learning_rate": 0.0019907830773272348, + "loss": 3.8308, + "step": 1035 + }, + { + "epoch": 0.07207207207207207, + "grad_norm": 0.91015625, + "learning_rate": 0.001990752528996915, + "loss": 3.1517, + "step": 1036 + }, + { + "epoch": 0.07214163970920728, + "grad_norm": 1.015625, + "learning_rate": 0.0019907219303611757, + "loss": 3.2052, + "step": 1037 + }, + { + "epoch": 0.07221120734634248, + "grad_norm": 1.53125, + "learning_rate": 0.0019906912814215702, + "loss": 3.0487, + "step": 1038 + }, + { + "epoch": 0.07228077498347768, + "grad_norm": 1.0234375, + "learning_rate": 0.0019906605821796547, + "loss": 3.6735, + "step": 1039 + }, + { + "epoch": 0.07235034262061289, + "grad_norm": 1.0390625, + "learning_rate": 0.0019906298326369887, + "loss": 3.3629, + "step": 1040 + }, + { + "epoch": 0.07241991025774809, + "grad_norm": 1.0625, + "learning_rate": 0.001990599032795132, + "loss": 3.1491, + "step": 1041 + }, + { + "epoch": 0.0724894778948833, + "grad_norm": 0.96875, + "learning_rate": 0.0019905681826556504, + "loss": 3.3664, + "step": 1042 + }, + { + "epoch": 0.0725590455320185, + "grad_norm": 0.7421875, + "learning_rate": 0.001990537282220109, + "loss": 3.2537, + "step": 1043 + }, + { + "epoch": 0.07262861316915371, + "grad_norm": 0.984375, + "learning_rate": 0.0019905063314900767, + "loss": 3.6151, + "step": 1044 + }, + { + "epoch": 0.07269818080628891, + "grad_norm": 0.9296875, + "learning_rate": 0.0019904753304671257, + "loss": 3.6813, + "step": 1045 + }, + { + "epoch": 0.07276774844342412, + "grad_norm": 1.078125, + "learning_rate": 0.00199044427915283, + "loss": 3.1794, + "step": 1046 + }, + { + "epoch": 0.07283731608055932, + "grad_norm": 0.8359375, + "learning_rate": 0.0019904131775487655, + "loss": 3.7046, + "step": 1047 + }, + { + "epoch": 0.07290688371769453, + "grad_norm": 1.0703125, + "learning_rate": 0.0019903820256565122, + "loss": 3.285, + "step": 1048 + }, + { + "epoch": 0.07297645135482973, + "grad_norm": 1.3203125, + "learning_rate": 0.0019903508234776516, + "loss": 3.1065, + "step": 1049 + }, + { + "epoch": 0.07304601899196493, + "grad_norm": 0.94140625, + "learning_rate": 0.001990319571013768, + "loss": 3.3196, + "step": 1050 + }, + { + "epoch": 0.07311558662910014, + "grad_norm": 1.0859375, + "learning_rate": 0.001990288268266448, + "loss": 3.2582, + "step": 1051 + }, + { + "epoch": 0.07318515426623534, + "grad_norm": 1.0390625, + "learning_rate": 0.0019902569152372806, + "loss": 3.3076, + "step": 1052 + }, + { + "epoch": 0.07325472190337055, + "grad_norm": 1.09375, + "learning_rate": 0.001990225511927859, + "loss": 2.9895, + "step": 1053 + }, + { + "epoch": 0.07332428954050575, + "grad_norm": 1.0546875, + "learning_rate": 0.001990194058339777, + "loss": 2.9849, + "step": 1054 + }, + { + "epoch": 0.07339385717764096, + "grad_norm": 1.125, + "learning_rate": 0.0019901625544746313, + "loss": 3.5619, + "step": 1055 + }, + { + "epoch": 0.07346342481477616, + "grad_norm": 0.875, + "learning_rate": 0.0019901310003340223, + "loss": 3.2445, + "step": 1056 + }, + { + "epoch": 0.07353299245191137, + "grad_norm": 0.90625, + "learning_rate": 0.001990099395919552, + "loss": 3.4955, + "step": 1057 + }, + { + "epoch": 0.07360256008904657, + "grad_norm": 1.0390625, + "learning_rate": 0.0019900677412328237, + "loss": 3.0674, + "step": 1058 + }, + { + "epoch": 0.07367212772618179, + "grad_norm": 0.91796875, + "learning_rate": 0.0019900360362754468, + "loss": 3.2239, + "step": 1059 + }, + { + "epoch": 0.07374169536331698, + "grad_norm": 0.95703125, + "learning_rate": 0.0019900042810490296, + "loss": 3.0941, + "step": 1060 + }, + { + "epoch": 0.0738112630004522, + "grad_norm": 1.1015625, + "learning_rate": 0.0019899724755551855, + "loss": 3.154, + "step": 1061 + }, + { + "epoch": 0.0738808306375874, + "grad_norm": 1.1171875, + "learning_rate": 0.0019899406197955286, + "loss": 3.3571, + "step": 1062 + }, + { + "epoch": 0.07395039827472259, + "grad_norm": 1.65625, + "learning_rate": 0.0019899087137716766, + "loss": 3.3969, + "step": 1063 + }, + { + "epoch": 0.0740199659118578, + "grad_norm": 1.484375, + "learning_rate": 0.0019898767574852497, + "loss": 3.2255, + "step": 1064 + }, + { + "epoch": 0.074089533548993, + "grad_norm": 0.8828125, + "learning_rate": 0.0019898447509378706, + "loss": 3.3027, + "step": 1065 + }, + { + "epoch": 0.07415910118612822, + "grad_norm": 1.0859375, + "learning_rate": 0.001989812694131164, + "loss": 3.6268, + "step": 1066 + }, + { + "epoch": 0.07422866882326341, + "grad_norm": 0.921875, + "learning_rate": 0.001989780587066758, + "loss": 3.3741, + "step": 1067 + }, + { + "epoch": 0.07429823646039863, + "grad_norm": 0.98828125, + "learning_rate": 0.0019897484297462828, + "loss": 2.8676, + "step": 1068 + }, + { + "epoch": 0.07436780409753382, + "grad_norm": 0.9375, + "learning_rate": 0.0019897162221713706, + "loss": 3.2567, + "step": 1069 + }, + { + "epoch": 0.07443737173466904, + "grad_norm": 1.109375, + "learning_rate": 0.0019896839643436573, + "loss": 3.2005, + "step": 1070 + }, + { + "epoch": 0.07450693937180423, + "grad_norm": 1.1640625, + "learning_rate": 0.001989651656264781, + "loss": 3.1738, + "step": 1071 + }, + { + "epoch": 0.07457650700893945, + "grad_norm": 1.078125, + "learning_rate": 0.0019896192979363815, + "loss": 3.3275, + "step": 1072 + }, + { + "epoch": 0.07464607464607464, + "grad_norm": 0.9765625, + "learning_rate": 0.0019895868893601023, + "loss": 3.5421, + "step": 1073 + }, + { + "epoch": 0.07471564228320986, + "grad_norm": 1.2109375, + "learning_rate": 0.0019895544305375884, + "loss": 3.0984, + "step": 1074 + }, + { + "epoch": 0.07478520992034506, + "grad_norm": 1.15625, + "learning_rate": 0.0019895219214704886, + "loss": 3.1449, + "step": 1075 + }, + { + "epoch": 0.07485477755748025, + "grad_norm": 1.234375, + "learning_rate": 0.001989489362160453, + "loss": 3.1699, + "step": 1076 + }, + { + "epoch": 0.07492434519461547, + "grad_norm": 0.9765625, + "learning_rate": 0.0019894567526091353, + "loss": 3.2374, + "step": 1077 + }, + { + "epoch": 0.07499391283175066, + "grad_norm": 0.75390625, + "learning_rate": 0.0019894240928181907, + "loss": 3.3657, + "step": 1078 + }, + { + "epoch": 0.07506348046888588, + "grad_norm": 1.390625, + "learning_rate": 0.0019893913827892773, + "loss": 3.5596, + "step": 1079 + }, + { + "epoch": 0.07513304810602107, + "grad_norm": 0.95703125, + "learning_rate": 0.001989358622524057, + "loss": 3.4357, + "step": 1080 + }, + { + "epoch": 0.07520261574315629, + "grad_norm": 1.1484375, + "learning_rate": 0.0019893258120241924, + "loss": 3.1769, + "step": 1081 + }, + { + "epoch": 0.07527218338029149, + "grad_norm": 1.2109375, + "learning_rate": 0.0019892929512913497, + "loss": 3.4845, + "step": 1082 + }, + { + "epoch": 0.0753417510174267, + "grad_norm": 1.4296875, + "learning_rate": 0.001989260040327197, + "loss": 3.3399, + "step": 1083 + }, + { + "epoch": 0.0754113186545619, + "grad_norm": 1.2265625, + "learning_rate": 0.001989227079133406, + "loss": 3.1975, + "step": 1084 + }, + { + "epoch": 0.07548088629169711, + "grad_norm": 1.140625, + "learning_rate": 0.00198919406771165, + "loss": 2.9935, + "step": 1085 + }, + { + "epoch": 0.0755504539288323, + "grad_norm": 1.2578125, + "learning_rate": 0.001989161006063605, + "loss": 3.0267, + "step": 1086 + }, + { + "epoch": 0.07562002156596752, + "grad_norm": 1.4375, + "learning_rate": 0.0019891278941909503, + "loss": 3.2466, + "step": 1087 + }, + { + "epoch": 0.07568958920310272, + "grad_norm": 1.0, + "learning_rate": 0.001989094732095366, + "loss": 3.3007, + "step": 1088 + }, + { + "epoch": 0.07575915684023792, + "grad_norm": 1.1484375, + "learning_rate": 0.001989061519778537, + "loss": 3.4514, + "step": 1089 + }, + { + "epoch": 0.07582872447737313, + "grad_norm": 0.93359375, + "learning_rate": 0.0019890282572421493, + "loss": 3.48, + "step": 1090 + }, + { + "epoch": 0.07589829211450833, + "grad_norm": 1.09375, + "learning_rate": 0.0019889949444878915, + "loss": 3.2318, + "step": 1091 + }, + { + "epoch": 0.07596785975164354, + "grad_norm": 1.53125, + "learning_rate": 0.0019889615815174557, + "loss": 3.0235, + "step": 1092 + }, + { + "epoch": 0.07603742738877874, + "grad_norm": 1.171875, + "learning_rate": 0.001988928168332535, + "loss": 3.2355, + "step": 1093 + }, + { + "epoch": 0.07610699502591395, + "grad_norm": 0.890625, + "learning_rate": 0.0019888947049348273, + "loss": 3.3776, + "step": 1094 + }, + { + "epoch": 0.07617656266304915, + "grad_norm": 1.1875, + "learning_rate": 0.0019888611913260303, + "loss": 3.1374, + "step": 1095 + }, + { + "epoch": 0.07624613030018436, + "grad_norm": 1.3125, + "learning_rate": 0.0019888276275078463, + "loss": 3.3041, + "step": 1096 + }, + { + "epoch": 0.07631569793731956, + "grad_norm": 1.0078125, + "learning_rate": 0.0019887940134819793, + "loss": 3.3709, + "step": 1097 + }, + { + "epoch": 0.07638526557445477, + "grad_norm": 0.98828125, + "learning_rate": 0.0019887603492501366, + "loss": 3.4368, + "step": 1098 + }, + { + "epoch": 0.07645483321158997, + "grad_norm": 0.87890625, + "learning_rate": 0.0019887266348140266, + "loss": 3.4639, + "step": 1099 + }, + { + "epoch": 0.07652440084872518, + "grad_norm": 1.0625, + "learning_rate": 0.001988692870175362, + "loss": 3.2643, + "step": 1100 + }, + { + "epoch": 0.07659396848586038, + "grad_norm": 1.171875, + "learning_rate": 0.0019886590553358564, + "loss": 3.0054, + "step": 1101 + }, + { + "epoch": 0.07666353612299558, + "grad_norm": 1.3671875, + "learning_rate": 0.0019886251902972276, + "loss": 2.6731, + "step": 1102 + }, + { + "epoch": 0.07673310376013079, + "grad_norm": 1.1328125, + "learning_rate": 0.001988591275061195, + "loss": 3.034, + "step": 1103 + }, + { + "epoch": 0.07680267139726599, + "grad_norm": 1.0859375, + "learning_rate": 0.0019885573096294793, + "loss": 3.5486, + "step": 1104 + }, + { + "epoch": 0.0768722390344012, + "grad_norm": 1.9375, + "learning_rate": 0.001988523294003807, + "loss": 3.0527, + "step": 1105 + }, + { + "epoch": 0.0769418066715364, + "grad_norm": 0.92578125, + "learning_rate": 0.001988489228185904, + "loss": 3.0049, + "step": 1106 + }, + { + "epoch": 0.07701137430867161, + "grad_norm": 0.98828125, + "learning_rate": 0.0019884551121775004, + "loss": 3.3528, + "step": 1107 + }, + { + "epoch": 0.07708094194580681, + "grad_norm": 0.734375, + "learning_rate": 0.001988420945980328, + "loss": 3.3525, + "step": 1108 + }, + { + "epoch": 0.07715050958294202, + "grad_norm": 0.984375, + "learning_rate": 0.001988386729596123, + "loss": 3.2421, + "step": 1109 + }, + { + "epoch": 0.07722007722007722, + "grad_norm": 0.84765625, + "learning_rate": 0.001988352463026621, + "loss": 3.6422, + "step": 1110 + }, + { + "epoch": 0.07728964485721243, + "grad_norm": 0.90234375, + "learning_rate": 0.0019883181462735625, + "loss": 3.4884, + "step": 1111 + }, + { + "epoch": 0.07735921249434763, + "grad_norm": 0.99609375, + "learning_rate": 0.0019882837793386903, + "loss": 3.4869, + "step": 1112 + }, + { + "epoch": 0.07742878013148284, + "grad_norm": 1.2265625, + "learning_rate": 0.001988249362223749, + "loss": 3.354, + "step": 1113 + }, + { + "epoch": 0.07749834776861804, + "grad_norm": 1.0078125, + "learning_rate": 0.0019882148949304864, + "loss": 3.2511, + "step": 1114 + }, + { + "epoch": 0.07756791540575324, + "grad_norm": 0.9296875, + "learning_rate": 0.001988180377460652, + "loss": 3.2186, + "step": 1115 + }, + { + "epoch": 0.07763748304288845, + "grad_norm": 1.0546875, + "learning_rate": 0.001988145809815999, + "loss": 3.2695, + "step": 1116 + }, + { + "epoch": 0.07770705068002365, + "grad_norm": 0.9296875, + "learning_rate": 0.0019881111919982826, + "loss": 3.7394, + "step": 1117 + }, + { + "epoch": 0.07777661831715886, + "grad_norm": 1.0, + "learning_rate": 0.0019880765240092605, + "loss": 3.0813, + "step": 1118 + }, + { + "epoch": 0.07784618595429406, + "grad_norm": 1.328125, + "learning_rate": 0.0019880418058506925, + "loss": 3.2941, + "step": 1119 + }, + { + "epoch": 0.07791575359142927, + "grad_norm": 0.95703125, + "learning_rate": 0.0019880070375243417, + "loss": 3.671, + "step": 1120 + }, + { + "epoch": 0.07798532122856447, + "grad_norm": 1.1015625, + "learning_rate": 0.0019879722190319733, + "loss": 3.3528, + "step": 1121 + }, + { + "epoch": 0.07805488886569968, + "grad_norm": 1.046875, + "learning_rate": 0.0019879373503753554, + "loss": 3.386, + "step": 1122 + }, + { + "epoch": 0.07812445650283488, + "grad_norm": 0.9296875, + "learning_rate": 0.0019879024315562583, + "loss": 2.9778, + "step": 1123 + }, + { + "epoch": 0.07819402413997009, + "grad_norm": 0.90234375, + "learning_rate": 0.0019878674625764554, + "loss": 3.3305, + "step": 1124 + }, + { + "epoch": 0.07826359177710529, + "grad_norm": 1.1171875, + "learning_rate": 0.001987832443437722, + "loss": 2.9773, + "step": 1125 + }, + { + "epoch": 0.07833315941424049, + "grad_norm": 0.9140625, + "learning_rate": 0.0019877973741418364, + "loss": 3.394, + "step": 1126 + }, + { + "epoch": 0.0784027270513757, + "grad_norm": 0.984375, + "learning_rate": 0.0019877622546905786, + "loss": 3.2623, + "step": 1127 + }, + { + "epoch": 0.0784722946885109, + "grad_norm": 1.1640625, + "learning_rate": 0.001987727085085732, + "loss": 3.1564, + "step": 1128 + }, + { + "epoch": 0.07854186232564611, + "grad_norm": 1.171875, + "learning_rate": 0.001987691865329083, + "loss": 3.6401, + "step": 1129 + }, + { + "epoch": 0.07861142996278131, + "grad_norm": 1.140625, + "learning_rate": 0.0019876565954224192, + "loss": 3.0965, + "step": 1130 + }, + { + "epoch": 0.07868099759991652, + "grad_norm": 1.15625, + "learning_rate": 0.001987621275367532, + "loss": 3.0531, + "step": 1131 + }, + { + "epoch": 0.07875056523705172, + "grad_norm": 1.2578125, + "learning_rate": 0.0019875859051662137, + "loss": 2.9751, + "step": 1132 + }, + { + "epoch": 0.07882013287418693, + "grad_norm": 0.82421875, + "learning_rate": 0.0019875504848202614, + "loss": 3.5153, + "step": 1133 + }, + { + "epoch": 0.07888970051132213, + "grad_norm": 1.53125, + "learning_rate": 0.001987515014331473, + "loss": 2.8853, + "step": 1134 + }, + { + "epoch": 0.07895926814845734, + "grad_norm": 1.28125, + "learning_rate": 0.0019874794937016498, + "loss": 3.1095, + "step": 1135 + }, + { + "epoch": 0.07902883578559254, + "grad_norm": 2.21875, + "learning_rate": 0.001987443922932595, + "loss": 3.3813, + "step": 1136 + }, + { + "epoch": 0.07909840342272775, + "grad_norm": 0.98046875, + "learning_rate": 0.001987408302026115, + "loss": 3.7459, + "step": 1137 + }, + { + "epoch": 0.07916797105986295, + "grad_norm": 1.109375, + "learning_rate": 0.001987372630984018, + "loss": 3.0523, + "step": 1138 + }, + { + "epoch": 0.07923753869699815, + "grad_norm": 1.53125, + "learning_rate": 0.001987336909808116, + "loss": 2.879, + "step": 1139 + }, + { + "epoch": 0.07930710633413336, + "grad_norm": 0.7890625, + "learning_rate": 0.001987301138500222, + "loss": 3.192, + "step": 1140 + }, + { + "epoch": 0.07937667397126856, + "grad_norm": 1.0859375, + "learning_rate": 0.001987265317062153, + "loss": 2.8798, + "step": 1141 + }, + { + "epoch": 0.07944624160840377, + "grad_norm": 1.2109375, + "learning_rate": 0.0019872294454957268, + "loss": 2.8866, + "step": 1142 + }, + { + "epoch": 0.07951580924553897, + "grad_norm": 0.921875, + "learning_rate": 0.001987193523802765, + "loss": 3.1609, + "step": 1143 + }, + { + "epoch": 0.07958537688267418, + "grad_norm": 0.83984375, + "learning_rate": 0.0019871575519850924, + "loss": 3.6688, + "step": 1144 + }, + { + "epoch": 0.07965494451980938, + "grad_norm": 2.453125, + "learning_rate": 0.0019871215300445353, + "loss": 3.64, + "step": 1145 + }, + { + "epoch": 0.0797245121569446, + "grad_norm": 2.296875, + "learning_rate": 0.001987085457982922, + "loss": 3.4658, + "step": 1146 + }, + { + "epoch": 0.07979407979407979, + "grad_norm": 1.0703125, + "learning_rate": 0.0019870493358020843, + "loss": 3.5664, + "step": 1147 + }, + { + "epoch": 0.079863647431215, + "grad_norm": 0.6953125, + "learning_rate": 0.001987013163503857, + "loss": 3.882, + "step": 1148 + }, + { + "epoch": 0.0799332150683502, + "grad_norm": 1.34375, + "learning_rate": 0.0019869769410900753, + "loss": 3.1736, + "step": 1149 + }, + { + "epoch": 0.08000278270548541, + "grad_norm": 1.0546875, + "learning_rate": 0.00198694066856258, + "loss": 2.935, + "step": 1150 + }, + { + "epoch": 0.08007235034262061, + "grad_norm": 1.0625, + "learning_rate": 0.001986904345923212, + "loss": 3.3874, + "step": 1151 + }, + { + "epoch": 0.08014191797975581, + "grad_norm": 0.80859375, + "learning_rate": 0.001986867973173815, + "loss": 3.2382, + "step": 1152 + }, + { + "epoch": 0.08021148561689102, + "grad_norm": 0.96484375, + "learning_rate": 0.001986831550316237, + "loss": 3.1287, + "step": 1153 + }, + { + "epoch": 0.08028105325402622, + "grad_norm": 1.0, + "learning_rate": 0.001986795077352327, + "loss": 3.1296, + "step": 1154 + }, + { + "epoch": 0.08035062089116143, + "grad_norm": 1.15625, + "learning_rate": 0.0019867585542839373, + "loss": 3.1815, + "step": 1155 + }, + { + "epoch": 0.08042018852829663, + "grad_norm": 0.8515625, + "learning_rate": 0.001986721981112921, + "loss": 3.7003, + "step": 1156 + }, + { + "epoch": 0.08048975616543184, + "grad_norm": 0.9765625, + "learning_rate": 0.001986685357841136, + "loss": 3.2178, + "step": 1157 + }, + { + "epoch": 0.08055932380256704, + "grad_norm": 1.0, + "learning_rate": 0.001986648684470442, + "loss": 3.4522, + "step": 1158 + }, + { + "epoch": 0.08062889143970225, + "grad_norm": 0.81640625, + "learning_rate": 0.001986611961002701, + "loss": 3.4755, + "step": 1159 + }, + { + "epoch": 0.08069845907683745, + "grad_norm": 0.953125, + "learning_rate": 0.001986575187439777, + "loss": 3.3797, + "step": 1160 + }, + { + "epoch": 0.08076802671397267, + "grad_norm": 0.7890625, + "learning_rate": 0.001986538363783538, + "loss": 3.1683, + "step": 1161 + }, + { + "epoch": 0.08083759435110786, + "grad_norm": 1.0234375, + "learning_rate": 0.0019865014900358534, + "loss": 3.1062, + "step": 1162 + }, + { + "epoch": 0.08090716198824308, + "grad_norm": 0.70703125, + "learning_rate": 0.0019864645661985957, + "loss": 3.574, + "step": 1163 + }, + { + "epoch": 0.08097672962537827, + "grad_norm": 0.79296875, + "learning_rate": 0.0019864275922736397, + "loss": 3.4354, + "step": 1164 + }, + { + "epoch": 0.08104629726251347, + "grad_norm": 0.73828125, + "learning_rate": 0.001986390568262862, + "loss": 3.3539, + "step": 1165 + }, + { + "epoch": 0.08111586489964868, + "grad_norm": 1.171875, + "learning_rate": 0.0019863534941681428, + "loss": 3.5538, + "step": 1166 + }, + { + "epoch": 0.08118543253678388, + "grad_norm": 1.046875, + "learning_rate": 0.001986316369991365, + "loss": 3.5096, + "step": 1167 + }, + { + "epoch": 0.0812550001739191, + "grad_norm": 0.80078125, + "learning_rate": 0.0019862791957344136, + "loss": 2.8434, + "step": 1168 + }, + { + "epoch": 0.0813245678110543, + "grad_norm": 1.2109375, + "learning_rate": 0.0019862419713991756, + "loss": 3.2823, + "step": 1169 + }, + { + "epoch": 0.0813941354481895, + "grad_norm": 1.25, + "learning_rate": 0.0019862046969875416, + "loss": 3.4693, + "step": 1170 + }, + { + "epoch": 0.0814637030853247, + "grad_norm": 0.875, + "learning_rate": 0.0019861673725014035, + "loss": 3.1002, + "step": 1171 + }, + { + "epoch": 0.08153327072245992, + "grad_norm": 0.96875, + "learning_rate": 0.0019861299979426574, + "loss": 3.5625, + "step": 1172 + }, + { + "epoch": 0.08160283835959511, + "grad_norm": 0.80078125, + "learning_rate": 0.0019860925733132004, + "loss": 3.1363, + "step": 1173 + }, + { + "epoch": 0.08167240599673033, + "grad_norm": 0.6796875, + "learning_rate": 0.0019860550986149322, + "loss": 3.7165, + "step": 1174 + }, + { + "epoch": 0.08174197363386553, + "grad_norm": 0.921875, + "learning_rate": 0.0019860175738497564, + "loss": 3.2552, + "step": 1175 + }, + { + "epoch": 0.08181154127100074, + "grad_norm": 1.0234375, + "learning_rate": 0.0019859799990195786, + "loss": 3.5978, + "step": 1176 + }, + { + "epoch": 0.08188110890813594, + "grad_norm": 0.97265625, + "learning_rate": 0.0019859423741263055, + "loss": 3.0328, + "step": 1177 + }, + { + "epoch": 0.08195067654527113, + "grad_norm": 0.98828125, + "learning_rate": 0.0019859046991718486, + "loss": 3.2697, + "step": 1178 + }, + { + "epoch": 0.08202024418240635, + "grad_norm": 1.046875, + "learning_rate": 0.0019858669741581207, + "loss": 3.0997, + "step": 1179 + }, + { + "epoch": 0.08208981181954154, + "grad_norm": 0.98046875, + "learning_rate": 0.0019858291990870365, + "loss": 3.2616, + "step": 1180 + }, + { + "epoch": 0.08215937945667676, + "grad_norm": 1.1484375, + "learning_rate": 0.0019857913739605147, + "loss": 3.0133, + "step": 1181 + }, + { + "epoch": 0.08222894709381195, + "grad_norm": 1.0078125, + "learning_rate": 0.001985753498780475, + "loss": 3.3974, + "step": 1182 + }, + { + "epoch": 0.08229851473094717, + "grad_norm": 0.921875, + "learning_rate": 0.001985715573548842, + "loss": 3.466, + "step": 1183 + }, + { + "epoch": 0.08236808236808237, + "grad_norm": 1.2109375, + "learning_rate": 0.0019856775982675405, + "loss": 2.859, + "step": 1184 + }, + { + "epoch": 0.08243765000521758, + "grad_norm": 0.8359375, + "learning_rate": 0.0019856395729384983, + "loss": 3.4976, + "step": 1185 + }, + { + "epoch": 0.08250721764235278, + "grad_norm": 1.1015625, + "learning_rate": 0.001985601497563647, + "loss": 3.2808, + "step": 1186 + }, + { + "epoch": 0.08257678527948799, + "grad_norm": 6.3125, + "learning_rate": 0.001985563372144919, + "loss": 3.4329, + "step": 1187 + }, + { + "epoch": 0.08264635291662319, + "grad_norm": 1.0703125, + "learning_rate": 0.001985525196684251, + "loss": 3.2684, + "step": 1188 + }, + { + "epoch": 0.08271592055375838, + "grad_norm": 1.28125, + "learning_rate": 0.001985486971183581, + "loss": 3.5413, + "step": 1189 + }, + { + "epoch": 0.0827854881908936, + "grad_norm": 1.0, + "learning_rate": 0.00198544869564485, + "loss": 3.0702, + "step": 1190 + }, + { + "epoch": 0.0828550558280288, + "grad_norm": 1.1328125, + "learning_rate": 0.00198541037007, + "loss": 2.9937, + "step": 1191 + }, + { + "epoch": 0.08292462346516401, + "grad_norm": 0.9921875, + "learning_rate": 0.00198537199446098, + "loss": 3.5534, + "step": 1192 + }, + { + "epoch": 0.0829941911022992, + "grad_norm": 0.984375, + "learning_rate": 0.0019853335688197354, + "loss": 3.4087, + "step": 1193 + }, + { + "epoch": 0.08306375873943442, + "grad_norm": 1.2109375, + "learning_rate": 0.0019852950931482194, + "loss": 3.1602, + "step": 1194 + }, + { + "epoch": 0.08313332637656962, + "grad_norm": 0.83203125, + "learning_rate": 0.0019852565674483846, + "loss": 3.3389, + "step": 1195 + }, + { + "epoch": 0.08320289401370483, + "grad_norm": 0.9609375, + "learning_rate": 0.001985217991722187, + "loss": 2.9335, + "step": 1196 + }, + { + "epoch": 0.08327246165084003, + "grad_norm": 0.93359375, + "learning_rate": 0.001985179365971586, + "loss": 3.0408, + "step": 1197 + }, + { + "epoch": 0.08334202928797524, + "grad_norm": 0.82421875, + "learning_rate": 0.0019851406901985427, + "loss": 3.4378, + "step": 1198 + }, + { + "epoch": 0.08341159692511044, + "grad_norm": 0.734375, + "learning_rate": 0.0019851019644050202, + "loss": 3.5636, + "step": 1199 + }, + { + "epoch": 0.08348116456224565, + "grad_norm": 0.921875, + "learning_rate": 0.0019850631885929854, + "loss": 2.891, + "step": 1200 + }, + { + "epoch": 0.08355073219938085, + "grad_norm": 0.953125, + "learning_rate": 0.001985024362764407, + "loss": 3.2612, + "step": 1201 + }, + { + "epoch": 0.08362029983651605, + "grad_norm": 1.0546875, + "learning_rate": 0.0019849854869212562, + "loss": 3.5886, + "step": 1202 + }, + { + "epoch": 0.08368986747365126, + "grad_norm": 1.1171875, + "learning_rate": 0.0019849465610655074, + "loss": 3.4887, + "step": 1203 + }, + { + "epoch": 0.08375943511078646, + "grad_norm": 1.3125, + "learning_rate": 0.0019849075851991363, + "loss": 3.4052, + "step": 1204 + }, + { + "epoch": 0.08382900274792167, + "grad_norm": 0.75, + "learning_rate": 0.0019848685593241225, + "loss": 3.3308, + "step": 1205 + }, + { + "epoch": 0.08389857038505687, + "grad_norm": 0.92578125, + "learning_rate": 0.0019848294834424476, + "loss": 3.395, + "step": 1206 + }, + { + "epoch": 0.08396813802219208, + "grad_norm": 0.97265625, + "learning_rate": 0.001984790357556095, + "loss": 3.2642, + "step": 1207 + }, + { + "epoch": 0.08403770565932728, + "grad_norm": 0.90625, + "learning_rate": 0.001984751181667052, + "loss": 3.7784, + "step": 1208 + }, + { + "epoch": 0.08410727329646249, + "grad_norm": 1.078125, + "learning_rate": 0.0019847119557773072, + "loss": 3.5094, + "step": 1209 + }, + { + "epoch": 0.08417684093359769, + "grad_norm": 0.859375, + "learning_rate": 0.001984672679888853, + "loss": 3.0859, + "step": 1210 + }, + { + "epoch": 0.0842464085707329, + "grad_norm": 1.1796875, + "learning_rate": 0.0019846333540036835, + "loss": 3.2304, + "step": 1211 + }, + { + "epoch": 0.0843159762078681, + "grad_norm": 1.0234375, + "learning_rate": 0.001984593978123795, + "loss": 3.5656, + "step": 1212 + }, + { + "epoch": 0.08438554384500331, + "grad_norm": 1.1953125, + "learning_rate": 0.001984554552251186, + "loss": 3.2698, + "step": 1213 + }, + { + "epoch": 0.08445511148213851, + "grad_norm": 0.81640625, + "learning_rate": 0.0019845150763878605, + "loss": 3.5456, + "step": 1214 + }, + { + "epoch": 0.08452467911927371, + "grad_norm": 0.9140625, + "learning_rate": 0.0019844755505358217, + "loss": 3.2847, + "step": 1215 + }, + { + "epoch": 0.08459424675640892, + "grad_norm": 1.3125, + "learning_rate": 0.001984435974697076, + "loss": 3.2534, + "step": 1216 + }, + { + "epoch": 0.08466381439354412, + "grad_norm": 1.28125, + "learning_rate": 0.001984396348873634, + "loss": 3.2247, + "step": 1217 + }, + { + "epoch": 0.08473338203067933, + "grad_norm": 1.0859375, + "learning_rate": 0.0019843566730675067, + "loss": 3.5911, + "step": 1218 + }, + { + "epoch": 0.08480294966781453, + "grad_norm": 1.2265625, + "learning_rate": 0.0019843169472807095, + "loss": 3.4861, + "step": 1219 + }, + { + "epoch": 0.08487251730494974, + "grad_norm": 1.3046875, + "learning_rate": 0.0019842771715152586, + "loss": 3.0054, + "step": 1220 + }, + { + "epoch": 0.08494208494208494, + "grad_norm": 1.1796875, + "learning_rate": 0.0019842373457731742, + "loss": 3.276, + "step": 1221 + }, + { + "epoch": 0.08501165257922015, + "grad_norm": 1.203125, + "learning_rate": 0.0019841974700564786, + "loss": 3.45, + "step": 1222 + }, + { + "epoch": 0.08508122021635535, + "grad_norm": 0.88671875, + "learning_rate": 0.0019841575443671957, + "loss": 3.2529, + "step": 1223 + }, + { + "epoch": 0.08515078785349056, + "grad_norm": 0.9140625, + "learning_rate": 0.0019841175687073534, + "loss": 3.2271, + "step": 1224 + }, + { + "epoch": 0.08522035549062576, + "grad_norm": 0.984375, + "learning_rate": 0.0019840775430789814, + "loss": 3.2621, + "step": 1225 + }, + { + "epoch": 0.08528992312776097, + "grad_norm": 0.8125, + "learning_rate": 0.001984037467484112, + "loss": 3.589, + "step": 1226 + }, + { + "epoch": 0.08535949076489617, + "grad_norm": 1.1171875, + "learning_rate": 0.0019839973419247797, + "loss": 3.0352, + "step": 1227 + }, + { + "epoch": 0.08542905840203137, + "grad_norm": 1.203125, + "learning_rate": 0.001983957166403022, + "loss": 3.4417, + "step": 1228 + }, + { + "epoch": 0.08549862603916658, + "grad_norm": 1.109375, + "learning_rate": 0.001983916940920879, + "loss": 3.7552, + "step": 1229 + }, + { + "epoch": 0.08556819367630178, + "grad_norm": 0.92578125, + "learning_rate": 0.001983876665480393, + "loss": 3.4003, + "step": 1230 + }, + { + "epoch": 0.08563776131343699, + "grad_norm": 0.85546875, + "learning_rate": 0.0019838363400836094, + "loss": 3.5857, + "step": 1231 + }, + { + "epoch": 0.08570732895057219, + "grad_norm": 1.4921875, + "learning_rate": 0.001983795964732575, + "loss": 3.5616, + "step": 1232 + }, + { + "epoch": 0.0857768965877074, + "grad_norm": 1.125, + "learning_rate": 0.0019837555394293404, + "loss": 3.2636, + "step": 1233 + }, + { + "epoch": 0.0858464642248426, + "grad_norm": 0.97265625, + "learning_rate": 0.0019837150641759576, + "loss": 3.4016, + "step": 1234 + }, + { + "epoch": 0.08591603186197781, + "grad_norm": 1.0078125, + "learning_rate": 0.0019836745389744826, + "loss": 2.9563, + "step": 1235 + }, + { + "epoch": 0.08598559949911301, + "grad_norm": 1.1328125, + "learning_rate": 0.001983633963826972, + "loss": 3.4908, + "step": 1236 + }, + { + "epoch": 0.08605516713624822, + "grad_norm": 1.1171875, + "learning_rate": 0.0019835933387354872, + "loss": 3.0906, + "step": 1237 + }, + { + "epoch": 0.08612473477338342, + "grad_norm": 0.8125, + "learning_rate": 0.0019835526637020902, + "loss": 3.4301, + "step": 1238 + }, + { + "epoch": 0.08619430241051863, + "grad_norm": 0.87890625, + "learning_rate": 0.0019835119387288463, + "loss": 3.3323, + "step": 1239 + }, + { + "epoch": 0.08626387004765383, + "grad_norm": 1.6484375, + "learning_rate": 0.0019834711638178236, + "loss": 3.3489, + "step": 1240 + }, + { + "epoch": 0.08633343768478903, + "grad_norm": 0.80078125, + "learning_rate": 0.001983430338971092, + "loss": 3.6983, + "step": 1241 + }, + { + "epoch": 0.08640300532192424, + "grad_norm": 0.96875, + "learning_rate": 0.001983389464190725, + "loss": 3.4093, + "step": 1242 + }, + { + "epoch": 0.08647257295905944, + "grad_norm": 0.76953125, + "learning_rate": 0.0019833485394787974, + "loss": 3.1098, + "step": 1243 + }, + { + "epoch": 0.08654214059619465, + "grad_norm": 0.88671875, + "learning_rate": 0.0019833075648373875, + "loss": 3.2846, + "step": 1244 + }, + { + "epoch": 0.08661170823332985, + "grad_norm": 0.73828125, + "learning_rate": 0.0019832665402685756, + "loss": 3.5138, + "step": 1245 + }, + { + "epoch": 0.08668127587046506, + "grad_norm": 0.7578125, + "learning_rate": 0.001983225465774445, + "loss": 3.6069, + "step": 1246 + }, + { + "epoch": 0.08675084350760026, + "grad_norm": 0.828125, + "learning_rate": 0.001983184341357081, + "loss": 3.4257, + "step": 1247 + }, + { + "epoch": 0.08682041114473547, + "grad_norm": 1.0234375, + "learning_rate": 0.0019831431670185714, + "loss": 3.2438, + "step": 1248 + }, + { + "epoch": 0.08688997878187067, + "grad_norm": 0.85546875, + "learning_rate": 0.0019831019427610074, + "loss": 3.291, + "step": 1249 + }, + { + "epoch": 0.08695954641900588, + "grad_norm": 1.25, + "learning_rate": 0.001983060668586482, + "loss": 3.2301, + "step": 1250 + }, + { + "epoch": 0.08702911405614108, + "grad_norm": 1.234375, + "learning_rate": 0.001983019344497091, + "loss": 3.4895, + "step": 1251 + }, + { + "epoch": 0.0870986816932763, + "grad_norm": 1.265625, + "learning_rate": 0.0019829779704949326, + "loss": 3.1253, + "step": 1252 + }, + { + "epoch": 0.08716824933041149, + "grad_norm": 1.265625, + "learning_rate": 0.0019829365465821066, + "loss": 3.1541, + "step": 1253 + }, + { + "epoch": 0.08723781696754669, + "grad_norm": 1.0859375, + "learning_rate": 0.001982895072760718, + "loss": 3.1458, + "step": 1254 + }, + { + "epoch": 0.0873073846046819, + "grad_norm": 0.8515625, + "learning_rate": 0.0019828535490328714, + "loss": 3.3, + "step": 1255 + }, + { + "epoch": 0.0873769522418171, + "grad_norm": 0.87109375, + "learning_rate": 0.0019828119754006757, + "loss": 3.5455, + "step": 1256 + }, + { + "epoch": 0.08744651987895231, + "grad_norm": 1.046875, + "learning_rate": 0.0019827703518662415, + "loss": 3.2653, + "step": 1257 + }, + { + "epoch": 0.08751608751608751, + "grad_norm": 1.265625, + "learning_rate": 0.0019827286784316824, + "loss": 3.3314, + "step": 1258 + }, + { + "epoch": 0.08758565515322272, + "grad_norm": 0.94140625, + "learning_rate": 0.0019826869550991144, + "loss": 3.4117, + "step": 1259 + }, + { + "epoch": 0.08765522279035792, + "grad_norm": 1.03125, + "learning_rate": 0.0019826451818706556, + "loss": 3.286, + "step": 1260 + }, + { + "epoch": 0.08772479042749314, + "grad_norm": 0.94921875, + "learning_rate": 0.001982603358748428, + "loss": 3.2271, + "step": 1261 + }, + { + "epoch": 0.08779435806462833, + "grad_norm": 0.890625, + "learning_rate": 0.001982561485734554, + "loss": 3.4454, + "step": 1262 + }, + { + "epoch": 0.08786392570176355, + "grad_norm": 1.109375, + "learning_rate": 0.0019825195628311604, + "loss": 3.2706, + "step": 1263 + }, + { + "epoch": 0.08793349333889874, + "grad_norm": 1.1796875, + "learning_rate": 0.0019824775900403754, + "loss": 3.1977, + "step": 1264 + }, + { + "epoch": 0.08800306097603394, + "grad_norm": 0.89453125, + "learning_rate": 0.0019824355673643307, + "loss": 3.0425, + "step": 1265 + }, + { + "epoch": 0.08807262861316915, + "grad_norm": 0.6640625, + "learning_rate": 0.0019823934948051598, + "loss": 3.3867, + "step": 1266 + }, + { + "epoch": 0.08814219625030435, + "grad_norm": 0.828125, + "learning_rate": 0.001982351372364999, + "loss": 3.6068, + "step": 1267 + }, + { + "epoch": 0.08821176388743956, + "grad_norm": 1.015625, + "learning_rate": 0.0019823092000459865, + "loss": 3.5567, + "step": 1268 + }, + { + "epoch": 0.08828133152457476, + "grad_norm": 1.1015625, + "learning_rate": 0.001982266977850264, + "loss": 3.138, + "step": 1269 + }, + { + "epoch": 0.08835089916170998, + "grad_norm": 1.0078125, + "learning_rate": 0.001982224705779976, + "loss": 2.9568, + "step": 1270 + }, + { + "epoch": 0.08842046679884517, + "grad_norm": 1.2265625, + "learning_rate": 0.0019821823838372674, + "loss": 2.9139, + "step": 1271 + }, + { + "epoch": 0.08849003443598039, + "grad_norm": 1.2109375, + "learning_rate": 0.0019821400120242885, + "loss": 3.6534, + "step": 1272 + }, + { + "epoch": 0.08855960207311558, + "grad_norm": 1.0390625, + "learning_rate": 0.00198209759034319, + "loss": 3.1228, + "step": 1273 + }, + { + "epoch": 0.0886291697102508, + "grad_norm": 1.171875, + "learning_rate": 0.0019820551187961256, + "loss": 3.2362, + "step": 1274 + }, + { + "epoch": 0.088698737347386, + "grad_norm": 0.94921875, + "learning_rate": 0.001982012597385253, + "loss": 3.4493, + "step": 1275 + }, + { + "epoch": 0.0887683049845212, + "grad_norm": 1.0625, + "learning_rate": 0.0019819700261127296, + "loss": 3.139, + "step": 1276 + }, + { + "epoch": 0.0888378726216564, + "grad_norm": 0.95703125, + "learning_rate": 0.0019819274049807183, + "loss": 3.2712, + "step": 1277 + }, + { + "epoch": 0.0889074402587916, + "grad_norm": 0.953125, + "learning_rate": 0.001981884733991382, + "loss": 3.0725, + "step": 1278 + }, + { + "epoch": 0.08897700789592682, + "grad_norm": 1.03125, + "learning_rate": 0.0019818420131468887, + "loss": 3.3549, + "step": 1279 + }, + { + "epoch": 0.08904657553306201, + "grad_norm": 1.15625, + "learning_rate": 0.0019817992424494067, + "loss": 3.5994, + "step": 1280 + }, + { + "epoch": 0.08911614317019723, + "grad_norm": 0.8515625, + "learning_rate": 0.0019817564219011077, + "loss": 3.1747, + "step": 1281 + }, + { + "epoch": 0.08918571080733242, + "grad_norm": 3.953125, + "learning_rate": 0.001981713551504166, + "loss": 3.5693, + "step": 1282 + }, + { + "epoch": 0.08925527844446764, + "grad_norm": 0.91796875, + "learning_rate": 0.0019816706312607586, + "loss": 3.6042, + "step": 1283 + }, + { + "epoch": 0.08932484608160284, + "grad_norm": 1.25, + "learning_rate": 0.0019816276611730643, + "loss": 3.2726, + "step": 1284 + }, + { + "epoch": 0.08939441371873805, + "grad_norm": 1.1171875, + "learning_rate": 0.001981584641243265, + "loss": 3.1582, + "step": 1285 + }, + { + "epoch": 0.08946398135587325, + "grad_norm": 1.0078125, + "learning_rate": 0.001981541571473545, + "loss": 3.5227, + "step": 1286 + }, + { + "epoch": 0.08953354899300846, + "grad_norm": 1.1953125, + "learning_rate": 0.001981498451866092, + "loss": 3.4966, + "step": 1287 + }, + { + "epoch": 0.08960311663014366, + "grad_norm": 0.9375, + "learning_rate": 0.001981455282423094, + "loss": 2.9587, + "step": 1288 + }, + { + "epoch": 0.08967268426727887, + "grad_norm": 0.88671875, + "learning_rate": 0.0019814120631467444, + "loss": 3.4397, + "step": 1289 + }, + { + "epoch": 0.08974225190441407, + "grad_norm": 0.953125, + "learning_rate": 0.0019813687940392366, + "loss": 3.5678, + "step": 1290 + }, + { + "epoch": 0.08981181954154926, + "grad_norm": 0.9921875, + "learning_rate": 0.001981325475102768, + "loss": 3.3464, + "step": 1291 + }, + { + "epoch": 0.08988138717868448, + "grad_norm": 1.109375, + "learning_rate": 0.0019812821063395374, + "loss": 3.3282, + "step": 1292 + }, + { + "epoch": 0.08995095481581968, + "grad_norm": 1.1953125, + "learning_rate": 0.001981238687751748, + "loss": 3.6958, + "step": 1293 + }, + { + "epoch": 0.09002052245295489, + "grad_norm": 0.98828125, + "learning_rate": 0.0019811952193416037, + "loss": 3.4398, + "step": 1294 + }, + { + "epoch": 0.09009009009009009, + "grad_norm": 1.21875, + "learning_rate": 0.001981151701111312, + "loss": 3.1065, + "step": 1295 + }, + { + "epoch": 0.0901596577272253, + "grad_norm": 1.0625, + "learning_rate": 0.0019811081330630823, + "loss": 3.4621, + "step": 1296 + }, + { + "epoch": 0.0902292253643605, + "grad_norm": 1.1484375, + "learning_rate": 0.0019810645151991262, + "loss": 3.3886, + "step": 1297 + }, + { + "epoch": 0.09029879300149571, + "grad_norm": 0.92578125, + "learning_rate": 0.0019810208475216596, + "loss": 3.4419, + "step": 1298 + }, + { + "epoch": 0.0903683606386309, + "grad_norm": 1.0546875, + "learning_rate": 0.0019809771300328986, + "loss": 3.3693, + "step": 1299 + }, + { + "epoch": 0.09043792827576612, + "grad_norm": 1.2265625, + "learning_rate": 0.0019809333627350636, + "loss": 3.2262, + "step": 1300 + }, + { + "epoch": 0.09050749591290132, + "grad_norm": 1.1015625, + "learning_rate": 0.001980889545630377, + "loss": 3.2679, + "step": 1301 + }, + { + "epoch": 0.09057706355003653, + "grad_norm": 0.8125, + "learning_rate": 0.001980845678721063, + "loss": 3.2695, + "step": 1302 + }, + { + "epoch": 0.09064663118717173, + "grad_norm": 1.0, + "learning_rate": 0.001980801762009349, + "loss": 3.4142, + "step": 1303 + }, + { + "epoch": 0.09071619882430693, + "grad_norm": 1.0, + "learning_rate": 0.0019807577954974657, + "loss": 3.063, + "step": 1304 + }, + { + "epoch": 0.09078576646144214, + "grad_norm": 0.640625, + "learning_rate": 0.001980713779187645, + "loss": 3.5843, + "step": 1305 + }, + { + "epoch": 0.09085533409857734, + "grad_norm": 1.0703125, + "learning_rate": 0.001980669713082121, + "loss": 3.2546, + "step": 1306 + }, + { + "epoch": 0.09092490173571255, + "grad_norm": 0.84375, + "learning_rate": 0.0019806255971831326, + "loss": 3.5231, + "step": 1307 + }, + { + "epoch": 0.09099446937284775, + "grad_norm": 0.85546875, + "learning_rate": 0.0019805814314929186, + "loss": 3.2505, + "step": 1308 + }, + { + "epoch": 0.09106403700998296, + "grad_norm": 0.91796875, + "learning_rate": 0.0019805372160137226, + "loss": 3.1583, + "step": 1309 + }, + { + "epoch": 0.09113360464711816, + "grad_norm": 1.3125, + "learning_rate": 0.0019804929507477886, + "loss": 3.1289, + "step": 1310 + }, + { + "epoch": 0.09120317228425337, + "grad_norm": 0.9921875, + "learning_rate": 0.0019804486356973646, + "loss": 3.1288, + "step": 1311 + }, + { + "epoch": 0.09127273992138857, + "grad_norm": 0.74609375, + "learning_rate": 0.001980404270864701, + "loss": 3.6134, + "step": 1312 + }, + { + "epoch": 0.09134230755852378, + "grad_norm": 1.453125, + "learning_rate": 0.00198035985625205, + "loss": 3.2821, + "step": 1313 + }, + { + "epoch": 0.09141187519565898, + "grad_norm": 1.0625, + "learning_rate": 0.0019803153918616667, + "loss": 3.1245, + "step": 1314 + }, + { + "epoch": 0.09148144283279419, + "grad_norm": 1.34375, + "learning_rate": 0.001980270877695809, + "loss": 2.9684, + "step": 1315 + }, + { + "epoch": 0.09155101046992939, + "grad_norm": 0.81640625, + "learning_rate": 0.001980226313756737, + "loss": 3.5429, + "step": 1316 + }, + { + "epoch": 0.09162057810706459, + "grad_norm": 0.9140625, + "learning_rate": 0.001980181700046714, + "loss": 3.2961, + "step": 1317 + }, + { + "epoch": 0.0916901457441998, + "grad_norm": 1.09375, + "learning_rate": 0.001980137036568004, + "loss": 3.4082, + "step": 1318 + }, + { + "epoch": 0.091759713381335, + "grad_norm": 0.9453125, + "learning_rate": 0.001980092323322876, + "loss": 3.447, + "step": 1319 + }, + { + "epoch": 0.09182928101847021, + "grad_norm": 0.8828125, + "learning_rate": 0.0019800475603135997, + "loss": 3.417, + "step": 1320 + }, + { + "epoch": 0.09189884865560541, + "grad_norm": 0.81640625, + "learning_rate": 0.0019800027475424483, + "loss": 3.7718, + "step": 1321 + }, + { + "epoch": 0.09196841629274062, + "grad_norm": 1.1484375, + "learning_rate": 0.0019799578850116972, + "loss": 2.9843, + "step": 1322 + }, + { + "epoch": 0.09203798392987582, + "grad_norm": 0.79296875, + "learning_rate": 0.0019799129727236233, + "loss": 3.3471, + "step": 1323 + }, + { + "epoch": 0.09210755156701103, + "grad_norm": 0.94140625, + "learning_rate": 0.001979868010680508, + "loss": 3.2991, + "step": 1324 + }, + { + "epoch": 0.09217711920414623, + "grad_norm": 0.984375, + "learning_rate": 0.0019798229988846347, + "loss": 3.3589, + "step": 1325 + }, + { + "epoch": 0.09224668684128144, + "grad_norm": 1.1015625, + "learning_rate": 0.0019797779373382876, + "loss": 3.2222, + "step": 1326 + }, + { + "epoch": 0.09231625447841664, + "grad_norm": 0.95703125, + "learning_rate": 0.001979732826043755, + "loss": 3.3215, + "step": 1327 + }, + { + "epoch": 0.09238582211555185, + "grad_norm": 0.83203125, + "learning_rate": 0.0019796876650033284, + "loss": 3.4502, + "step": 1328 + }, + { + "epoch": 0.09245538975268705, + "grad_norm": 0.953125, + "learning_rate": 0.0019796424542192995, + "loss": 3.2011, + "step": 1329 + }, + { + "epoch": 0.09252495738982225, + "grad_norm": 1.171875, + "learning_rate": 0.001979597193693965, + "loss": 3.5101, + "step": 1330 + }, + { + "epoch": 0.09259452502695746, + "grad_norm": 0.93359375, + "learning_rate": 0.001979551883429623, + "loss": 3.1729, + "step": 1331 + }, + { + "epoch": 0.09266409266409266, + "grad_norm": 0.82421875, + "learning_rate": 0.001979506523428573, + "loss": 3.4822, + "step": 1332 + }, + { + "epoch": 0.09273366030122787, + "grad_norm": 0.6875, + "learning_rate": 0.001979461113693119, + "loss": 3.6057, + "step": 1333 + }, + { + "epoch": 0.09280322793836307, + "grad_norm": 1.171875, + "learning_rate": 0.001979415654225566, + "loss": 3.4683, + "step": 1334 + }, + { + "epoch": 0.09287279557549828, + "grad_norm": 0.890625, + "learning_rate": 0.001979370145028223, + "loss": 3.489, + "step": 1335 + }, + { + "epoch": 0.09294236321263348, + "grad_norm": 1.140625, + "learning_rate": 0.001979324586103401, + "loss": 3.2464, + "step": 1336 + }, + { + "epoch": 0.09301193084976869, + "grad_norm": 1.0625, + "learning_rate": 0.0019792789774534117, + "loss": 3.1344, + "step": 1337 + }, + { + "epoch": 0.09308149848690389, + "grad_norm": 1.1796875, + "learning_rate": 0.0019792333190805727, + "loss": 3.333, + "step": 1338 + }, + { + "epoch": 0.0931510661240391, + "grad_norm": 1.03125, + "learning_rate": 0.001979187610987201, + "loss": 3.5017, + "step": 1339 + }, + { + "epoch": 0.0932206337611743, + "grad_norm": 0.87109375, + "learning_rate": 0.0019791418531756176, + "loss": 3.2691, + "step": 1340 + }, + { + "epoch": 0.0932902013983095, + "grad_norm": 0.81640625, + "learning_rate": 0.001979096045648147, + "loss": 3.5223, + "step": 1341 + }, + { + "epoch": 0.09335976903544471, + "grad_norm": 0.83203125, + "learning_rate": 0.0019790501884071137, + "loss": 3.3475, + "step": 1342 + }, + { + "epoch": 0.09342933667257991, + "grad_norm": 0.75390625, + "learning_rate": 0.0019790042814548463, + "loss": 3.3571, + "step": 1343 + }, + { + "epoch": 0.09349890430971512, + "grad_norm": 1.078125, + "learning_rate": 0.0019789583247936766, + "loss": 3.123, + "step": 1344 + }, + { + "epoch": 0.09356847194685032, + "grad_norm": 1.078125, + "learning_rate": 0.0019789123184259373, + "loss": 3.2013, + "step": 1345 + }, + { + "epoch": 0.09363803958398553, + "grad_norm": 0.8359375, + "learning_rate": 0.001978866262353964, + "loss": 3.4691, + "step": 1346 + }, + { + "epoch": 0.09370760722112073, + "grad_norm": 0.9375, + "learning_rate": 0.0019788201565800966, + "loss": 3.2226, + "step": 1347 + }, + { + "epoch": 0.09377717485825594, + "grad_norm": 0.93359375, + "learning_rate": 0.001978774001106675, + "loss": 3.3479, + "step": 1348 + }, + { + "epoch": 0.09384674249539114, + "grad_norm": 1.2265625, + "learning_rate": 0.001978727795936043, + "loss": 3.2993, + "step": 1349 + }, + { + "epoch": 0.09391631013252635, + "grad_norm": 0.93359375, + "learning_rate": 0.0019786815410705464, + "loss": 3.4622, + "step": 1350 + }, + { + "epoch": 0.09398587776966155, + "grad_norm": 0.97265625, + "learning_rate": 0.0019786352365125347, + "loss": 3.5396, + "step": 1351 + }, + { + "epoch": 0.09405544540679676, + "grad_norm": 1.109375, + "learning_rate": 0.001978588882264358, + "loss": 3.6279, + "step": 1352 + }, + { + "epoch": 0.09412501304393196, + "grad_norm": 1.1796875, + "learning_rate": 0.00197854247832837, + "loss": 3.3464, + "step": 1353 + }, + { + "epoch": 0.09419458068106716, + "grad_norm": 1.1875, + "learning_rate": 0.0019784960247069276, + "loss": 3.1044, + "step": 1354 + }, + { + "epoch": 0.09426414831820237, + "grad_norm": 0.8515625, + "learning_rate": 0.001978449521402389, + "loss": 3.1176, + "step": 1355 + }, + { + "epoch": 0.09433371595533757, + "grad_norm": 1.0234375, + "learning_rate": 0.0019784029684171154, + "loss": 3.3721, + "step": 1356 + }, + { + "epoch": 0.09440328359247278, + "grad_norm": 1.0390625, + "learning_rate": 0.0019783563657534706, + "loss": 3.4134, + "step": 1357 + }, + { + "epoch": 0.09447285122960798, + "grad_norm": 0.95703125, + "learning_rate": 0.001978309713413821, + "loss": 3.091, + "step": 1358 + }, + { + "epoch": 0.0945424188667432, + "grad_norm": 0.9921875, + "learning_rate": 0.0019782630114005347, + "loss": 3.1234, + "step": 1359 + }, + { + "epoch": 0.09461198650387839, + "grad_norm": 1.0, + "learning_rate": 0.0019782162597159836, + "loss": 3.0553, + "step": 1360 + }, + { + "epoch": 0.0946815541410136, + "grad_norm": 1.078125, + "learning_rate": 0.0019781694583625416, + "loss": 3.4159, + "step": 1361 + }, + { + "epoch": 0.0947511217781488, + "grad_norm": 0.7421875, + "learning_rate": 0.0019781226073425848, + "loss": 3.199, + "step": 1362 + }, + { + "epoch": 0.09482068941528402, + "grad_norm": 1.0703125, + "learning_rate": 0.0019780757066584923, + "loss": 3.4261, + "step": 1363 + }, + { + "epoch": 0.09489025705241921, + "grad_norm": 0.90625, + "learning_rate": 0.001978028756312645, + "loss": 3.0732, + "step": 1364 + }, + { + "epoch": 0.09495982468955443, + "grad_norm": 0.88671875, + "learning_rate": 0.001977981756307427, + "loss": 3.3938, + "step": 1365 + }, + { + "epoch": 0.09502939232668962, + "grad_norm": 1.0546875, + "learning_rate": 0.001977934706645225, + "loss": 3.3882, + "step": 1366 + }, + { + "epoch": 0.09509895996382482, + "grad_norm": 1.015625, + "learning_rate": 0.001977887607328428, + "loss": 2.9706, + "step": 1367 + }, + { + "epoch": 0.09516852760096003, + "grad_norm": 0.98046875, + "learning_rate": 0.001977840458359427, + "loss": 2.8586, + "step": 1368 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 1.0078125, + "learning_rate": 0.001977793259740616, + "loss": 3.1019, + "step": 1369 + }, + { + "epoch": 0.09530766287523044, + "grad_norm": 1.2265625, + "learning_rate": 0.001977746011474392, + "loss": 3.264, + "step": 1370 + }, + { + "epoch": 0.09537723051236564, + "grad_norm": 1.234375, + "learning_rate": 0.001977698713563154, + "loss": 3.151, + "step": 1371 + }, + { + "epoch": 0.09544679814950086, + "grad_norm": 1.265625, + "learning_rate": 0.0019776513660093027, + "loss": 3.145, + "step": 1372 + }, + { + "epoch": 0.09551636578663605, + "grad_norm": 0.875, + "learning_rate": 0.001977603968815243, + "loss": 3.6223, + "step": 1373 + }, + { + "epoch": 0.09558593342377127, + "grad_norm": 0.8125, + "learning_rate": 0.001977556521983381, + "loss": 3.2582, + "step": 1374 + }, + { + "epoch": 0.09565550106090646, + "grad_norm": 1.546875, + "learning_rate": 0.0019775090255161262, + "loss": 3.3363, + "step": 1375 + }, + { + "epoch": 0.09572506869804168, + "grad_norm": 0.86328125, + "learning_rate": 0.0019774614794158905, + "loss": 3.116, + "step": 1376 + }, + { + "epoch": 0.09579463633517687, + "grad_norm": 0.8671875, + "learning_rate": 0.0019774138836850873, + "loss": 3.6931, + "step": 1377 + }, + { + "epoch": 0.09586420397231209, + "grad_norm": 1.015625, + "learning_rate": 0.0019773662383261335, + "loss": 3.1336, + "step": 1378 + }, + { + "epoch": 0.09593377160944729, + "grad_norm": 1.1484375, + "learning_rate": 0.0019773185433414487, + "loss": 3.2009, + "step": 1379 + }, + { + "epoch": 0.09600333924658248, + "grad_norm": 0.8984375, + "learning_rate": 0.001977270798733454, + "loss": 3.4753, + "step": 1380 + }, + { + "epoch": 0.0960729068837177, + "grad_norm": 1.0234375, + "learning_rate": 0.0019772230045045744, + "loss": 3.1052, + "step": 1381 + }, + { + "epoch": 0.0961424745208529, + "grad_norm": 0.82421875, + "learning_rate": 0.001977175160657236, + "loss": 3.0842, + "step": 1382 + }, + { + "epoch": 0.0962120421579881, + "grad_norm": 0.890625, + "learning_rate": 0.0019771272671938677, + "loss": 3.3699, + "step": 1383 + }, + { + "epoch": 0.0962816097951233, + "grad_norm": 0.87109375, + "learning_rate": 0.0019770793241169027, + "loss": 3.4106, + "step": 1384 + }, + { + "epoch": 0.09635117743225852, + "grad_norm": 0.828125, + "learning_rate": 0.001977031331428774, + "loss": 3.2359, + "step": 1385 + }, + { + "epoch": 0.09642074506939372, + "grad_norm": 1.0078125, + "learning_rate": 0.0019769832891319192, + "loss": 3.3751, + "step": 1386 + }, + { + "epoch": 0.09649031270652893, + "grad_norm": 0.87109375, + "learning_rate": 0.001976935197228777, + "loss": 3.5038, + "step": 1387 + }, + { + "epoch": 0.09655988034366413, + "grad_norm": 1.140625, + "learning_rate": 0.00197688705572179, + "loss": 3.3873, + "step": 1388 + }, + { + "epoch": 0.09662944798079934, + "grad_norm": 0.70703125, + "learning_rate": 0.0019768388646134016, + "loss": 3.3603, + "step": 1389 + }, + { + "epoch": 0.09669901561793454, + "grad_norm": 1.2890625, + "learning_rate": 0.0019767906239060596, + "loss": 3.106, + "step": 1390 + }, + { + "epoch": 0.09676858325506975, + "grad_norm": 0.890625, + "learning_rate": 0.001976742333602213, + "loss": 3.4672, + "step": 1391 + }, + { + "epoch": 0.09683815089220495, + "grad_norm": 1.328125, + "learning_rate": 0.0019766939937043144, + "loss": 3.1086, + "step": 1392 + }, + { + "epoch": 0.09690771852934014, + "grad_norm": 1.046875, + "learning_rate": 0.0019766456042148175, + "loss": 3.3279, + "step": 1393 + }, + { + "epoch": 0.09697728616647536, + "grad_norm": 0.6875, + "learning_rate": 0.001976597165136179, + "loss": 3.3536, + "step": 1394 + }, + { + "epoch": 0.09704685380361056, + "grad_norm": 1.0625, + "learning_rate": 0.001976548676470859, + "loss": 3.0432, + "step": 1395 + }, + { + "epoch": 0.09711642144074577, + "grad_norm": 0.70703125, + "learning_rate": 0.0019765001382213198, + "loss": 3.5107, + "step": 1396 + }, + { + "epoch": 0.09718598907788097, + "grad_norm": 0.87109375, + "learning_rate": 0.001976451550390025, + "loss": 2.7966, + "step": 1397 + }, + { + "epoch": 0.09725555671501618, + "grad_norm": 1.078125, + "learning_rate": 0.0019764029129794424, + "loss": 3.3384, + "step": 1398 + }, + { + "epoch": 0.09732512435215138, + "grad_norm": 0.87109375, + "learning_rate": 0.001976354225992041, + "loss": 3.1992, + "step": 1399 + }, + { + "epoch": 0.09739469198928659, + "grad_norm": 0.98828125, + "learning_rate": 0.001976305489430294, + "loss": 3.2581, + "step": 1400 + }, + { + "epoch": 0.09746425962642179, + "grad_norm": 0.90234375, + "learning_rate": 0.0019762567032966744, + "loss": 3.0971, + "step": 1401 + }, + { + "epoch": 0.097533827263557, + "grad_norm": 1.0703125, + "learning_rate": 0.0019762078675936608, + "loss": 2.7413, + "step": 1402 + }, + { + "epoch": 0.0976033949006922, + "grad_norm": 0.80859375, + "learning_rate": 0.0019761589823237315, + "loss": 3.189, + "step": 1403 + }, + { + "epoch": 0.0976729625378274, + "grad_norm": 0.76953125, + "learning_rate": 0.0019761100474893693, + "loss": 3.2528, + "step": 1404 + }, + { + "epoch": 0.09774253017496261, + "grad_norm": 1.0, + "learning_rate": 0.0019760610630930593, + "loss": 2.9884, + "step": 1405 + }, + { + "epoch": 0.0978120978120978, + "grad_norm": 0.859375, + "learning_rate": 0.0019760120291372877, + "loss": 3.2232, + "step": 1406 + }, + { + "epoch": 0.09788166544923302, + "grad_norm": 0.875, + "learning_rate": 0.001975962945624545, + "loss": 3.3636, + "step": 1407 + }, + { + "epoch": 0.09795123308636822, + "grad_norm": 1.1171875, + "learning_rate": 0.0019759138125573232, + "loss": 3.3866, + "step": 1408 + }, + { + "epoch": 0.09802080072350343, + "grad_norm": 1.1640625, + "learning_rate": 0.0019758646299381168, + "loss": 3.3013, + "step": 1409 + }, + { + "epoch": 0.09809036836063863, + "grad_norm": 0.984375, + "learning_rate": 0.0019758153977694234, + "loss": 3.3126, + "step": 1410 + }, + { + "epoch": 0.09815993599777384, + "grad_norm": 1.1171875, + "learning_rate": 0.001975766116053743, + "loss": 3.2412, + "step": 1411 + }, + { + "epoch": 0.09822950363490904, + "grad_norm": 0.61328125, + "learning_rate": 0.0019757167847935767, + "loss": 3.6484, + "step": 1412 + }, + { + "epoch": 0.09829907127204425, + "grad_norm": 0.9921875, + "learning_rate": 0.00197566740399143, + "loss": 3.5801, + "step": 1413 + }, + { + "epoch": 0.09836863890917945, + "grad_norm": 1.015625, + "learning_rate": 0.0019756179736498108, + "loss": 3.2148, + "step": 1414 + }, + { + "epoch": 0.09843820654631466, + "grad_norm": 0.8203125, + "learning_rate": 0.001975568493771228, + "loss": 3.4016, + "step": 1415 + }, + { + "epoch": 0.09850777418344986, + "grad_norm": 0.91015625, + "learning_rate": 0.0019755189643581943, + "loss": 3.5892, + "step": 1416 + }, + { + "epoch": 0.09857734182058506, + "grad_norm": 0.8203125, + "learning_rate": 0.001975469385413225, + "loss": 3.1497, + "step": 1417 + }, + { + "epoch": 0.09864690945772027, + "grad_norm": 0.921875, + "learning_rate": 0.0019754197569388367, + "loss": 3.2365, + "step": 1418 + }, + { + "epoch": 0.09871647709485547, + "grad_norm": 1.09375, + "learning_rate": 0.00197537007893755, + "loss": 3.446, + "step": 1419 + }, + { + "epoch": 0.09878604473199068, + "grad_norm": 0.7421875, + "learning_rate": 0.001975320351411886, + "loss": 3.2635, + "step": 1420 + }, + { + "epoch": 0.09885561236912588, + "grad_norm": 0.96484375, + "learning_rate": 0.0019752705743643715, + "loss": 3.2567, + "step": 1421 + }, + { + "epoch": 0.09892518000626109, + "grad_norm": 0.734375, + "learning_rate": 0.0019752207477975324, + "loss": 3.5977, + "step": 1422 + }, + { + "epoch": 0.09899474764339629, + "grad_norm": 0.875, + "learning_rate": 0.0019751708717138995, + "loss": 3.2144, + "step": 1423 + }, + { + "epoch": 0.0990643152805315, + "grad_norm": 0.84765625, + "learning_rate": 0.0019751209461160045, + "loss": 3.4039, + "step": 1424 + }, + { + "epoch": 0.0991338829176667, + "grad_norm": 0.84765625, + "learning_rate": 0.0019750709710063836, + "loss": 3.4512, + "step": 1425 + }, + { + "epoch": 0.09920345055480191, + "grad_norm": 0.92578125, + "learning_rate": 0.001975020946387573, + "loss": 3.3055, + "step": 1426 + }, + { + "epoch": 0.09927301819193711, + "grad_norm": 1.0625, + "learning_rate": 0.001974970872262113, + "loss": 3.291, + "step": 1427 + }, + { + "epoch": 0.09934258582907232, + "grad_norm": 1.2421875, + "learning_rate": 0.001974920748632547, + "loss": 3.3701, + "step": 1428 + }, + { + "epoch": 0.09941215346620752, + "grad_norm": 1.109375, + "learning_rate": 0.0019748705755014188, + "loss": 3.3214, + "step": 1429 + }, + { + "epoch": 0.09948172110334272, + "grad_norm": 0.80859375, + "learning_rate": 0.001974820352871277, + "loss": 3.5112, + "step": 1430 + }, + { + "epoch": 0.09955128874047793, + "grad_norm": 0.78125, + "learning_rate": 0.0019747700807446703, + "loss": 3.3854, + "step": 1431 + }, + { + "epoch": 0.09962085637761313, + "grad_norm": 0.9375, + "learning_rate": 0.0019747197591241526, + "loss": 3.2159, + "step": 1432 + }, + { + "epoch": 0.09969042401474834, + "grad_norm": 0.85546875, + "learning_rate": 0.0019746693880122786, + "loss": 3.06, + "step": 1433 + }, + { + "epoch": 0.09975999165188354, + "grad_norm": 0.8671875, + "learning_rate": 0.001974618967411606, + "loss": 3.1673, + "step": 1434 + }, + { + "epoch": 0.09982955928901875, + "grad_norm": 0.83984375, + "learning_rate": 0.0019745684973246943, + "loss": 3.449, + "step": 1435 + }, + { + "epoch": 0.09989912692615395, + "grad_norm": 0.8359375, + "learning_rate": 0.0019745179777541063, + "loss": 3.5406, + "step": 1436 + }, + { + "epoch": 0.09996869456328916, + "grad_norm": 1.046875, + "learning_rate": 0.0019744674087024076, + "loss": 3.5287, + "step": 1437 + }, + { + "epoch": 0.10003826220042436, + "grad_norm": 1.09375, + "learning_rate": 0.0019744167901721657, + "loss": 3.2567, + "step": 1438 + }, + { + "epoch": 0.10010782983755957, + "grad_norm": 0.89453125, + "learning_rate": 0.0019743661221659505, + "loss": 3.0954, + "step": 1439 + }, + { + "epoch": 0.10017739747469477, + "grad_norm": 1.0078125, + "learning_rate": 0.0019743154046863347, + "loss": 3.3116, + "step": 1440 + }, + { + "epoch": 0.10024696511182998, + "grad_norm": 0.77734375, + "learning_rate": 0.001974264637735894, + "loss": 3.5275, + "step": 1441 + }, + { + "epoch": 0.10031653274896518, + "grad_norm": 0.9296875, + "learning_rate": 0.0019742138213172046, + "loss": 3.3459, + "step": 1442 + }, + { + "epoch": 0.10038610038610038, + "grad_norm": 1.171875, + "learning_rate": 0.0019741629554328485, + "loss": 3.5695, + "step": 1443 + }, + { + "epoch": 0.10045566802323559, + "grad_norm": 0.70703125, + "learning_rate": 0.0019741120400854077, + "loss": 3.4048, + "step": 1444 + }, + { + "epoch": 0.10052523566037079, + "grad_norm": 1.0703125, + "learning_rate": 0.0019740610752774675, + "loss": 2.9705, + "step": 1445 + }, + { + "epoch": 0.100594803297506, + "grad_norm": 0.765625, + "learning_rate": 0.001974010061011615, + "loss": 3.5704, + "step": 1446 + }, + { + "epoch": 0.1006643709346412, + "grad_norm": 0.83203125, + "learning_rate": 0.0019739589972904417, + "loss": 3.2439, + "step": 1447 + }, + { + "epoch": 0.10073393857177641, + "grad_norm": 0.76953125, + "learning_rate": 0.001973907884116539, + "loss": 3.3124, + "step": 1448 + }, + { + "epoch": 0.10080350620891161, + "grad_norm": 0.92578125, + "learning_rate": 0.001973856721492503, + "loss": 3.5505, + "step": 1449 + }, + { + "epoch": 0.10087307384604682, + "grad_norm": 1.171875, + "learning_rate": 0.001973805509420931, + "loss": 3.4526, + "step": 1450 + }, + { + "epoch": 0.10094264148318202, + "grad_norm": 1.0390625, + "learning_rate": 0.0019737542479044243, + "loss": 3.2931, + "step": 1451 + }, + { + "epoch": 0.10101220912031723, + "grad_norm": 0.921875, + "learning_rate": 0.0019737029369455844, + "loss": 3.1317, + "step": 1452 + }, + { + "epoch": 0.10108177675745243, + "grad_norm": 1.046875, + "learning_rate": 0.0019736515765470175, + "loss": 3.0454, + "step": 1453 + }, + { + "epoch": 0.10115134439458764, + "grad_norm": 0.921875, + "learning_rate": 0.0019736001667113308, + "loss": 3.2824, + "step": 1454 + }, + { + "epoch": 0.10122091203172284, + "grad_norm": 0.7265625, + "learning_rate": 0.001973548707441135, + "loss": 3.1641, + "step": 1455 + }, + { + "epoch": 0.10129047966885804, + "grad_norm": 1.0546875, + "learning_rate": 0.0019734971987390433, + "loss": 3.6603, + "step": 1456 + }, + { + "epoch": 0.10136004730599325, + "grad_norm": 0.83984375, + "learning_rate": 0.00197344564060767, + "loss": 3.3129, + "step": 1457 + }, + { + "epoch": 0.10142961494312845, + "grad_norm": 0.87109375, + "learning_rate": 0.001973394033049634, + "loss": 3.5636, + "step": 1458 + }, + { + "epoch": 0.10149918258026366, + "grad_norm": 0.80859375, + "learning_rate": 0.001973342376067555, + "loss": 3.5552, + "step": 1459 + }, + { + "epoch": 0.10156875021739886, + "grad_norm": 0.8359375, + "learning_rate": 0.001973290669664057, + "loss": 3.288, + "step": 1460 + }, + { + "epoch": 0.10163831785453407, + "grad_norm": 0.8515625, + "learning_rate": 0.0019732389138417635, + "loss": 3.0768, + "step": 1461 + }, + { + "epoch": 0.10170788549166927, + "grad_norm": 1.09375, + "learning_rate": 0.001973187108603304, + "loss": 3.5058, + "step": 1462 + }, + { + "epoch": 0.10177745312880448, + "grad_norm": 1.1796875, + "learning_rate": 0.001973135253951308, + "loss": 3.5485, + "step": 1463 + }, + { + "epoch": 0.10184702076593968, + "grad_norm": 0.87890625, + "learning_rate": 0.001973083349888409, + "loss": 3.6063, + "step": 1464 + }, + { + "epoch": 0.1019165884030749, + "grad_norm": 1.2890625, + "learning_rate": 0.001973031396417242, + "loss": 3.6008, + "step": 1465 + }, + { + "epoch": 0.1019861560402101, + "grad_norm": 1.0625, + "learning_rate": 0.001972979393540445, + "loss": 3.136, + "step": 1466 + }, + { + "epoch": 0.1020557236773453, + "grad_norm": 1.0234375, + "learning_rate": 0.0019729273412606592, + "loss": 3.3279, + "step": 1467 + }, + { + "epoch": 0.1021252913144805, + "grad_norm": 1.0078125, + "learning_rate": 0.0019728752395805267, + "loss": 3.2092, + "step": 1468 + }, + { + "epoch": 0.1021948589516157, + "grad_norm": 0.95703125, + "learning_rate": 0.001972823088502693, + "loss": 3.56, + "step": 1469 + }, + { + "epoch": 0.10226442658875091, + "grad_norm": 0.859375, + "learning_rate": 0.0019727708880298064, + "loss": 3.3485, + "step": 1470 + }, + { + "epoch": 0.10233399422588611, + "grad_norm": 0.89453125, + "learning_rate": 0.001972718638164517, + "loss": 3.2539, + "step": 1471 + }, + { + "epoch": 0.10240356186302133, + "grad_norm": 1.6328125, + "learning_rate": 0.0019726663389094783, + "loss": 3.4803, + "step": 1472 + }, + { + "epoch": 0.10247312950015652, + "grad_norm": 0.75, + "learning_rate": 0.0019726139902673454, + "loss": 3.5746, + "step": 1473 + }, + { + "epoch": 0.10254269713729174, + "grad_norm": 0.84765625, + "learning_rate": 0.0019725615922407762, + "loss": 3.2785, + "step": 1474 + }, + { + "epoch": 0.10261226477442693, + "grad_norm": 0.87890625, + "learning_rate": 0.0019725091448324315, + "loss": 3.046, + "step": 1475 + }, + { + "epoch": 0.10268183241156215, + "grad_norm": 0.84375, + "learning_rate": 0.0019724566480449745, + "loss": 3.5367, + "step": 1476 + }, + { + "epoch": 0.10275140004869734, + "grad_norm": 0.9765625, + "learning_rate": 0.0019724041018810705, + "loss": 3.6516, + "step": 1477 + }, + { + "epoch": 0.10282096768583256, + "grad_norm": 0.765625, + "learning_rate": 0.001972351506343387, + "loss": 3.448, + "step": 1478 + }, + { + "epoch": 0.10289053532296775, + "grad_norm": 0.95703125, + "learning_rate": 0.0019722988614345955, + "loss": 2.9195, + "step": 1479 + }, + { + "epoch": 0.10296010296010295, + "grad_norm": 0.734375, + "learning_rate": 0.0019722461671573682, + "loss": 3.3375, + "step": 1480 + }, + { + "epoch": 0.10302967059723817, + "grad_norm": 0.80078125, + "learning_rate": 0.0019721934235143817, + "loss": 3.1518, + "step": 1481 + }, + { + "epoch": 0.10309923823437336, + "grad_norm": 0.91796875, + "learning_rate": 0.001972140630508313, + "loss": 3.3298, + "step": 1482 + }, + { + "epoch": 0.10316880587150858, + "grad_norm": 0.8984375, + "learning_rate": 0.0019720877881418426, + "loss": 3.4051, + "step": 1483 + }, + { + "epoch": 0.10323837350864377, + "grad_norm": 1.125, + "learning_rate": 0.001972034896417654, + "loss": 3.2962, + "step": 1484 + }, + { + "epoch": 0.10330794114577899, + "grad_norm": 0.75, + "learning_rate": 0.001971981955338433, + "loss": 3.2481, + "step": 1485 + }, + { + "epoch": 0.10337750878291418, + "grad_norm": 0.87890625, + "learning_rate": 0.001971928964906868, + "loss": 3.1045, + "step": 1486 + }, + { + "epoch": 0.1034470764200494, + "grad_norm": 0.8046875, + "learning_rate": 0.0019718759251256485, + "loss": 3.4065, + "step": 1487 + }, + { + "epoch": 0.1035166440571846, + "grad_norm": 0.95703125, + "learning_rate": 0.001971822835997468, + "loss": 3.0315, + "step": 1488 + }, + { + "epoch": 0.10358621169431981, + "grad_norm": 0.9453125, + "learning_rate": 0.001971769697525023, + "loss": 3.1924, + "step": 1489 + }, + { + "epoch": 0.103655779331455, + "grad_norm": 0.921875, + "learning_rate": 0.00197171650971101, + "loss": 3.1543, + "step": 1490 + }, + { + "epoch": 0.10372534696859022, + "grad_norm": 0.86328125, + "learning_rate": 0.001971663272558131, + "loss": 3.4982, + "step": 1491 + }, + { + "epoch": 0.10379491460572542, + "grad_norm": 0.8671875, + "learning_rate": 0.001971609986069088, + "loss": 3.4726, + "step": 1492 + }, + { + "epoch": 0.10386448224286061, + "grad_norm": 1.0078125, + "learning_rate": 0.0019715566502465877, + "loss": 3.1342, + "step": 1493 + }, + { + "epoch": 0.10393404987999583, + "grad_norm": 0.8671875, + "learning_rate": 0.0019715032650933374, + "loss": 3.8033, + "step": 1494 + }, + { + "epoch": 0.10400361751713103, + "grad_norm": 0.91015625, + "learning_rate": 0.0019714498306120484, + "loss": 3.3207, + "step": 1495 + }, + { + "epoch": 0.10407318515426624, + "grad_norm": 0.65234375, + "learning_rate": 0.001971396346805433, + "loss": 3.5685, + "step": 1496 + }, + { + "epoch": 0.10414275279140144, + "grad_norm": 0.609375, + "learning_rate": 0.0019713428136762076, + "loss": 3.7468, + "step": 1497 + }, + { + "epoch": 0.10421232042853665, + "grad_norm": 0.9609375, + "learning_rate": 0.00197128923122709, + "loss": 3.2513, + "step": 1498 + }, + { + "epoch": 0.10428188806567185, + "grad_norm": 0.921875, + "learning_rate": 0.0019712355994608013, + "loss": 3.2436, + "step": 1499 + }, + { + "epoch": 0.10435145570280706, + "grad_norm": 0.7734375, + "learning_rate": 0.0019711819183800636, + "loss": 3.3017, + "step": 1500 + }, + { + "epoch": 0.10442102333994226, + "grad_norm": 0.8671875, + "learning_rate": 0.0019711281879876037, + "loss": 2.9926, + "step": 1501 + }, + { + "epoch": 0.10449059097707747, + "grad_norm": 1.1484375, + "learning_rate": 0.0019710744082861486, + "loss": 3.0992, + "step": 1502 + }, + { + "epoch": 0.10456015861421267, + "grad_norm": 0.921875, + "learning_rate": 0.0019710205792784303, + "loss": 3.4089, + "step": 1503 + }, + { + "epoch": 0.10462972625134788, + "grad_norm": 1.0078125, + "learning_rate": 0.001970966700967181, + "loss": 2.9906, + "step": 1504 + }, + { + "epoch": 0.10469929388848308, + "grad_norm": 0.99609375, + "learning_rate": 0.0019709127733551365, + "loss": 3.3956, + "step": 1505 + }, + { + "epoch": 0.10476886152561828, + "grad_norm": 0.9453125, + "learning_rate": 0.0019708587964450356, + "loss": 3.6315, + "step": 1506 + }, + { + "epoch": 0.10483842916275349, + "grad_norm": 0.91796875, + "learning_rate": 0.0019708047702396182, + "loss": 3.4508, + "step": 1507 + }, + { + "epoch": 0.10490799679988869, + "grad_norm": 0.82421875, + "learning_rate": 0.001970750694741628, + "loss": 3.3902, + "step": 1508 + }, + { + "epoch": 0.1049775644370239, + "grad_norm": 0.86328125, + "learning_rate": 0.00197069656995381, + "loss": 3.3237, + "step": 1509 + }, + { + "epoch": 0.1050471320741591, + "grad_norm": 0.8671875, + "learning_rate": 0.001970642395878913, + "loss": 3.215, + "step": 1510 + }, + { + "epoch": 0.10511669971129431, + "grad_norm": 1.0078125, + "learning_rate": 0.001970588172519688, + "loss": 2.8556, + "step": 1511 + }, + { + "epoch": 0.10518626734842951, + "grad_norm": 1.03125, + "learning_rate": 0.001970533899878887, + "loss": 3.1504, + "step": 1512 + }, + { + "epoch": 0.10525583498556472, + "grad_norm": 0.828125, + "learning_rate": 0.0019704795779592666, + "loss": 3.3045, + "step": 1513 + }, + { + "epoch": 0.10532540262269992, + "grad_norm": 0.9765625, + "learning_rate": 0.0019704252067635855, + "loss": 3.2279, + "step": 1514 + }, + { + "epoch": 0.10539497025983513, + "grad_norm": 1.0703125, + "learning_rate": 0.001970370786294603, + "loss": 2.9838, + "step": 1515 + }, + { + "epoch": 0.10546453789697033, + "grad_norm": 0.8046875, + "learning_rate": 0.0019703163165550835, + "loss": 3.3872, + "step": 1516 + }, + { + "epoch": 0.10553410553410554, + "grad_norm": 0.6796875, + "learning_rate": 0.0019702617975477918, + "loss": 3.2918, + "step": 1517 + }, + { + "epoch": 0.10560367317124074, + "grad_norm": 1.2578125, + "learning_rate": 0.001970207229275497, + "loss": 2.9289, + "step": 1518 + }, + { + "epoch": 0.10567324080837594, + "grad_norm": 0.984375, + "learning_rate": 0.001970152611740969, + "loss": 3.4421, + "step": 1519 + }, + { + "epoch": 0.10574280844551115, + "grad_norm": 0.9609375, + "learning_rate": 0.0019700979449469806, + "loss": 3.3153, + "step": 1520 + }, + { + "epoch": 0.10581237608264635, + "grad_norm": 0.58203125, + "learning_rate": 0.001970043228896309, + "loss": 3.7921, + "step": 1521 + }, + { + "epoch": 0.10588194371978156, + "grad_norm": 0.96484375, + "learning_rate": 0.0019699884635917316, + "loss": 3.2177, + "step": 1522 + }, + { + "epoch": 0.10595151135691676, + "grad_norm": 1.0546875, + "learning_rate": 0.001969933649036029, + "loss": 3.3786, + "step": 1523 + }, + { + "epoch": 0.10602107899405197, + "grad_norm": 0.80078125, + "learning_rate": 0.0019698787852319845, + "loss": 3.679, + "step": 1524 + }, + { + "epoch": 0.10609064663118717, + "grad_norm": 0.89453125, + "learning_rate": 0.001969823872182384, + "loss": 2.9693, + "step": 1525 + }, + { + "epoch": 0.10616021426832238, + "grad_norm": 0.74609375, + "learning_rate": 0.0019697689098900155, + "loss": 3.2078, + "step": 1526 + }, + { + "epoch": 0.10622978190545758, + "grad_norm": 0.94140625, + "learning_rate": 0.0019697138983576696, + "loss": 3.2731, + "step": 1527 + }, + { + "epoch": 0.10629934954259279, + "grad_norm": 0.8515625, + "learning_rate": 0.0019696588375881395, + "loss": 3.4905, + "step": 1528 + }, + { + "epoch": 0.10636891717972799, + "grad_norm": 0.78515625, + "learning_rate": 0.0019696037275842215, + "loss": 3.7366, + "step": 1529 + }, + { + "epoch": 0.1064384848168632, + "grad_norm": 0.9296875, + "learning_rate": 0.001969548568348713, + "loss": 3.1914, + "step": 1530 + }, + { + "epoch": 0.1065080524539984, + "grad_norm": 0.79296875, + "learning_rate": 0.0019694933598844153, + "loss": 3.5288, + "step": 1531 + }, + { + "epoch": 0.1065776200911336, + "grad_norm": 0.7265625, + "learning_rate": 0.0019694381021941316, + "loss": 3.1728, + "step": 1532 + }, + { + "epoch": 0.10664718772826881, + "grad_norm": 0.7265625, + "learning_rate": 0.0019693827952806673, + "loss": 3.5763, + "step": 1533 + }, + { + "epoch": 0.10671675536540401, + "grad_norm": 1.1328125, + "learning_rate": 0.0019693274391468303, + "loss": 3.5042, + "step": 1534 + }, + { + "epoch": 0.10678632300253922, + "grad_norm": 0.8984375, + "learning_rate": 0.001969272033795432, + "loss": 2.8814, + "step": 1535 + }, + { + "epoch": 0.10685589063967442, + "grad_norm": 0.70703125, + "learning_rate": 0.0019692165792292854, + "loss": 3.2389, + "step": 1536 + }, + { + "epoch": 0.10692545827680963, + "grad_norm": 0.7109375, + "learning_rate": 0.001969161075451206, + "loss": 3.2213, + "step": 1537 + }, + { + "epoch": 0.10699502591394483, + "grad_norm": 0.80859375, + "learning_rate": 0.001969105522464012, + "loss": 3.6244, + "step": 1538 + }, + { + "epoch": 0.10706459355108004, + "grad_norm": 0.8125, + "learning_rate": 0.0019690499202705243, + "loss": 3.2784, + "step": 1539 + }, + { + "epoch": 0.10713416118821524, + "grad_norm": 1.0546875, + "learning_rate": 0.001968994268873566, + "loss": 3.08, + "step": 1540 + }, + { + "epoch": 0.10720372882535045, + "grad_norm": 0.8203125, + "learning_rate": 0.001968938568275963, + "loss": 2.9797, + "step": 1541 + }, + { + "epoch": 0.10727329646248565, + "grad_norm": 0.90234375, + "learning_rate": 0.0019688828184805432, + "loss": 3.0292, + "step": 1542 + }, + { + "epoch": 0.10734286409962086, + "grad_norm": 0.8984375, + "learning_rate": 0.0019688270194901376, + "loss": 2.9543, + "step": 1543 + }, + { + "epoch": 0.10741243173675606, + "grad_norm": 0.9375, + "learning_rate": 0.001968771171307579, + "loss": 2.8975, + "step": 1544 + }, + { + "epoch": 0.10748199937389126, + "grad_norm": 0.83984375, + "learning_rate": 0.0019687152739357033, + "loss": 3.6984, + "step": 1545 + }, + { + "epoch": 0.10755156701102647, + "grad_norm": 0.859375, + "learning_rate": 0.0019686593273773485, + "loss": 3.5366, + "step": 1546 + }, + { + "epoch": 0.10762113464816167, + "grad_norm": 0.765625, + "learning_rate": 0.0019686033316353557, + "loss": 3.4072, + "step": 1547 + }, + { + "epoch": 0.10769070228529688, + "grad_norm": 0.8359375, + "learning_rate": 0.001968547286712568, + "loss": 3.4246, + "step": 1548 + }, + { + "epoch": 0.10776026992243208, + "grad_norm": 0.87109375, + "learning_rate": 0.0019684911926118307, + "loss": 3.1723, + "step": 1549 + }, + { + "epoch": 0.1078298375595673, + "grad_norm": 0.8828125, + "learning_rate": 0.001968435049335992, + "loss": 3.4788, + "step": 1550 + }, + { + "epoch": 0.10789940519670249, + "grad_norm": 0.75390625, + "learning_rate": 0.001968378856887903, + "loss": 3.4059, + "step": 1551 + }, + { + "epoch": 0.1079689728338377, + "grad_norm": 0.96875, + "learning_rate": 0.0019683226152704164, + "loss": 3.2771, + "step": 1552 + }, + { + "epoch": 0.1080385404709729, + "grad_norm": 1.125, + "learning_rate": 0.001968266324486389, + "loss": 3.1777, + "step": 1553 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 1.0546875, + "learning_rate": 0.001968209984538677, + "loss": 3.3148, + "step": 1554 + }, + { + "epoch": 0.10817767574524331, + "grad_norm": 0.95703125, + "learning_rate": 0.0019681535954301425, + "loss": 3.3594, + "step": 1555 + }, + { + "epoch": 0.10824724338237851, + "grad_norm": 0.90625, + "learning_rate": 0.0019680971571636482, + "loss": 3.2512, + "step": 1556 + }, + { + "epoch": 0.10831681101951372, + "grad_norm": 0.62109375, + "learning_rate": 0.00196804066974206, + "loss": 3.4698, + "step": 1557 + }, + { + "epoch": 0.10838637865664892, + "grad_norm": 0.83984375, + "learning_rate": 0.001967984133168246, + "loss": 3.5944, + "step": 1558 + }, + { + "epoch": 0.10845594629378413, + "grad_norm": 0.6640625, + "learning_rate": 0.001967927547445076, + "loss": 3.4931, + "step": 1559 + }, + { + "epoch": 0.10852551393091933, + "grad_norm": 0.9375, + "learning_rate": 0.001967870912575425, + "loss": 3.1851, + "step": 1560 + }, + { + "epoch": 0.10859508156805454, + "grad_norm": 0.71875, + "learning_rate": 0.0019678142285621666, + "loss": 3.4562, + "step": 1561 + }, + { + "epoch": 0.10866464920518974, + "grad_norm": 0.90234375, + "learning_rate": 0.00196775749540818, + "loss": 3.0386, + "step": 1562 + }, + { + "epoch": 0.10873421684232495, + "grad_norm": 1.75, + "learning_rate": 0.0019677007131163457, + "loss": 3.611, + "step": 1563 + }, + { + "epoch": 0.10880378447946015, + "grad_norm": 1.0234375, + "learning_rate": 0.001967643881689547, + "loss": 3.2333, + "step": 1564 + }, + { + "epoch": 0.10887335211659536, + "grad_norm": 0.96875, + "learning_rate": 0.0019675870011306687, + "loss": 3.495, + "step": 1565 + }, + { + "epoch": 0.10894291975373056, + "grad_norm": 1.0859375, + "learning_rate": 0.0019675300714426004, + "loss": 3.0353, + "step": 1566 + }, + { + "epoch": 0.10901248739086578, + "grad_norm": 0.7421875, + "learning_rate": 0.001967473092628231, + "loss": 3.3026, + "step": 1567 + }, + { + "epoch": 0.10908205502800097, + "grad_norm": 1.0078125, + "learning_rate": 0.001967416064690455, + "loss": 3.4083, + "step": 1568 + }, + { + "epoch": 0.10915162266513617, + "grad_norm": 1.15625, + "learning_rate": 0.001967358987632167, + "loss": 3.1005, + "step": 1569 + }, + { + "epoch": 0.10922119030227138, + "grad_norm": 0.8125, + "learning_rate": 0.001967301861456265, + "loss": 3.5104, + "step": 1570 + }, + { + "epoch": 0.10929075793940658, + "grad_norm": 0.78515625, + "learning_rate": 0.0019672446861656507, + "loss": 3.2699, + "step": 1571 + }, + { + "epoch": 0.1093603255765418, + "grad_norm": 0.6328125, + "learning_rate": 0.001967187461763226, + "loss": 3.2808, + "step": 1572 + }, + { + "epoch": 0.109429893213677, + "grad_norm": 0.79296875, + "learning_rate": 0.0019671301882518977, + "loss": 3.6146, + "step": 1573 + }, + { + "epoch": 0.1094994608508122, + "grad_norm": 0.73828125, + "learning_rate": 0.0019670728656345725, + "loss": 3.3131, + "step": 1574 + }, + { + "epoch": 0.1095690284879474, + "grad_norm": 1.2734375, + "learning_rate": 0.0019670154939141616, + "loss": 3.522, + "step": 1575 + }, + { + "epoch": 0.10963859612508262, + "grad_norm": 0.84765625, + "learning_rate": 0.0019669580730935785, + "loss": 3.5155, + "step": 1576 + }, + { + "epoch": 0.10970816376221781, + "grad_norm": 0.859375, + "learning_rate": 0.001966900603175738, + "loss": 3.1026, + "step": 1577 + }, + { + "epoch": 0.10977773139935303, + "grad_norm": 0.72265625, + "learning_rate": 0.0019668430841635583, + "loss": 3.4919, + "step": 1578 + }, + { + "epoch": 0.10984729903648822, + "grad_norm": 0.94921875, + "learning_rate": 0.0019667855160599604, + "loss": 3.2533, + "step": 1579 + }, + { + "epoch": 0.10991686667362344, + "grad_norm": 0.97265625, + "learning_rate": 0.0019667278988678666, + "loss": 3.6415, + "step": 1580 + }, + { + "epoch": 0.10998643431075864, + "grad_norm": 1.0390625, + "learning_rate": 0.001966670232590203, + "loss": 3.059, + "step": 1581 + }, + { + "epoch": 0.11005600194789383, + "grad_norm": 0.7421875, + "learning_rate": 0.0019666125172298973, + "loss": 2.7372, + "step": 1582 + }, + { + "epoch": 0.11012556958502905, + "grad_norm": 0.73046875, + "learning_rate": 0.0019665547527898796, + "loss": 3.3197, + "step": 1583 + }, + { + "epoch": 0.11019513722216424, + "grad_norm": 0.6953125, + "learning_rate": 0.001966496939273084, + "loss": 2.9155, + "step": 1584 + }, + { + "epoch": 0.11026470485929946, + "grad_norm": 0.96875, + "learning_rate": 0.001966439076682445, + "loss": 2.9173, + "step": 1585 + }, + { + "epoch": 0.11033427249643465, + "grad_norm": 0.640625, + "learning_rate": 0.001966381165020901, + "loss": 3.2683, + "step": 1586 + }, + { + "epoch": 0.11040384013356987, + "grad_norm": 1.09375, + "learning_rate": 0.0019663232042913923, + "loss": 3.6652, + "step": 1587 + }, + { + "epoch": 0.11047340777070506, + "grad_norm": 0.87890625, + "learning_rate": 0.0019662651944968622, + "loss": 3.0684, + "step": 1588 + }, + { + "epoch": 0.11054297540784028, + "grad_norm": 0.73046875, + "learning_rate": 0.0019662071356402557, + "loss": 3.464, + "step": 1589 + }, + { + "epoch": 0.11061254304497548, + "grad_norm": 0.98828125, + "learning_rate": 0.0019661490277245205, + "loss": 3.1648, + "step": 1590 + }, + { + "epoch": 0.11068211068211069, + "grad_norm": 0.7578125, + "learning_rate": 0.001966090870752608, + "loss": 3.8514, + "step": 1591 + }, + { + "epoch": 0.11075167831924589, + "grad_norm": 1.0859375, + "learning_rate": 0.00196603266472747, + "loss": 3.5695, + "step": 1592 + }, + { + "epoch": 0.1108212459563811, + "grad_norm": 0.8359375, + "learning_rate": 0.0019659744096520632, + "loss": 3.2381, + "step": 1593 + }, + { + "epoch": 0.1108908135935163, + "grad_norm": 0.7265625, + "learning_rate": 0.0019659161055293442, + "loss": 3.2566, + "step": 1594 + }, + { + "epoch": 0.1109603812306515, + "grad_norm": 0.8046875, + "learning_rate": 0.001965857752362274, + "loss": 3.4792, + "step": 1595 + }, + { + "epoch": 0.1110299488677867, + "grad_norm": 0.78125, + "learning_rate": 0.0019657993501538155, + "loss": 3.3937, + "step": 1596 + }, + { + "epoch": 0.1110995165049219, + "grad_norm": 0.7421875, + "learning_rate": 0.001965740898906934, + "loss": 3.25, + "step": 1597 + }, + { + "epoch": 0.11116908414205712, + "grad_norm": 0.80078125, + "learning_rate": 0.001965682398624597, + "loss": 3.3554, + "step": 1598 + }, + { + "epoch": 0.11123865177919232, + "grad_norm": 0.765625, + "learning_rate": 0.001965623849309776, + "loss": 3.114, + "step": 1599 + }, + { + "epoch": 0.11130821941632753, + "grad_norm": 0.9375, + "learning_rate": 0.0019655652509654423, + "loss": 3.368, + "step": 1600 + }, + { + "epoch": 0.11137778705346273, + "grad_norm": 0.7578125, + "learning_rate": 0.001965506603594572, + "loss": 3.2518, + "step": 1601 + }, + { + "epoch": 0.11144735469059794, + "grad_norm": 0.84375, + "learning_rate": 0.001965447907200143, + "loss": 3.5058, + "step": 1602 + }, + { + "epoch": 0.11151692232773314, + "grad_norm": 0.8984375, + "learning_rate": 0.0019653891617851357, + "loss": 3.2876, + "step": 1603 + }, + { + "epoch": 0.11158648996486835, + "grad_norm": 1.09375, + "learning_rate": 0.001965330367352533, + "loss": 3.3141, + "step": 1604 + }, + { + "epoch": 0.11165605760200355, + "grad_norm": 0.8203125, + "learning_rate": 0.001965271523905319, + "loss": 3.6069, + "step": 1605 + }, + { + "epoch": 0.11172562523913876, + "grad_norm": 0.73828125, + "learning_rate": 0.001965212631446483, + "loss": 3.1281, + "step": 1606 + }, + { + "epoch": 0.11179519287627396, + "grad_norm": 0.97265625, + "learning_rate": 0.0019651536899790143, + "loss": 2.982, + "step": 1607 + }, + { + "epoch": 0.11186476051340916, + "grad_norm": 0.79296875, + "learning_rate": 0.001965094699505906, + "loss": 3.2783, + "step": 1608 + }, + { + "epoch": 0.11193432815054437, + "grad_norm": 0.84765625, + "learning_rate": 0.0019650356600301533, + "loss": 2.854, + "step": 1609 + }, + { + "epoch": 0.11200389578767957, + "grad_norm": 0.84375, + "learning_rate": 0.001964976571554754, + "loss": 3.1584, + "step": 1610 + }, + { + "epoch": 0.11207346342481478, + "grad_norm": 0.765625, + "learning_rate": 0.001964917434082708, + "loss": 2.9245, + "step": 1611 + }, + { + "epoch": 0.11214303106194998, + "grad_norm": 0.80078125, + "learning_rate": 0.0019648582476170184, + "loss": 3.347, + "step": 1612 + }, + { + "epoch": 0.11221259869908519, + "grad_norm": 0.71484375, + "learning_rate": 0.0019647990121606906, + "loss": 3.3861, + "step": 1613 + }, + { + "epoch": 0.11228216633622039, + "grad_norm": 0.765625, + "learning_rate": 0.0019647397277167316, + "loss": 3.1979, + "step": 1614 + }, + { + "epoch": 0.1123517339733556, + "grad_norm": 0.8671875, + "learning_rate": 0.0019646803942881515, + "loss": 3.4472, + "step": 1615 + }, + { + "epoch": 0.1124213016104908, + "grad_norm": 1.0234375, + "learning_rate": 0.0019646210118779636, + "loss": 3.1445, + "step": 1616 + }, + { + "epoch": 0.11249086924762601, + "grad_norm": 0.88671875, + "learning_rate": 0.0019645615804891833, + "loss": 3.1762, + "step": 1617 + }, + { + "epoch": 0.11256043688476121, + "grad_norm": 0.94140625, + "learning_rate": 0.001964502100124827, + "loss": 3.0012, + "step": 1618 + }, + { + "epoch": 0.1126300045218964, + "grad_norm": 0.90234375, + "learning_rate": 0.001964442570787916, + "loss": 3.1677, + "step": 1619 + }, + { + "epoch": 0.11269957215903162, + "grad_norm": 0.71875, + "learning_rate": 0.001964382992481472, + "loss": 3.3573, + "step": 1620 + }, + { + "epoch": 0.11276913979616682, + "grad_norm": 0.734375, + "learning_rate": 0.0019643233652085206, + "loss": 3.5359, + "step": 1621 + }, + { + "epoch": 0.11283870743330203, + "grad_norm": 1.015625, + "learning_rate": 0.0019642636889720894, + "loss": 2.8434, + "step": 1622 + }, + { + "epoch": 0.11290827507043723, + "grad_norm": 0.87109375, + "learning_rate": 0.001964203963775208, + "loss": 3.3141, + "step": 1623 + }, + { + "epoch": 0.11297784270757244, + "grad_norm": 0.765625, + "learning_rate": 0.00196414418962091, + "loss": 3.6836, + "step": 1624 + }, + { + "epoch": 0.11304741034470764, + "grad_norm": 1.0390625, + "learning_rate": 0.0019640843665122286, + "loss": 3.5512, + "step": 1625 + }, + { + "epoch": 0.11311697798184285, + "grad_norm": 0.90625, + "learning_rate": 0.0019640244944522036, + "loss": 2.9838, + "step": 1626 + }, + { + "epoch": 0.11318654561897805, + "grad_norm": 0.86328125, + "learning_rate": 0.001963964573443873, + "loss": 3.2729, + "step": 1627 + }, + { + "epoch": 0.11325611325611326, + "grad_norm": 0.890625, + "learning_rate": 0.00196390460349028, + "loss": 3.4352, + "step": 1628 + }, + { + "epoch": 0.11332568089324846, + "grad_norm": 0.95703125, + "learning_rate": 0.0019638445845944702, + "loss": 3.5427, + "step": 1629 + }, + { + "epoch": 0.11339524853038367, + "grad_norm": 0.96484375, + "learning_rate": 0.0019637845167594903, + "loss": 3.2987, + "step": 1630 + }, + { + "epoch": 0.11346481616751887, + "grad_norm": 0.8984375, + "learning_rate": 0.0019637243999883905, + "loss": 3.2223, + "step": 1631 + }, + { + "epoch": 0.11353438380465407, + "grad_norm": 1.09375, + "learning_rate": 0.001963664234284223, + "loss": 3.177, + "step": 1632 + }, + { + "epoch": 0.11360395144178928, + "grad_norm": 0.80859375, + "learning_rate": 0.0019636040196500436, + "loss": 3.2046, + "step": 1633 + }, + { + "epoch": 0.11367351907892448, + "grad_norm": 0.83984375, + "learning_rate": 0.0019635437560889084, + "loss": 3.163, + "step": 1634 + }, + { + "epoch": 0.11374308671605969, + "grad_norm": 1.015625, + "learning_rate": 0.001963483443603878, + "loss": 3.1626, + "step": 1635 + }, + { + "epoch": 0.11381265435319489, + "grad_norm": 1.1015625, + "learning_rate": 0.0019634230821980146, + "loss": 3.3732, + "step": 1636 + }, + { + "epoch": 0.1138822219903301, + "grad_norm": 0.91796875, + "learning_rate": 0.001963362671874383, + "loss": 3.3927, + "step": 1637 + }, + { + "epoch": 0.1139517896274653, + "grad_norm": 0.87890625, + "learning_rate": 0.0019633022126360512, + "loss": 3.0315, + "step": 1638 + }, + { + "epoch": 0.11402135726460051, + "grad_norm": 0.9375, + "learning_rate": 0.0019632417044860876, + "loss": 3.236, + "step": 1639 + }, + { + "epoch": 0.11409092490173571, + "grad_norm": 0.74609375, + "learning_rate": 0.001963181147427566, + "loss": 3.0346, + "step": 1640 + }, + { + "epoch": 0.11416049253887092, + "grad_norm": 1.015625, + "learning_rate": 0.0019631205414635602, + "loss": 3.0258, + "step": 1641 + }, + { + "epoch": 0.11423006017600612, + "grad_norm": 1.046875, + "learning_rate": 0.001963059886597148, + "loss": 3.1226, + "step": 1642 + }, + { + "epoch": 0.11429962781314133, + "grad_norm": 1.3203125, + "learning_rate": 0.001962999182831409, + "loss": 3.1423, + "step": 1643 + }, + { + "epoch": 0.11436919545027653, + "grad_norm": 0.734375, + "learning_rate": 0.0019629384301694253, + "loss": 3.4149, + "step": 1644 + }, + { + "epoch": 0.11443876308741173, + "grad_norm": 0.95703125, + "learning_rate": 0.0019628776286142813, + "loss": 3.0182, + "step": 1645 + }, + { + "epoch": 0.11450833072454694, + "grad_norm": 0.91796875, + "learning_rate": 0.001962816778169065, + "loss": 3.2926, + "step": 1646 + }, + { + "epoch": 0.11457789836168214, + "grad_norm": 0.99609375, + "learning_rate": 0.0019627558788368657, + "loss": 3.243, + "step": 1647 + }, + { + "epoch": 0.11464746599881735, + "grad_norm": 0.8359375, + "learning_rate": 0.001962694930620775, + "loss": 3.2997, + "step": 1648 + }, + { + "epoch": 0.11471703363595255, + "grad_norm": 0.90625, + "learning_rate": 0.001962633933523889, + "loss": 2.6841, + "step": 1649 + }, + { + "epoch": 0.11478660127308776, + "grad_norm": 0.5078125, + "learning_rate": 0.001962572887549303, + "loss": 3.6762, + "step": 1650 + }, + { + "epoch": 0.11485616891022296, + "grad_norm": 0.8828125, + "learning_rate": 0.001962511792700118, + "loss": 3.12, + "step": 1651 + }, + { + "epoch": 0.11492573654735817, + "grad_norm": 0.99609375, + "learning_rate": 0.001962450648979435, + "loss": 3.3223, + "step": 1652 + }, + { + "epoch": 0.11499530418449337, + "grad_norm": 0.734375, + "learning_rate": 0.0019623894563903597, + "loss": 3.5976, + "step": 1653 + }, + { + "epoch": 0.11506487182162858, + "grad_norm": 1.2578125, + "learning_rate": 0.0019623282149359984, + "loss": 2.9595, + "step": 1654 + }, + { + "epoch": 0.11513443945876378, + "grad_norm": 0.703125, + "learning_rate": 0.001962266924619461, + "loss": 3.4546, + "step": 1655 + }, + { + "epoch": 0.115204007095899, + "grad_norm": 0.98046875, + "learning_rate": 0.001962205585443859, + "loss": 3.3012, + "step": 1656 + }, + { + "epoch": 0.11527357473303419, + "grad_norm": 1.1015625, + "learning_rate": 0.0019621441974123077, + "loss": 3.1979, + "step": 1657 + }, + { + "epoch": 0.11534314237016939, + "grad_norm": 0.88671875, + "learning_rate": 0.0019620827605279236, + "loss": 3.0767, + "step": 1658 + }, + { + "epoch": 0.1154127100073046, + "grad_norm": 0.78125, + "learning_rate": 0.001962021274793826, + "loss": 3.2852, + "step": 1659 + }, + { + "epoch": 0.1154822776444398, + "grad_norm": 1.03125, + "learning_rate": 0.001961959740213137, + "loss": 3.336, + "step": 1660 + }, + { + "epoch": 0.11555184528157501, + "grad_norm": 1.0390625, + "learning_rate": 0.001961898156788981, + "loss": 2.9833, + "step": 1661 + }, + { + "epoch": 0.11562141291871021, + "grad_norm": 0.73828125, + "learning_rate": 0.001961836524524485, + "loss": 3.4473, + "step": 1662 + }, + { + "epoch": 0.11569098055584542, + "grad_norm": 0.6953125, + "learning_rate": 0.001961774843422778, + "loss": 3.2512, + "step": 1663 + }, + { + "epoch": 0.11576054819298062, + "grad_norm": 0.94140625, + "learning_rate": 0.0019617131134869927, + "loss": 3.3328, + "step": 1664 + }, + { + "epoch": 0.11583011583011583, + "grad_norm": 0.97265625, + "learning_rate": 0.0019616513347202624, + "loss": 3.4149, + "step": 1665 + }, + { + "epoch": 0.11589968346725103, + "grad_norm": 0.8359375, + "learning_rate": 0.001961589507125725, + "loss": 3.1448, + "step": 1666 + }, + { + "epoch": 0.11596925110438625, + "grad_norm": 0.765625, + "learning_rate": 0.0019615276307065185, + "loss": 3.1483, + "step": 1667 + }, + { + "epoch": 0.11603881874152144, + "grad_norm": 0.80078125, + "learning_rate": 0.0019614657054657855, + "loss": 3.2344, + "step": 1668 + }, + { + "epoch": 0.11610838637865666, + "grad_norm": 0.67578125, + "learning_rate": 0.0019614037314066705, + "loss": 3.5726, + "step": 1669 + }, + { + "epoch": 0.11617795401579185, + "grad_norm": 0.8046875, + "learning_rate": 0.0019613417085323193, + "loss": 3.5865, + "step": 1670 + }, + { + "epoch": 0.11624752165292705, + "grad_norm": 0.91796875, + "learning_rate": 0.0019612796368458827, + "loss": 3.2813, + "step": 1671 + }, + { + "epoch": 0.11631708929006226, + "grad_norm": 0.89453125, + "learning_rate": 0.0019612175163505104, + "loss": 3.1217, + "step": 1672 + }, + { + "epoch": 0.11638665692719746, + "grad_norm": 0.83203125, + "learning_rate": 0.0019611553470493576, + "loss": 2.907, + "step": 1673 + }, + { + "epoch": 0.11645622456433267, + "grad_norm": 0.8203125, + "learning_rate": 0.0019610931289455813, + "loss": 3.2636, + "step": 1674 + }, + { + "epoch": 0.11652579220146787, + "grad_norm": 0.83984375, + "learning_rate": 0.0019610308620423397, + "loss": 3.4404, + "step": 1675 + }, + { + "epoch": 0.11659535983860309, + "grad_norm": 1.015625, + "learning_rate": 0.0019609685463427952, + "loss": 3.5559, + "step": 1676 + }, + { + "epoch": 0.11666492747573828, + "grad_norm": 0.984375, + "learning_rate": 0.0019609061818501115, + "loss": 3.477, + "step": 1677 + }, + { + "epoch": 0.1167344951128735, + "grad_norm": 0.84375, + "learning_rate": 0.001960843768567455, + "loss": 3.286, + "step": 1678 + }, + { + "epoch": 0.1168040627500087, + "grad_norm": 0.70703125, + "learning_rate": 0.0019607813064979954, + "loss": 3.4537, + "step": 1679 + }, + { + "epoch": 0.1168736303871439, + "grad_norm": 1.0234375, + "learning_rate": 0.0019607187956449034, + "loss": 3.4741, + "step": 1680 + }, + { + "epoch": 0.1169431980242791, + "grad_norm": 0.8984375, + "learning_rate": 0.0019606562360113535, + "loss": 3.4188, + "step": 1681 + }, + { + "epoch": 0.11701276566141432, + "grad_norm": 0.83203125, + "learning_rate": 0.0019605936276005215, + "loss": 3.1084, + "step": 1682 + }, + { + "epoch": 0.11708233329854952, + "grad_norm": 0.60546875, + "learning_rate": 0.0019605309704155876, + "loss": 3.5886, + "step": 1683 + }, + { + "epoch": 0.11715190093568471, + "grad_norm": 0.80859375, + "learning_rate": 0.001960468264459732, + "loss": 3.1211, + "step": 1684 + }, + { + "epoch": 0.11722146857281993, + "grad_norm": 0.6796875, + "learning_rate": 0.0019604055097361393, + "loss": 3.3281, + "step": 1685 + }, + { + "epoch": 0.11729103620995512, + "grad_norm": 0.8984375, + "learning_rate": 0.0019603427062479953, + "loss": 3.4789, + "step": 1686 + }, + { + "epoch": 0.11736060384709034, + "grad_norm": 0.98046875, + "learning_rate": 0.001960279853998489, + "loss": 3.2314, + "step": 1687 + }, + { + "epoch": 0.11743017148422553, + "grad_norm": 0.6953125, + "learning_rate": 0.0019602169529908124, + "loss": 3.8185, + "step": 1688 + }, + { + "epoch": 0.11749973912136075, + "grad_norm": 0.796875, + "learning_rate": 0.001960154003228159, + "loss": 3.1382, + "step": 1689 + }, + { + "epoch": 0.11756930675849595, + "grad_norm": 0.79296875, + "learning_rate": 0.0019600910047137244, + "loss": 3.1299, + "step": 1690 + }, + { + "epoch": 0.11763887439563116, + "grad_norm": 0.71484375, + "learning_rate": 0.0019600279574507077, + "loss": 3.1588, + "step": 1691 + }, + { + "epoch": 0.11770844203276636, + "grad_norm": 0.828125, + "learning_rate": 0.00195996486144231, + "loss": 3.1371, + "step": 1692 + }, + { + "epoch": 0.11777800966990157, + "grad_norm": 0.74609375, + "learning_rate": 0.001959901716691736, + "loss": 3.4681, + "step": 1693 + }, + { + "epoch": 0.11784757730703677, + "grad_norm": 0.67578125, + "learning_rate": 0.0019598385232021905, + "loss": 3.3351, + "step": 1694 + }, + { + "epoch": 0.11791714494417196, + "grad_norm": 0.69921875, + "learning_rate": 0.0019597752809768832, + "loss": 3.4628, + "step": 1695 + }, + { + "epoch": 0.11798671258130718, + "grad_norm": 0.81640625, + "learning_rate": 0.0019597119900190245, + "loss": 3.2477, + "step": 1696 + }, + { + "epoch": 0.11805628021844237, + "grad_norm": 0.82421875, + "learning_rate": 0.001959648650331828, + "loss": 3.3238, + "step": 1697 + }, + { + "epoch": 0.11812584785557759, + "grad_norm": 0.75, + "learning_rate": 0.00195958526191851, + "loss": 3.4188, + "step": 1698 + }, + { + "epoch": 0.11819541549271279, + "grad_norm": 0.9921875, + "learning_rate": 0.00195952182478229, + "loss": 3.3663, + "step": 1699 + }, + { + "epoch": 0.118264983129848, + "grad_norm": 0.8984375, + "learning_rate": 0.0019594583389263872, + "loss": 3.1903, + "step": 1700 + }, + { + "epoch": 0.1183345507669832, + "grad_norm": 0.87109375, + "learning_rate": 0.001959394804354026, + "loss": 3.4655, + "step": 1701 + }, + { + "epoch": 0.11840411840411841, + "grad_norm": 0.84765625, + "learning_rate": 0.0019593312210684326, + "loss": 3.4838, + "step": 1702 + }, + { + "epoch": 0.1184736860412536, + "grad_norm": 1.0078125, + "learning_rate": 0.001959267589072835, + "loss": 3.4469, + "step": 1703 + }, + { + "epoch": 0.11854325367838882, + "grad_norm": 0.81640625, + "learning_rate": 0.0019592039083704644, + "loss": 3.0336, + "step": 1704 + }, + { + "epoch": 0.11861282131552402, + "grad_norm": 0.734375, + "learning_rate": 0.001959140178964554, + "loss": 3.3108, + "step": 1705 + }, + { + "epoch": 0.11868238895265923, + "grad_norm": 1.09375, + "learning_rate": 0.00195907640085834, + "loss": 3.0878, + "step": 1706 + }, + { + "epoch": 0.11875195658979443, + "grad_norm": 0.9765625, + "learning_rate": 0.00195901257405506, + "loss": 3.0738, + "step": 1707 + }, + { + "epoch": 0.11882152422692963, + "grad_norm": 0.75390625, + "learning_rate": 0.0019589486985579557, + "loss": 3.4438, + "step": 1708 + }, + { + "epoch": 0.11889109186406484, + "grad_norm": 0.56640625, + "learning_rate": 0.00195888477437027, + "loss": 3.465, + "step": 1709 + }, + { + "epoch": 0.11896065950120004, + "grad_norm": 0.5703125, + "learning_rate": 0.001958820801495248, + "loss": 3.4594, + "step": 1710 + }, + { + "epoch": 0.11903022713833525, + "grad_norm": 0.8828125, + "learning_rate": 0.001958756779936139, + "loss": 3.4865, + "step": 1711 + }, + { + "epoch": 0.11909979477547045, + "grad_norm": 0.75, + "learning_rate": 0.0019586927096961935, + "loss": 3.2511, + "step": 1712 + }, + { + "epoch": 0.11916936241260566, + "grad_norm": 0.6875, + "learning_rate": 0.001958628590778664, + "loss": 3.3198, + "step": 1713 + }, + { + "epoch": 0.11923893004974086, + "grad_norm": 1.2265625, + "learning_rate": 0.0019585644231868062, + "loss": 3.6062, + "step": 1714 + }, + { + "epoch": 0.11930849768687607, + "grad_norm": 0.7421875, + "learning_rate": 0.001958500206923879, + "loss": 3.5153, + "step": 1715 + }, + { + "epoch": 0.11937806532401127, + "grad_norm": 0.796875, + "learning_rate": 0.001958435941993142, + "loss": 3.3921, + "step": 1716 + }, + { + "epoch": 0.11944763296114648, + "grad_norm": 0.7734375, + "learning_rate": 0.0019583716283978593, + "loss": 3.3137, + "step": 1717 + }, + { + "epoch": 0.11951720059828168, + "grad_norm": 0.8203125, + "learning_rate": 0.0019583072661412955, + "loss": 3.4524, + "step": 1718 + }, + { + "epoch": 0.11958676823541689, + "grad_norm": 0.62109375, + "learning_rate": 0.001958242855226719, + "loss": 3.3938, + "step": 1719 + }, + { + "epoch": 0.11965633587255209, + "grad_norm": 0.70703125, + "learning_rate": 0.0019581783956574006, + "loss": 3.168, + "step": 1720 + }, + { + "epoch": 0.11972590350968729, + "grad_norm": 0.55859375, + "learning_rate": 0.001958113887436612, + "loss": 3.6354, + "step": 1721 + }, + { + "epoch": 0.1197954711468225, + "grad_norm": 0.734375, + "learning_rate": 0.00195804933056763, + "loss": 3.1484, + "step": 1722 + }, + { + "epoch": 0.1198650387839577, + "grad_norm": 1.015625, + "learning_rate": 0.0019579847250537318, + "loss": 2.7735, + "step": 1723 + }, + { + "epoch": 0.11993460642109291, + "grad_norm": 0.80859375, + "learning_rate": 0.001957920070898198, + "loss": 2.9981, + "step": 1724 + }, + { + "epoch": 0.12000417405822811, + "grad_norm": 0.953125, + "learning_rate": 0.0019578553681043115, + "loss": 2.9309, + "step": 1725 + }, + { + "epoch": 0.12007374169536332, + "grad_norm": 0.71875, + "learning_rate": 0.001957790616675357, + "loss": 3.4824, + "step": 1726 + }, + { + "epoch": 0.12014330933249852, + "grad_norm": 0.7890625, + "learning_rate": 0.0019577258166146227, + "loss": 3.1838, + "step": 1727 + }, + { + "epoch": 0.12021287696963373, + "grad_norm": 0.82421875, + "learning_rate": 0.0019576609679253986, + "loss": 2.8742, + "step": 1728 + }, + { + "epoch": 0.12028244460676893, + "grad_norm": 0.9375, + "learning_rate": 0.001957596070610978, + "loss": 3.5897, + "step": 1729 + }, + { + "epoch": 0.12035201224390414, + "grad_norm": 0.62109375, + "learning_rate": 0.001957531124674655, + "loss": 3.6366, + "step": 1730 + }, + { + "epoch": 0.12042157988103934, + "grad_norm": 0.74609375, + "learning_rate": 0.001957466130119728, + "loss": 3.0287, + "step": 1731 + }, + { + "epoch": 0.12049114751817455, + "grad_norm": 0.890625, + "learning_rate": 0.0019574010869494968, + "loss": 3.2798, + "step": 1732 + }, + { + "epoch": 0.12056071515530975, + "grad_norm": 0.82421875, + "learning_rate": 0.0019573359951672643, + "loss": 3.4167, + "step": 1733 + }, + { + "epoch": 0.12063028279244495, + "grad_norm": 2.75, + "learning_rate": 0.001957270854776335, + "loss": 3.3749, + "step": 1734 + }, + { + "epoch": 0.12069985042958016, + "grad_norm": 0.7265625, + "learning_rate": 0.001957205665780017, + "loss": 3.7976, + "step": 1735 + }, + { + "epoch": 0.12076941806671536, + "grad_norm": 1.015625, + "learning_rate": 0.00195714042818162, + "loss": 3.2681, + "step": 1736 + }, + { + "epoch": 0.12083898570385057, + "grad_norm": 0.828125, + "learning_rate": 0.001957075141984456, + "loss": 3.0821, + "step": 1737 + }, + { + "epoch": 0.12090855334098577, + "grad_norm": 1.0546875, + "learning_rate": 0.0019570098071918407, + "loss": 3.477, + "step": 1738 + }, + { + "epoch": 0.12097812097812098, + "grad_norm": 0.88671875, + "learning_rate": 0.001956944423807091, + "loss": 3.3121, + "step": 1739 + }, + { + "epoch": 0.12104768861525618, + "grad_norm": 0.9140625, + "learning_rate": 0.0019568789918335268, + "loss": 3.0542, + "step": 1740 + }, + { + "epoch": 0.12111725625239139, + "grad_norm": 0.95703125, + "learning_rate": 0.0019568135112744698, + "loss": 3.432, + "step": 1741 + }, + { + "epoch": 0.12118682388952659, + "grad_norm": 0.85546875, + "learning_rate": 0.0019567479821332463, + "loss": 2.9629, + "step": 1742 + }, + { + "epoch": 0.1212563915266618, + "grad_norm": 0.99609375, + "learning_rate": 0.001956682404413182, + "loss": 3.4069, + "step": 1743 + }, + { + "epoch": 0.121325959163797, + "grad_norm": 0.84375, + "learning_rate": 0.0019566167781176077, + "loss": 3.2643, + "step": 1744 + }, + { + "epoch": 0.12139552680093221, + "grad_norm": 0.84765625, + "learning_rate": 0.001956551103249855, + "loss": 3.8109, + "step": 1745 + }, + { + "epoch": 0.12146509443806741, + "grad_norm": 0.70703125, + "learning_rate": 0.0019564853798132585, + "loss": 3.5165, + "step": 1746 + }, + { + "epoch": 0.12153466207520261, + "grad_norm": 0.9765625, + "learning_rate": 0.0019564196078111556, + "loss": 3.4767, + "step": 1747 + }, + { + "epoch": 0.12160422971233782, + "grad_norm": 0.9609375, + "learning_rate": 0.0019563537872468854, + "loss": 3.2814, + "step": 1748 + }, + { + "epoch": 0.12167379734947302, + "grad_norm": 1.0390625, + "learning_rate": 0.0019562879181237907, + "loss": 3.188, + "step": 1749 + }, + { + "epoch": 0.12174336498660823, + "grad_norm": 0.86328125, + "learning_rate": 0.0019562220004452156, + "loss": 3.3306, + "step": 1750 + }, + { + "epoch": 0.12181293262374343, + "grad_norm": 1.0234375, + "learning_rate": 0.001956156034214507, + "loss": 3.3211, + "step": 1751 + }, + { + "epoch": 0.12188250026087864, + "grad_norm": 1.109375, + "learning_rate": 0.0019560900194350137, + "loss": 3.0687, + "step": 1752 + }, + { + "epoch": 0.12195206789801384, + "grad_norm": 0.671875, + "learning_rate": 0.001956023956110089, + "loss": 3.1071, + "step": 1753 + }, + { + "epoch": 0.12202163553514905, + "grad_norm": 0.796875, + "learning_rate": 0.0019559578442430864, + "loss": 3.1057, + "step": 1754 + }, + { + "epoch": 0.12209120317228425, + "grad_norm": 0.84375, + "learning_rate": 0.0019558916838373626, + "loss": 3.3788, + "step": 1755 + }, + { + "epoch": 0.12216077080941946, + "grad_norm": 0.8359375, + "learning_rate": 0.0019558254748962773, + "loss": 3.0311, + "step": 1756 + }, + { + "epoch": 0.12223033844655466, + "grad_norm": 0.94921875, + "learning_rate": 0.001955759217423192, + "loss": 3.4973, + "step": 1757 + }, + { + "epoch": 0.12229990608368987, + "grad_norm": 1.0234375, + "learning_rate": 0.001955692911421471, + "loss": 3.0044, + "step": 1758 + }, + { + "epoch": 0.12236947372082507, + "grad_norm": 0.7578125, + "learning_rate": 0.0019556265568944813, + "loss": 2.8627, + "step": 1759 + }, + { + "epoch": 0.12243904135796027, + "grad_norm": 0.70703125, + "learning_rate": 0.0019555601538455915, + "loss": 3.5029, + "step": 1760 + }, + { + "epoch": 0.12250860899509548, + "grad_norm": 0.953125, + "learning_rate": 0.0019554937022781735, + "loss": 3.1952, + "step": 1761 + }, + { + "epoch": 0.12257817663223068, + "grad_norm": 0.86328125, + "learning_rate": 0.0019554272021956014, + "loss": 3.7242, + "step": 1762 + }, + { + "epoch": 0.1226477442693659, + "grad_norm": 0.94921875, + "learning_rate": 0.001955360653601252, + "loss": 3.31, + "step": 1763 + }, + { + "epoch": 0.12271731190650109, + "grad_norm": 0.82421875, + "learning_rate": 0.0019552940564985036, + "loss": 3.3173, + "step": 1764 + }, + { + "epoch": 0.1227868795436363, + "grad_norm": 0.79296875, + "learning_rate": 0.001955227410890738, + "loss": 3.2139, + "step": 1765 + }, + { + "epoch": 0.1228564471807715, + "grad_norm": 0.796875, + "learning_rate": 0.001955160716781339, + "loss": 3.0229, + "step": 1766 + }, + { + "epoch": 0.12292601481790671, + "grad_norm": 0.71484375, + "learning_rate": 0.0019550939741736937, + "loss": 3.0964, + "step": 1767 + }, + { + "epoch": 0.12299558245504191, + "grad_norm": 0.890625, + "learning_rate": 0.00195502718307119, + "loss": 3.3713, + "step": 1768 + }, + { + "epoch": 0.12306515009217713, + "grad_norm": 1.03125, + "learning_rate": 0.0019549603434772197, + "loss": 3.3651, + "step": 1769 + }, + { + "epoch": 0.12313471772931232, + "grad_norm": 0.89453125, + "learning_rate": 0.001954893455395177, + "loss": 2.9908, + "step": 1770 + }, + { + "epoch": 0.12320428536644752, + "grad_norm": 0.921875, + "learning_rate": 0.0019548265188284574, + "loss": 2.9317, + "step": 1771 + }, + { + "epoch": 0.12327385300358273, + "grad_norm": 0.9375, + "learning_rate": 0.0019547595337804594, + "loss": 2.8074, + "step": 1772 + }, + { + "epoch": 0.12334342064071793, + "grad_norm": 0.921875, + "learning_rate": 0.001954692500254585, + "loss": 3.339, + "step": 1773 + }, + { + "epoch": 0.12341298827785314, + "grad_norm": 0.9453125, + "learning_rate": 0.0019546254182542374, + "loss": 3.0529, + "step": 1774 + }, + { + "epoch": 0.12348255591498834, + "grad_norm": 0.78125, + "learning_rate": 0.001954558287782823, + "loss": 3.3201, + "step": 1775 + }, + { + "epoch": 0.12355212355212356, + "grad_norm": 1.0234375, + "learning_rate": 0.0019544911088437496, + "loss": 3.211, + "step": 1776 + }, + { + "epoch": 0.12362169118925875, + "grad_norm": 0.84765625, + "learning_rate": 0.0019544238814404287, + "loss": 3.4603, + "step": 1777 + }, + { + "epoch": 0.12369125882639397, + "grad_norm": 0.73828125, + "learning_rate": 0.0019543566055762744, + "loss": 3.3202, + "step": 1778 + }, + { + "epoch": 0.12376082646352916, + "grad_norm": 1.09375, + "learning_rate": 0.0019542892812547015, + "loss": 3.4099, + "step": 1779 + }, + { + "epoch": 0.12383039410066438, + "grad_norm": 0.88671875, + "learning_rate": 0.0019542219084791286, + "loss": 3.6499, + "step": 1780 + }, + { + "epoch": 0.12389996173779957, + "grad_norm": 0.84765625, + "learning_rate": 0.001954154487252977, + "loss": 3.2814, + "step": 1781 + }, + { + "epoch": 0.12396952937493479, + "grad_norm": 0.796875, + "learning_rate": 0.00195408701757967, + "loss": 3.1144, + "step": 1782 + }, + { + "epoch": 0.12403909701206998, + "grad_norm": 0.90625, + "learning_rate": 0.001954019499462633, + "loss": 3.2145, + "step": 1783 + }, + { + "epoch": 0.12410866464920518, + "grad_norm": 1.1796875, + "learning_rate": 0.001953951932905295, + "loss": 2.9224, + "step": 1784 + }, + { + "epoch": 0.1241782322863404, + "grad_norm": 0.62890625, + "learning_rate": 0.0019538843179110854, + "loss": 3.5563, + "step": 1785 + }, + { + "epoch": 0.1242477999234756, + "grad_norm": 0.73828125, + "learning_rate": 0.0019538166544834385, + "loss": 3.7433, + "step": 1786 + }, + { + "epoch": 0.1243173675606108, + "grad_norm": 0.66796875, + "learning_rate": 0.0019537489426257894, + "loss": 3.5424, + "step": 1787 + }, + { + "epoch": 0.124386935197746, + "grad_norm": 0.81640625, + "learning_rate": 0.001953681182341576, + "loss": 3.1177, + "step": 1788 + }, + { + "epoch": 0.12445650283488122, + "grad_norm": 0.94140625, + "learning_rate": 0.0019536133736342393, + "loss": 3.1545, + "step": 1789 + }, + { + "epoch": 0.12452607047201641, + "grad_norm": 0.92578125, + "learning_rate": 0.001953545516507222, + "loss": 3.2966, + "step": 1790 + }, + { + "epoch": 0.12459563810915163, + "grad_norm": 0.828125, + "learning_rate": 0.00195347761096397, + "loss": 3.0389, + "step": 1791 + }, + { + "epoch": 0.12466520574628683, + "grad_norm": 0.8984375, + "learning_rate": 0.0019534096570079304, + "loss": 3.2605, + "step": 1792 + }, + { + "epoch": 0.12473477338342204, + "grad_norm": 0.734375, + "learning_rate": 0.001953341654642554, + "loss": 3.5102, + "step": 1793 + }, + { + "epoch": 0.12480434102055724, + "grad_norm": 0.6875, + "learning_rate": 0.0019532736038712934, + "loss": 3.6672, + "step": 1794 + }, + { + "epoch": 0.12487390865769245, + "grad_norm": 0.953125, + "learning_rate": 0.0019532055046976044, + "loss": 3.2388, + "step": 1795 + }, + { + "epoch": 0.12494347629482765, + "grad_norm": 1.0703125, + "learning_rate": 0.001953137357124944, + "loss": 3.2163, + "step": 1796 + }, + { + "epoch": 0.12501304393196286, + "grad_norm": 0.78515625, + "learning_rate": 0.001953069161156773, + "loss": 3.3399, + "step": 1797 + }, + { + "epoch": 0.12508261156909806, + "grad_norm": 1.0078125, + "learning_rate": 0.0019530009167965537, + "loss": 3.4525, + "step": 1798 + }, + { + "epoch": 0.12515217920623325, + "grad_norm": 0.8515625, + "learning_rate": 0.0019529326240477513, + "loss": 2.9544, + "step": 1799 + }, + { + "epoch": 0.12522174684336845, + "grad_norm": 1.125, + "learning_rate": 0.0019528642829138338, + "loss": 3.4445, + "step": 1800 + }, + { + "epoch": 0.12529131448050368, + "grad_norm": 0.74609375, + "learning_rate": 0.0019527958933982703, + "loss": 3.1022, + "step": 1801 + }, + { + "epoch": 0.12536088211763888, + "grad_norm": 0.8828125, + "learning_rate": 0.001952727455504534, + "loss": 3.1783, + "step": 1802 + }, + { + "epoch": 0.12543044975477408, + "grad_norm": 0.8671875, + "learning_rate": 0.0019526589692360997, + "loss": 3.3324, + "step": 1803 + }, + { + "epoch": 0.12550001739190927, + "grad_norm": 1.0078125, + "learning_rate": 0.0019525904345964445, + "loss": 3.2979, + "step": 1804 + }, + { + "epoch": 0.1255695850290445, + "grad_norm": 0.84375, + "learning_rate": 0.0019525218515890487, + "loss": 3.2694, + "step": 1805 + }, + { + "epoch": 0.1256391526661797, + "grad_norm": 0.703125, + "learning_rate": 0.0019524532202173938, + "loss": 3.6337, + "step": 1806 + }, + { + "epoch": 0.1257087203033149, + "grad_norm": 1.03125, + "learning_rate": 0.0019523845404849655, + "loss": 3.0394, + "step": 1807 + }, + { + "epoch": 0.1257782879404501, + "grad_norm": 0.80859375, + "learning_rate": 0.0019523158123952507, + "loss": 3.2475, + "step": 1808 + }, + { + "epoch": 0.1258478555775853, + "grad_norm": 0.97265625, + "learning_rate": 0.001952247035951739, + "loss": 3.5361, + "step": 1809 + }, + { + "epoch": 0.12591742321472052, + "grad_norm": 0.75, + "learning_rate": 0.0019521782111579223, + "loss": 3.2208, + "step": 1810 + }, + { + "epoch": 0.12598699085185572, + "grad_norm": 0.74609375, + "learning_rate": 0.0019521093380172954, + "loss": 3.0137, + "step": 1811 + }, + { + "epoch": 0.12605655848899092, + "grad_norm": 0.92578125, + "learning_rate": 0.0019520404165333555, + "loss": 3.4475, + "step": 1812 + }, + { + "epoch": 0.12612612612612611, + "grad_norm": 0.85546875, + "learning_rate": 0.0019519714467096016, + "loss": 3.3839, + "step": 1813 + }, + { + "epoch": 0.12619569376326134, + "grad_norm": 0.65625, + "learning_rate": 0.0019519024285495359, + "loss": 3.4224, + "step": 1814 + }, + { + "epoch": 0.12626526140039654, + "grad_norm": 0.796875, + "learning_rate": 0.0019518333620566631, + "loss": 3.4469, + "step": 1815 + }, + { + "epoch": 0.12633482903753174, + "grad_norm": 0.91796875, + "learning_rate": 0.0019517642472344895, + "loss": 3.385, + "step": 1816 + }, + { + "epoch": 0.12640439667466694, + "grad_norm": 0.77734375, + "learning_rate": 0.0019516950840865249, + "loss": 3.0788, + "step": 1817 + }, + { + "epoch": 0.12647396431180216, + "grad_norm": 1.109375, + "learning_rate": 0.0019516258726162807, + "loss": 2.922, + "step": 1818 + }, + { + "epoch": 0.12654353194893736, + "grad_norm": 0.94140625, + "learning_rate": 0.0019515566128272713, + "loss": 3.7069, + "step": 1819 + }, + { + "epoch": 0.12661309958607256, + "grad_norm": 0.87890625, + "learning_rate": 0.0019514873047230133, + "loss": 3.5041, + "step": 1820 + }, + { + "epoch": 0.12668266722320776, + "grad_norm": 0.98828125, + "learning_rate": 0.0019514179483070258, + "loss": 3.3119, + "step": 1821 + }, + { + "epoch": 0.12675223486034295, + "grad_norm": 0.76171875, + "learning_rate": 0.0019513485435828303, + "loss": 3.5025, + "step": 1822 + }, + { + "epoch": 0.12682180249747818, + "grad_norm": 0.765625, + "learning_rate": 0.001951279090553951, + "loss": 3.1403, + "step": 1823 + }, + { + "epoch": 0.12689137013461338, + "grad_norm": 0.796875, + "learning_rate": 0.0019512095892239144, + "loss": 3.0786, + "step": 1824 + }, + { + "epoch": 0.12696093777174858, + "grad_norm": 0.7421875, + "learning_rate": 0.001951140039596249, + "loss": 3.2466, + "step": 1825 + }, + { + "epoch": 0.12703050540888378, + "grad_norm": 0.86328125, + "learning_rate": 0.001951070441674487, + "loss": 3.1296, + "step": 1826 + }, + { + "epoch": 0.127100073046019, + "grad_norm": 0.89453125, + "learning_rate": 0.0019510007954621612, + "loss": 3.5657, + "step": 1827 + }, + { + "epoch": 0.1271696406831542, + "grad_norm": 0.77734375, + "learning_rate": 0.0019509311009628086, + "loss": 3.0529, + "step": 1828 + }, + { + "epoch": 0.1272392083202894, + "grad_norm": 0.82421875, + "learning_rate": 0.0019508613581799676, + "loss": 3.0305, + "step": 1829 + }, + { + "epoch": 0.1273087759574246, + "grad_norm": 0.9609375, + "learning_rate": 0.0019507915671171797, + "loss": 3.1983, + "step": 1830 + }, + { + "epoch": 0.12737834359455982, + "grad_norm": 0.78515625, + "learning_rate": 0.0019507217277779884, + "loss": 3.1657, + "step": 1831 + }, + { + "epoch": 0.12744791123169502, + "grad_norm": 0.66015625, + "learning_rate": 0.0019506518401659397, + "loss": 3.5715, + "step": 1832 + }, + { + "epoch": 0.12751747886883022, + "grad_norm": 1.015625, + "learning_rate": 0.0019505819042845822, + "loss": 3.2889, + "step": 1833 + }, + { + "epoch": 0.12758704650596542, + "grad_norm": 0.9140625, + "learning_rate": 0.001950511920137467, + "loss": 3.3527, + "step": 1834 + }, + { + "epoch": 0.12765661414310062, + "grad_norm": 0.859375, + "learning_rate": 0.001950441887728147, + "loss": 3.7539, + "step": 1835 + }, + { + "epoch": 0.12772618178023584, + "grad_norm": 0.78125, + "learning_rate": 0.0019503718070601791, + "loss": 3.2217, + "step": 1836 + }, + { + "epoch": 0.12779574941737104, + "grad_norm": 0.78125, + "learning_rate": 0.0019503016781371209, + "loss": 3.275, + "step": 1837 + }, + { + "epoch": 0.12786531705450624, + "grad_norm": 0.85546875, + "learning_rate": 0.0019502315009625331, + "loss": 3.4357, + "step": 1838 + }, + { + "epoch": 0.12793488469164144, + "grad_norm": 0.82421875, + "learning_rate": 0.0019501612755399795, + "loss": 3.3004, + "step": 1839 + }, + { + "epoch": 0.12800445232877666, + "grad_norm": 1.0546875, + "learning_rate": 0.0019500910018730253, + "loss": 3.1225, + "step": 1840 + }, + { + "epoch": 0.12807401996591186, + "grad_norm": 0.86328125, + "learning_rate": 0.0019500206799652386, + "loss": 3.3034, + "step": 1841 + }, + { + "epoch": 0.12814358760304706, + "grad_norm": 1.0703125, + "learning_rate": 0.0019499503098201908, + "loss": 3.1774, + "step": 1842 + }, + { + "epoch": 0.12821315524018226, + "grad_norm": 0.79296875, + "learning_rate": 0.001949879891441454, + "loss": 2.9553, + "step": 1843 + }, + { + "epoch": 0.12828272287731748, + "grad_norm": 0.73046875, + "learning_rate": 0.0019498094248326043, + "loss": 3.0137, + "step": 1844 + }, + { + "epoch": 0.12835229051445268, + "grad_norm": 0.87109375, + "learning_rate": 0.0019497389099972192, + "loss": 3.063, + "step": 1845 + }, + { + "epoch": 0.12842185815158788, + "grad_norm": 1.078125, + "learning_rate": 0.0019496683469388794, + "loss": 3.5581, + "step": 1846 + }, + { + "epoch": 0.12849142578872308, + "grad_norm": 0.92578125, + "learning_rate": 0.0019495977356611674, + "loss": 3.3502, + "step": 1847 + }, + { + "epoch": 0.12856099342585828, + "grad_norm": 0.80859375, + "learning_rate": 0.001949527076167669, + "loss": 3.3218, + "step": 1848 + }, + { + "epoch": 0.1286305610629935, + "grad_norm": 0.8515625, + "learning_rate": 0.0019494563684619715, + "loss": 3.4957, + "step": 1849 + }, + { + "epoch": 0.1287001287001287, + "grad_norm": 1.09375, + "learning_rate": 0.0019493856125476652, + "loss": 3.7629, + "step": 1850 + }, + { + "epoch": 0.1287696963372639, + "grad_norm": 1.3828125, + "learning_rate": 0.0019493148084283427, + "loss": 3.0125, + "step": 1851 + }, + { + "epoch": 0.1288392639743991, + "grad_norm": 1.09375, + "learning_rate": 0.0019492439561075994, + "loss": 3.1882, + "step": 1852 + }, + { + "epoch": 0.12890883161153432, + "grad_norm": 0.74609375, + "learning_rate": 0.0019491730555890323, + "loss": 3.1547, + "step": 1853 + }, + { + "epoch": 0.12897839924866952, + "grad_norm": 0.8125, + "learning_rate": 0.0019491021068762417, + "loss": 3.2827, + "step": 1854 + }, + { + "epoch": 0.12904796688580472, + "grad_norm": 1.0703125, + "learning_rate": 0.00194903110997283, + "loss": 3.4886, + "step": 1855 + }, + { + "epoch": 0.12911753452293992, + "grad_norm": 0.73828125, + "learning_rate": 0.001948960064882402, + "loss": 3.4038, + "step": 1856 + }, + { + "epoch": 0.12918710216007515, + "grad_norm": 0.76953125, + "learning_rate": 0.0019488889716085648, + "loss": 3.119, + "step": 1857 + }, + { + "epoch": 0.12925666979721034, + "grad_norm": 0.83984375, + "learning_rate": 0.0019488178301549286, + "loss": 3.3197, + "step": 1858 + }, + { + "epoch": 0.12932623743434554, + "grad_norm": 0.73828125, + "learning_rate": 0.0019487466405251053, + "loss": 3.5417, + "step": 1859 + }, + { + "epoch": 0.12939580507148074, + "grad_norm": 1.1796875, + "learning_rate": 0.0019486754027227098, + "loss": 3.174, + "step": 1860 + }, + { + "epoch": 0.12946537270861594, + "grad_norm": 0.76953125, + "learning_rate": 0.0019486041167513588, + "loss": 3.2131, + "step": 1861 + }, + { + "epoch": 0.12953494034575117, + "grad_norm": 0.72265625, + "learning_rate": 0.0019485327826146723, + "loss": 3.4978, + "step": 1862 + }, + { + "epoch": 0.12960450798288636, + "grad_norm": 0.67578125, + "learning_rate": 0.0019484614003162717, + "loss": 3.408, + "step": 1863 + }, + { + "epoch": 0.12967407562002156, + "grad_norm": 0.6953125, + "learning_rate": 0.0019483899698597821, + "loss": 3.3175, + "step": 1864 + }, + { + "epoch": 0.12974364325715676, + "grad_norm": 0.9296875, + "learning_rate": 0.0019483184912488301, + "loss": 2.8108, + "step": 1865 + }, + { + "epoch": 0.12981321089429199, + "grad_norm": 0.6875, + "learning_rate": 0.001948246964487045, + "loss": 3.226, + "step": 1866 + }, + { + "epoch": 0.12988277853142718, + "grad_norm": 0.796875, + "learning_rate": 0.0019481753895780583, + "loss": 3.2306, + "step": 1867 + }, + { + "epoch": 0.12995234616856238, + "grad_norm": 0.7265625, + "learning_rate": 0.0019481037665255046, + "loss": 3.292, + "step": 1868 + }, + { + "epoch": 0.13002191380569758, + "grad_norm": 0.7265625, + "learning_rate": 0.0019480320953330205, + "loss": 3.6482, + "step": 1869 + }, + { + "epoch": 0.1300914814428328, + "grad_norm": 0.8515625, + "learning_rate": 0.0019479603760042448, + "loss": 3.6202, + "step": 1870 + }, + { + "epoch": 0.130161049079968, + "grad_norm": 0.9296875, + "learning_rate": 0.0019478886085428195, + "loss": 2.8485, + "step": 1871 + }, + { + "epoch": 0.1302306167171032, + "grad_norm": 0.95703125, + "learning_rate": 0.0019478167929523884, + "loss": 3.5329, + "step": 1872 + }, + { + "epoch": 0.1303001843542384, + "grad_norm": 0.83984375, + "learning_rate": 0.0019477449292365978, + "loss": 3.2885, + "step": 1873 + }, + { + "epoch": 0.1303697519913736, + "grad_norm": 1.1796875, + "learning_rate": 0.001947673017399097, + "loss": 3.0415, + "step": 1874 + }, + { + "epoch": 0.13043931962850883, + "grad_norm": 0.890625, + "learning_rate": 0.0019476010574435364, + "loss": 3.4042, + "step": 1875 + }, + { + "epoch": 0.13050888726564402, + "grad_norm": 0.66796875, + "learning_rate": 0.001947529049373571, + "loss": 3.4087, + "step": 1876 + }, + { + "epoch": 0.13057845490277922, + "grad_norm": 0.7265625, + "learning_rate": 0.0019474569931928558, + "loss": 3.4739, + "step": 1877 + }, + { + "epoch": 0.13064802253991442, + "grad_norm": 0.859375, + "learning_rate": 0.0019473848889050504, + "loss": 3.1786, + "step": 1878 + }, + { + "epoch": 0.13071759017704965, + "grad_norm": 0.8515625, + "learning_rate": 0.0019473127365138155, + "loss": 3.2386, + "step": 1879 + }, + { + "epoch": 0.13078715781418485, + "grad_norm": 0.890625, + "learning_rate": 0.0019472405360228145, + "loss": 3.5112, + "step": 1880 + }, + { + "epoch": 0.13085672545132004, + "grad_norm": 0.98828125, + "learning_rate": 0.0019471682874357135, + "loss": 3.318, + "step": 1881 + }, + { + "epoch": 0.13092629308845524, + "grad_norm": 0.80078125, + "learning_rate": 0.0019470959907561811, + "loss": 3.6096, + "step": 1882 + }, + { + "epoch": 0.13099586072559047, + "grad_norm": 0.875, + "learning_rate": 0.0019470236459878877, + "loss": 2.9816, + "step": 1883 + }, + { + "epoch": 0.13106542836272567, + "grad_norm": 0.78125, + "learning_rate": 0.0019469512531345072, + "loss": 2.9925, + "step": 1884 + }, + { + "epoch": 0.13113499599986086, + "grad_norm": 0.9765625, + "learning_rate": 0.001946878812199715, + "loss": 3.3357, + "step": 1885 + }, + { + "epoch": 0.13120456363699606, + "grad_norm": 0.82421875, + "learning_rate": 0.0019468063231871896, + "loss": 3.3221, + "step": 1886 + }, + { + "epoch": 0.13127413127413126, + "grad_norm": 0.9375, + "learning_rate": 0.001946733786100611, + "loss": 3.0446, + "step": 1887 + }, + { + "epoch": 0.1313436989112665, + "grad_norm": 0.94140625, + "learning_rate": 0.0019466612009436627, + "loss": 3.4744, + "step": 1888 + }, + { + "epoch": 0.13141326654840169, + "grad_norm": 0.7890625, + "learning_rate": 0.00194658856772003, + "loss": 3.3944, + "step": 1889 + }, + { + "epoch": 0.13148283418553688, + "grad_norm": 1.0078125, + "learning_rate": 0.001946515886433401, + "loss": 3.1389, + "step": 1890 + }, + { + "epoch": 0.13155240182267208, + "grad_norm": 1.0390625, + "learning_rate": 0.0019464431570874665, + "loss": 3.3595, + "step": 1891 + }, + { + "epoch": 0.1316219694598073, + "grad_norm": 0.76171875, + "learning_rate": 0.0019463703796859188, + "loss": 3.3208, + "step": 1892 + }, + { + "epoch": 0.1316915370969425, + "grad_norm": 0.69921875, + "learning_rate": 0.001946297554232453, + "loss": 3.5098, + "step": 1893 + }, + { + "epoch": 0.1317611047340777, + "grad_norm": 0.8125, + "learning_rate": 0.0019462246807307672, + "loss": 3.2446, + "step": 1894 + }, + { + "epoch": 0.1318306723712129, + "grad_norm": 0.6640625, + "learning_rate": 0.0019461517591845615, + "loss": 3.3563, + "step": 1895 + }, + { + "epoch": 0.13190024000834813, + "grad_norm": 0.87109375, + "learning_rate": 0.0019460787895975386, + "loss": 3.5702, + "step": 1896 + }, + { + "epoch": 0.13196980764548333, + "grad_norm": 0.83984375, + "learning_rate": 0.001946005771973403, + "loss": 3.3963, + "step": 1897 + }, + { + "epoch": 0.13203937528261853, + "grad_norm": 0.9140625, + "learning_rate": 0.0019459327063158628, + "loss": 3.2952, + "step": 1898 + }, + { + "epoch": 0.13210894291975372, + "grad_norm": 0.95703125, + "learning_rate": 0.0019458595926286272, + "loss": 3.0189, + "step": 1899 + }, + { + "epoch": 0.13217851055688892, + "grad_norm": 0.59375, + "learning_rate": 0.0019457864309154094, + "loss": 3.6055, + "step": 1900 + }, + { + "epoch": 0.13224807819402415, + "grad_norm": 0.83984375, + "learning_rate": 0.0019457132211799235, + "loss": 2.9506, + "step": 1901 + }, + { + "epoch": 0.13231764583115935, + "grad_norm": 0.9453125, + "learning_rate": 0.0019456399634258871, + "loss": 3.2754, + "step": 1902 + }, + { + "epoch": 0.13238721346829455, + "grad_norm": 0.8125, + "learning_rate": 0.00194556665765702, + "loss": 3.2409, + "step": 1903 + }, + { + "epoch": 0.13245678110542974, + "grad_norm": 0.94921875, + "learning_rate": 0.0019454933038770435, + "loss": 3.1072, + "step": 1904 + }, + { + "epoch": 0.13252634874256497, + "grad_norm": 0.9296875, + "learning_rate": 0.001945419902089683, + "loss": 2.9871, + "step": 1905 + }, + { + "epoch": 0.13259591637970017, + "grad_norm": 0.72265625, + "learning_rate": 0.001945346452298665, + "loss": 3.0467, + "step": 1906 + }, + { + "epoch": 0.13266548401683537, + "grad_norm": 0.80859375, + "learning_rate": 0.0019452729545077192, + "loss": 3.4325, + "step": 1907 + }, + { + "epoch": 0.13273505165397056, + "grad_norm": 0.83984375, + "learning_rate": 0.001945199408720577, + "loss": 3.1134, + "step": 1908 + }, + { + "epoch": 0.1328046192911058, + "grad_norm": 1.0546875, + "learning_rate": 0.0019451258149409735, + "loss": 3.4843, + "step": 1909 + }, + { + "epoch": 0.132874186928241, + "grad_norm": 0.78515625, + "learning_rate": 0.0019450521731726447, + "loss": 3.384, + "step": 1910 + }, + { + "epoch": 0.1329437545653762, + "grad_norm": 0.96875, + "learning_rate": 0.0019449784834193297, + "loss": 3.2625, + "step": 1911 + }, + { + "epoch": 0.13301332220251139, + "grad_norm": 0.796875, + "learning_rate": 0.0019449047456847706, + "loss": 3.519, + "step": 1912 + }, + { + "epoch": 0.13308288983964658, + "grad_norm": 0.94921875, + "learning_rate": 0.0019448309599727112, + "loss": 3.3985, + "step": 1913 + }, + { + "epoch": 0.1331524574767818, + "grad_norm": 0.72265625, + "learning_rate": 0.001944757126286898, + "loss": 3.4797, + "step": 1914 + }, + { + "epoch": 0.133222025113917, + "grad_norm": 0.9375, + "learning_rate": 0.0019446832446310793, + "loss": 3.2945, + "step": 1915 + }, + { + "epoch": 0.1332915927510522, + "grad_norm": 0.859375, + "learning_rate": 0.0019446093150090075, + "loss": 3.3423, + "step": 1916 + }, + { + "epoch": 0.1333611603881874, + "grad_norm": 0.828125, + "learning_rate": 0.0019445353374244359, + "loss": 3.6269, + "step": 1917 + }, + { + "epoch": 0.13343072802532263, + "grad_norm": 0.7421875, + "learning_rate": 0.0019444613118811205, + "loss": 3.362, + "step": 1918 + }, + { + "epoch": 0.13350029566245783, + "grad_norm": 0.9453125, + "learning_rate": 0.0019443872383828203, + "loss": 3.3338, + "step": 1919 + }, + { + "epoch": 0.13356986329959303, + "grad_norm": 0.89453125, + "learning_rate": 0.0019443131169332962, + "loss": 3.3511, + "step": 1920 + }, + { + "epoch": 0.13363943093672823, + "grad_norm": 0.953125, + "learning_rate": 0.0019442389475363116, + "loss": 3.5065, + "step": 1921 + }, + { + "epoch": 0.13370899857386345, + "grad_norm": 0.9453125, + "learning_rate": 0.0019441647301956324, + "loss": 3.3673, + "step": 1922 + }, + { + "epoch": 0.13377856621099865, + "grad_norm": 0.90234375, + "learning_rate": 0.0019440904649150276, + "loss": 3.0948, + "step": 1923 + }, + { + "epoch": 0.13384813384813385, + "grad_norm": 0.8984375, + "learning_rate": 0.0019440161516982668, + "loss": 3.6924, + "step": 1924 + }, + { + "epoch": 0.13391770148526905, + "grad_norm": 0.78515625, + "learning_rate": 0.0019439417905491247, + "loss": 3.0385, + "step": 1925 + }, + { + "epoch": 0.13398726912240425, + "grad_norm": 0.9140625, + "learning_rate": 0.0019438673814713761, + "loss": 3.1771, + "step": 1926 + }, + { + "epoch": 0.13405683675953947, + "grad_norm": 1.109375, + "learning_rate": 0.001943792924468799, + "loss": 3.7504, + "step": 1927 + }, + { + "epoch": 0.13412640439667467, + "grad_norm": 0.875, + "learning_rate": 0.0019437184195451747, + "loss": 3.4519, + "step": 1928 + }, + { + "epoch": 0.13419597203380987, + "grad_norm": 0.78125, + "learning_rate": 0.0019436438667042855, + "loss": 3.4553, + "step": 1929 + }, + { + "epoch": 0.13426553967094507, + "grad_norm": 0.7734375, + "learning_rate": 0.0019435692659499173, + "loss": 3.4122, + "step": 1930 + }, + { + "epoch": 0.1343351073080803, + "grad_norm": 0.8046875, + "learning_rate": 0.0019434946172858577, + "loss": 3.4642, + "step": 1931 + }, + { + "epoch": 0.1344046749452155, + "grad_norm": 0.69140625, + "learning_rate": 0.0019434199207158968, + "loss": 3.3712, + "step": 1932 + }, + { + "epoch": 0.1344742425823507, + "grad_norm": 0.8046875, + "learning_rate": 0.0019433451762438274, + "loss": 3.2983, + "step": 1933 + }, + { + "epoch": 0.1345438102194859, + "grad_norm": 0.77734375, + "learning_rate": 0.0019432703838734452, + "loss": 3.5689, + "step": 1934 + }, + { + "epoch": 0.1346133778566211, + "grad_norm": 0.6875, + "learning_rate": 0.0019431955436085468, + "loss": 3.0071, + "step": 1935 + }, + { + "epoch": 0.1346829454937563, + "grad_norm": 0.78515625, + "learning_rate": 0.0019431206554529333, + "loss": 3.5376, + "step": 1936 + }, + { + "epoch": 0.1347525131308915, + "grad_norm": 0.7265625, + "learning_rate": 0.0019430457194104063, + "loss": 3.5035, + "step": 1937 + }, + { + "epoch": 0.1348220807680267, + "grad_norm": 0.64453125, + "learning_rate": 0.0019429707354847712, + "loss": 3.1914, + "step": 1938 + }, + { + "epoch": 0.1348916484051619, + "grad_norm": 0.6484375, + "learning_rate": 0.0019428957036798347, + "loss": 3.9197, + "step": 1939 + }, + { + "epoch": 0.13496121604229713, + "grad_norm": 0.84375, + "learning_rate": 0.001942820623999407, + "loss": 3.2672, + "step": 1940 + }, + { + "epoch": 0.13503078367943233, + "grad_norm": 0.8125, + "learning_rate": 0.0019427454964473006, + "loss": 3.0891, + "step": 1941 + }, + { + "epoch": 0.13510035131656753, + "grad_norm": 1.171875, + "learning_rate": 0.0019426703210273294, + "loss": 2.8527, + "step": 1942 + }, + { + "epoch": 0.13516991895370273, + "grad_norm": 0.78125, + "learning_rate": 0.0019425950977433105, + "loss": 2.8761, + "step": 1943 + }, + { + "epoch": 0.13523948659083795, + "grad_norm": 0.73828125, + "learning_rate": 0.0019425198265990637, + "loss": 3.564, + "step": 1944 + }, + { + "epoch": 0.13530905422797315, + "grad_norm": 0.80859375, + "learning_rate": 0.001942444507598411, + "loss": 3.5785, + "step": 1945 + }, + { + "epoch": 0.13537862186510835, + "grad_norm": 0.890625, + "learning_rate": 0.0019423691407451761, + "loss": 3.4398, + "step": 1946 + }, + { + "epoch": 0.13544818950224355, + "grad_norm": 0.8125, + "learning_rate": 0.0019422937260431864, + "loss": 3.2611, + "step": 1947 + }, + { + "epoch": 0.13551775713937875, + "grad_norm": 0.65234375, + "learning_rate": 0.0019422182634962707, + "loss": 3.4193, + "step": 1948 + }, + { + "epoch": 0.13558732477651397, + "grad_norm": 0.76171875, + "learning_rate": 0.0019421427531082606, + "loss": 3.2257, + "step": 1949 + }, + { + "epoch": 0.13565689241364917, + "grad_norm": 0.71484375, + "learning_rate": 0.0019420671948829904, + "loss": 3.2713, + "step": 1950 + }, + { + "epoch": 0.13572646005078437, + "grad_norm": 0.7421875, + "learning_rate": 0.0019419915888242963, + "loss": 3.2238, + "step": 1951 + }, + { + "epoch": 0.13579602768791957, + "grad_norm": 1.0234375, + "learning_rate": 0.0019419159349360173, + "loss": 3.5063, + "step": 1952 + }, + { + "epoch": 0.1358655953250548, + "grad_norm": 0.62109375, + "learning_rate": 0.0019418402332219951, + "loss": 3.765, + "step": 1953 + }, + { + "epoch": 0.13593516296219, + "grad_norm": 0.64453125, + "learning_rate": 0.0019417644836860727, + "loss": 3.1648, + "step": 1954 + }, + { + "epoch": 0.1360047305993252, + "grad_norm": 0.89453125, + "learning_rate": 0.0019416886863320968, + "loss": 3.2006, + "step": 1955 + }, + { + "epoch": 0.1360742982364604, + "grad_norm": 0.703125, + "learning_rate": 0.001941612841163916, + "loss": 3.1752, + "step": 1956 + }, + { + "epoch": 0.13614386587359562, + "grad_norm": 0.92578125, + "learning_rate": 0.0019415369481853811, + "loss": 3.1891, + "step": 1957 + }, + { + "epoch": 0.1362134335107308, + "grad_norm": 0.85546875, + "learning_rate": 0.0019414610074003455, + "loss": 3.3473, + "step": 1958 + }, + { + "epoch": 0.136283001147866, + "grad_norm": 0.8515625, + "learning_rate": 0.001941385018812665, + "loss": 3.1638, + "step": 1959 + }, + { + "epoch": 0.1363525687850012, + "grad_norm": 1.046875, + "learning_rate": 0.0019413089824261989, + "loss": 3.2753, + "step": 1960 + }, + { + "epoch": 0.1364221364221364, + "grad_norm": 0.94921875, + "learning_rate": 0.0019412328982448069, + "loss": 3.0721, + "step": 1961 + }, + { + "epoch": 0.13649170405927163, + "grad_norm": 0.8984375, + "learning_rate": 0.0019411567662723523, + "loss": 3.5696, + "step": 1962 + }, + { + "epoch": 0.13656127169640683, + "grad_norm": 0.83984375, + "learning_rate": 0.001941080586512701, + "loss": 3.1816, + "step": 1963 + }, + { + "epoch": 0.13663083933354203, + "grad_norm": 0.83203125, + "learning_rate": 0.001941004358969721, + "loss": 3.2473, + "step": 1964 + }, + { + "epoch": 0.13670040697067723, + "grad_norm": 0.77734375, + "learning_rate": 0.0019409280836472829, + "loss": 3.4571, + "step": 1965 + }, + { + "epoch": 0.13676997460781246, + "grad_norm": 0.828125, + "learning_rate": 0.0019408517605492592, + "loss": 3.5023, + "step": 1966 + }, + { + "epoch": 0.13683954224494765, + "grad_norm": 0.8515625, + "learning_rate": 0.001940775389679525, + "loss": 3.7084, + "step": 1967 + }, + { + "epoch": 0.13690910988208285, + "grad_norm": 0.90234375, + "learning_rate": 0.0019406989710419587, + "loss": 3.5293, + "step": 1968 + }, + { + "epoch": 0.13697867751921805, + "grad_norm": 0.89453125, + "learning_rate": 0.00194062250464044, + "loss": 3.2683, + "step": 1969 + }, + { + "epoch": 0.13704824515635328, + "grad_norm": 0.84375, + "learning_rate": 0.0019405459904788516, + "loss": 3.4587, + "step": 1970 + }, + { + "epoch": 0.13711781279348847, + "grad_norm": 1.0703125, + "learning_rate": 0.0019404694285610783, + "loss": 2.7896, + "step": 1971 + }, + { + "epoch": 0.13718738043062367, + "grad_norm": 0.82421875, + "learning_rate": 0.0019403928188910082, + "loss": 3.2769, + "step": 1972 + }, + { + "epoch": 0.13725694806775887, + "grad_norm": 1.2890625, + "learning_rate": 0.00194031616147253, + "loss": 3.0715, + "step": 1973 + }, + { + "epoch": 0.13732651570489407, + "grad_norm": 1.0546875, + "learning_rate": 0.0019402394563095373, + "loss": 3.2322, + "step": 1974 + }, + { + "epoch": 0.1373960833420293, + "grad_norm": 0.8828125, + "learning_rate": 0.001940162703405924, + "loss": 3.543, + "step": 1975 + }, + { + "epoch": 0.1374656509791645, + "grad_norm": 0.79296875, + "learning_rate": 0.0019400859027655876, + "loss": 3.3874, + "step": 1976 + }, + { + "epoch": 0.1375352186162997, + "grad_norm": 0.73046875, + "learning_rate": 0.0019400090543924271, + "loss": 3.2165, + "step": 1977 + }, + { + "epoch": 0.1376047862534349, + "grad_norm": 0.80078125, + "learning_rate": 0.0019399321582903451, + "loss": 2.9563, + "step": 1978 + }, + { + "epoch": 0.13767435389057012, + "grad_norm": 0.71484375, + "learning_rate": 0.0019398552144632454, + "loss": 3.2973, + "step": 1979 + }, + { + "epoch": 0.13774392152770532, + "grad_norm": 0.7265625, + "learning_rate": 0.0019397782229150355, + "loss": 3.3801, + "step": 1980 + }, + { + "epoch": 0.1378134891648405, + "grad_norm": 0.94140625, + "learning_rate": 0.001939701183649624, + "loss": 3.2854, + "step": 1981 + }, + { + "epoch": 0.1378830568019757, + "grad_norm": 0.75390625, + "learning_rate": 0.0019396240966709226, + "loss": 3.5824, + "step": 1982 + }, + { + "epoch": 0.13795262443911094, + "grad_norm": 0.84375, + "learning_rate": 0.001939546961982846, + "loss": 3.2963, + "step": 1983 + }, + { + "epoch": 0.13802219207624614, + "grad_norm": 0.828125, + "learning_rate": 0.0019394697795893103, + "loss": 3.2933, + "step": 1984 + }, + { + "epoch": 0.13809175971338133, + "grad_norm": 0.9765625, + "learning_rate": 0.0019393925494942345, + "loss": 3.187, + "step": 1985 + }, + { + "epoch": 0.13816132735051653, + "grad_norm": 0.68359375, + "learning_rate": 0.0019393152717015396, + "loss": 3.6, + "step": 1986 + }, + { + "epoch": 0.13823089498765173, + "grad_norm": 0.76171875, + "learning_rate": 0.0019392379462151502, + "loss": 3.9195, + "step": 1987 + }, + { + "epoch": 0.13830046262478696, + "grad_norm": 0.8046875, + "learning_rate": 0.0019391605730389916, + "loss": 3.3988, + "step": 1988 + }, + { + "epoch": 0.13837003026192216, + "grad_norm": 0.859375, + "learning_rate": 0.0019390831521769929, + "loss": 3.3932, + "step": 1989 + }, + { + "epoch": 0.13843959789905735, + "grad_norm": 0.7734375, + "learning_rate": 0.0019390056836330852, + "loss": 3.1819, + "step": 1990 + }, + { + "epoch": 0.13850916553619255, + "grad_norm": 0.77734375, + "learning_rate": 0.0019389281674112018, + "loss": 3.409, + "step": 1991 + }, + { + "epoch": 0.13857873317332778, + "grad_norm": 0.75390625, + "learning_rate": 0.0019388506035152785, + "loss": 3.4284, + "step": 1992 + }, + { + "epoch": 0.13864830081046298, + "grad_norm": 0.91796875, + "learning_rate": 0.0019387729919492541, + "loss": 3.3189, + "step": 1993 + }, + { + "epoch": 0.13871786844759817, + "grad_norm": 0.99609375, + "learning_rate": 0.0019386953327170684, + "loss": 3.356, + "step": 1994 + }, + { + "epoch": 0.13878743608473337, + "grad_norm": 0.765625, + "learning_rate": 0.0019386176258226653, + "loss": 3.1132, + "step": 1995 + }, + { + "epoch": 0.1388570037218686, + "grad_norm": 0.93359375, + "learning_rate": 0.00193853987126999, + "loss": 3.3558, + "step": 1996 + }, + { + "epoch": 0.1389265713590038, + "grad_norm": 0.7578125, + "learning_rate": 0.0019384620690629907, + "loss": 3.2451, + "step": 1997 + }, + { + "epoch": 0.138996138996139, + "grad_norm": 0.60546875, + "learning_rate": 0.001938384219205618, + "loss": 3.4747, + "step": 1998 + }, + { + "epoch": 0.1390657066332742, + "grad_norm": 0.80078125, + "learning_rate": 0.0019383063217018241, + "loss": 2.8982, + "step": 1999 + }, + { + "epoch": 0.1391352742704094, + "grad_norm": 0.74609375, + "learning_rate": 0.0019382283765555651, + "loss": 2.9246, + "step": 2000 + }, + { + "epoch": 0.13920484190754462, + "grad_norm": 0.79296875, + "learning_rate": 0.0019381503837707977, + "loss": 3.1143, + "step": 2001 + }, + { + "epoch": 0.13927440954467982, + "grad_norm": 0.77734375, + "learning_rate": 0.0019380723433514823, + "loss": 3.2632, + "step": 2002 + }, + { + "epoch": 0.13934397718181502, + "grad_norm": 0.87109375, + "learning_rate": 0.001937994255301582, + "loss": 3.4156, + "step": 2003 + }, + { + "epoch": 0.1394135448189502, + "grad_norm": 0.59375, + "learning_rate": 0.001937916119625061, + "loss": 3.0436, + "step": 2004 + }, + { + "epoch": 0.13948311245608544, + "grad_norm": 0.953125, + "learning_rate": 0.001937837936325887, + "loss": 3.1214, + "step": 2005 + }, + { + "epoch": 0.13955268009322064, + "grad_norm": 0.8515625, + "learning_rate": 0.0019377597054080296, + "loss": 3.0121, + "step": 2006 + }, + { + "epoch": 0.13962224773035584, + "grad_norm": 0.85546875, + "learning_rate": 0.0019376814268754609, + "loss": 3.0581, + "step": 2007 + }, + { + "epoch": 0.13969181536749103, + "grad_norm": 0.94140625, + "learning_rate": 0.0019376031007321557, + "loss": 3.3915, + "step": 2008 + }, + { + "epoch": 0.13976138300462626, + "grad_norm": 0.77734375, + "learning_rate": 0.001937524726982091, + "loss": 3.26, + "step": 2009 + }, + { + "epoch": 0.13983095064176146, + "grad_norm": 1.140625, + "learning_rate": 0.0019374463056292459, + "loss": 3.6236, + "step": 2010 + }, + { + "epoch": 0.13990051827889666, + "grad_norm": 0.71875, + "learning_rate": 0.0019373678366776028, + "loss": 3.1375, + "step": 2011 + }, + { + "epoch": 0.13997008591603186, + "grad_norm": 0.70703125, + "learning_rate": 0.0019372893201311454, + "loss": 3.4221, + "step": 2012 + }, + { + "epoch": 0.14003965355316705, + "grad_norm": 1.0859375, + "learning_rate": 0.0019372107559938608, + "loss": 3.3759, + "step": 2013 + }, + { + "epoch": 0.14010922119030228, + "grad_norm": 0.875, + "learning_rate": 0.001937132144269738, + "loss": 3.4701, + "step": 2014 + }, + { + "epoch": 0.14017878882743748, + "grad_norm": 1.1484375, + "learning_rate": 0.0019370534849627679, + "loss": 3.1729, + "step": 2015 + }, + { + "epoch": 0.14024835646457268, + "grad_norm": 0.84765625, + "learning_rate": 0.0019369747780769453, + "loss": 2.7963, + "step": 2016 + }, + { + "epoch": 0.14031792410170787, + "grad_norm": 0.80078125, + "learning_rate": 0.0019368960236162663, + "loss": 3.4503, + "step": 2017 + }, + { + "epoch": 0.1403874917388431, + "grad_norm": 0.9765625, + "learning_rate": 0.0019368172215847293, + "loss": 3.3126, + "step": 2018 + }, + { + "epoch": 0.1404570593759783, + "grad_norm": 0.80859375, + "learning_rate": 0.0019367383719863355, + "loss": 3.8492, + "step": 2019 + }, + { + "epoch": 0.1405266270131135, + "grad_norm": 0.69921875, + "learning_rate": 0.0019366594748250893, + "loss": 3.3776, + "step": 2020 + }, + { + "epoch": 0.1405961946502487, + "grad_norm": 0.69921875, + "learning_rate": 0.0019365805301049955, + "loss": 3.2232, + "step": 2021 + }, + { + "epoch": 0.14066576228738392, + "grad_norm": 0.73046875, + "learning_rate": 0.0019365015378300632, + "loss": 3.2879, + "step": 2022 + }, + { + "epoch": 0.14073532992451912, + "grad_norm": 0.85546875, + "learning_rate": 0.0019364224980043033, + "loss": 3.2998, + "step": 2023 + }, + { + "epoch": 0.14080489756165432, + "grad_norm": 0.98828125, + "learning_rate": 0.0019363434106317288, + "loss": 3.2886, + "step": 2024 + }, + { + "epoch": 0.14087446519878952, + "grad_norm": 0.9921875, + "learning_rate": 0.0019362642757163556, + "loss": 3.2717, + "step": 2025 + }, + { + "epoch": 0.14094403283592472, + "grad_norm": 1.0390625, + "learning_rate": 0.0019361850932622011, + "loss": 3.119, + "step": 2026 + }, + { + "epoch": 0.14101360047305994, + "grad_norm": 0.7421875, + "learning_rate": 0.0019361058632732867, + "loss": 3.2256, + "step": 2027 + }, + { + "epoch": 0.14108316811019514, + "grad_norm": 1.0234375, + "learning_rate": 0.001936026585753635, + "loss": 2.9657, + "step": 2028 + }, + { + "epoch": 0.14115273574733034, + "grad_norm": 1.1015625, + "learning_rate": 0.001935947260707271, + "loss": 2.9658, + "step": 2029 + }, + { + "epoch": 0.14122230338446554, + "grad_norm": 0.82421875, + "learning_rate": 0.0019358678881382227, + "loss": 3.5397, + "step": 2030 + }, + { + "epoch": 0.14129187102160076, + "grad_norm": 1.03125, + "learning_rate": 0.0019357884680505197, + "loss": 3.3371, + "step": 2031 + }, + { + "epoch": 0.14136143865873596, + "grad_norm": 0.7109375, + "learning_rate": 0.0019357090004481954, + "loss": 3.7608, + "step": 2032 + }, + { + "epoch": 0.14143100629587116, + "grad_norm": 0.87109375, + "learning_rate": 0.0019356294853352845, + "loss": 3.2105, + "step": 2033 + }, + { + "epoch": 0.14150057393300636, + "grad_norm": 0.73046875, + "learning_rate": 0.0019355499227158243, + "loss": 3.3079, + "step": 2034 + }, + { + "epoch": 0.14157014157014158, + "grad_norm": 0.66015625, + "learning_rate": 0.0019354703125938543, + "loss": 3.2313, + "step": 2035 + }, + { + "epoch": 0.14163970920727678, + "grad_norm": 0.7578125, + "learning_rate": 0.001935390654973417, + "loss": 3.4517, + "step": 2036 + }, + { + "epoch": 0.14170927684441198, + "grad_norm": 0.7578125, + "learning_rate": 0.001935310949858557, + "loss": 2.969, + "step": 2037 + }, + { + "epoch": 0.14177884448154718, + "grad_norm": 0.72265625, + "learning_rate": 0.0019352311972533212, + "loss": 3.5515, + "step": 2038 + }, + { + "epoch": 0.14184841211868238, + "grad_norm": 0.81640625, + "learning_rate": 0.0019351513971617594, + "loss": 3.2527, + "step": 2039 + }, + { + "epoch": 0.1419179797558176, + "grad_norm": 0.94140625, + "learning_rate": 0.001935071549587923, + "loss": 3.3396, + "step": 2040 + }, + { + "epoch": 0.1419875473929528, + "grad_norm": 0.87890625, + "learning_rate": 0.001934991654535866, + "loss": 3.0555, + "step": 2041 + }, + { + "epoch": 0.142057115030088, + "grad_norm": 0.76171875, + "learning_rate": 0.001934911712009646, + "loss": 3.5917, + "step": 2042 + }, + { + "epoch": 0.1421266826672232, + "grad_norm": 0.859375, + "learning_rate": 0.0019348317220133217, + "loss": 3.4887, + "step": 2043 + }, + { + "epoch": 0.14219625030435842, + "grad_norm": 0.93359375, + "learning_rate": 0.001934751684550954, + "loss": 3.6032, + "step": 2044 + }, + { + "epoch": 0.14226581794149362, + "grad_norm": 0.84765625, + "learning_rate": 0.0019346715996266073, + "loss": 3.1755, + "step": 2045 + }, + { + "epoch": 0.14233538557862882, + "grad_norm": 0.90234375, + "learning_rate": 0.0019345914672443483, + "loss": 3.165, + "step": 2046 + }, + { + "epoch": 0.14240495321576402, + "grad_norm": 0.93359375, + "learning_rate": 0.0019345112874082449, + "loss": 2.8039, + "step": 2047 + }, + { + "epoch": 0.14247452085289924, + "grad_norm": 0.828125, + "learning_rate": 0.0019344310601223686, + "loss": 2.9524, + "step": 2048 + }, + { + "epoch": 0.14254408849003444, + "grad_norm": 0.69921875, + "learning_rate": 0.001934350785390793, + "loss": 3.5818, + "step": 2049 + }, + { + "epoch": 0.14261365612716964, + "grad_norm": 0.91015625, + "learning_rate": 0.0019342704632175944, + "loss": 3.5404, + "step": 2050 + }, + { + "epoch": 0.14268322376430484, + "grad_norm": 0.76171875, + "learning_rate": 0.0019341900936068503, + "loss": 3.0195, + "step": 2051 + }, + { + "epoch": 0.14275279140144004, + "grad_norm": 0.78515625, + "learning_rate": 0.001934109676562642, + "loss": 3.166, + "step": 2052 + }, + { + "epoch": 0.14282235903857526, + "grad_norm": 0.99609375, + "learning_rate": 0.0019340292120890524, + "loss": 3.1962, + "step": 2053 + }, + { + "epoch": 0.14289192667571046, + "grad_norm": 1.359375, + "learning_rate": 0.0019339487001901676, + "loss": 3.3675, + "step": 2054 + }, + { + "epoch": 0.14296149431284566, + "grad_norm": 0.8359375, + "learning_rate": 0.0019338681408700752, + "loss": 3.3433, + "step": 2055 + }, + { + "epoch": 0.14303106194998086, + "grad_norm": 0.953125, + "learning_rate": 0.0019337875341328655, + "loss": 3.031, + "step": 2056 + }, + { + "epoch": 0.14310062958711608, + "grad_norm": 0.98046875, + "learning_rate": 0.0019337068799826316, + "loss": 3.0238, + "step": 2057 + }, + { + "epoch": 0.14317019722425128, + "grad_norm": 0.7421875, + "learning_rate": 0.0019336261784234684, + "loss": 3.3712, + "step": 2058 + }, + { + "epoch": 0.14323976486138648, + "grad_norm": 0.765625, + "learning_rate": 0.001933545429459474, + "loss": 3.4989, + "step": 2059 + }, + { + "epoch": 0.14330933249852168, + "grad_norm": 0.74609375, + "learning_rate": 0.0019334646330947476, + "loss": 3.2556, + "step": 2060 + }, + { + "epoch": 0.1433789001356569, + "grad_norm": 0.875, + "learning_rate": 0.0019333837893333926, + "loss": 3.0214, + "step": 2061 + }, + { + "epoch": 0.1434484677727921, + "grad_norm": 1.1015625, + "learning_rate": 0.0019333028981795132, + "loss": 3.1768, + "step": 2062 + }, + { + "epoch": 0.1435180354099273, + "grad_norm": 0.765625, + "learning_rate": 0.0019332219596372166, + "loss": 3.4896, + "step": 2063 + }, + { + "epoch": 0.1435876030470625, + "grad_norm": 0.86328125, + "learning_rate": 0.0019331409737106129, + "loss": 2.6882, + "step": 2064 + }, + { + "epoch": 0.1436571706841977, + "grad_norm": 0.89453125, + "learning_rate": 0.001933059940403814, + "loss": 3.2502, + "step": 2065 + }, + { + "epoch": 0.14372673832133293, + "grad_norm": 1.125, + "learning_rate": 0.0019329788597209343, + "loss": 3.2372, + "step": 2066 + }, + { + "epoch": 0.14379630595846812, + "grad_norm": 0.8125, + "learning_rate": 0.0019328977316660906, + "loss": 3.2938, + "step": 2067 + }, + { + "epoch": 0.14386587359560332, + "grad_norm": 0.7734375, + "learning_rate": 0.0019328165562434024, + "loss": 3.1811, + "step": 2068 + }, + { + "epoch": 0.14393544123273852, + "grad_norm": 0.84375, + "learning_rate": 0.001932735333456991, + "loss": 3.4409, + "step": 2069 + }, + { + "epoch": 0.14400500886987375, + "grad_norm": 0.859375, + "learning_rate": 0.0019326540633109808, + "loss": 3.1112, + "step": 2070 + }, + { + "epoch": 0.14407457650700894, + "grad_norm": 1.1640625, + "learning_rate": 0.0019325727458094982, + "loss": 3.1347, + "step": 2071 + }, + { + "epoch": 0.14414414414414414, + "grad_norm": 0.89453125, + "learning_rate": 0.0019324913809566717, + "loss": 3.1182, + "step": 2072 + }, + { + "epoch": 0.14421371178127934, + "grad_norm": 0.96875, + "learning_rate": 0.0019324099687566335, + "loss": 3.2554, + "step": 2073 + }, + { + "epoch": 0.14428327941841457, + "grad_norm": 1.078125, + "learning_rate": 0.0019323285092135167, + "loss": 2.8964, + "step": 2074 + }, + { + "epoch": 0.14435284705554977, + "grad_norm": 0.98828125, + "learning_rate": 0.0019322470023314573, + "loss": 3.049, + "step": 2075 + }, + { + "epoch": 0.14442241469268496, + "grad_norm": 0.74609375, + "learning_rate": 0.001932165448114594, + "loss": 2.8512, + "step": 2076 + }, + { + "epoch": 0.14449198232982016, + "grad_norm": 0.828125, + "learning_rate": 0.0019320838465670678, + "loss": 3.2504, + "step": 2077 + }, + { + "epoch": 0.14456154996695536, + "grad_norm": 0.67578125, + "learning_rate": 0.001932002197693022, + "loss": 3.0084, + "step": 2078 + }, + { + "epoch": 0.1446311176040906, + "grad_norm": 0.9296875, + "learning_rate": 0.0019319205014966022, + "loss": 3.415, + "step": 2079 + }, + { + "epoch": 0.14470068524122578, + "grad_norm": 0.66015625, + "learning_rate": 0.0019318387579819562, + "loss": 3.2434, + "step": 2080 + }, + { + "epoch": 0.14477025287836098, + "grad_norm": 0.97265625, + "learning_rate": 0.0019317569671532353, + "loss": 3.4969, + "step": 2081 + }, + { + "epoch": 0.14483982051549618, + "grad_norm": 0.9140625, + "learning_rate": 0.0019316751290145923, + "loss": 2.9212, + "step": 2082 + }, + { + "epoch": 0.1449093881526314, + "grad_norm": 0.8828125, + "learning_rate": 0.0019315932435701817, + "loss": 3.4919, + "step": 2083 + }, + { + "epoch": 0.1449789557897666, + "grad_norm": 0.8046875, + "learning_rate": 0.0019315113108241617, + "loss": 3.3674, + "step": 2084 + }, + { + "epoch": 0.1450485234269018, + "grad_norm": 0.7421875, + "learning_rate": 0.0019314293307806927, + "loss": 3.6306, + "step": 2085 + }, + { + "epoch": 0.145118091064037, + "grad_norm": 0.73828125, + "learning_rate": 0.0019313473034439372, + "loss": 3.1649, + "step": 2086 + }, + { + "epoch": 0.1451876587011722, + "grad_norm": 1.0234375, + "learning_rate": 0.00193126522881806, + "loss": 3.0952, + "step": 2087 + }, + { + "epoch": 0.14525722633830743, + "grad_norm": 0.63671875, + "learning_rate": 0.0019311831069072278, + "loss": 3.3009, + "step": 2088 + }, + { + "epoch": 0.14532679397544263, + "grad_norm": 0.65625, + "learning_rate": 0.0019311009377156116, + "loss": 2.86, + "step": 2089 + }, + { + "epoch": 0.14539636161257782, + "grad_norm": 0.57421875, + "learning_rate": 0.0019310187212473826, + "loss": 3.2769, + "step": 2090 + }, + { + "epoch": 0.14546592924971302, + "grad_norm": 0.68359375, + "learning_rate": 0.0019309364575067157, + "loss": 3.2831, + "step": 2091 + }, + { + "epoch": 0.14553549688684825, + "grad_norm": 0.7109375, + "learning_rate": 0.0019308541464977877, + "loss": 3.105, + "step": 2092 + }, + { + "epoch": 0.14560506452398345, + "grad_norm": 0.74609375, + "learning_rate": 0.001930771788224778, + "loss": 3.2699, + "step": 2093 + }, + { + "epoch": 0.14567463216111864, + "grad_norm": 0.83984375, + "learning_rate": 0.0019306893826918684, + "loss": 3.2145, + "step": 2094 + }, + { + "epoch": 0.14574419979825384, + "grad_norm": 0.56640625, + "learning_rate": 0.001930606929903243, + "loss": 3.4146, + "step": 2095 + }, + { + "epoch": 0.14581376743538907, + "grad_norm": 0.75, + "learning_rate": 0.001930524429863088, + "loss": 3.2052, + "step": 2096 + }, + { + "epoch": 0.14588333507252427, + "grad_norm": 0.65234375, + "learning_rate": 0.0019304418825755929, + "loss": 3.7529, + "step": 2097 + }, + { + "epoch": 0.14595290270965947, + "grad_norm": 0.73046875, + "learning_rate": 0.0019303592880449488, + "loss": 3.1949, + "step": 2098 + }, + { + "epoch": 0.14602247034679466, + "grad_norm": 0.9140625, + "learning_rate": 0.0019302766462753493, + "loss": 2.8965, + "step": 2099 + }, + { + "epoch": 0.14609203798392986, + "grad_norm": 0.796875, + "learning_rate": 0.0019301939572709907, + "loss": 3.0809, + "step": 2100 + }, + { + "epoch": 0.1461616056210651, + "grad_norm": 0.81640625, + "learning_rate": 0.0019301112210360714, + "loss": 3.1178, + "step": 2101 + }, + { + "epoch": 0.1462311732582003, + "grad_norm": 0.8203125, + "learning_rate": 0.0019300284375747925, + "loss": 3.6246, + "step": 2102 + }, + { + "epoch": 0.14630074089533548, + "grad_norm": 0.8671875, + "learning_rate": 0.0019299456068913572, + "loss": 3.0863, + "step": 2103 + }, + { + "epoch": 0.14637030853247068, + "grad_norm": 0.77734375, + "learning_rate": 0.0019298627289899715, + "loss": 3.1766, + "step": 2104 + }, + { + "epoch": 0.1464398761696059, + "grad_norm": 0.96875, + "learning_rate": 0.001929779803874843, + "loss": 3.4152, + "step": 2105 + }, + { + "epoch": 0.1465094438067411, + "grad_norm": 0.75, + "learning_rate": 0.0019296968315501823, + "loss": 3.3496, + "step": 2106 + }, + { + "epoch": 0.1465790114438763, + "grad_norm": 0.7734375, + "learning_rate": 0.001929613812020203, + "loss": 3.267, + "step": 2107 + }, + { + "epoch": 0.1466485790810115, + "grad_norm": 0.8203125, + "learning_rate": 0.0019295307452891195, + "loss": 3.1405, + "step": 2108 + }, + { + "epoch": 0.14671814671814673, + "grad_norm": 0.73828125, + "learning_rate": 0.0019294476313611501, + "loss": 3.1015, + "step": 2109 + }, + { + "epoch": 0.14678771435528193, + "grad_norm": 0.734375, + "learning_rate": 0.0019293644702405147, + "loss": 2.8852, + "step": 2110 + }, + { + "epoch": 0.14685728199241713, + "grad_norm": 0.80859375, + "learning_rate": 0.001929281261931436, + "loss": 3.3098, + "step": 2111 + }, + { + "epoch": 0.14692684962955233, + "grad_norm": 0.72265625, + "learning_rate": 0.0019291980064381385, + "loss": 3.1474, + "step": 2112 + }, + { + "epoch": 0.14699641726668752, + "grad_norm": 0.72265625, + "learning_rate": 0.00192911470376485, + "loss": 3.6916, + "step": 2113 + }, + { + "epoch": 0.14706598490382275, + "grad_norm": 0.8046875, + "learning_rate": 0.0019290313539158, + "loss": 3.4483, + "step": 2114 + }, + { + "epoch": 0.14713555254095795, + "grad_norm": 1.09375, + "learning_rate": 0.0019289479568952203, + "loss": 3.3741, + "step": 2115 + }, + { + "epoch": 0.14720512017809315, + "grad_norm": 0.75390625, + "learning_rate": 0.0019288645127073455, + "loss": 2.9652, + "step": 2116 + }, + { + "epoch": 0.14727468781522834, + "grad_norm": 0.8828125, + "learning_rate": 0.0019287810213564126, + "loss": 2.9294, + "step": 2117 + }, + { + "epoch": 0.14734425545236357, + "grad_norm": 0.70703125, + "learning_rate": 0.001928697482846661, + "loss": 3.3925, + "step": 2118 + }, + { + "epoch": 0.14741382308949877, + "grad_norm": 0.8046875, + "learning_rate": 0.001928613897182332, + "loss": 3.0946, + "step": 2119 + }, + { + "epoch": 0.14748339072663397, + "grad_norm": 0.59765625, + "learning_rate": 0.00192853026436767, + "loss": 3.5871, + "step": 2120 + }, + { + "epoch": 0.14755295836376917, + "grad_norm": 0.64453125, + "learning_rate": 0.0019284465844069212, + "loss": 3.4433, + "step": 2121 + }, + { + "epoch": 0.1476225260009044, + "grad_norm": 0.78125, + "learning_rate": 0.0019283628573043348, + "loss": 3.4745, + "step": 2122 + }, + { + "epoch": 0.1476920936380396, + "grad_norm": 0.6484375, + "learning_rate": 0.0019282790830641616, + "loss": 3.5395, + "step": 2123 + }, + { + "epoch": 0.1477616612751748, + "grad_norm": 0.68359375, + "learning_rate": 0.0019281952616906554, + "loss": 2.7807, + "step": 2124 + }, + { + "epoch": 0.14783122891231, + "grad_norm": 0.6015625, + "learning_rate": 0.0019281113931880727, + "loss": 3.6789, + "step": 2125 + }, + { + "epoch": 0.14790079654944518, + "grad_norm": 0.64453125, + "learning_rate": 0.001928027477560671, + "loss": 3.056, + "step": 2126 + }, + { + "epoch": 0.1479703641865804, + "grad_norm": 0.796875, + "learning_rate": 0.0019279435148127117, + "loss": 3.503, + "step": 2127 + }, + { + "epoch": 0.1480399318237156, + "grad_norm": 0.703125, + "learning_rate": 0.001927859504948458, + "loss": 3.5267, + "step": 2128 + }, + { + "epoch": 0.1481094994608508, + "grad_norm": 0.8046875, + "learning_rate": 0.0019277754479721755, + "loss": 3.436, + "step": 2129 + }, + { + "epoch": 0.148179067097986, + "grad_norm": 0.78515625, + "learning_rate": 0.0019276913438881316, + "loss": 3.2678, + "step": 2130 + }, + { + "epoch": 0.14824863473512123, + "grad_norm": 0.71484375, + "learning_rate": 0.0019276071927005977, + "loss": 3.2497, + "step": 2131 + }, + { + "epoch": 0.14831820237225643, + "grad_norm": 0.765625, + "learning_rate": 0.0019275229944138456, + "loss": 3.058, + "step": 2132 + }, + { + "epoch": 0.14838777000939163, + "grad_norm": 1.015625, + "learning_rate": 0.0019274387490321515, + "loss": 3.205, + "step": 2133 + }, + { + "epoch": 0.14845733764652683, + "grad_norm": 0.81640625, + "learning_rate": 0.0019273544565597918, + "loss": 2.9491, + "step": 2134 + }, + { + "epoch": 0.14852690528366205, + "grad_norm": 1.1484375, + "learning_rate": 0.0019272701170010471, + "loss": 3.448, + "step": 2135 + }, + { + "epoch": 0.14859647292079725, + "grad_norm": 0.7578125, + "learning_rate": 0.0019271857303602, + "loss": 3.424, + "step": 2136 + }, + { + "epoch": 0.14866604055793245, + "grad_norm": 0.875, + "learning_rate": 0.0019271012966415345, + "loss": 3.2642, + "step": 2137 + }, + { + "epoch": 0.14873560819506765, + "grad_norm": 0.921875, + "learning_rate": 0.001927016815849338, + "loss": 3.8212, + "step": 2138 + }, + { + "epoch": 0.14880517583220285, + "grad_norm": 0.875, + "learning_rate": 0.0019269322879879006, + "loss": 3.1188, + "step": 2139 + }, + { + "epoch": 0.14887474346933807, + "grad_norm": 0.98046875, + "learning_rate": 0.0019268477130615135, + "loss": 3.5071, + "step": 2140 + }, + { + "epoch": 0.14894431110647327, + "grad_norm": 0.74609375, + "learning_rate": 0.0019267630910744708, + "loss": 3.1713, + "step": 2141 + }, + { + "epoch": 0.14901387874360847, + "grad_norm": 0.7109375, + "learning_rate": 0.00192667842203107, + "loss": 3.2658, + "step": 2142 + }, + { + "epoch": 0.14908344638074367, + "grad_norm": 0.625, + "learning_rate": 0.0019265937059356095, + "loss": 3.1708, + "step": 2143 + }, + { + "epoch": 0.1491530140178789, + "grad_norm": 0.6484375, + "learning_rate": 0.0019265089427923914, + "loss": 3.24, + "step": 2144 + }, + { + "epoch": 0.1492225816550141, + "grad_norm": 0.67578125, + "learning_rate": 0.0019264241326057189, + "loss": 3.2901, + "step": 2145 + }, + { + "epoch": 0.1492921492921493, + "grad_norm": 0.74609375, + "learning_rate": 0.0019263392753798981, + "loss": 3.1596, + "step": 2146 + }, + { + "epoch": 0.1493617169292845, + "grad_norm": 1.1171875, + "learning_rate": 0.0019262543711192385, + "loss": 2.9588, + "step": 2147 + }, + { + "epoch": 0.14943128456641971, + "grad_norm": 0.796875, + "learning_rate": 0.0019261694198280503, + "loss": 2.9698, + "step": 2148 + }, + { + "epoch": 0.1495008522035549, + "grad_norm": 0.7890625, + "learning_rate": 0.0019260844215106471, + "loss": 3.4912, + "step": 2149 + }, + { + "epoch": 0.1495704198406901, + "grad_norm": 0.67578125, + "learning_rate": 0.0019259993761713452, + "loss": 3.1958, + "step": 2150 + }, + { + "epoch": 0.1496399874778253, + "grad_norm": 0.8515625, + "learning_rate": 0.0019259142838144623, + "loss": 2.9556, + "step": 2151 + }, + { + "epoch": 0.1497095551149605, + "grad_norm": 0.7890625, + "learning_rate": 0.0019258291444443187, + "loss": 3.0389, + "step": 2152 + }, + { + "epoch": 0.14977912275209573, + "grad_norm": 0.9296875, + "learning_rate": 0.0019257439580652378, + "loss": 3.3051, + "step": 2153 + }, + { + "epoch": 0.14984869038923093, + "grad_norm": 0.78515625, + "learning_rate": 0.0019256587246815448, + "loss": 3.0117, + "step": 2154 + }, + { + "epoch": 0.14991825802636613, + "grad_norm": 0.6328125, + "learning_rate": 0.0019255734442975676, + "loss": 3.5847, + "step": 2155 + }, + { + "epoch": 0.14998782566350133, + "grad_norm": 0.84765625, + "learning_rate": 0.001925488116917636, + "loss": 3.4234, + "step": 2156 + }, + { + "epoch": 0.15005739330063655, + "grad_norm": 0.80078125, + "learning_rate": 0.0019254027425460827, + "loss": 3.4137, + "step": 2157 + }, + { + "epoch": 0.15012696093777175, + "grad_norm": 0.78515625, + "learning_rate": 0.0019253173211872423, + "loss": 3.4854, + "step": 2158 + }, + { + "epoch": 0.15019652857490695, + "grad_norm": 0.65625, + "learning_rate": 0.0019252318528454526, + "loss": 3.1512, + "step": 2159 + }, + { + "epoch": 0.15026609621204215, + "grad_norm": 0.72265625, + "learning_rate": 0.0019251463375250526, + "loss": 3.7507, + "step": 2160 + }, + { + "epoch": 0.15033566384917738, + "grad_norm": 0.71875, + "learning_rate": 0.001925060775230385, + "loss": 3.488, + "step": 2161 + }, + { + "epoch": 0.15040523148631257, + "grad_norm": 0.6953125, + "learning_rate": 0.0019249751659657934, + "loss": 3.3587, + "step": 2162 + }, + { + "epoch": 0.15047479912344777, + "grad_norm": 0.8203125, + "learning_rate": 0.0019248895097356256, + "loss": 3.428, + "step": 2163 + }, + { + "epoch": 0.15054436676058297, + "grad_norm": 0.76953125, + "learning_rate": 0.00192480380654423, + "loss": 3.1657, + "step": 2164 + }, + { + "epoch": 0.15061393439771817, + "grad_norm": 0.80078125, + "learning_rate": 0.0019247180563959586, + "loss": 3.5087, + "step": 2165 + }, + { + "epoch": 0.1506835020348534, + "grad_norm": 0.7734375, + "learning_rate": 0.0019246322592951653, + "loss": 3.457, + "step": 2166 + }, + { + "epoch": 0.1507530696719886, + "grad_norm": 0.75390625, + "learning_rate": 0.0019245464152462062, + "loss": 3.4768, + "step": 2167 + }, + { + "epoch": 0.1508226373091238, + "grad_norm": 0.90234375, + "learning_rate": 0.0019244605242534402, + "loss": 3.0841, + "step": 2168 + }, + { + "epoch": 0.150892204946259, + "grad_norm": 0.76171875, + "learning_rate": 0.0019243745863212283, + "loss": 3.1466, + "step": 2169 + }, + { + "epoch": 0.15096177258339422, + "grad_norm": 0.8828125, + "learning_rate": 0.0019242886014539343, + "loss": 2.7856, + "step": 2170 + }, + { + "epoch": 0.15103134022052941, + "grad_norm": 0.78515625, + "learning_rate": 0.0019242025696559239, + "loss": 3.379, + "step": 2171 + }, + { + "epoch": 0.1511009078576646, + "grad_norm": 0.68359375, + "learning_rate": 0.0019241164909315652, + "loss": 3.4753, + "step": 2172 + }, + { + "epoch": 0.1511704754947998, + "grad_norm": 0.8359375, + "learning_rate": 0.001924030365285229, + "loss": 2.9882, + "step": 2173 + }, + { + "epoch": 0.15124004313193504, + "grad_norm": 0.71875, + "learning_rate": 0.0019239441927212885, + "loss": 3.2017, + "step": 2174 + }, + { + "epoch": 0.15130961076907024, + "grad_norm": 0.8125, + "learning_rate": 0.0019238579732441185, + "loss": 2.9532, + "step": 2175 + }, + { + "epoch": 0.15137917840620543, + "grad_norm": 0.75, + "learning_rate": 0.0019237717068580973, + "loss": 3.4523, + "step": 2176 + }, + { + "epoch": 0.15144874604334063, + "grad_norm": 0.6171875, + "learning_rate": 0.0019236853935676052, + "loss": 3.2862, + "step": 2177 + }, + { + "epoch": 0.15151831368047583, + "grad_norm": 0.62890625, + "learning_rate": 0.0019235990333770247, + "loss": 3.4693, + "step": 2178 + }, + { + "epoch": 0.15158788131761106, + "grad_norm": 0.8515625, + "learning_rate": 0.0019235126262907402, + "loss": 3.1924, + "step": 2179 + }, + { + "epoch": 0.15165744895474625, + "grad_norm": 0.86328125, + "learning_rate": 0.0019234261723131395, + "loss": 3.4629, + "step": 2180 + }, + { + "epoch": 0.15172701659188145, + "grad_norm": 1.0546875, + "learning_rate": 0.0019233396714486122, + "loss": 3.4659, + "step": 2181 + }, + { + "epoch": 0.15179658422901665, + "grad_norm": 0.71484375, + "learning_rate": 0.0019232531237015503, + "loss": 3.5634, + "step": 2182 + }, + { + "epoch": 0.15186615186615188, + "grad_norm": 0.8203125, + "learning_rate": 0.0019231665290763485, + "loss": 3.2782, + "step": 2183 + }, + { + "epoch": 0.15193571950328708, + "grad_norm": 0.91015625, + "learning_rate": 0.0019230798875774031, + "loss": 2.896, + "step": 2184 + }, + { + "epoch": 0.15200528714042227, + "grad_norm": 1.078125, + "learning_rate": 0.001922993199209114, + "loss": 3.9471, + "step": 2185 + }, + { + "epoch": 0.15207485477755747, + "grad_norm": 0.765625, + "learning_rate": 0.0019229064639758825, + "loss": 3.2744, + "step": 2186 + }, + { + "epoch": 0.1521444224146927, + "grad_norm": 0.90625, + "learning_rate": 0.0019228196818821127, + "loss": 3.12, + "step": 2187 + }, + { + "epoch": 0.1522139900518279, + "grad_norm": 0.80078125, + "learning_rate": 0.0019227328529322102, + "loss": 3.2227, + "step": 2188 + }, + { + "epoch": 0.1522835576889631, + "grad_norm": 0.734375, + "learning_rate": 0.001922645977130585, + "loss": 3.2465, + "step": 2189 + }, + { + "epoch": 0.1523531253260983, + "grad_norm": 0.78125, + "learning_rate": 0.0019225590544816472, + "loss": 2.951, + "step": 2190 + }, + { + "epoch": 0.1524226929632335, + "grad_norm": 0.796875, + "learning_rate": 0.0019224720849898107, + "loss": 3.3373, + "step": 2191 + }, + { + "epoch": 0.15249226060036872, + "grad_norm": 0.72265625, + "learning_rate": 0.0019223850686594913, + "loss": 3.667, + "step": 2192 + }, + { + "epoch": 0.15256182823750392, + "grad_norm": 1.0078125, + "learning_rate": 0.0019222980054951072, + "loss": 3.4597, + "step": 2193 + }, + { + "epoch": 0.15263139587463911, + "grad_norm": 0.7421875, + "learning_rate": 0.0019222108955010793, + "loss": 3.0378, + "step": 2194 + }, + { + "epoch": 0.1527009635117743, + "grad_norm": 0.828125, + "learning_rate": 0.0019221237386818305, + "loss": 3.2428, + "step": 2195 + }, + { + "epoch": 0.15277053114890954, + "grad_norm": 0.72265625, + "learning_rate": 0.0019220365350417858, + "loss": 3.5437, + "step": 2196 + }, + { + "epoch": 0.15284009878604474, + "grad_norm": 0.93359375, + "learning_rate": 0.0019219492845853733, + "loss": 3.1585, + "step": 2197 + }, + { + "epoch": 0.15290966642317994, + "grad_norm": 0.80859375, + "learning_rate": 0.0019218619873170232, + "loss": 3.2917, + "step": 2198 + }, + { + "epoch": 0.15297923406031513, + "grad_norm": 0.73046875, + "learning_rate": 0.001921774643241168, + "loss": 3.4092, + "step": 2199 + }, + { + "epoch": 0.15304880169745036, + "grad_norm": 0.83203125, + "learning_rate": 0.0019216872523622427, + "loss": 3.2245, + "step": 2200 + }, + { + "epoch": 0.15311836933458556, + "grad_norm": 0.88671875, + "learning_rate": 0.0019215998146846838, + "loss": 2.9539, + "step": 2201 + }, + { + "epoch": 0.15318793697172076, + "grad_norm": 0.77734375, + "learning_rate": 0.001921512330212932, + "loss": 3.0401, + "step": 2202 + }, + { + "epoch": 0.15325750460885595, + "grad_norm": 0.9296875, + "learning_rate": 0.0019214247989514286, + "loss": 3.1097, + "step": 2203 + }, + { + "epoch": 0.15332707224599115, + "grad_norm": 0.91015625, + "learning_rate": 0.0019213372209046183, + "loss": 3.3438, + "step": 2204 + }, + { + "epoch": 0.15339663988312638, + "grad_norm": 0.7109375, + "learning_rate": 0.0019212495960769479, + "loss": 3.4192, + "step": 2205 + }, + { + "epoch": 0.15346620752026158, + "grad_norm": 0.7734375, + "learning_rate": 0.001921161924472866, + "loss": 3.0318, + "step": 2206 + }, + { + "epoch": 0.15353577515739678, + "grad_norm": 0.74609375, + "learning_rate": 0.001921074206096825, + "loss": 3.2601, + "step": 2207 + }, + { + "epoch": 0.15360534279453197, + "grad_norm": 0.85546875, + "learning_rate": 0.0019209864409532784, + "loss": 3.0052, + "step": 2208 + }, + { + "epoch": 0.1536749104316672, + "grad_norm": 0.6953125, + "learning_rate": 0.0019208986290466822, + "loss": 3.2566, + "step": 2209 + }, + { + "epoch": 0.1537444780688024, + "grad_norm": 0.81640625, + "learning_rate": 0.0019208107703814954, + "loss": 3.2904, + "step": 2210 + }, + { + "epoch": 0.1538140457059376, + "grad_norm": 0.859375, + "learning_rate": 0.001920722864962179, + "loss": 3.2598, + "step": 2211 + }, + { + "epoch": 0.1538836133430728, + "grad_norm": 0.88671875, + "learning_rate": 0.0019206349127931963, + "loss": 2.8552, + "step": 2212 + }, + { + "epoch": 0.15395318098020802, + "grad_norm": 0.734375, + "learning_rate": 0.001920546913879013, + "loss": 3.4522, + "step": 2213 + }, + { + "epoch": 0.15402274861734322, + "grad_norm": 0.86328125, + "learning_rate": 0.0019204588682240973, + "loss": 3.4462, + "step": 2214 + }, + { + "epoch": 0.15409231625447842, + "grad_norm": 0.80859375, + "learning_rate": 0.0019203707758329198, + "loss": 3.5985, + "step": 2215 + }, + { + "epoch": 0.15416188389161362, + "grad_norm": 0.7734375, + "learning_rate": 0.0019202826367099534, + "loss": 3.1834, + "step": 2216 + }, + { + "epoch": 0.15423145152874881, + "grad_norm": 0.83984375, + "learning_rate": 0.0019201944508596732, + "loss": 3.5128, + "step": 2217 + }, + { + "epoch": 0.15430101916588404, + "grad_norm": 0.87109375, + "learning_rate": 0.0019201062182865566, + "loss": 3.4152, + "step": 2218 + }, + { + "epoch": 0.15437058680301924, + "grad_norm": 0.7421875, + "learning_rate": 0.0019200179389950842, + "loss": 3.2574, + "step": 2219 + }, + { + "epoch": 0.15444015444015444, + "grad_norm": 0.62109375, + "learning_rate": 0.001919929612989738, + "loss": 3.3873, + "step": 2220 + }, + { + "epoch": 0.15450972207728964, + "grad_norm": 0.81640625, + "learning_rate": 0.001919841240275003, + "loss": 3.3195, + "step": 2221 + }, + { + "epoch": 0.15457928971442486, + "grad_norm": 0.7421875, + "learning_rate": 0.0019197528208553661, + "loss": 3.0571, + "step": 2222 + }, + { + "epoch": 0.15464885735156006, + "grad_norm": 0.7109375, + "learning_rate": 0.0019196643547353168, + "loss": 3.526, + "step": 2223 + }, + { + "epoch": 0.15471842498869526, + "grad_norm": 0.73828125, + "learning_rate": 0.001919575841919347, + "loss": 3.4711, + "step": 2224 + }, + { + "epoch": 0.15478799262583046, + "grad_norm": 0.8359375, + "learning_rate": 0.001919487282411951, + "loss": 3.3619, + "step": 2225 + }, + { + "epoch": 0.15485756026296568, + "grad_norm": 0.640625, + "learning_rate": 0.0019193986762176252, + "loss": 3.0812, + "step": 2226 + }, + { + "epoch": 0.15492712790010088, + "grad_norm": 0.6953125, + "learning_rate": 0.0019193100233408692, + "loss": 3.1839, + "step": 2227 + }, + { + "epoch": 0.15499669553723608, + "grad_norm": 0.76171875, + "learning_rate": 0.0019192213237861834, + "loss": 3.0777, + "step": 2228 + }, + { + "epoch": 0.15506626317437128, + "grad_norm": 0.87890625, + "learning_rate": 0.0019191325775580722, + "loss": 2.9409, + "step": 2229 + }, + { + "epoch": 0.15513583081150648, + "grad_norm": 0.90625, + "learning_rate": 0.0019190437846610413, + "loss": 3.1332, + "step": 2230 + }, + { + "epoch": 0.1552053984486417, + "grad_norm": 1.0390625, + "learning_rate": 0.0019189549450995996, + "loss": 3.3082, + "step": 2231 + }, + { + "epoch": 0.1552749660857769, + "grad_norm": 0.86328125, + "learning_rate": 0.0019188660588782573, + "loss": 3.2494, + "step": 2232 + }, + { + "epoch": 0.1553445337229121, + "grad_norm": 1.078125, + "learning_rate": 0.0019187771260015284, + "loss": 3.215, + "step": 2233 + }, + { + "epoch": 0.1554141013600473, + "grad_norm": 0.80859375, + "learning_rate": 0.0019186881464739278, + "loss": 3.6842, + "step": 2234 + }, + { + "epoch": 0.15548366899718252, + "grad_norm": 0.72265625, + "learning_rate": 0.0019185991202999738, + "loss": 3.232, + "step": 2235 + }, + { + "epoch": 0.15555323663431772, + "grad_norm": 0.91796875, + "learning_rate": 0.0019185100474841863, + "loss": 3.0475, + "step": 2236 + }, + { + "epoch": 0.15562280427145292, + "grad_norm": 0.734375, + "learning_rate": 0.0019184209280310883, + "loss": 3.2183, + "step": 2237 + }, + { + "epoch": 0.15569237190858812, + "grad_norm": 0.91796875, + "learning_rate": 0.001918331761945205, + "loss": 3.4003, + "step": 2238 + }, + { + "epoch": 0.15576193954572332, + "grad_norm": 0.83203125, + "learning_rate": 0.0019182425492310633, + "loss": 3.3352, + "step": 2239 + }, + { + "epoch": 0.15583150718285854, + "grad_norm": 0.9375, + "learning_rate": 0.0019181532898931934, + "loss": 3.2472, + "step": 2240 + }, + { + "epoch": 0.15590107481999374, + "grad_norm": 0.9140625, + "learning_rate": 0.001918063983936127, + "loss": 3.2951, + "step": 2241 + }, + { + "epoch": 0.15597064245712894, + "grad_norm": 1.1015625, + "learning_rate": 0.0019179746313643992, + "loss": 3.3719, + "step": 2242 + }, + { + "epoch": 0.15604021009426414, + "grad_norm": 0.78125, + "learning_rate": 0.0019178852321825464, + "loss": 3.3772, + "step": 2243 + }, + { + "epoch": 0.15610977773139936, + "grad_norm": 0.7578125, + "learning_rate": 0.001917795786395108, + "loss": 3.1354, + "step": 2244 + }, + { + "epoch": 0.15617934536853456, + "grad_norm": 0.6953125, + "learning_rate": 0.0019177062940066256, + "loss": 3.3926, + "step": 2245 + }, + { + "epoch": 0.15624891300566976, + "grad_norm": 0.87890625, + "learning_rate": 0.0019176167550216433, + "loss": 3.2926, + "step": 2246 + }, + { + "epoch": 0.15631848064280496, + "grad_norm": 0.82421875, + "learning_rate": 0.0019175271694447072, + "loss": 3.3812, + "step": 2247 + }, + { + "epoch": 0.15638804827994018, + "grad_norm": 0.9296875, + "learning_rate": 0.0019174375372803662, + "loss": 3.2116, + "step": 2248 + }, + { + "epoch": 0.15645761591707538, + "grad_norm": 0.82421875, + "learning_rate": 0.0019173478585331712, + "loss": 2.9926, + "step": 2249 + }, + { + "epoch": 0.15652718355421058, + "grad_norm": 0.9921875, + "learning_rate": 0.0019172581332076756, + "loss": 3.1369, + "step": 2250 + }, + { + "epoch": 0.15659675119134578, + "grad_norm": 0.79296875, + "learning_rate": 0.0019171683613084353, + "loss": 3.4026, + "step": 2251 + }, + { + "epoch": 0.15666631882848098, + "grad_norm": 0.73046875, + "learning_rate": 0.0019170785428400086, + "loss": 3.0608, + "step": 2252 + }, + { + "epoch": 0.1567358864656162, + "grad_norm": 1.03125, + "learning_rate": 0.001916988677806956, + "loss": 3.22, + "step": 2253 + }, + { + "epoch": 0.1568054541027514, + "grad_norm": 0.65625, + "learning_rate": 0.0019168987662138402, + "loss": 3.0833, + "step": 2254 + }, + { + "epoch": 0.1568750217398866, + "grad_norm": 0.83984375, + "learning_rate": 0.0019168088080652268, + "loss": 2.9146, + "step": 2255 + }, + { + "epoch": 0.1569445893770218, + "grad_norm": 0.86328125, + "learning_rate": 0.0019167188033656828, + "loss": 3.3521, + "step": 2256 + }, + { + "epoch": 0.15701415701415702, + "grad_norm": 0.84375, + "learning_rate": 0.0019166287521197786, + "loss": 3.179, + "step": 2257 + }, + { + "epoch": 0.15708372465129222, + "grad_norm": 0.6328125, + "learning_rate": 0.0019165386543320867, + "loss": 3.3823, + "step": 2258 + }, + { + "epoch": 0.15715329228842742, + "grad_norm": 0.83984375, + "learning_rate": 0.0019164485100071817, + "loss": 3.5961, + "step": 2259 + }, + { + "epoch": 0.15722285992556262, + "grad_norm": 0.828125, + "learning_rate": 0.0019163583191496407, + "loss": 3.0851, + "step": 2260 + }, + { + "epoch": 0.15729242756269785, + "grad_norm": 0.77734375, + "learning_rate": 0.0019162680817640429, + "loss": 3.232, + "step": 2261 + }, + { + "epoch": 0.15736199519983304, + "grad_norm": 0.796875, + "learning_rate": 0.00191617779785497, + "loss": 3.3365, + "step": 2262 + }, + { + "epoch": 0.15743156283696824, + "grad_norm": 0.6640625, + "learning_rate": 0.0019160874674270067, + "loss": 3.7762, + "step": 2263 + }, + { + "epoch": 0.15750113047410344, + "grad_norm": 0.80859375, + "learning_rate": 0.0019159970904847393, + "loss": 3.0307, + "step": 2264 + }, + { + "epoch": 0.15757069811123864, + "grad_norm": 0.76171875, + "learning_rate": 0.0019159066670327563, + "loss": 3.2305, + "step": 2265 + }, + { + "epoch": 0.15764026574837386, + "grad_norm": 0.7109375, + "learning_rate": 0.0019158161970756493, + "loss": 3.0286, + "step": 2266 + }, + { + "epoch": 0.15770983338550906, + "grad_norm": 0.94921875, + "learning_rate": 0.001915725680618012, + "loss": 3.1607, + "step": 2267 + }, + { + "epoch": 0.15777940102264426, + "grad_norm": 0.859375, + "learning_rate": 0.0019156351176644404, + "loss": 3.2419, + "step": 2268 + }, + { + "epoch": 0.15784896865977946, + "grad_norm": 0.65625, + "learning_rate": 0.0019155445082195324, + "loss": 3.6515, + "step": 2269 + }, + { + "epoch": 0.15791853629691469, + "grad_norm": 0.87109375, + "learning_rate": 0.001915453852287889, + "loss": 3.5183, + "step": 2270 + }, + { + "epoch": 0.15798810393404988, + "grad_norm": 0.7421875, + "learning_rate": 0.0019153631498741133, + "loss": 3.5219, + "step": 2271 + }, + { + "epoch": 0.15805767157118508, + "grad_norm": 0.78125, + "learning_rate": 0.0019152724009828105, + "loss": 3.2711, + "step": 2272 + }, + { + "epoch": 0.15812723920832028, + "grad_norm": 0.73046875, + "learning_rate": 0.0019151816056185887, + "loss": 3.2095, + "step": 2273 + }, + { + "epoch": 0.1581968068454555, + "grad_norm": 0.734375, + "learning_rate": 0.0019150907637860576, + "loss": 3.4705, + "step": 2274 + }, + { + "epoch": 0.1582663744825907, + "grad_norm": 0.75, + "learning_rate": 0.0019149998754898298, + "loss": 3.5706, + "step": 2275 + }, + { + "epoch": 0.1583359421197259, + "grad_norm": 0.92578125, + "learning_rate": 0.0019149089407345206, + "loss": 3.1592, + "step": 2276 + }, + { + "epoch": 0.1584055097568611, + "grad_norm": 0.8203125, + "learning_rate": 0.0019148179595247468, + "loss": 3.2951, + "step": 2277 + }, + { + "epoch": 0.1584750773939963, + "grad_norm": 0.9140625, + "learning_rate": 0.0019147269318651279, + "loss": 3.4067, + "step": 2278 + }, + { + "epoch": 0.15854464503113153, + "grad_norm": 0.87109375, + "learning_rate": 0.001914635857760286, + "loss": 3.2653, + "step": 2279 + }, + { + "epoch": 0.15861421266826672, + "grad_norm": 0.80859375, + "learning_rate": 0.0019145447372148454, + "loss": 3.1743, + "step": 2280 + }, + { + "epoch": 0.15868378030540192, + "grad_norm": 0.671875, + "learning_rate": 0.0019144535702334327, + "loss": 3.3068, + "step": 2281 + }, + { + "epoch": 0.15875334794253712, + "grad_norm": 0.83984375, + "learning_rate": 0.001914362356820677, + "loss": 3.6477, + "step": 2282 + }, + { + "epoch": 0.15882291557967235, + "grad_norm": 0.87109375, + "learning_rate": 0.0019142710969812092, + "loss": 3.1305, + "step": 2283 + }, + { + "epoch": 0.15889248321680755, + "grad_norm": 0.88671875, + "learning_rate": 0.0019141797907196638, + "loss": 2.9882, + "step": 2284 + }, + { + "epoch": 0.15896205085394274, + "grad_norm": 0.6796875, + "learning_rate": 0.0019140884380406762, + "loss": 3.4047, + "step": 2285 + }, + { + "epoch": 0.15903161849107794, + "grad_norm": 0.71484375, + "learning_rate": 0.001913997038948885, + "loss": 3.4563, + "step": 2286 + }, + { + "epoch": 0.15910118612821317, + "grad_norm": 0.71484375, + "learning_rate": 0.001913905593448931, + "loss": 3.0906, + "step": 2287 + }, + { + "epoch": 0.15917075376534837, + "grad_norm": 0.69140625, + "learning_rate": 0.0019138141015454578, + "loss": 3.483, + "step": 2288 + }, + { + "epoch": 0.15924032140248356, + "grad_norm": 0.8125, + "learning_rate": 0.00191372256324311, + "loss": 3.0954, + "step": 2289 + }, + { + "epoch": 0.15930988903961876, + "grad_norm": 0.92578125, + "learning_rate": 0.0019136309785465363, + "loss": 3.2783, + "step": 2290 + }, + { + "epoch": 0.15937945667675396, + "grad_norm": 0.671875, + "learning_rate": 0.0019135393474603863, + "loss": 3.4251, + "step": 2291 + }, + { + "epoch": 0.1594490243138892, + "grad_norm": 1.15625, + "learning_rate": 0.0019134476699893131, + "loss": 3.1533, + "step": 2292 + }, + { + "epoch": 0.15951859195102439, + "grad_norm": 0.76171875, + "learning_rate": 0.0019133559461379708, + "loss": 3.1875, + "step": 2293 + }, + { + "epoch": 0.15958815958815958, + "grad_norm": 0.7578125, + "learning_rate": 0.0019132641759110175, + "loss": 3.1194, + "step": 2294 + }, + { + "epoch": 0.15965772722529478, + "grad_norm": 0.78515625, + "learning_rate": 0.001913172359313113, + "loss": 3.3485, + "step": 2295 + }, + { + "epoch": 0.15972729486243, + "grad_norm": 0.91796875, + "learning_rate": 0.0019130804963489183, + "loss": 3.5899, + "step": 2296 + }, + { + "epoch": 0.1597968624995652, + "grad_norm": 0.96484375, + "learning_rate": 0.0019129885870230983, + "loss": 2.8317, + "step": 2297 + }, + { + "epoch": 0.1598664301367004, + "grad_norm": 0.6953125, + "learning_rate": 0.0019128966313403197, + "loss": 3.3308, + "step": 2298 + }, + { + "epoch": 0.1599359977738356, + "grad_norm": 0.828125, + "learning_rate": 0.0019128046293052515, + "loss": 3.3317, + "step": 2299 + }, + { + "epoch": 0.16000556541097083, + "grad_norm": 0.8515625, + "learning_rate": 0.0019127125809225653, + "loss": 3.7295, + "step": 2300 + }, + { + "epoch": 0.16007513304810603, + "grad_norm": 0.84765625, + "learning_rate": 0.0019126204861969344, + "loss": 3.281, + "step": 2301 + }, + { + "epoch": 0.16014470068524123, + "grad_norm": 0.8828125, + "learning_rate": 0.0019125283451330354, + "loss": 3.1455, + "step": 2302 + }, + { + "epoch": 0.16021426832237642, + "grad_norm": 0.953125, + "learning_rate": 0.0019124361577355462, + "loss": 2.7583, + "step": 2303 + }, + { + "epoch": 0.16028383595951162, + "grad_norm": 0.9609375, + "learning_rate": 0.0019123439240091482, + "loss": 3.3957, + "step": 2304 + }, + { + "epoch": 0.16035340359664685, + "grad_norm": 0.83203125, + "learning_rate": 0.0019122516439585243, + "loss": 3.1651, + "step": 2305 + }, + { + "epoch": 0.16042297123378205, + "grad_norm": 0.91796875, + "learning_rate": 0.0019121593175883596, + "loss": 3.4217, + "step": 2306 + }, + { + "epoch": 0.16049253887091725, + "grad_norm": 0.77734375, + "learning_rate": 0.0019120669449033429, + "loss": 3.1833, + "step": 2307 + }, + { + "epoch": 0.16056210650805244, + "grad_norm": 0.77734375, + "learning_rate": 0.0019119745259081635, + "loss": 3.5411, + "step": 2308 + }, + { + "epoch": 0.16063167414518767, + "grad_norm": 0.7265625, + "learning_rate": 0.0019118820606075146, + "loss": 3.6817, + "step": 2309 + }, + { + "epoch": 0.16070124178232287, + "grad_norm": 0.70703125, + "learning_rate": 0.001911789549006091, + "loss": 3.2903, + "step": 2310 + }, + { + "epoch": 0.16077080941945807, + "grad_norm": 0.82421875, + "learning_rate": 0.0019116969911085896, + "loss": 2.9979, + "step": 2311 + }, + { + "epoch": 0.16084037705659326, + "grad_norm": 0.921875, + "learning_rate": 0.0019116043869197102, + "loss": 3.6423, + "step": 2312 + }, + { + "epoch": 0.1609099446937285, + "grad_norm": 0.78515625, + "learning_rate": 0.0019115117364441553, + "loss": 3.4296, + "step": 2313 + }, + { + "epoch": 0.1609795123308637, + "grad_norm": 0.70703125, + "learning_rate": 0.0019114190396866283, + "loss": 3.6313, + "step": 2314 + }, + { + "epoch": 0.1610490799679989, + "grad_norm": 0.8671875, + "learning_rate": 0.0019113262966518369, + "loss": 3.1692, + "step": 2315 + }, + { + "epoch": 0.16111864760513409, + "grad_norm": 0.875, + "learning_rate": 0.0019112335073444891, + "loss": 3.4536, + "step": 2316 + }, + { + "epoch": 0.16118821524226928, + "grad_norm": 1.0, + "learning_rate": 0.0019111406717692966, + "loss": 2.7128, + "step": 2317 + }, + { + "epoch": 0.1612577828794045, + "grad_norm": 0.859375, + "learning_rate": 0.0019110477899309739, + "loss": 3.1569, + "step": 2318 + }, + { + "epoch": 0.1613273505165397, + "grad_norm": 0.84765625, + "learning_rate": 0.001910954861834236, + "loss": 3.2842, + "step": 2319 + }, + { + "epoch": 0.1613969181536749, + "grad_norm": 0.8046875, + "learning_rate": 0.001910861887483802, + "loss": 3.1674, + "step": 2320 + }, + { + "epoch": 0.1614664857908101, + "grad_norm": 0.6953125, + "learning_rate": 0.0019107688668843924, + "loss": 3.3832, + "step": 2321 + }, + { + "epoch": 0.16153605342794533, + "grad_norm": 0.97265625, + "learning_rate": 0.00191067580004073, + "loss": 2.9273, + "step": 2322 + }, + { + "epoch": 0.16160562106508053, + "grad_norm": 0.81640625, + "learning_rate": 0.001910582686957541, + "loss": 3.4529, + "step": 2323 + }, + { + "epoch": 0.16167518870221573, + "grad_norm": 1.0078125, + "learning_rate": 0.001910489527639553, + "loss": 3.2073, + "step": 2324 + }, + { + "epoch": 0.16174475633935093, + "grad_norm": 0.75390625, + "learning_rate": 0.0019103963220914958, + "loss": 3.587, + "step": 2325 + }, + { + "epoch": 0.16181432397648615, + "grad_norm": 0.8203125, + "learning_rate": 0.001910303070318102, + "loss": 2.9228, + "step": 2326 + }, + { + "epoch": 0.16188389161362135, + "grad_norm": 1.0546875, + "learning_rate": 0.0019102097723241065, + "loss": 3.3639, + "step": 2327 + }, + { + "epoch": 0.16195345925075655, + "grad_norm": 0.89453125, + "learning_rate": 0.0019101164281142466, + "loss": 3.4723, + "step": 2328 + }, + { + "epoch": 0.16202302688789175, + "grad_norm": 0.87109375, + "learning_rate": 0.0019100230376932618, + "loss": 3.236, + "step": 2329 + }, + { + "epoch": 0.16209259452502695, + "grad_norm": 1.0546875, + "learning_rate": 0.001909929601065894, + "loss": 3.2439, + "step": 2330 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 0.8515625, + "learning_rate": 0.0019098361182368878, + "loss": 3.2361, + "step": 2331 + }, + { + "epoch": 0.16223172979929737, + "grad_norm": 0.859375, + "learning_rate": 0.0019097425892109889, + "loss": 3.3246, + "step": 2332 + }, + { + "epoch": 0.16230129743643257, + "grad_norm": 0.84375, + "learning_rate": 0.0019096490139929472, + "loss": 2.9247, + "step": 2333 + }, + { + "epoch": 0.16237086507356777, + "grad_norm": 1.0234375, + "learning_rate": 0.0019095553925875133, + "loss": 3.1494, + "step": 2334 + }, + { + "epoch": 0.162440432710703, + "grad_norm": 0.98046875, + "learning_rate": 0.001909461724999441, + "loss": 3.2959, + "step": 2335 + }, + { + "epoch": 0.1625100003478382, + "grad_norm": 0.97265625, + "learning_rate": 0.0019093680112334864, + "loss": 3.0625, + "step": 2336 + }, + { + "epoch": 0.1625795679849734, + "grad_norm": 1.0546875, + "learning_rate": 0.001909274251294408, + "loss": 3.179, + "step": 2337 + }, + { + "epoch": 0.1626491356221086, + "grad_norm": 0.85546875, + "learning_rate": 0.001909180445186966, + "loss": 3.1252, + "step": 2338 + }, + { + "epoch": 0.1627187032592438, + "grad_norm": 0.71484375, + "learning_rate": 0.0019090865929159233, + "loss": 3.6039, + "step": 2339 + }, + { + "epoch": 0.162788270896379, + "grad_norm": 0.77734375, + "learning_rate": 0.0019089926944860461, + "loss": 3.4938, + "step": 2340 + }, + { + "epoch": 0.1628578385335142, + "grad_norm": 0.59375, + "learning_rate": 0.0019088987499021012, + "loss": 3.6166, + "step": 2341 + }, + { + "epoch": 0.1629274061706494, + "grad_norm": 1.0234375, + "learning_rate": 0.001908804759168859, + "loss": 3.4761, + "step": 2342 + }, + { + "epoch": 0.1629969738077846, + "grad_norm": 0.90234375, + "learning_rate": 0.001908710722291092, + "loss": 3.2296, + "step": 2343 + }, + { + "epoch": 0.16306654144491983, + "grad_norm": 0.81640625, + "learning_rate": 0.0019086166392735745, + "loss": 3.1014, + "step": 2344 + }, + { + "epoch": 0.16313610908205503, + "grad_norm": 0.68359375, + "learning_rate": 0.001908522510121084, + "loss": 3.6301, + "step": 2345 + }, + { + "epoch": 0.16320567671919023, + "grad_norm": 0.87109375, + "learning_rate": 0.0019084283348383994, + "loss": 2.8503, + "step": 2346 + }, + { + "epoch": 0.16327524435632543, + "grad_norm": 0.796875, + "learning_rate": 0.0019083341134303034, + "loss": 3.1469, + "step": 2347 + }, + { + "epoch": 0.16334481199346065, + "grad_norm": 0.890625, + "learning_rate": 0.001908239845901579, + "loss": 3.2092, + "step": 2348 + }, + { + "epoch": 0.16341437963059585, + "grad_norm": 0.6953125, + "learning_rate": 0.0019081455322570134, + "loss": 3.4614, + "step": 2349 + }, + { + "epoch": 0.16348394726773105, + "grad_norm": 0.7109375, + "learning_rate": 0.001908051172501395, + "loss": 2.7877, + "step": 2350 + }, + { + "epoch": 0.16355351490486625, + "grad_norm": 0.703125, + "learning_rate": 0.0019079567666395146, + "loss": 3.1249, + "step": 2351 + }, + { + "epoch": 0.16362308254200147, + "grad_norm": 0.80078125, + "learning_rate": 0.0019078623146761662, + "loss": 3.3158, + "step": 2352 + }, + { + "epoch": 0.16369265017913667, + "grad_norm": 0.828125, + "learning_rate": 0.0019077678166161457, + "loss": 3.1093, + "step": 2353 + }, + { + "epoch": 0.16376221781627187, + "grad_norm": 0.84765625, + "learning_rate": 0.0019076732724642507, + "loss": 3.0221, + "step": 2354 + }, + { + "epoch": 0.16383178545340707, + "grad_norm": 0.90625, + "learning_rate": 0.0019075786822252822, + "loss": 3.3588, + "step": 2355 + }, + { + "epoch": 0.16390135309054227, + "grad_norm": 0.5546875, + "learning_rate": 0.0019074840459040426, + "loss": 3.5697, + "step": 2356 + }, + { + "epoch": 0.1639709207276775, + "grad_norm": 0.84765625, + "learning_rate": 0.0019073893635053372, + "loss": 3.0052, + "step": 2357 + }, + { + "epoch": 0.1640404883648127, + "grad_norm": 1.1015625, + "learning_rate": 0.0019072946350339732, + "loss": 2.9758, + "step": 2358 + }, + { + "epoch": 0.1641100560019479, + "grad_norm": 0.88671875, + "learning_rate": 0.0019071998604947612, + "loss": 2.8526, + "step": 2359 + }, + { + "epoch": 0.1641796236390831, + "grad_norm": 0.76953125, + "learning_rate": 0.0019071050398925128, + "loss": 3.6262, + "step": 2360 + }, + { + "epoch": 0.16424919127621831, + "grad_norm": 0.85546875, + "learning_rate": 0.0019070101732320426, + "loss": 3.246, + "step": 2361 + }, + { + "epoch": 0.1643187589133535, + "grad_norm": 0.6484375, + "learning_rate": 0.0019069152605181673, + "loss": 3.2814, + "step": 2362 + }, + { + "epoch": 0.1643883265504887, + "grad_norm": 0.8046875, + "learning_rate": 0.0019068203017557064, + "loss": 3.2555, + "step": 2363 + }, + { + "epoch": 0.1644578941876239, + "grad_norm": 0.73046875, + "learning_rate": 0.0019067252969494812, + "loss": 2.951, + "step": 2364 + }, + { + "epoch": 0.16452746182475914, + "grad_norm": 0.81640625, + "learning_rate": 0.0019066302461043158, + "loss": 3.3416, + "step": 2365 + }, + { + "epoch": 0.16459702946189433, + "grad_norm": 0.7265625, + "learning_rate": 0.0019065351492250362, + "loss": 3.3133, + "step": 2366 + }, + { + "epoch": 0.16466659709902953, + "grad_norm": 0.6796875, + "learning_rate": 0.0019064400063164711, + "loss": 3.5457, + "step": 2367 + }, + { + "epoch": 0.16473616473616473, + "grad_norm": 0.8046875, + "learning_rate": 0.001906344817383451, + "loss": 3.1784, + "step": 2368 + }, + { + "epoch": 0.16480573237329993, + "grad_norm": 0.8046875, + "learning_rate": 0.0019062495824308098, + "loss": 3.2771, + "step": 2369 + }, + { + "epoch": 0.16487530001043516, + "grad_norm": 0.92578125, + "learning_rate": 0.0019061543014633822, + "loss": 2.851, + "step": 2370 + }, + { + "epoch": 0.16494486764757035, + "grad_norm": 1.25, + "learning_rate": 0.0019060589744860068, + "loss": 3.1726, + "step": 2371 + }, + { + "epoch": 0.16501443528470555, + "grad_norm": 0.77734375, + "learning_rate": 0.0019059636015035235, + "loss": 3.5959, + "step": 2372 + }, + { + "epoch": 0.16508400292184075, + "grad_norm": 0.93359375, + "learning_rate": 0.0019058681825207748, + "loss": 3.3155, + "step": 2373 + }, + { + "epoch": 0.16515357055897598, + "grad_norm": 0.890625, + "learning_rate": 0.001905772717542606, + "loss": 3.2452, + "step": 2374 + }, + { + "epoch": 0.16522313819611117, + "grad_norm": 0.77734375, + "learning_rate": 0.0019056772065738636, + "loss": 3.0752, + "step": 2375 + }, + { + "epoch": 0.16529270583324637, + "grad_norm": 0.70703125, + "learning_rate": 0.0019055816496193981, + "loss": 3.1546, + "step": 2376 + }, + { + "epoch": 0.16536227347038157, + "grad_norm": 0.8984375, + "learning_rate": 0.0019054860466840606, + "loss": 3.1118, + "step": 2377 + }, + { + "epoch": 0.16543184110751677, + "grad_norm": 0.859375, + "learning_rate": 0.0019053903977727057, + "loss": 3.4787, + "step": 2378 + }, + { + "epoch": 0.165501408744652, + "grad_norm": 0.72265625, + "learning_rate": 0.0019052947028901897, + "loss": 3.3528, + "step": 2379 + }, + { + "epoch": 0.1655709763817872, + "grad_norm": 0.83203125, + "learning_rate": 0.0019051989620413718, + "loss": 3.4341, + "step": 2380 + }, + { + "epoch": 0.1656405440189224, + "grad_norm": 0.87109375, + "learning_rate": 0.0019051031752311135, + "loss": 3.4025, + "step": 2381 + }, + { + "epoch": 0.1657101116560576, + "grad_norm": 0.85546875, + "learning_rate": 0.0019050073424642779, + "loss": 3.6356, + "step": 2382 + }, + { + "epoch": 0.16577967929319282, + "grad_norm": 0.82421875, + "learning_rate": 0.0019049114637457306, + "loss": 3.1271, + "step": 2383 + }, + { + "epoch": 0.16584924693032801, + "grad_norm": 0.75390625, + "learning_rate": 0.0019048155390803405, + "loss": 3.3048, + "step": 2384 + }, + { + "epoch": 0.1659188145674632, + "grad_norm": 0.91796875, + "learning_rate": 0.0019047195684729781, + "loss": 3.3535, + "step": 2385 + }, + { + "epoch": 0.1659883822045984, + "grad_norm": 0.83984375, + "learning_rate": 0.001904623551928516, + "loss": 3.0647, + "step": 2386 + }, + { + "epoch": 0.16605794984173364, + "grad_norm": 0.85546875, + "learning_rate": 0.0019045274894518296, + "loss": 3.1102, + "step": 2387 + }, + { + "epoch": 0.16612751747886884, + "grad_norm": 0.71484375, + "learning_rate": 0.0019044313810477964, + "loss": 3.2833, + "step": 2388 + }, + { + "epoch": 0.16619708511600403, + "grad_norm": 0.6875, + "learning_rate": 0.0019043352267212965, + "loss": 3.1556, + "step": 2389 + }, + { + "epoch": 0.16626665275313923, + "grad_norm": 0.6953125, + "learning_rate": 0.0019042390264772118, + "loss": 3.3388, + "step": 2390 + }, + { + "epoch": 0.16633622039027443, + "grad_norm": 0.9609375, + "learning_rate": 0.001904142780320427, + "loss": 2.995, + "step": 2391 + }, + { + "epoch": 0.16640578802740966, + "grad_norm": 0.63671875, + "learning_rate": 0.0019040464882558292, + "loss": 3.6169, + "step": 2392 + }, + { + "epoch": 0.16647535566454486, + "grad_norm": 0.77734375, + "learning_rate": 0.0019039501502883071, + "loss": 2.7984, + "step": 2393 + }, + { + "epoch": 0.16654492330168005, + "grad_norm": 0.80859375, + "learning_rate": 0.001903853766422753, + "loss": 3.3614, + "step": 2394 + }, + { + "epoch": 0.16661449093881525, + "grad_norm": 0.90625, + "learning_rate": 0.0019037573366640604, + "loss": 3.0099, + "step": 2395 + }, + { + "epoch": 0.16668405857595048, + "grad_norm": 0.65234375, + "learning_rate": 0.0019036608610171256, + "loss": 3.6974, + "step": 2396 + }, + { + "epoch": 0.16675362621308568, + "grad_norm": 0.72265625, + "learning_rate": 0.0019035643394868468, + "loss": 3.5691, + "step": 2397 + }, + { + "epoch": 0.16682319385022087, + "grad_norm": 0.91796875, + "learning_rate": 0.001903467772078125, + "loss": 3.4569, + "step": 2398 + }, + { + "epoch": 0.16689276148735607, + "grad_norm": 0.80078125, + "learning_rate": 0.0019033711587958639, + "loss": 2.9937, + "step": 2399 + }, + { + "epoch": 0.1669623291244913, + "grad_norm": 0.69140625, + "learning_rate": 0.0019032744996449688, + "loss": 3.3972, + "step": 2400 + }, + { + "epoch": 0.1670318967616265, + "grad_norm": 0.76171875, + "learning_rate": 0.001903177794630347, + "loss": 3.1435, + "step": 2401 + }, + { + "epoch": 0.1671014643987617, + "grad_norm": 0.8125, + "learning_rate": 0.0019030810437569096, + "loss": 3.33, + "step": 2402 + }, + { + "epoch": 0.1671710320358969, + "grad_norm": 1.0546875, + "learning_rate": 0.0019029842470295682, + "loss": 3.0786, + "step": 2403 + }, + { + "epoch": 0.1672405996730321, + "grad_norm": 0.8515625, + "learning_rate": 0.0019028874044532383, + "loss": 3.3754, + "step": 2404 + }, + { + "epoch": 0.16731016731016732, + "grad_norm": 1.0390625, + "learning_rate": 0.001902790516032837, + "loss": 3.0439, + "step": 2405 + }, + { + "epoch": 0.16737973494730252, + "grad_norm": 0.69921875, + "learning_rate": 0.0019026935817732836, + "loss": 3.9029, + "step": 2406 + }, + { + "epoch": 0.16744930258443771, + "grad_norm": 1.0078125, + "learning_rate": 0.0019025966016795, + "loss": 3.1789, + "step": 2407 + }, + { + "epoch": 0.1675188702215729, + "grad_norm": 1.03125, + "learning_rate": 0.0019024995757564102, + "loss": 3.7364, + "step": 2408 + }, + { + "epoch": 0.16758843785870814, + "grad_norm": 0.73828125, + "learning_rate": 0.0019024025040089412, + "loss": 3.3665, + "step": 2409 + }, + { + "epoch": 0.16765800549584334, + "grad_norm": 0.8203125, + "learning_rate": 0.0019023053864420216, + "loss": 3.232, + "step": 2410 + }, + { + "epoch": 0.16772757313297854, + "grad_norm": 0.81640625, + "learning_rate": 0.0019022082230605822, + "loss": 3.2489, + "step": 2411 + }, + { + "epoch": 0.16779714077011373, + "grad_norm": 0.84375, + "learning_rate": 0.0019021110138695567, + "loss": 3.1511, + "step": 2412 + }, + { + "epoch": 0.16786670840724896, + "grad_norm": 0.78125, + "learning_rate": 0.0019020137588738808, + "loss": 3.2653, + "step": 2413 + }, + { + "epoch": 0.16793627604438416, + "grad_norm": 0.76953125, + "learning_rate": 0.001901916458078493, + "loss": 3.3701, + "step": 2414 + }, + { + "epoch": 0.16800584368151936, + "grad_norm": 0.74609375, + "learning_rate": 0.0019018191114883332, + "loss": 3.3645, + "step": 2415 + }, + { + "epoch": 0.16807541131865456, + "grad_norm": 0.99609375, + "learning_rate": 0.0019017217191083446, + "loss": 2.9964, + "step": 2416 + }, + { + "epoch": 0.16814497895578975, + "grad_norm": 0.98046875, + "learning_rate": 0.001901624280943472, + "loss": 3.108, + "step": 2417 + }, + { + "epoch": 0.16821454659292498, + "grad_norm": 0.78515625, + "learning_rate": 0.001901526796998663, + "loss": 3.2885, + "step": 2418 + }, + { + "epoch": 0.16828411423006018, + "grad_norm": 0.84765625, + "learning_rate": 0.0019014292672788673, + "loss": 3.3568, + "step": 2419 + }, + { + "epoch": 0.16835368186719538, + "grad_norm": 0.85546875, + "learning_rate": 0.0019013316917890369, + "loss": 3.131, + "step": 2420 + }, + { + "epoch": 0.16842324950433057, + "grad_norm": 0.72265625, + "learning_rate": 0.0019012340705341262, + "loss": 3.459, + "step": 2421 + }, + { + "epoch": 0.1684928171414658, + "grad_norm": 0.859375, + "learning_rate": 0.001901136403519092, + "loss": 3.1376, + "step": 2422 + }, + { + "epoch": 0.168562384778601, + "grad_norm": 0.890625, + "learning_rate": 0.0019010386907488933, + "loss": 2.9253, + "step": 2423 + }, + { + "epoch": 0.1686319524157362, + "grad_norm": 0.74609375, + "learning_rate": 0.0019009409322284915, + "loss": 3.3112, + "step": 2424 + }, + { + "epoch": 0.1687015200528714, + "grad_norm": 0.76171875, + "learning_rate": 0.00190084312796285, + "loss": 3.2567, + "step": 2425 + }, + { + "epoch": 0.16877108769000662, + "grad_norm": 0.74609375, + "learning_rate": 0.0019007452779569354, + "loss": 3.2253, + "step": 2426 + }, + { + "epoch": 0.16884065532714182, + "grad_norm": 0.671875, + "learning_rate": 0.0019006473822157153, + "loss": 3.2455, + "step": 2427 + }, + { + "epoch": 0.16891022296427702, + "grad_norm": 0.59765625, + "learning_rate": 0.001900549440744161, + "loss": 3.3872, + "step": 2428 + }, + { + "epoch": 0.16897979060141222, + "grad_norm": 0.80859375, + "learning_rate": 0.001900451453547245, + "loss": 3.3497, + "step": 2429 + }, + { + "epoch": 0.16904935823854741, + "grad_norm": 0.92578125, + "learning_rate": 0.001900353420629943, + "loss": 3.0166, + "step": 2430 + }, + { + "epoch": 0.16911892587568264, + "grad_norm": 0.72265625, + "learning_rate": 0.0019002553419972324, + "loss": 3.2642, + "step": 2431 + }, + { + "epoch": 0.16918849351281784, + "grad_norm": 0.79296875, + "learning_rate": 0.001900157217654093, + "loss": 3.4829, + "step": 2432 + }, + { + "epoch": 0.16925806114995304, + "grad_norm": 0.98046875, + "learning_rate": 0.0019000590476055076, + "loss": 3.4229, + "step": 2433 + }, + { + "epoch": 0.16932762878708824, + "grad_norm": 0.9140625, + "learning_rate": 0.00189996083185646, + "loss": 3.0214, + "step": 2434 + }, + { + "epoch": 0.16939719642422346, + "grad_norm": 0.85546875, + "learning_rate": 0.0018998625704119377, + "loss": 3.3381, + "step": 2435 + }, + { + "epoch": 0.16946676406135866, + "grad_norm": 0.75390625, + "learning_rate": 0.0018997642632769297, + "loss": 3.724, + "step": 2436 + }, + { + "epoch": 0.16953633169849386, + "grad_norm": 0.6953125, + "learning_rate": 0.0018996659104564273, + "loss": 3.2562, + "step": 2437 + }, + { + "epoch": 0.16960589933562906, + "grad_norm": 0.890625, + "learning_rate": 0.001899567511955425, + "loss": 2.8605, + "step": 2438 + }, + { + "epoch": 0.16967546697276428, + "grad_norm": 0.80859375, + "learning_rate": 0.0018994690677789183, + "loss": 3.4274, + "step": 2439 + }, + { + "epoch": 0.16974503460989948, + "grad_norm": 0.87109375, + "learning_rate": 0.0018993705779319062, + "loss": 3.4065, + "step": 2440 + }, + { + "epoch": 0.16981460224703468, + "grad_norm": 0.7265625, + "learning_rate": 0.0018992720424193892, + "loss": 3.3679, + "step": 2441 + }, + { + "epoch": 0.16988416988416988, + "grad_norm": 1.046875, + "learning_rate": 0.0018991734612463706, + "loss": 3.1656, + "step": 2442 + }, + { + "epoch": 0.16995373752130508, + "grad_norm": 0.9765625, + "learning_rate": 0.001899074834417856, + "loss": 2.9411, + "step": 2443 + }, + { + "epoch": 0.1700233051584403, + "grad_norm": 0.8046875, + "learning_rate": 0.0018989761619388527, + "loss": 3.1267, + "step": 2444 + }, + { + "epoch": 0.1700928727955755, + "grad_norm": 1.140625, + "learning_rate": 0.0018988774438143713, + "loss": 3.4626, + "step": 2445 + }, + { + "epoch": 0.1701624404327107, + "grad_norm": 0.6875, + "learning_rate": 0.0018987786800494235, + "loss": 2.9381, + "step": 2446 + }, + { + "epoch": 0.1702320080698459, + "grad_norm": 0.82421875, + "learning_rate": 0.001898679870649025, + "loss": 3.3637, + "step": 2447 + }, + { + "epoch": 0.17030157570698112, + "grad_norm": 1.03125, + "learning_rate": 0.0018985810156181922, + "loss": 3.1177, + "step": 2448 + }, + { + "epoch": 0.17037114334411632, + "grad_norm": 0.71484375, + "learning_rate": 0.0018984821149619444, + "loss": 3.3731, + "step": 2449 + }, + { + "epoch": 0.17044071098125152, + "grad_norm": 0.69921875, + "learning_rate": 0.001898383168685304, + "loss": 3.191, + "step": 2450 + }, + { + "epoch": 0.17051027861838672, + "grad_norm": 0.59375, + "learning_rate": 0.001898284176793294, + "loss": 3.696, + "step": 2451 + }, + { + "epoch": 0.17057984625552194, + "grad_norm": 0.6796875, + "learning_rate": 0.0018981851392909413, + "loss": 3.4107, + "step": 2452 + }, + { + "epoch": 0.17064941389265714, + "grad_norm": 0.97265625, + "learning_rate": 0.0018980860561832746, + "loss": 3.6169, + "step": 2453 + }, + { + "epoch": 0.17071898152979234, + "grad_norm": 0.76171875, + "learning_rate": 0.0018979869274753246, + "loss": 3.0298, + "step": 2454 + }, + { + "epoch": 0.17078854916692754, + "grad_norm": 0.765625, + "learning_rate": 0.0018978877531721245, + "loss": 3.2814, + "step": 2455 + }, + { + "epoch": 0.17085811680406274, + "grad_norm": 0.9609375, + "learning_rate": 0.00189778853327871, + "loss": 2.9714, + "step": 2456 + }, + { + "epoch": 0.17092768444119796, + "grad_norm": 1.140625, + "learning_rate": 0.001897689267800119, + "loss": 3.0664, + "step": 2457 + }, + { + "epoch": 0.17099725207833316, + "grad_norm": 1.0, + "learning_rate": 0.0018975899567413915, + "loss": 2.9897, + "step": 2458 + }, + { + "epoch": 0.17106681971546836, + "grad_norm": 1.0859375, + "learning_rate": 0.0018974906001075706, + "loss": 3.3436, + "step": 2459 + }, + { + "epoch": 0.17113638735260356, + "grad_norm": 0.734375, + "learning_rate": 0.0018973911979037004, + "loss": 3.2716, + "step": 2460 + }, + { + "epoch": 0.17120595498973878, + "grad_norm": 0.8671875, + "learning_rate": 0.0018972917501348283, + "loss": 3.4302, + "step": 2461 + }, + { + "epoch": 0.17127552262687398, + "grad_norm": 0.765625, + "learning_rate": 0.001897192256806004, + "loss": 3.034, + "step": 2462 + }, + { + "epoch": 0.17134509026400918, + "grad_norm": 0.8359375, + "learning_rate": 0.001897092717922279, + "loss": 3.1561, + "step": 2463 + }, + { + "epoch": 0.17141465790114438, + "grad_norm": 0.93359375, + "learning_rate": 0.0018969931334887073, + "loss": 3.3995, + "step": 2464 + }, + { + "epoch": 0.1714842255382796, + "grad_norm": 1.0234375, + "learning_rate": 0.0018968935035103458, + "loss": 3.1383, + "step": 2465 + }, + { + "epoch": 0.1715537931754148, + "grad_norm": 0.86328125, + "learning_rate": 0.0018967938279922528, + "loss": 3.1904, + "step": 2466 + }, + { + "epoch": 0.17162336081255, + "grad_norm": 0.8828125, + "learning_rate": 0.0018966941069394894, + "loss": 3.2457, + "step": 2467 + }, + { + "epoch": 0.1716929284496852, + "grad_norm": 1.046875, + "learning_rate": 0.001896594340357119, + "loss": 3.4366, + "step": 2468 + }, + { + "epoch": 0.1717624960868204, + "grad_norm": 0.75390625, + "learning_rate": 0.001896494528250207, + "loss": 3.1295, + "step": 2469 + }, + { + "epoch": 0.17183206372395562, + "grad_norm": 0.7109375, + "learning_rate": 0.0018963946706238213, + "loss": 3.4161, + "step": 2470 + }, + { + "epoch": 0.17190163136109082, + "grad_norm": 0.71484375, + "learning_rate": 0.0018962947674830324, + "loss": 3.6559, + "step": 2471 + }, + { + "epoch": 0.17197119899822602, + "grad_norm": 0.75, + "learning_rate": 0.0018961948188329133, + "loss": 3.0453, + "step": 2472 + }, + { + "epoch": 0.17204076663536122, + "grad_norm": 0.7890625, + "learning_rate": 0.0018960948246785382, + "loss": 3.4069, + "step": 2473 + }, + { + "epoch": 0.17211033427249645, + "grad_norm": 0.8359375, + "learning_rate": 0.0018959947850249845, + "loss": 3.2656, + "step": 2474 + }, + { + "epoch": 0.17217990190963164, + "grad_norm": 0.7265625, + "learning_rate": 0.0018958946998773318, + "loss": 3.3958, + "step": 2475 + }, + { + "epoch": 0.17224946954676684, + "grad_norm": 0.69140625, + "learning_rate": 0.0018957945692406621, + "loss": 3.2935, + "step": 2476 + }, + { + "epoch": 0.17231903718390204, + "grad_norm": 0.59375, + "learning_rate": 0.0018956943931200591, + "loss": 3.5589, + "step": 2477 + }, + { + "epoch": 0.17238860482103727, + "grad_norm": 0.78125, + "learning_rate": 0.0018955941715206096, + "loss": 2.9786, + "step": 2478 + }, + { + "epoch": 0.17245817245817247, + "grad_norm": 0.76171875, + "learning_rate": 0.001895493904447402, + "loss": 3.2443, + "step": 2479 + }, + { + "epoch": 0.17252774009530766, + "grad_norm": 1.0, + "learning_rate": 0.0018953935919055276, + "loss": 3.1005, + "step": 2480 + }, + { + "epoch": 0.17259730773244286, + "grad_norm": 0.90625, + "learning_rate": 0.00189529323390008, + "loss": 3.1678, + "step": 2481 + }, + { + "epoch": 0.17266687536957806, + "grad_norm": 0.828125, + "learning_rate": 0.0018951928304361543, + "loss": 2.9976, + "step": 2482 + }, + { + "epoch": 0.17273644300671329, + "grad_norm": 1.15625, + "learning_rate": 0.001895092381518849, + "loss": 3.3298, + "step": 2483 + }, + { + "epoch": 0.17280601064384848, + "grad_norm": 1.0390625, + "learning_rate": 0.0018949918871532638, + "loss": 3.0909, + "step": 2484 + }, + { + "epoch": 0.17287557828098368, + "grad_norm": 0.609375, + "learning_rate": 0.001894891347344502, + "loss": 3.2494, + "step": 2485 + }, + { + "epoch": 0.17294514591811888, + "grad_norm": 1.0078125, + "learning_rate": 0.001894790762097668, + "loss": 3.1686, + "step": 2486 + }, + { + "epoch": 0.1730147135552541, + "grad_norm": 0.65625, + "learning_rate": 0.0018946901314178693, + "loss": 3.6562, + "step": 2487 + }, + { + "epoch": 0.1730842811923893, + "grad_norm": 0.82421875, + "learning_rate": 0.0018945894553102152, + "loss": 3.07, + "step": 2488 + }, + { + "epoch": 0.1731538488295245, + "grad_norm": 0.8203125, + "learning_rate": 0.0018944887337798177, + "loss": 2.9494, + "step": 2489 + }, + { + "epoch": 0.1732234164666597, + "grad_norm": 0.76953125, + "learning_rate": 0.0018943879668317906, + "loss": 3.03, + "step": 2490 + }, + { + "epoch": 0.17329298410379493, + "grad_norm": 0.78515625, + "learning_rate": 0.0018942871544712508, + "loss": 3.5412, + "step": 2491 + }, + { + "epoch": 0.17336255174093013, + "grad_norm": 0.8515625, + "learning_rate": 0.001894186296703317, + "loss": 3.2025, + "step": 2492 + }, + { + "epoch": 0.17343211937806532, + "grad_norm": 0.91796875, + "learning_rate": 0.00189408539353311, + "loss": 2.7529, + "step": 2493 + }, + { + "epoch": 0.17350168701520052, + "grad_norm": 0.78125, + "learning_rate": 0.001893984444965753, + "loss": 3.3874, + "step": 2494 + }, + { + "epoch": 0.17357125465233572, + "grad_norm": 0.7265625, + "learning_rate": 0.001893883451006372, + "loss": 3.3101, + "step": 2495 + }, + { + "epoch": 0.17364082228947095, + "grad_norm": 0.65234375, + "learning_rate": 0.001893782411660095, + "loss": 3.3576, + "step": 2496 + }, + { + "epoch": 0.17371038992660615, + "grad_norm": 1.0078125, + "learning_rate": 0.001893681326932052, + "loss": 3.1649, + "step": 2497 + }, + { + "epoch": 0.17377995756374134, + "grad_norm": 0.7578125, + "learning_rate": 0.0018935801968273758, + "loss": 2.8563, + "step": 2498 + }, + { + "epoch": 0.17384952520087654, + "grad_norm": 0.671875, + "learning_rate": 0.0018934790213512014, + "loss": 3.3319, + "step": 2499 + }, + { + "epoch": 0.17391909283801177, + "grad_norm": 0.765625, + "learning_rate": 0.0018933778005086653, + "loss": 3.5107, + "step": 2500 + }, + { + "epoch": 0.17398866047514697, + "grad_norm": 0.78515625, + "learning_rate": 0.0018932765343049076, + "loss": 3.4601, + "step": 2501 + }, + { + "epoch": 0.17405822811228217, + "grad_norm": 0.78125, + "learning_rate": 0.0018931752227450702, + "loss": 3.3166, + "step": 2502 + }, + { + "epoch": 0.17412779574941736, + "grad_norm": 0.83984375, + "learning_rate": 0.0018930738658342965, + "loss": 3.2444, + "step": 2503 + }, + { + "epoch": 0.1741973633865526, + "grad_norm": 0.61328125, + "learning_rate": 0.0018929724635777336, + "loss": 3.1286, + "step": 2504 + }, + { + "epoch": 0.1742669310236878, + "grad_norm": 0.79296875, + "learning_rate": 0.00189287101598053, + "loss": 3.2518, + "step": 2505 + }, + { + "epoch": 0.17433649866082299, + "grad_norm": 0.7265625, + "learning_rate": 0.0018927695230478365, + "loss": 3.1074, + "step": 2506 + }, + { + "epoch": 0.17440606629795818, + "grad_norm": 0.81640625, + "learning_rate": 0.0018926679847848064, + "loss": 3.2652, + "step": 2507 + }, + { + "epoch": 0.17447563393509338, + "grad_norm": 0.66796875, + "learning_rate": 0.0018925664011965955, + "loss": 3.2207, + "step": 2508 + }, + { + "epoch": 0.1745452015722286, + "grad_norm": 0.9296875, + "learning_rate": 0.0018924647722883617, + "loss": 2.8783, + "step": 2509 + }, + { + "epoch": 0.1746147692093638, + "grad_norm": 1.03125, + "learning_rate": 0.0018923630980652649, + "loss": 3.0064, + "step": 2510 + }, + { + "epoch": 0.174684336846499, + "grad_norm": 0.9140625, + "learning_rate": 0.001892261378532468, + "loss": 3.5639, + "step": 2511 + }, + { + "epoch": 0.1747539044836342, + "grad_norm": 0.8359375, + "learning_rate": 0.0018921596136951355, + "loss": 3.1965, + "step": 2512 + }, + { + "epoch": 0.17482347212076943, + "grad_norm": 0.71875, + "learning_rate": 0.0018920578035584348, + "loss": 2.857, + "step": 2513 + }, + { + "epoch": 0.17489303975790463, + "grad_norm": 0.86328125, + "learning_rate": 0.001891955948127535, + "loss": 2.7816, + "step": 2514 + }, + { + "epoch": 0.17496260739503983, + "grad_norm": 0.8125, + "learning_rate": 0.0018918540474076081, + "loss": 3.2822, + "step": 2515 + }, + { + "epoch": 0.17503217503217502, + "grad_norm": 0.94921875, + "learning_rate": 0.0018917521014038278, + "loss": 3.0852, + "step": 2516 + }, + { + "epoch": 0.17510174266931022, + "grad_norm": 0.91796875, + "learning_rate": 0.0018916501101213705, + "loss": 3.4133, + "step": 2517 + }, + { + "epoch": 0.17517131030644545, + "grad_norm": 0.921875, + "learning_rate": 0.001891548073565415, + "loss": 3.0457, + "step": 2518 + }, + { + "epoch": 0.17524087794358065, + "grad_norm": 0.81640625, + "learning_rate": 0.0018914459917411422, + "loss": 2.9106, + "step": 2519 + }, + { + "epoch": 0.17531044558071585, + "grad_norm": 0.89453125, + "learning_rate": 0.0018913438646537349, + "loss": 3.2036, + "step": 2520 + }, + { + "epoch": 0.17538001321785104, + "grad_norm": 0.890625, + "learning_rate": 0.0018912416923083791, + "loss": 2.8513, + "step": 2521 + }, + { + "epoch": 0.17544958085498627, + "grad_norm": 0.88671875, + "learning_rate": 0.0018911394747102622, + "loss": 3.2675, + "step": 2522 + }, + { + "epoch": 0.17551914849212147, + "grad_norm": 0.77734375, + "learning_rate": 0.0018910372118645742, + "loss": 3.3492, + "step": 2523 + }, + { + "epoch": 0.17558871612925667, + "grad_norm": 0.81640625, + "learning_rate": 0.001890934903776508, + "loss": 3.0853, + "step": 2524 + }, + { + "epoch": 0.17565828376639187, + "grad_norm": 0.8046875, + "learning_rate": 0.001890832550451258, + "loss": 3.0819, + "step": 2525 + }, + { + "epoch": 0.1757278514035271, + "grad_norm": 0.8125, + "learning_rate": 0.0018907301518940214, + "loss": 3.1941, + "step": 2526 + }, + { + "epoch": 0.1757974190406623, + "grad_norm": 0.81640625, + "learning_rate": 0.0018906277081099973, + "loss": 3.2225, + "step": 2527 + }, + { + "epoch": 0.1758669866777975, + "grad_norm": 0.86328125, + "learning_rate": 0.0018905252191043869, + "loss": 3.2995, + "step": 2528 + }, + { + "epoch": 0.17593655431493269, + "grad_norm": 0.8203125, + "learning_rate": 0.0018904226848823948, + "loss": 2.9885, + "step": 2529 + }, + { + "epoch": 0.17600612195206788, + "grad_norm": 1.4609375, + "learning_rate": 0.0018903201054492266, + "loss": 3.4344, + "step": 2530 + }, + { + "epoch": 0.1760756895892031, + "grad_norm": 0.79296875, + "learning_rate": 0.0018902174808100912, + "loss": 3.1054, + "step": 2531 + }, + { + "epoch": 0.1761452572263383, + "grad_norm": 0.68359375, + "learning_rate": 0.0018901148109701988, + "loss": 3.2626, + "step": 2532 + }, + { + "epoch": 0.1762148248634735, + "grad_norm": 0.83203125, + "learning_rate": 0.0018900120959347633, + "loss": 3.1817, + "step": 2533 + }, + { + "epoch": 0.1762843925006087, + "grad_norm": 0.97265625, + "learning_rate": 0.0018899093357089992, + "loss": 3.0, + "step": 2534 + }, + { + "epoch": 0.17635396013774393, + "grad_norm": 0.71875, + "learning_rate": 0.0018898065302981246, + "loss": 3.1899, + "step": 2535 + }, + { + "epoch": 0.17642352777487913, + "grad_norm": 0.84375, + "learning_rate": 0.0018897036797073594, + "loss": 2.9696, + "step": 2536 + }, + { + "epoch": 0.17649309541201433, + "grad_norm": 0.81640625, + "learning_rate": 0.0018896007839419259, + "loss": 2.9419, + "step": 2537 + }, + { + "epoch": 0.17656266304914953, + "grad_norm": 0.70703125, + "learning_rate": 0.0018894978430070482, + "loss": 3.2079, + "step": 2538 + }, + { + "epoch": 0.17663223068628475, + "grad_norm": 0.953125, + "learning_rate": 0.0018893948569079536, + "loss": 3.162, + "step": 2539 + }, + { + "epoch": 0.17670179832341995, + "grad_norm": 0.9296875, + "learning_rate": 0.001889291825649871, + "loss": 3.4439, + "step": 2540 + }, + { + "epoch": 0.17677136596055515, + "grad_norm": 0.87109375, + "learning_rate": 0.001889188749238032, + "loss": 3.2802, + "step": 2541 + }, + { + "epoch": 0.17684093359769035, + "grad_norm": 0.609375, + "learning_rate": 0.00188908562767767, + "loss": 3.414, + "step": 2542 + }, + { + "epoch": 0.17691050123482555, + "grad_norm": 0.9453125, + "learning_rate": 0.001888982460974021, + "loss": 2.7324, + "step": 2543 + }, + { + "epoch": 0.17698006887196077, + "grad_norm": 0.8046875, + "learning_rate": 0.001888879249132324, + "loss": 3.433, + "step": 2544 + }, + { + "epoch": 0.17704963650909597, + "grad_norm": 0.79296875, + "learning_rate": 0.0018887759921578184, + "loss": 3.1548, + "step": 2545 + }, + { + "epoch": 0.17711920414623117, + "grad_norm": 0.90234375, + "learning_rate": 0.001888672690055748, + "loss": 3.1417, + "step": 2546 + }, + { + "epoch": 0.17718877178336637, + "grad_norm": 0.796875, + "learning_rate": 0.0018885693428313576, + "loss": 3.1294, + "step": 2547 + }, + { + "epoch": 0.1772583394205016, + "grad_norm": 0.734375, + "learning_rate": 0.0018884659504898947, + "loss": 3.7066, + "step": 2548 + }, + { + "epoch": 0.1773279070576368, + "grad_norm": 0.9296875, + "learning_rate": 0.001888362513036609, + "loss": 3.0523, + "step": 2549 + }, + { + "epoch": 0.177397474694772, + "grad_norm": 0.90234375, + "learning_rate": 0.0018882590304767526, + "loss": 2.8411, + "step": 2550 + }, + { + "epoch": 0.1774670423319072, + "grad_norm": 0.96875, + "learning_rate": 0.0018881555028155796, + "loss": 3.0223, + "step": 2551 + }, + { + "epoch": 0.1775366099690424, + "grad_norm": 0.7421875, + "learning_rate": 0.0018880519300583471, + "loss": 3.3749, + "step": 2552 + }, + { + "epoch": 0.1776061776061776, + "grad_norm": 0.8359375, + "learning_rate": 0.0018879483122103136, + "loss": 2.9869, + "step": 2553 + }, + { + "epoch": 0.1776757452433128, + "grad_norm": 0.77734375, + "learning_rate": 0.0018878446492767403, + "loss": 3.4166, + "step": 2554 + }, + { + "epoch": 0.177745312880448, + "grad_norm": 0.80859375, + "learning_rate": 0.0018877409412628907, + "loss": 3.3083, + "step": 2555 + }, + { + "epoch": 0.1778148805175832, + "grad_norm": 0.83984375, + "learning_rate": 0.0018876371881740308, + "loss": 2.9763, + "step": 2556 + }, + { + "epoch": 0.17788444815471843, + "grad_norm": 0.67578125, + "learning_rate": 0.0018875333900154289, + "loss": 3.5804, + "step": 2557 + }, + { + "epoch": 0.17795401579185363, + "grad_norm": 0.9375, + "learning_rate": 0.0018874295467923544, + "loss": 2.897, + "step": 2558 + }, + { + "epoch": 0.17802358342898883, + "grad_norm": 0.7265625, + "learning_rate": 0.0018873256585100807, + "loss": 3.2606, + "step": 2559 + }, + { + "epoch": 0.17809315106612403, + "grad_norm": 0.8984375, + "learning_rate": 0.0018872217251738824, + "loss": 3.6405, + "step": 2560 + }, + { + "epoch": 0.17816271870325925, + "grad_norm": 0.7734375, + "learning_rate": 0.0018871177467890369, + "loss": 3.1806, + "step": 2561 + }, + { + "epoch": 0.17823228634039445, + "grad_norm": 0.75, + "learning_rate": 0.0018870137233608236, + "loss": 3.3116, + "step": 2562 + }, + { + "epoch": 0.17830185397752965, + "grad_norm": 0.765625, + "learning_rate": 0.0018869096548945242, + "loss": 3.1411, + "step": 2563 + }, + { + "epoch": 0.17837142161466485, + "grad_norm": 0.703125, + "learning_rate": 0.0018868055413954231, + "loss": 3.2375, + "step": 2564 + }, + { + "epoch": 0.17844098925180008, + "grad_norm": 0.72265625, + "learning_rate": 0.0018867013828688065, + "loss": 3.5075, + "step": 2565 + }, + { + "epoch": 0.17851055688893527, + "grad_norm": 0.7265625, + "learning_rate": 0.0018865971793199626, + "loss": 3.1211, + "step": 2566 + }, + { + "epoch": 0.17858012452607047, + "grad_norm": 0.95703125, + "learning_rate": 0.001886492930754183, + "loss": 3.4119, + "step": 2567 + }, + { + "epoch": 0.17864969216320567, + "grad_norm": 0.82421875, + "learning_rate": 0.0018863886371767605, + "loss": 3.0101, + "step": 2568 + }, + { + "epoch": 0.17871925980034087, + "grad_norm": 0.828125, + "learning_rate": 0.0018862842985929906, + "loss": 3.0612, + "step": 2569 + }, + { + "epoch": 0.1787888274374761, + "grad_norm": 0.9140625, + "learning_rate": 0.0018861799150081719, + "loss": 3.1752, + "step": 2570 + }, + { + "epoch": 0.1788583950746113, + "grad_norm": 0.98046875, + "learning_rate": 0.0018860754864276031, + "loss": 3.3085, + "step": 2571 + }, + { + "epoch": 0.1789279627117465, + "grad_norm": 0.9453125, + "learning_rate": 0.0018859710128565875, + "loss": 3.4382, + "step": 2572 + }, + { + "epoch": 0.1789975303488817, + "grad_norm": 0.93359375, + "learning_rate": 0.0018858664943004295, + "loss": 3.635, + "step": 2573 + }, + { + "epoch": 0.17906709798601692, + "grad_norm": 0.72265625, + "learning_rate": 0.001885761930764436, + "loss": 3.3867, + "step": 2574 + }, + { + "epoch": 0.1791366656231521, + "grad_norm": 0.81640625, + "learning_rate": 0.0018856573222539163, + "loss": 3.0955, + "step": 2575 + }, + { + "epoch": 0.1792062332602873, + "grad_norm": 1.0234375, + "learning_rate": 0.0018855526687741816, + "loss": 3.4234, + "step": 2576 + }, + { + "epoch": 0.1792758008974225, + "grad_norm": 0.75, + "learning_rate": 0.001885447970330546, + "loss": 2.7572, + "step": 2577 + }, + { + "epoch": 0.17934536853455774, + "grad_norm": 1.03125, + "learning_rate": 0.0018853432269283254, + "loss": 3.193, + "step": 2578 + }, + { + "epoch": 0.17941493617169293, + "grad_norm": 0.8359375, + "learning_rate": 0.0018852384385728382, + "loss": 3.1121, + "step": 2579 + }, + { + "epoch": 0.17948450380882813, + "grad_norm": 0.8671875, + "learning_rate": 0.0018851336052694051, + "loss": 3.2991, + "step": 2580 + }, + { + "epoch": 0.17955407144596333, + "grad_norm": 0.7578125, + "learning_rate": 0.0018850287270233488, + "loss": 3.3188, + "step": 2581 + }, + { + "epoch": 0.17962363908309853, + "grad_norm": 0.921875, + "learning_rate": 0.001884923803839995, + "loss": 3.432, + "step": 2582 + }, + { + "epoch": 0.17969320672023376, + "grad_norm": 1.046875, + "learning_rate": 0.0018848188357246706, + "loss": 3.5215, + "step": 2583 + }, + { + "epoch": 0.17976277435736895, + "grad_norm": 0.953125, + "learning_rate": 0.0018847138226827053, + "loss": 3.0869, + "step": 2584 + }, + { + "epoch": 0.17983234199450415, + "grad_norm": 0.9296875, + "learning_rate": 0.0018846087647194315, + "loss": 3.4616, + "step": 2585 + }, + { + "epoch": 0.17990190963163935, + "grad_norm": 0.70703125, + "learning_rate": 0.0018845036618401834, + "loss": 3.6504, + "step": 2586 + }, + { + "epoch": 0.17997147726877458, + "grad_norm": 0.77734375, + "learning_rate": 0.0018843985140502976, + "loss": 2.9132, + "step": 2587 + }, + { + "epoch": 0.18004104490590978, + "grad_norm": 0.7578125, + "learning_rate": 0.001884293321355113, + "loss": 3.5805, + "step": 2588 + }, + { + "epoch": 0.18011061254304497, + "grad_norm": 1.0859375, + "learning_rate": 0.0018841880837599705, + "loss": 3.0303, + "step": 2589 + }, + { + "epoch": 0.18018018018018017, + "grad_norm": 0.9296875, + "learning_rate": 0.001884082801270214, + "loss": 3.5147, + "step": 2590 + }, + { + "epoch": 0.1802497478173154, + "grad_norm": 0.88671875, + "learning_rate": 0.0018839774738911889, + "loss": 3.0765, + "step": 2591 + }, + { + "epoch": 0.1803193154544506, + "grad_norm": 0.890625, + "learning_rate": 0.0018838721016282433, + "loss": 3.494, + "step": 2592 + }, + { + "epoch": 0.1803888830915858, + "grad_norm": 0.7578125, + "learning_rate": 0.0018837666844867273, + "loss": 3.1105, + "step": 2593 + }, + { + "epoch": 0.180458450728721, + "grad_norm": 0.7734375, + "learning_rate": 0.0018836612224719938, + "loss": 3.0279, + "step": 2594 + }, + { + "epoch": 0.1805280183658562, + "grad_norm": 0.99609375, + "learning_rate": 0.001883555715589397, + "loss": 3.5961, + "step": 2595 + }, + { + "epoch": 0.18059758600299142, + "grad_norm": 0.80078125, + "learning_rate": 0.0018834501638442947, + "loss": 3.3071, + "step": 2596 + }, + { + "epoch": 0.18066715364012662, + "grad_norm": 0.734375, + "learning_rate": 0.001883344567242046, + "loss": 3.5241, + "step": 2597 + }, + { + "epoch": 0.1807367212772618, + "grad_norm": 1.015625, + "learning_rate": 0.0018832389257880124, + "loss": 3.1847, + "step": 2598 + }, + { + "epoch": 0.180806288914397, + "grad_norm": 0.82421875, + "learning_rate": 0.0018831332394875582, + "loss": 3.5169, + "step": 2599 + }, + { + "epoch": 0.18087585655153224, + "grad_norm": 0.96875, + "learning_rate": 0.0018830275083460493, + "loss": 3.0003, + "step": 2600 + }, + { + "epoch": 0.18094542418866744, + "grad_norm": 1.125, + "learning_rate": 0.0018829217323688544, + "loss": 3.1013, + "step": 2601 + }, + { + "epoch": 0.18101499182580263, + "grad_norm": 0.9296875, + "learning_rate": 0.0018828159115613441, + "loss": 3.0551, + "step": 2602 + }, + { + "epoch": 0.18108455946293783, + "grad_norm": 0.77734375, + "learning_rate": 0.0018827100459288914, + "loss": 3.3201, + "step": 2603 + }, + { + "epoch": 0.18115412710007306, + "grad_norm": 0.91796875, + "learning_rate": 0.001882604135476872, + "loss": 3.3826, + "step": 2604 + }, + { + "epoch": 0.18122369473720826, + "grad_norm": 0.8671875, + "learning_rate": 0.0018824981802106633, + "loss": 3.7203, + "step": 2605 + }, + { + "epoch": 0.18129326237434346, + "grad_norm": 0.73046875, + "learning_rate": 0.001882392180135645, + "loss": 3.1752, + "step": 2606 + }, + { + "epoch": 0.18136283001147865, + "grad_norm": 0.8203125, + "learning_rate": 0.0018822861352571995, + "loss": 3.4479, + "step": 2607 + }, + { + "epoch": 0.18143239764861385, + "grad_norm": 0.7421875, + "learning_rate": 0.0018821800455807109, + "loss": 3.347, + "step": 2608 + }, + { + "epoch": 0.18150196528574908, + "grad_norm": 0.81640625, + "learning_rate": 0.001882073911111566, + "loss": 3.342, + "step": 2609 + }, + { + "epoch": 0.18157153292288428, + "grad_norm": 0.81640625, + "learning_rate": 0.0018819677318551542, + "loss": 3.4264, + "step": 2610 + }, + { + "epoch": 0.18164110056001948, + "grad_norm": 0.83984375, + "learning_rate": 0.0018818615078168661, + "loss": 3.2509, + "step": 2611 + }, + { + "epoch": 0.18171066819715467, + "grad_norm": 0.69140625, + "learning_rate": 0.0018817552390020958, + "loss": 3.0668, + "step": 2612 + }, + { + "epoch": 0.1817802358342899, + "grad_norm": 0.90234375, + "learning_rate": 0.0018816489254162387, + "loss": 3.3121, + "step": 2613 + }, + { + "epoch": 0.1818498034714251, + "grad_norm": 0.83984375, + "learning_rate": 0.001881542567064693, + "loss": 3.1481, + "step": 2614 + }, + { + "epoch": 0.1819193711085603, + "grad_norm": 0.80078125, + "learning_rate": 0.0018814361639528593, + "loss": 3.3161, + "step": 2615 + }, + { + "epoch": 0.1819889387456955, + "grad_norm": 0.65625, + "learning_rate": 0.0018813297160861398, + "loss": 2.8992, + "step": 2616 + }, + { + "epoch": 0.18205850638283072, + "grad_norm": 0.98046875, + "learning_rate": 0.0018812232234699394, + "loss": 3.1757, + "step": 2617 + }, + { + "epoch": 0.18212807401996592, + "grad_norm": 1.0390625, + "learning_rate": 0.0018811166861096656, + "loss": 2.7238, + "step": 2618 + }, + { + "epoch": 0.18219764165710112, + "grad_norm": 1.0, + "learning_rate": 0.0018810101040107276, + "loss": 3.3991, + "step": 2619 + }, + { + "epoch": 0.18226720929423632, + "grad_norm": 0.90234375, + "learning_rate": 0.001880903477178537, + "loss": 2.9947, + "step": 2620 + }, + { + "epoch": 0.1823367769313715, + "grad_norm": 0.75390625, + "learning_rate": 0.001880796805618508, + "loss": 3.1391, + "step": 2621 + }, + { + "epoch": 0.18240634456850674, + "grad_norm": 0.70703125, + "learning_rate": 0.0018806900893360567, + "loss": 3.1868, + "step": 2622 + }, + { + "epoch": 0.18247591220564194, + "grad_norm": 1.2265625, + "learning_rate": 0.001880583328336602, + "loss": 3.2925, + "step": 2623 + }, + { + "epoch": 0.18254547984277714, + "grad_norm": 0.75, + "learning_rate": 0.001880476522625564, + "loss": 3.3639, + "step": 2624 + }, + { + "epoch": 0.18261504747991233, + "grad_norm": 0.8203125, + "learning_rate": 0.0018803696722083662, + "loss": 2.7638, + "step": 2625 + }, + { + "epoch": 0.18268461511704756, + "grad_norm": 0.72265625, + "learning_rate": 0.0018802627770904338, + "loss": 3.1189, + "step": 2626 + }, + { + "epoch": 0.18275418275418276, + "grad_norm": 0.63671875, + "learning_rate": 0.0018801558372771945, + "loss": 3.4456, + "step": 2627 + }, + { + "epoch": 0.18282375039131796, + "grad_norm": 0.734375, + "learning_rate": 0.0018800488527740782, + "loss": 3.0031, + "step": 2628 + }, + { + "epoch": 0.18289331802845316, + "grad_norm": 0.8046875, + "learning_rate": 0.001879941823586517, + "loss": 3.2647, + "step": 2629 + }, + { + "epoch": 0.18296288566558838, + "grad_norm": 0.71484375, + "learning_rate": 0.001879834749719945, + "loss": 3.4376, + "step": 2630 + }, + { + "epoch": 0.18303245330272358, + "grad_norm": 0.6796875, + "learning_rate": 0.001879727631179799, + "loss": 2.9715, + "step": 2631 + }, + { + "epoch": 0.18310202093985878, + "grad_norm": 0.87890625, + "learning_rate": 0.0018796204679715183, + "loss": 3.1558, + "step": 2632 + }, + { + "epoch": 0.18317158857699398, + "grad_norm": 0.8515625, + "learning_rate": 0.0018795132601005435, + "loss": 3.399, + "step": 2633 + }, + { + "epoch": 0.18324115621412917, + "grad_norm": 0.71875, + "learning_rate": 0.0018794060075723188, + "loss": 3.0927, + "step": 2634 + }, + { + "epoch": 0.1833107238512644, + "grad_norm": 0.81640625, + "learning_rate": 0.0018792987103922894, + "loss": 3.4046, + "step": 2635 + }, + { + "epoch": 0.1833802914883996, + "grad_norm": 0.72265625, + "learning_rate": 0.0018791913685659036, + "loss": 3.0692, + "step": 2636 + }, + { + "epoch": 0.1834498591255348, + "grad_norm": 0.8984375, + "learning_rate": 0.0018790839820986113, + "loss": 3.1099, + "step": 2637 + }, + { + "epoch": 0.18351942676267, + "grad_norm": 0.83203125, + "learning_rate": 0.0018789765509958656, + "loss": 2.9657, + "step": 2638 + }, + { + "epoch": 0.18358899439980522, + "grad_norm": 0.734375, + "learning_rate": 0.001878869075263121, + "loss": 3.4141, + "step": 2639 + }, + { + "epoch": 0.18365856203694042, + "grad_norm": 0.8515625, + "learning_rate": 0.0018787615549058347, + "loss": 2.8735, + "step": 2640 + }, + { + "epoch": 0.18372812967407562, + "grad_norm": 0.91796875, + "learning_rate": 0.0018786539899294655, + "loss": 3.509, + "step": 2641 + }, + { + "epoch": 0.18379769731121082, + "grad_norm": 0.87890625, + "learning_rate": 0.0018785463803394757, + "loss": 3.4174, + "step": 2642 + }, + { + "epoch": 0.18386726494834604, + "grad_norm": 1.015625, + "learning_rate": 0.001878438726141329, + "loss": 2.6337, + "step": 2643 + }, + { + "epoch": 0.18393683258548124, + "grad_norm": 0.74609375, + "learning_rate": 0.001878331027340491, + "loss": 3.4049, + "step": 2644 + }, + { + "epoch": 0.18400640022261644, + "grad_norm": 0.91796875, + "learning_rate": 0.0018782232839424308, + "loss": 3.0697, + "step": 2645 + }, + { + "epoch": 0.18407596785975164, + "grad_norm": 0.93359375, + "learning_rate": 0.001878115495952619, + "loss": 3.241, + "step": 2646 + }, + { + "epoch": 0.18414553549688684, + "grad_norm": 0.70703125, + "learning_rate": 0.001878007663376528, + "loss": 3.2673, + "step": 2647 + }, + { + "epoch": 0.18421510313402206, + "grad_norm": 0.84375, + "learning_rate": 0.0018778997862196338, + "loss": 3.3684, + "step": 2648 + }, + { + "epoch": 0.18428467077115726, + "grad_norm": 0.8046875, + "learning_rate": 0.001877791864487413, + "loss": 2.8385, + "step": 2649 + }, + { + "epoch": 0.18435423840829246, + "grad_norm": 0.91015625, + "learning_rate": 0.001877683898185346, + "loss": 3.0731, + "step": 2650 + }, + { + "epoch": 0.18442380604542766, + "grad_norm": 0.87109375, + "learning_rate": 0.0018775758873189143, + "loss": 3.0177, + "step": 2651 + }, + { + "epoch": 0.18449337368256288, + "grad_norm": 0.875, + "learning_rate": 0.0018774678318936025, + "loss": 3.1539, + "step": 2652 + }, + { + "epoch": 0.18456294131969808, + "grad_norm": 0.7421875, + "learning_rate": 0.0018773597319148968, + "loss": 2.9013, + "step": 2653 + }, + { + "epoch": 0.18463250895683328, + "grad_norm": 0.9765625, + "learning_rate": 0.0018772515873882864, + "loss": 3.1443, + "step": 2654 + }, + { + "epoch": 0.18470207659396848, + "grad_norm": 0.84765625, + "learning_rate": 0.0018771433983192619, + "loss": 3.1884, + "step": 2655 + }, + { + "epoch": 0.1847716442311037, + "grad_norm": 0.890625, + "learning_rate": 0.0018770351647133165, + "loss": 3.4777, + "step": 2656 + }, + { + "epoch": 0.1848412118682389, + "grad_norm": 1.078125, + "learning_rate": 0.0018769268865759467, + "loss": 3.2733, + "step": 2657 + }, + { + "epoch": 0.1849107795053741, + "grad_norm": 1.0703125, + "learning_rate": 0.001876818563912649, + "loss": 3.135, + "step": 2658 + }, + { + "epoch": 0.1849803471425093, + "grad_norm": 0.86328125, + "learning_rate": 0.0018767101967289244, + "loss": 3.3096, + "step": 2659 + }, + { + "epoch": 0.1850499147796445, + "grad_norm": 0.95703125, + "learning_rate": 0.0018766017850302748, + "loss": 3.4159, + "step": 2660 + }, + { + "epoch": 0.18511948241677972, + "grad_norm": 0.71875, + "learning_rate": 0.001876493328822205, + "loss": 3.3289, + "step": 2661 + }, + { + "epoch": 0.18518905005391492, + "grad_norm": 0.828125, + "learning_rate": 0.0018763848281102221, + "loss": 3.3328, + "step": 2662 + }, + { + "epoch": 0.18525861769105012, + "grad_norm": 0.8828125, + "learning_rate": 0.0018762762828998345, + "loss": 3.0924, + "step": 2663 + }, + { + "epoch": 0.18532818532818532, + "grad_norm": 0.8671875, + "learning_rate": 0.0018761676931965542, + "loss": 2.9847, + "step": 2664 + }, + { + "epoch": 0.18539775296532054, + "grad_norm": 0.640625, + "learning_rate": 0.0018760590590058946, + "loss": 3.0087, + "step": 2665 + }, + { + "epoch": 0.18546732060245574, + "grad_norm": 0.96484375, + "learning_rate": 0.0018759503803333717, + "loss": 2.853, + "step": 2666 + }, + { + "epoch": 0.18553688823959094, + "grad_norm": 0.9609375, + "learning_rate": 0.0018758416571845037, + "loss": 2.8915, + "step": 2667 + }, + { + "epoch": 0.18560645587672614, + "grad_norm": 0.73828125, + "learning_rate": 0.0018757328895648109, + "loss": 3.0704, + "step": 2668 + }, + { + "epoch": 0.18567602351386134, + "grad_norm": 1.15625, + "learning_rate": 0.0018756240774798157, + "loss": 3.1353, + "step": 2669 + }, + { + "epoch": 0.18574559115099656, + "grad_norm": 0.84765625, + "learning_rate": 0.0018755152209350436, + "loss": 3.1826, + "step": 2670 + }, + { + "epoch": 0.18581515878813176, + "grad_norm": 1.078125, + "learning_rate": 0.0018754063199360217, + "loss": 3.1387, + "step": 2671 + }, + { + "epoch": 0.18588472642526696, + "grad_norm": 1.0234375, + "learning_rate": 0.0018752973744882789, + "loss": 2.8912, + "step": 2672 + }, + { + "epoch": 0.18595429406240216, + "grad_norm": 0.80078125, + "learning_rate": 0.001875188384597347, + "loss": 3.3143, + "step": 2673 + }, + { + "epoch": 0.18602386169953739, + "grad_norm": 1.0078125, + "learning_rate": 0.0018750793502687606, + "loss": 3.4538, + "step": 2674 + }, + { + "epoch": 0.18609342933667258, + "grad_norm": 0.74609375, + "learning_rate": 0.0018749702715080557, + "loss": 3.6592, + "step": 2675 + }, + { + "epoch": 0.18616299697380778, + "grad_norm": 0.890625, + "learning_rate": 0.0018748611483207704, + "loss": 3.3338, + "step": 2676 + }, + { + "epoch": 0.18623256461094298, + "grad_norm": 0.76171875, + "learning_rate": 0.0018747519807124453, + "loss": 2.9657, + "step": 2677 + }, + { + "epoch": 0.1863021322480782, + "grad_norm": 0.84375, + "learning_rate": 0.001874642768688624, + "loss": 3.3772, + "step": 2678 + }, + { + "epoch": 0.1863716998852134, + "grad_norm": 1.109375, + "learning_rate": 0.0018745335122548514, + "loss": 2.5748, + "step": 2679 + }, + { + "epoch": 0.1864412675223486, + "grad_norm": 0.84765625, + "learning_rate": 0.0018744242114166752, + "loss": 3.4695, + "step": 2680 + }, + { + "epoch": 0.1865108351594838, + "grad_norm": 1.015625, + "learning_rate": 0.0018743148661796447, + "loss": 3.6006, + "step": 2681 + }, + { + "epoch": 0.186580402796619, + "grad_norm": 1.0390625, + "learning_rate": 0.0018742054765493125, + "loss": 2.9741, + "step": 2682 + }, + { + "epoch": 0.18664997043375423, + "grad_norm": 0.7578125, + "learning_rate": 0.001874096042531232, + "loss": 3.1929, + "step": 2683 + }, + { + "epoch": 0.18671953807088942, + "grad_norm": 0.671875, + "learning_rate": 0.0018739865641309605, + "loss": 3.0261, + "step": 2684 + }, + { + "epoch": 0.18678910570802462, + "grad_norm": 0.73046875, + "learning_rate": 0.0018738770413540566, + "loss": 3.3793, + "step": 2685 + }, + { + "epoch": 0.18685867334515982, + "grad_norm": 1.0078125, + "learning_rate": 0.001873767474206081, + "loss": 2.8978, + "step": 2686 + }, + { + "epoch": 0.18692824098229505, + "grad_norm": 0.76953125, + "learning_rate": 0.0018736578626925976, + "loss": 3.3339, + "step": 2687 + }, + { + "epoch": 0.18699780861943024, + "grad_norm": 0.94921875, + "learning_rate": 0.0018735482068191712, + "loss": 2.9924, + "step": 2688 + }, + { + "epoch": 0.18706737625656544, + "grad_norm": 0.75, + "learning_rate": 0.0018734385065913698, + "loss": 3.4697, + "step": 2689 + }, + { + "epoch": 0.18713694389370064, + "grad_norm": 0.8046875, + "learning_rate": 0.0018733287620147634, + "loss": 3.0865, + "step": 2690 + }, + { + "epoch": 0.18720651153083587, + "grad_norm": 0.83984375, + "learning_rate": 0.0018732189730949246, + "loss": 3.5094, + "step": 2691 + }, + { + "epoch": 0.18727607916797107, + "grad_norm": 0.71875, + "learning_rate": 0.0018731091398374276, + "loss": 3.3612, + "step": 2692 + }, + { + "epoch": 0.18734564680510626, + "grad_norm": 0.84375, + "learning_rate": 0.0018729992622478493, + "loss": 3.5381, + "step": 2693 + }, + { + "epoch": 0.18741521444224146, + "grad_norm": 0.73828125, + "learning_rate": 0.0018728893403317686, + "loss": 3.3153, + "step": 2694 + }, + { + "epoch": 0.18748478207937666, + "grad_norm": 0.80078125, + "learning_rate": 0.0018727793740947669, + "loss": 3.1899, + "step": 2695 + }, + { + "epoch": 0.1875543497165119, + "grad_norm": 0.796875, + "learning_rate": 0.001872669363542428, + "loss": 3.0942, + "step": 2696 + }, + { + "epoch": 0.18762391735364709, + "grad_norm": 0.8828125, + "learning_rate": 0.0018725593086803371, + "loss": 3.0085, + "step": 2697 + }, + { + "epoch": 0.18769348499078228, + "grad_norm": 0.9296875, + "learning_rate": 0.0018724492095140825, + "loss": 3.2013, + "step": 2698 + }, + { + "epoch": 0.18776305262791748, + "grad_norm": 0.98828125, + "learning_rate": 0.0018723390660492548, + "loss": 2.9254, + "step": 2699 + }, + { + "epoch": 0.1878326202650527, + "grad_norm": 1.9921875, + "learning_rate": 0.001872228878291446, + "loss": 3.2803, + "step": 2700 + }, + { + "epoch": 0.1879021879021879, + "grad_norm": 0.96484375, + "learning_rate": 0.0018721186462462513, + "loss": 2.8896, + "step": 2701 + }, + { + "epoch": 0.1879717555393231, + "grad_norm": 0.80859375, + "learning_rate": 0.0018720083699192674, + "loss": 3.1164, + "step": 2702 + }, + { + "epoch": 0.1880413231764583, + "grad_norm": 0.71875, + "learning_rate": 0.0018718980493160938, + "loss": 3.4225, + "step": 2703 + }, + { + "epoch": 0.18811089081359353, + "grad_norm": 0.8515625, + "learning_rate": 0.0018717876844423318, + "loss": 3.3561, + "step": 2704 + }, + { + "epoch": 0.18818045845072873, + "grad_norm": 0.99609375, + "learning_rate": 0.0018716772753035852, + "loss": 2.6811, + "step": 2705 + }, + { + "epoch": 0.18825002608786393, + "grad_norm": 0.6953125, + "learning_rate": 0.0018715668219054606, + "loss": 3.4476, + "step": 2706 + }, + { + "epoch": 0.18831959372499912, + "grad_norm": 1.0859375, + "learning_rate": 0.0018714563242535657, + "loss": 3.7711, + "step": 2707 + }, + { + "epoch": 0.18838916136213432, + "grad_norm": 0.74609375, + "learning_rate": 0.0018713457823535107, + "loss": 3.125, + "step": 2708 + }, + { + "epoch": 0.18845872899926955, + "grad_norm": 0.8828125, + "learning_rate": 0.001871235196210909, + "loss": 3.4208, + "step": 2709 + }, + { + "epoch": 0.18852829663640475, + "grad_norm": 0.8828125, + "learning_rate": 0.0018711245658313755, + "loss": 2.6746, + "step": 2710 + }, + { + "epoch": 0.18859786427353994, + "grad_norm": 0.96484375, + "learning_rate": 0.0018710138912205274, + "loss": 3.2908, + "step": 2711 + }, + { + "epoch": 0.18866743191067514, + "grad_norm": 0.89453125, + "learning_rate": 0.0018709031723839842, + "loss": 3.5754, + "step": 2712 + }, + { + "epoch": 0.18873699954781037, + "grad_norm": 0.92578125, + "learning_rate": 0.0018707924093273674, + "loss": 3.3013, + "step": 2713 + }, + { + "epoch": 0.18880656718494557, + "grad_norm": 0.9453125, + "learning_rate": 0.0018706816020563012, + "loss": 3.2028, + "step": 2714 + }, + { + "epoch": 0.18887613482208077, + "grad_norm": 0.9296875, + "learning_rate": 0.0018705707505764116, + "loss": 3.3403, + "step": 2715 + }, + { + "epoch": 0.18894570245921596, + "grad_norm": 0.72265625, + "learning_rate": 0.0018704598548933277, + "loss": 3.3962, + "step": 2716 + }, + { + "epoch": 0.1890152700963512, + "grad_norm": 0.91796875, + "learning_rate": 0.0018703489150126793, + "loss": 2.7822, + "step": 2717 + }, + { + "epoch": 0.1890848377334864, + "grad_norm": 1.015625, + "learning_rate": 0.0018702379309401005, + "loss": 3.3276, + "step": 2718 + }, + { + "epoch": 0.1891544053706216, + "grad_norm": 1.109375, + "learning_rate": 0.0018701269026812253, + "loss": 3.452, + "step": 2719 + }, + { + "epoch": 0.18922397300775678, + "grad_norm": 1.0, + "learning_rate": 0.0018700158302416923, + "loss": 3.1536, + "step": 2720 + }, + { + "epoch": 0.18929354064489198, + "grad_norm": 0.82421875, + "learning_rate": 0.0018699047136271402, + "loss": 2.8866, + "step": 2721 + }, + { + "epoch": 0.1893631082820272, + "grad_norm": 0.97265625, + "learning_rate": 0.0018697935528432118, + "loss": 3.1771, + "step": 2722 + }, + { + "epoch": 0.1894326759191624, + "grad_norm": 0.79296875, + "learning_rate": 0.0018696823478955502, + "loss": 3.6915, + "step": 2723 + }, + { + "epoch": 0.1895022435562976, + "grad_norm": 1.078125, + "learning_rate": 0.0018695710987898032, + "loss": 3.3638, + "step": 2724 + }, + { + "epoch": 0.1895718111934328, + "grad_norm": 0.95703125, + "learning_rate": 0.0018694598055316184, + "loss": 3.0669, + "step": 2725 + }, + { + "epoch": 0.18964137883056803, + "grad_norm": 1.0703125, + "learning_rate": 0.0018693484681266473, + "loss": 3.0863, + "step": 2726 + }, + { + "epoch": 0.18971094646770323, + "grad_norm": 0.703125, + "learning_rate": 0.0018692370865805426, + "loss": 3.2591, + "step": 2727 + }, + { + "epoch": 0.18978051410483843, + "grad_norm": 0.97265625, + "learning_rate": 0.00186912566089896, + "loss": 3.1086, + "step": 2728 + }, + { + "epoch": 0.18985008174197363, + "grad_norm": 1.2109375, + "learning_rate": 0.001869014191087557, + "loss": 2.9723, + "step": 2729 + }, + { + "epoch": 0.18991964937910885, + "grad_norm": 0.8046875, + "learning_rate": 0.0018689026771519937, + "loss": 3.1802, + "step": 2730 + }, + { + "epoch": 0.18998921701624405, + "grad_norm": 0.890625, + "learning_rate": 0.001868791119097932, + "loss": 3.3697, + "step": 2731 + }, + { + "epoch": 0.19005878465337925, + "grad_norm": 0.84765625, + "learning_rate": 0.001868679516931036, + "loss": 3.5163, + "step": 2732 + }, + { + "epoch": 0.19012835229051445, + "grad_norm": 0.83203125, + "learning_rate": 0.001868567870656973, + "loss": 3.3499, + "step": 2733 + }, + { + "epoch": 0.19019791992764964, + "grad_norm": 0.8203125, + "learning_rate": 0.0018684561802814112, + "loss": 3.3363, + "step": 2734 + }, + { + "epoch": 0.19026748756478487, + "grad_norm": 0.984375, + "learning_rate": 0.0018683444458100222, + "loss": 2.8722, + "step": 2735 + }, + { + "epoch": 0.19033705520192007, + "grad_norm": 0.90625, + "learning_rate": 0.0018682326672484785, + "loss": 3.327, + "step": 2736 + }, + { + "epoch": 0.19040662283905527, + "grad_norm": 0.88671875, + "learning_rate": 0.0018681208446024566, + "loss": 3.1799, + "step": 2737 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 1.09375, + "learning_rate": 0.001868008977877634, + "loss": 3.1104, + "step": 2738 + }, + { + "epoch": 0.1905457581133257, + "grad_norm": 0.890625, + "learning_rate": 0.0018678970670796902, + "loss": 3.0156, + "step": 2739 + }, + { + "epoch": 0.1906153257504609, + "grad_norm": 0.7734375, + "learning_rate": 0.0018677851122143082, + "loss": 3.1659, + "step": 2740 + }, + { + "epoch": 0.1906848933875961, + "grad_norm": 0.7890625, + "learning_rate": 0.0018676731132871718, + "loss": 3.354, + "step": 2741 + }, + { + "epoch": 0.1907544610247313, + "grad_norm": 0.95703125, + "learning_rate": 0.0018675610703039682, + "loss": 3.3231, + "step": 2742 + }, + { + "epoch": 0.1908240286618665, + "grad_norm": 0.875, + "learning_rate": 0.0018674489832703864, + "loss": 3.2926, + "step": 2743 + }, + { + "epoch": 0.1908935962990017, + "grad_norm": 0.92578125, + "learning_rate": 0.0018673368521921177, + "loss": 3.3228, + "step": 2744 + }, + { + "epoch": 0.1909631639361369, + "grad_norm": 0.90625, + "learning_rate": 0.001867224677074855, + "loss": 2.9053, + "step": 2745 + }, + { + "epoch": 0.1910327315732721, + "grad_norm": 0.90234375, + "learning_rate": 0.0018671124579242944, + "loss": 3.0507, + "step": 2746 + }, + { + "epoch": 0.1911022992104073, + "grad_norm": 0.99609375, + "learning_rate": 0.0018670001947461339, + "loss": 3.5, + "step": 2747 + }, + { + "epoch": 0.19117186684754253, + "grad_norm": 1.2265625, + "learning_rate": 0.0018668878875460733, + "loss": 3.6022, + "step": 2748 + }, + { + "epoch": 0.19124143448467773, + "grad_norm": 0.8984375, + "learning_rate": 0.0018667755363298154, + "loss": 3.3031, + "step": 2749 + }, + { + "epoch": 0.19131100212181293, + "grad_norm": 0.92578125, + "learning_rate": 0.0018666631411030645, + "loss": 3.531, + "step": 2750 + }, + { + "epoch": 0.19138056975894813, + "grad_norm": 0.91796875, + "learning_rate": 0.0018665507018715277, + "loss": 3.1135, + "step": 2751 + }, + { + "epoch": 0.19145013739608335, + "grad_norm": 0.86328125, + "learning_rate": 0.001866438218640914, + "loss": 3.5832, + "step": 2752 + }, + { + "epoch": 0.19151970503321855, + "grad_norm": 1.1875, + "learning_rate": 0.0018663256914169346, + "loss": 2.8053, + "step": 2753 + }, + { + "epoch": 0.19158927267035375, + "grad_norm": 0.7890625, + "learning_rate": 0.0018662131202053032, + "loss": 3.1533, + "step": 2754 + }, + { + "epoch": 0.19165884030748895, + "grad_norm": 1.0, + "learning_rate": 0.0018661005050117359, + "loss": 3.1857, + "step": 2755 + }, + { + "epoch": 0.19172840794462417, + "grad_norm": 0.9296875, + "learning_rate": 0.0018659878458419498, + "loss": 3.2337, + "step": 2756 + }, + { + "epoch": 0.19179797558175937, + "grad_norm": 1.0, + "learning_rate": 0.0018658751427016664, + "loss": 2.8843, + "step": 2757 + }, + { + "epoch": 0.19186754321889457, + "grad_norm": 1.2421875, + "learning_rate": 0.0018657623955966075, + "loss": 3.1811, + "step": 2758 + }, + { + "epoch": 0.19193711085602977, + "grad_norm": 0.99609375, + "learning_rate": 0.0018656496045324977, + "loss": 3.4058, + "step": 2759 + }, + { + "epoch": 0.19200667849316497, + "grad_norm": 0.95703125, + "learning_rate": 0.0018655367695150642, + "loss": 3.3727, + "step": 2760 + }, + { + "epoch": 0.1920762461303002, + "grad_norm": 0.94140625, + "learning_rate": 0.0018654238905500362, + "loss": 3.2887, + "step": 2761 + }, + { + "epoch": 0.1921458137674354, + "grad_norm": 1.0859375, + "learning_rate": 0.0018653109676431453, + "loss": 3.2733, + "step": 2762 + }, + { + "epoch": 0.1922153814045706, + "grad_norm": 1.0859375, + "learning_rate": 0.0018651980008001247, + "loss": 3.2227, + "step": 2763 + }, + { + "epoch": 0.1922849490417058, + "grad_norm": 0.9921875, + "learning_rate": 0.001865084990026711, + "loss": 2.544, + "step": 2764 + }, + { + "epoch": 0.19235451667884101, + "grad_norm": 0.8359375, + "learning_rate": 0.0018649719353286411, + "loss": 2.9591, + "step": 2765 + }, + { + "epoch": 0.1924240843159762, + "grad_norm": 0.78125, + "learning_rate": 0.0018648588367116568, + "loss": 3.5255, + "step": 2766 + }, + { + "epoch": 0.1924936519531114, + "grad_norm": 1.015625, + "learning_rate": 0.0018647456941814995, + "loss": 3.4794, + "step": 2767 + }, + { + "epoch": 0.1925632195902466, + "grad_norm": 1.1640625, + "learning_rate": 0.0018646325077439148, + "loss": 3.0202, + "step": 2768 + }, + { + "epoch": 0.19263278722738184, + "grad_norm": 0.78125, + "learning_rate": 0.0018645192774046492, + "loss": 3.567, + "step": 2769 + }, + { + "epoch": 0.19270235486451703, + "grad_norm": 0.8671875, + "learning_rate": 0.0018644060031694522, + "loss": 3.051, + "step": 2770 + }, + { + "epoch": 0.19277192250165223, + "grad_norm": 0.875, + "learning_rate": 0.0018642926850440755, + "loss": 2.9566, + "step": 2771 + }, + { + "epoch": 0.19284149013878743, + "grad_norm": 0.91796875, + "learning_rate": 0.0018641793230342726, + "loss": 3.2604, + "step": 2772 + }, + { + "epoch": 0.19291105777592263, + "grad_norm": 0.87890625, + "learning_rate": 0.0018640659171457992, + "loss": 3.246, + "step": 2773 + }, + { + "epoch": 0.19298062541305785, + "grad_norm": 1.140625, + "learning_rate": 0.0018639524673844143, + "loss": 3.4788, + "step": 2774 + }, + { + "epoch": 0.19305019305019305, + "grad_norm": 1.0078125, + "learning_rate": 0.001863838973755877, + "loss": 2.9149, + "step": 2775 + }, + { + "epoch": 0.19311976068732825, + "grad_norm": 1.1328125, + "learning_rate": 0.001863725436265951, + "loss": 2.9041, + "step": 2776 + }, + { + "epoch": 0.19318932832446345, + "grad_norm": 1.0390625, + "learning_rate": 0.0018636118549204008, + "loss": 3.3209, + "step": 2777 + }, + { + "epoch": 0.19325889596159868, + "grad_norm": 0.85546875, + "learning_rate": 0.0018634982297249937, + "loss": 3.6583, + "step": 2778 + }, + { + "epoch": 0.19332846359873387, + "grad_norm": 1.1328125, + "learning_rate": 0.001863384560685499, + "loss": 3.1681, + "step": 2779 + }, + { + "epoch": 0.19339803123586907, + "grad_norm": 0.96875, + "learning_rate": 0.0018632708478076875, + "loss": 3.3337, + "step": 2780 + }, + { + "epoch": 0.19346759887300427, + "grad_norm": 1.0859375, + "learning_rate": 0.0018631570910973342, + "loss": 3.3942, + "step": 2781 + }, + { + "epoch": 0.1935371665101395, + "grad_norm": 0.95703125, + "learning_rate": 0.001863043290560214, + "loss": 3.2764, + "step": 2782 + }, + { + "epoch": 0.1936067341472747, + "grad_norm": 1.078125, + "learning_rate": 0.0018629294462021058, + "loss": 3.1924, + "step": 2783 + }, + { + "epoch": 0.1936763017844099, + "grad_norm": 1.109375, + "learning_rate": 0.0018628155580287897, + "loss": 3.2434, + "step": 2784 + }, + { + "epoch": 0.1937458694215451, + "grad_norm": 1.15625, + "learning_rate": 0.0018627016260460486, + "loss": 3.2823, + "step": 2785 + }, + { + "epoch": 0.1938154370586803, + "grad_norm": 1.0546875, + "learning_rate": 0.001862587650259667, + "loss": 3.1277, + "step": 2786 + }, + { + "epoch": 0.19388500469581552, + "grad_norm": 0.90625, + "learning_rate": 0.0018624736306754324, + "loss": 3.0137, + "step": 2787 + }, + { + "epoch": 0.19395457233295071, + "grad_norm": 1.3125, + "learning_rate": 0.0018623595672991342, + "loss": 3.1154, + "step": 2788 + }, + { + "epoch": 0.1940241399700859, + "grad_norm": 0.96875, + "learning_rate": 0.0018622454601365636, + "loss": 3.1293, + "step": 2789 + }, + { + "epoch": 0.1940937076072211, + "grad_norm": 1.1015625, + "learning_rate": 0.0018621313091935145, + "loss": 2.8241, + "step": 2790 + }, + { + "epoch": 0.19416327524435634, + "grad_norm": 1.1171875, + "learning_rate": 0.0018620171144757833, + "loss": 3.5697, + "step": 2791 + }, + { + "epoch": 0.19423284288149154, + "grad_norm": 1.1484375, + "learning_rate": 0.0018619028759891676, + "loss": 2.8754, + "step": 2792 + }, + { + "epoch": 0.19430241051862673, + "grad_norm": 1.359375, + "learning_rate": 0.0018617885937394685, + "loss": 2.8006, + "step": 2793 + }, + { + "epoch": 0.19437197815576193, + "grad_norm": 1.0546875, + "learning_rate": 0.001861674267732488, + "loss": 2.903, + "step": 2794 + }, + { + "epoch": 0.19444154579289716, + "grad_norm": 1.234375, + "learning_rate": 0.0018615598979740318, + "loss": 3.1794, + "step": 2795 + }, + { + "epoch": 0.19451111343003236, + "grad_norm": 1.1328125, + "learning_rate": 0.0018614454844699062, + "loss": 3.3262, + "step": 2796 + }, + { + "epoch": 0.19458068106716755, + "grad_norm": 1.2734375, + "learning_rate": 0.0018613310272259209, + "loss": 2.9255, + "step": 2797 + }, + { + "epoch": 0.19465024870430275, + "grad_norm": 1.234375, + "learning_rate": 0.0018612165262478875, + "loss": 3.1538, + "step": 2798 + }, + { + "epoch": 0.19471981634143795, + "grad_norm": 1.375, + "learning_rate": 0.0018611019815416197, + "loss": 2.8641, + "step": 2799 + }, + { + "epoch": 0.19478938397857318, + "grad_norm": 1.125, + "learning_rate": 0.0018609873931129338, + "loss": 2.8544, + "step": 2800 + }, + { + "epoch": 0.19485895161570838, + "grad_norm": 1.3671875, + "learning_rate": 0.0018608727609676476, + "loss": 3.2558, + "step": 2801 + }, + { + "epoch": 0.19492851925284357, + "grad_norm": 1.4140625, + "learning_rate": 0.0018607580851115817, + "loss": 3.1328, + "step": 2802 + }, + { + "epoch": 0.19499808688997877, + "grad_norm": 1.484375, + "learning_rate": 0.0018606433655505587, + "loss": 2.9437, + "step": 2803 + }, + { + "epoch": 0.195067654527114, + "grad_norm": 1.46875, + "learning_rate": 0.0018605286022904037, + "loss": 2.9838, + "step": 2804 + }, + { + "epoch": 0.1951372221642492, + "grad_norm": 2.203125, + "learning_rate": 0.0018604137953369439, + "loss": 3.0965, + "step": 2805 + }, + { + "epoch": 0.1952067898013844, + "grad_norm": 1.3359375, + "learning_rate": 0.0018602989446960079, + "loss": 3.2953, + "step": 2806 + }, + { + "epoch": 0.1952763574385196, + "grad_norm": 1.46875, + "learning_rate": 0.001860184050373428, + "loss": 3.4891, + "step": 2807 + }, + { + "epoch": 0.1953459250756548, + "grad_norm": 1.15625, + "learning_rate": 0.0018600691123750374, + "loss": 3.2762, + "step": 2808 + }, + { + "epoch": 0.19541549271279002, + "grad_norm": 1.328125, + "learning_rate": 0.0018599541307066727, + "loss": 2.7164, + "step": 2809 + }, + { + "epoch": 0.19548506034992522, + "grad_norm": 1.03125, + "learning_rate": 0.0018598391053741717, + "loss": 3.2189, + "step": 2810 + }, + { + "epoch": 0.19555462798706041, + "grad_norm": 1.1640625, + "learning_rate": 0.0018597240363833745, + "loss": 3.0099, + "step": 2811 + }, + { + "epoch": 0.1956241956241956, + "grad_norm": 1.1953125, + "learning_rate": 0.0018596089237401245, + "loss": 2.7148, + "step": 2812 + }, + { + "epoch": 0.19569376326133084, + "grad_norm": 1.125, + "learning_rate": 0.0018594937674502657, + "loss": 3.1438, + "step": 2813 + }, + { + "epoch": 0.19576333089846604, + "grad_norm": 1.359375, + "learning_rate": 0.001859378567519646, + "loss": 3.3735, + "step": 2814 + }, + { + "epoch": 0.19583289853560124, + "grad_norm": 1.3828125, + "learning_rate": 0.0018592633239541136, + "loss": 3.291, + "step": 2815 + }, + { + "epoch": 0.19590246617273643, + "grad_norm": 1.1875, + "learning_rate": 0.0018591480367595213, + "loss": 3.312, + "step": 2816 + }, + { + "epoch": 0.19597203380987166, + "grad_norm": 1.5234375, + "learning_rate": 0.0018590327059417216, + "loss": 3.276, + "step": 2817 + }, + { + "epoch": 0.19604160144700686, + "grad_norm": 1.359375, + "learning_rate": 0.0018589173315065712, + "loss": 2.7598, + "step": 2818 + }, + { + "epoch": 0.19611116908414206, + "grad_norm": 1.6796875, + "learning_rate": 0.001858801913459928, + "loss": 3.0982, + "step": 2819 + }, + { + "epoch": 0.19618073672127725, + "grad_norm": 1.6796875, + "learning_rate": 0.0018586864518076523, + "loss": 3.0716, + "step": 2820 + }, + { + "epoch": 0.19625030435841245, + "grad_norm": 1.4921875, + "learning_rate": 0.0018585709465556066, + "loss": 3.7368, + "step": 2821 + }, + { + "epoch": 0.19631987199554768, + "grad_norm": 1.6484375, + "learning_rate": 0.0018584553977096557, + "loss": 3.1684, + "step": 2822 + }, + { + "epoch": 0.19638943963268288, + "grad_norm": 1.828125, + "learning_rate": 0.0018583398052756665, + "loss": 2.9681, + "step": 2823 + }, + { + "epoch": 0.19645900726981808, + "grad_norm": 1.5078125, + "learning_rate": 0.0018582241692595089, + "loss": 3.0652, + "step": 2824 + }, + { + "epoch": 0.19652857490695327, + "grad_norm": 1.7265625, + "learning_rate": 0.0018581084896670532, + "loss": 2.9162, + "step": 2825 + }, + { + "epoch": 0.1965981425440885, + "grad_norm": 1.9765625, + "learning_rate": 0.0018579927665041739, + "loss": 3.1073, + "step": 2826 + }, + { + "epoch": 0.1966677101812237, + "grad_norm": 1.390625, + "learning_rate": 0.0018578769997767465, + "loss": 2.6965, + "step": 2827 + }, + { + "epoch": 0.1967372778183589, + "grad_norm": 1.484375, + "learning_rate": 0.001857761189490649, + "loss": 2.9426, + "step": 2828 + }, + { + "epoch": 0.1968068454554941, + "grad_norm": 1.7109375, + "learning_rate": 0.0018576453356517618, + "loss": 3.1776, + "step": 2829 + }, + { + "epoch": 0.19687641309262932, + "grad_norm": 1.484375, + "learning_rate": 0.001857529438265967, + "loss": 2.8095, + "step": 2830 + }, + { + "epoch": 0.19694598072976452, + "grad_norm": 1.8046875, + "learning_rate": 0.0018574134973391497, + "loss": 3.233, + "step": 2831 + }, + { + "epoch": 0.19701554836689972, + "grad_norm": 1.6796875, + "learning_rate": 0.001857297512877197, + "loss": 3.2599, + "step": 2832 + }, + { + "epoch": 0.19708511600403492, + "grad_norm": 1.5234375, + "learning_rate": 0.0018571814848859973, + "loss": 2.7753, + "step": 2833 + }, + { + "epoch": 0.19715468364117011, + "grad_norm": 1.53125, + "learning_rate": 0.0018570654133714425, + "loss": 2.755, + "step": 2834 + }, + { + "epoch": 0.19722425127830534, + "grad_norm": 2.15625, + "learning_rate": 0.001856949298339426, + "loss": 2.9832, + "step": 2835 + }, + { + "epoch": 0.19729381891544054, + "grad_norm": 2.078125, + "learning_rate": 0.0018568331397958435, + "loss": 3.0149, + "step": 2836 + }, + { + "epoch": 0.19736338655257574, + "grad_norm": 1.765625, + "learning_rate": 0.0018567169377465928, + "loss": 3.0255, + "step": 2837 + }, + { + "epoch": 0.19743295418971094, + "grad_norm": 1.578125, + "learning_rate": 0.0018566006921975741, + "loss": 2.6039, + "step": 2838 + }, + { + "epoch": 0.19750252182684616, + "grad_norm": 1.71875, + "learning_rate": 0.0018564844031546902, + "loss": 2.7991, + "step": 2839 + }, + { + "epoch": 0.19757208946398136, + "grad_norm": 1.7734375, + "learning_rate": 0.001856368070623845, + "loss": 2.8238, + "step": 2840 + }, + { + "epoch": 0.19764165710111656, + "grad_norm": 2.25, + "learning_rate": 0.0018562516946109455, + "loss": 2.7847, + "step": 2841 + }, + { + "epoch": 0.19771122473825176, + "grad_norm": 1.625, + "learning_rate": 0.001856135275121901, + "loss": 2.5704, + "step": 2842 + }, + { + "epoch": 0.19778079237538698, + "grad_norm": 4.5625, + "learning_rate": 0.0018560188121626224, + "loss": 3.1919, + "step": 2843 + }, + { + "epoch": 0.19785036001252218, + "grad_norm": 2.078125, + "learning_rate": 0.0018559023057390235, + "loss": 2.3178, + "step": 2844 + }, + { + "epoch": 0.19791992764965738, + "grad_norm": 2.25, + "learning_rate": 0.001855785755857019, + "loss": 2.9565, + "step": 2845 + }, + { + "epoch": 0.19798949528679258, + "grad_norm": 2.125, + "learning_rate": 0.0018556691625225277, + "loss": 2.532, + "step": 2846 + }, + { + "epoch": 0.19805906292392778, + "grad_norm": 3.484375, + "learning_rate": 0.001855552525741469, + "loss": 2.7136, + "step": 2847 + }, + { + "epoch": 0.198128630561063, + "grad_norm": 2.09375, + "learning_rate": 0.0018554358455197652, + "loss": 2.5548, + "step": 2848 + }, + { + "epoch": 0.1981981981981982, + "grad_norm": 2.015625, + "learning_rate": 0.0018553191218633415, + "loss": 2.5284, + "step": 2849 + }, + { + "epoch": 0.1982677658353334, + "grad_norm": 2.34375, + "learning_rate": 0.0018552023547781231, + "loss": 2.6736, + "step": 2850 + }, + { + "epoch": 0.1983373334724686, + "grad_norm": 2.390625, + "learning_rate": 0.0018550855442700403, + "loss": 2.8147, + "step": 2851 + }, + { + "epoch": 0.19840690110960382, + "grad_norm": 2.65625, + "learning_rate": 0.0018549686903450234, + "loss": 2.7503, + "step": 2852 + }, + { + "epoch": 0.19847646874673902, + "grad_norm": 2.484375, + "learning_rate": 0.0018548517930090057, + "loss": 2.8713, + "step": 2853 + }, + { + "epoch": 0.19854603638387422, + "grad_norm": 2.4375, + "learning_rate": 0.0018547348522679225, + "loss": 2.7856, + "step": 2854 + }, + { + "epoch": 0.19861560402100942, + "grad_norm": 2.40625, + "learning_rate": 0.0018546178681277119, + "loss": 2.6505, + "step": 2855 + }, + { + "epoch": 0.19868517165814464, + "grad_norm": 2.34375, + "learning_rate": 0.0018545008405943136, + "loss": 2.6597, + "step": 2856 + }, + { + "epoch": 0.19875473929527984, + "grad_norm": 2.78125, + "learning_rate": 0.0018543837696736694, + "loss": 2.4495, + "step": 2857 + }, + { + "epoch": 0.19882430693241504, + "grad_norm": 2.390625, + "learning_rate": 0.001854266655371724, + "loss": 2.5478, + "step": 2858 + }, + { + "epoch": 0.19889387456955024, + "grad_norm": 2.9375, + "learning_rate": 0.0018541494976944235, + "loss": 2.8368, + "step": 2859 + }, + { + "epoch": 0.19896344220668544, + "grad_norm": 2.8125, + "learning_rate": 0.0018540322966477168, + "loss": 2.3955, + "step": 2860 + }, + { + "epoch": 0.19903300984382066, + "grad_norm": 3.015625, + "learning_rate": 0.001853915052237555, + "loss": 2.4895, + "step": 2861 + }, + { + "epoch": 0.19910257748095586, + "grad_norm": 2.640625, + "learning_rate": 0.0018537977644698907, + "loss": 2.7793, + "step": 2862 + }, + { + "epoch": 0.19917214511809106, + "grad_norm": 2.34375, + "learning_rate": 0.0018536804333506793, + "loss": 2.3713, + "step": 2863 + }, + { + "epoch": 0.19924171275522626, + "grad_norm": 2.578125, + "learning_rate": 0.0018535630588858783, + "loss": 2.6354, + "step": 2864 + }, + { + "epoch": 0.19931128039236148, + "grad_norm": 3.390625, + "learning_rate": 0.0018534456410814473, + "loss": 2.3463, + "step": 2865 + }, + { + "epoch": 0.19938084802949668, + "grad_norm": 2.859375, + "learning_rate": 0.0018533281799433489, + "loss": 2.2692, + "step": 2866 + }, + { + "epoch": 0.19945041566663188, + "grad_norm": 2.90625, + "learning_rate": 0.0018532106754775462, + "loss": 2.1026, + "step": 2867 + }, + { + "epoch": 0.19951998330376708, + "grad_norm": 2.140625, + "learning_rate": 0.001853093127690006, + "loss": 2.2183, + "step": 2868 + }, + { + "epoch": 0.1995895509409023, + "grad_norm": 2.921875, + "learning_rate": 0.0018529755365866967, + "loss": 2.5848, + "step": 2869 + }, + { + "epoch": 0.1996591185780375, + "grad_norm": 3.203125, + "learning_rate": 0.001852857902173589, + "loss": 2.3433, + "step": 2870 + }, + { + "epoch": 0.1997286862151727, + "grad_norm": 2.5625, + "learning_rate": 0.0018527402244566554, + "loss": 2.0217, + "step": 2871 + }, + { + "epoch": 0.1997982538523079, + "grad_norm": 2.578125, + "learning_rate": 0.0018526225034418715, + "loss": 2.2611, + "step": 2872 + }, + { + "epoch": 0.1998678214894431, + "grad_norm": 2.78125, + "learning_rate": 0.0018525047391352144, + "loss": 2.3769, + "step": 2873 + }, + { + "epoch": 0.19993738912657832, + "grad_norm": 2.3125, + "learning_rate": 0.001852386931542664, + "loss": 2.2041, + "step": 2874 + }, + { + "epoch": 0.20000695676371352, + "grad_norm": 2.90625, + "learning_rate": 0.0018522690806702013, + "loss": 2.1939, + "step": 2875 + }, + { + "epoch": 0.20007652440084872, + "grad_norm": 2.671875, + "learning_rate": 0.0018521511865238103, + "loss": 2.1469, + "step": 2876 + }, + { + "epoch": 0.20014609203798392, + "grad_norm": 2.515625, + "learning_rate": 0.0018520332491094775, + "loss": 2.3859, + "step": 2877 + }, + { + "epoch": 0.20021565967511915, + "grad_norm": 2.609375, + "learning_rate": 0.0018519152684331906, + "loss": 2.2566, + "step": 2878 + }, + { + "epoch": 0.20028522731225434, + "grad_norm": 2.46875, + "learning_rate": 0.0018517972445009404, + "loss": 2.1774, + "step": 2879 + }, + { + "epoch": 0.20035479494938954, + "grad_norm": 2.328125, + "learning_rate": 0.0018516791773187196, + "loss": 1.9746, + "step": 2880 + }, + { + "epoch": 0.20042436258652474, + "grad_norm": 2.375, + "learning_rate": 0.0018515610668925228, + "loss": 2.3208, + "step": 2881 + }, + { + "epoch": 0.20049393022365997, + "grad_norm": 2.6875, + "learning_rate": 0.0018514429132283476, + "loss": 2.1074, + "step": 2882 + }, + { + "epoch": 0.20056349786079516, + "grad_norm": 2.3125, + "learning_rate": 0.0018513247163321925, + "loss": 2.1219, + "step": 2883 + }, + { + "epoch": 0.20063306549793036, + "grad_norm": 2.953125, + "learning_rate": 0.0018512064762100594, + "loss": 2.3607, + "step": 2884 + }, + { + "epoch": 0.20070263313506556, + "grad_norm": 2.203125, + "learning_rate": 0.0018510881928679517, + "loss": 2.1561, + "step": 2885 + }, + { + "epoch": 0.20077220077220076, + "grad_norm": 2.140625, + "learning_rate": 0.0018509698663118754, + "loss": 1.8336, + "step": 2886 + }, + { + "epoch": 0.20084176840933599, + "grad_norm": 2.359375, + "learning_rate": 0.0018508514965478384, + "loss": 2.1067, + "step": 2887 + }, + { + "epoch": 0.20091133604647118, + "grad_norm": 2.3125, + "learning_rate": 0.0018507330835818513, + "loss": 2.0895, + "step": 2888 + }, + { + "epoch": 0.20098090368360638, + "grad_norm": 2.953125, + "learning_rate": 0.0018506146274199261, + "loss": 1.8984, + "step": 2889 + }, + { + "epoch": 0.20105047132074158, + "grad_norm": 2.40625, + "learning_rate": 0.0018504961280680777, + "loss": 1.9448, + "step": 2890 + }, + { + "epoch": 0.2011200389578768, + "grad_norm": 2.25, + "learning_rate": 0.0018503775855323226, + "loss": 1.7361, + "step": 2891 + }, + { + "epoch": 0.201189606595012, + "grad_norm": 2.40625, + "learning_rate": 0.00185025899981868, + "loss": 1.8027, + "step": 2892 + }, + { + "epoch": 0.2012591742321472, + "grad_norm": 3.03125, + "learning_rate": 0.0018501403709331706, + "loss": 2.1839, + "step": 2893 + }, + { + "epoch": 0.2013287418692824, + "grad_norm": 2.671875, + "learning_rate": 0.0018500216988818186, + "loss": 1.9038, + "step": 2894 + }, + { + "epoch": 0.20139830950641763, + "grad_norm": 2.96875, + "learning_rate": 0.0018499029836706491, + "loss": 1.8661, + "step": 2895 + }, + { + "epoch": 0.20146787714355283, + "grad_norm": 2.421875, + "learning_rate": 0.0018497842253056898, + "loss": 1.9367, + "step": 2896 + }, + { + "epoch": 0.20153744478068802, + "grad_norm": 3.34375, + "learning_rate": 0.0018496654237929709, + "loss": 2.0319, + "step": 2897 + }, + { + "epoch": 0.20160701241782322, + "grad_norm": 2.8125, + "learning_rate": 0.0018495465791385243, + "loss": 1.7812, + "step": 2898 + }, + { + "epoch": 0.20167658005495842, + "grad_norm": 2.625, + "learning_rate": 0.0018494276913483846, + "loss": 2.0751, + "step": 2899 + }, + { + "epoch": 0.20174614769209365, + "grad_norm": 4.34375, + "learning_rate": 0.0018493087604285882, + "loss": 2.0154, + "step": 2900 + }, + { + "epoch": 0.20181571532922885, + "grad_norm": 3.125, + "learning_rate": 0.0018491897863851733, + "loss": 2.1483, + "step": 2901 + }, + { + "epoch": 0.20188528296636404, + "grad_norm": 2.59375, + "learning_rate": 0.001849070769224182, + "loss": 1.8735, + "step": 2902 + }, + { + "epoch": 0.20195485060349924, + "grad_norm": 2.4375, + "learning_rate": 0.001848951708951656, + "loss": 1.9557, + "step": 2903 + }, + { + "epoch": 0.20202441824063447, + "grad_norm": 2.765625, + "learning_rate": 0.0018488326055736417, + "loss": 1.9589, + "step": 2904 + }, + { + "epoch": 0.20209398587776967, + "grad_norm": 2.4375, + "learning_rate": 0.001848713459096186, + "loss": 1.814, + "step": 2905 + }, + { + "epoch": 0.20216355351490486, + "grad_norm": 2.140625, + "learning_rate": 0.0018485942695253387, + "loss": 2.0086, + "step": 2906 + }, + { + "epoch": 0.20223312115204006, + "grad_norm": 2.125, + "learning_rate": 0.0018484750368671515, + "loss": 1.9076, + "step": 2907 + }, + { + "epoch": 0.2023026887891753, + "grad_norm": 2.15625, + "learning_rate": 0.0018483557611276788, + "loss": 1.9679, + "step": 2908 + }, + { + "epoch": 0.2023722564263105, + "grad_norm": 6.5625, + "learning_rate": 0.0018482364423129762, + "loss": 1.9789, + "step": 2909 + }, + { + "epoch": 0.20244182406344569, + "grad_norm": 2.1875, + "learning_rate": 0.0018481170804291029, + "loss": 1.7068, + "step": 2910 + }, + { + "epoch": 0.20251139170058088, + "grad_norm": 2.90625, + "learning_rate": 0.0018479976754821187, + "loss": 1.8457, + "step": 2911 + }, + { + "epoch": 0.20258095933771608, + "grad_norm": 1.9921875, + "learning_rate": 0.001847878227478087, + "loss": 1.8104, + "step": 2912 + }, + { + "epoch": 0.2026505269748513, + "grad_norm": 2.484375, + "learning_rate": 0.0018477587364230726, + "loss": 2.1387, + "step": 2913 + }, + { + "epoch": 0.2027200946119865, + "grad_norm": 2.328125, + "learning_rate": 0.0018476392023231423, + "loss": 1.9466, + "step": 2914 + }, + { + "epoch": 0.2027896622491217, + "grad_norm": 2.015625, + "learning_rate": 0.001847519625184366, + "loss": 1.7601, + "step": 2915 + }, + { + "epoch": 0.2028592298862569, + "grad_norm": 2.0625, + "learning_rate": 0.0018474000050128147, + "loss": 1.6999, + "step": 2916 + }, + { + "epoch": 0.20292879752339213, + "grad_norm": 3.03125, + "learning_rate": 0.0018472803418145625, + "loss": 1.6827, + "step": 2917 + }, + { + "epoch": 0.20299836516052733, + "grad_norm": 2.015625, + "learning_rate": 0.001847160635595685, + "loss": 1.7657, + "step": 2918 + }, + { + "epoch": 0.20306793279766253, + "grad_norm": 2.265625, + "learning_rate": 0.0018470408863622608, + "loss": 1.7335, + "step": 2919 + }, + { + "epoch": 0.20313750043479772, + "grad_norm": 2.078125, + "learning_rate": 0.0018469210941203698, + "loss": 1.7775, + "step": 2920 + }, + { + "epoch": 0.20320706807193295, + "grad_norm": 2.765625, + "learning_rate": 0.001846801258876094, + "loss": 1.6758, + "step": 2921 + }, + { + "epoch": 0.20327663570906815, + "grad_norm": 2.296875, + "learning_rate": 0.0018466813806355187, + "loss": 1.8042, + "step": 2922 + }, + { + "epoch": 0.20334620334620335, + "grad_norm": 2.0625, + "learning_rate": 0.0018465614594047307, + "loss": 1.8612, + "step": 2923 + }, + { + "epoch": 0.20341577098333855, + "grad_norm": 2.0625, + "learning_rate": 0.0018464414951898185, + "loss": 1.7777, + "step": 2924 + }, + { + "epoch": 0.20348533862047374, + "grad_norm": 2.53125, + "learning_rate": 0.0018463214879968735, + "loss": 1.7603, + "step": 2925 + }, + { + "epoch": 0.20355490625760897, + "grad_norm": 2.0625, + "learning_rate": 0.0018462014378319892, + "loss": 1.7786, + "step": 2926 + }, + { + "epoch": 0.20362447389474417, + "grad_norm": 2.09375, + "learning_rate": 0.0018460813447012613, + "loss": 1.666, + "step": 2927 + }, + { + "epoch": 0.20369404153187937, + "grad_norm": 2.46875, + "learning_rate": 0.0018459612086107868, + "loss": 1.8724, + "step": 2928 + }, + { + "epoch": 0.20376360916901456, + "grad_norm": 4.3125, + "learning_rate": 0.0018458410295666664, + "loss": 2.3202, + "step": 2929 + }, + { + "epoch": 0.2038331768061498, + "grad_norm": 2.484375, + "learning_rate": 0.0018457208075750018, + "loss": 1.7222, + "step": 2930 + }, + { + "epoch": 0.203902744443285, + "grad_norm": 2.078125, + "learning_rate": 0.0018456005426418973, + "loss": 1.6514, + "step": 2931 + }, + { + "epoch": 0.2039723120804202, + "grad_norm": 2.078125, + "learning_rate": 0.0018454802347734596, + "loss": 1.9026, + "step": 2932 + }, + { + "epoch": 0.20404187971755539, + "grad_norm": 2.4375, + "learning_rate": 0.0018453598839757968, + "loss": 1.6522, + "step": 2933 + }, + { + "epoch": 0.2041114473546906, + "grad_norm": 2.03125, + "learning_rate": 0.0018452394902550202, + "loss": 1.6696, + "step": 2934 + }, + { + "epoch": 0.2041810149918258, + "grad_norm": 2.046875, + "learning_rate": 0.0018451190536172427, + "loss": 1.6172, + "step": 2935 + }, + { + "epoch": 0.204250582628961, + "grad_norm": 4.5, + "learning_rate": 0.0018449985740685794, + "loss": 1.675, + "step": 2936 + }, + { + "epoch": 0.2043201502660962, + "grad_norm": 2.125, + "learning_rate": 0.0018448780516151474, + "loss": 1.7131, + "step": 2937 + }, + { + "epoch": 0.2043897179032314, + "grad_norm": 2.28125, + "learning_rate": 0.0018447574862630663, + "loss": 1.7751, + "step": 2938 + }, + { + "epoch": 0.20445928554036663, + "grad_norm": 1.9453125, + "learning_rate": 0.0018446368780184583, + "loss": 1.5138, + "step": 2939 + }, + { + "epoch": 0.20452885317750183, + "grad_norm": 2.140625, + "learning_rate": 0.0018445162268874466, + "loss": 1.7987, + "step": 2940 + }, + { + "epoch": 0.20459842081463703, + "grad_norm": 2.0625, + "learning_rate": 0.0018443955328761579, + "loss": 1.6765, + "step": 2941 + }, + { + "epoch": 0.20466798845177223, + "grad_norm": 2.0, + "learning_rate": 0.00184427479599072, + "loss": 1.8651, + "step": 2942 + }, + { + "epoch": 0.20473755608890745, + "grad_norm": 3.046875, + "learning_rate": 0.0018441540162372632, + "loss": 1.6984, + "step": 2943 + }, + { + "epoch": 0.20480712372604265, + "grad_norm": 2.65625, + "learning_rate": 0.0018440331936219207, + "loss": 1.554, + "step": 2944 + }, + { + "epoch": 0.20487669136317785, + "grad_norm": 2.625, + "learning_rate": 0.0018439123281508265, + "loss": 1.8372, + "step": 2945 + }, + { + "epoch": 0.20494625900031305, + "grad_norm": 1.875, + "learning_rate": 0.0018437914198301182, + "loss": 1.6942, + "step": 2946 + }, + { + "epoch": 0.20501582663744825, + "grad_norm": 1.890625, + "learning_rate": 0.0018436704686659346, + "loss": 1.6305, + "step": 2947 + }, + { + "epoch": 0.20508539427458347, + "grad_norm": 2.375, + "learning_rate": 0.0018435494746644168, + "loss": 1.6892, + "step": 2948 + }, + { + "epoch": 0.20515496191171867, + "grad_norm": 2.03125, + "learning_rate": 0.0018434284378317086, + "loss": 1.6875, + "step": 2949 + }, + { + "epoch": 0.20522452954885387, + "grad_norm": 2.28125, + "learning_rate": 0.0018433073581739555, + "loss": 1.799, + "step": 2950 + }, + { + "epoch": 0.20529409718598907, + "grad_norm": 2.09375, + "learning_rate": 0.0018431862356973056, + "loss": 1.9487, + "step": 2951 + }, + { + "epoch": 0.2053636648231243, + "grad_norm": 1.9609375, + "learning_rate": 0.001843065070407908, + "loss": 1.5475, + "step": 2952 + }, + { + "epoch": 0.2054332324602595, + "grad_norm": 2.0, + "learning_rate": 0.0018429438623119162, + "loss": 1.4665, + "step": 2953 + }, + { + "epoch": 0.2055028000973947, + "grad_norm": 1.9296875, + "learning_rate": 0.0018428226114154832, + "loss": 1.5239, + "step": 2954 + }, + { + "epoch": 0.2055723677345299, + "grad_norm": 2.015625, + "learning_rate": 0.0018427013177247664, + "loss": 1.602, + "step": 2955 + }, + { + "epoch": 0.2056419353716651, + "grad_norm": 1.7109375, + "learning_rate": 0.0018425799812459244, + "loss": 1.4334, + "step": 2956 + }, + { + "epoch": 0.2057115030088003, + "grad_norm": 2.40625, + "learning_rate": 0.0018424586019851175, + "loss": 1.825, + "step": 2957 + }, + { + "epoch": 0.2057810706459355, + "grad_norm": 2.796875, + "learning_rate": 0.0018423371799485095, + "loss": 1.6815, + "step": 2958 + }, + { + "epoch": 0.2058506382830707, + "grad_norm": 1.828125, + "learning_rate": 0.001842215715142265, + "loss": 1.6816, + "step": 2959 + }, + { + "epoch": 0.2059202059202059, + "grad_norm": 1.9609375, + "learning_rate": 0.0018420942075725514, + "loss": 1.4627, + "step": 2960 + }, + { + "epoch": 0.20598977355734113, + "grad_norm": 2.375, + "learning_rate": 0.0018419726572455387, + "loss": 1.7165, + "step": 2961 + }, + { + "epoch": 0.20605934119447633, + "grad_norm": 2.21875, + "learning_rate": 0.0018418510641673982, + "loss": 1.6336, + "step": 2962 + }, + { + "epoch": 0.20612890883161153, + "grad_norm": 1.640625, + "learning_rate": 0.001841729428344304, + "loss": 1.2457, + "step": 2963 + }, + { + "epoch": 0.20619847646874673, + "grad_norm": 1.6796875, + "learning_rate": 0.001841607749782432, + "loss": 1.6383, + "step": 2964 + }, + { + "epoch": 0.20626804410588195, + "grad_norm": 1.703125, + "learning_rate": 0.0018414860284879603, + "loss": 1.4405, + "step": 2965 + }, + { + "epoch": 0.20633761174301715, + "grad_norm": 2.0, + "learning_rate": 0.0018413642644670696, + "loss": 1.8695, + "step": 2966 + }, + { + "epoch": 0.20640717938015235, + "grad_norm": 1.5703125, + "learning_rate": 0.0018412424577259423, + "loss": 1.6998, + "step": 2967 + }, + { + "epoch": 0.20647674701728755, + "grad_norm": 1.7578125, + "learning_rate": 0.0018411206082707633, + "loss": 1.4037, + "step": 2968 + }, + { + "epoch": 0.20654631465442277, + "grad_norm": 2.3125, + "learning_rate": 0.0018409987161077193, + "loss": 1.4348, + "step": 2969 + }, + { + "epoch": 0.20661588229155797, + "grad_norm": 1.8828125, + "learning_rate": 0.0018408767812429993, + "loss": 1.6154, + "step": 2970 + }, + { + "epoch": 0.20668544992869317, + "grad_norm": 1.8046875, + "learning_rate": 0.001840754803682795, + "loss": 1.5229, + "step": 2971 + }, + { + "epoch": 0.20675501756582837, + "grad_norm": 1.9296875, + "learning_rate": 0.0018406327834332994, + "loss": 1.7966, + "step": 2972 + }, + { + "epoch": 0.20682458520296357, + "grad_norm": 1.984375, + "learning_rate": 0.0018405107205007082, + "loss": 1.7795, + "step": 2973 + }, + { + "epoch": 0.2068941528400988, + "grad_norm": 1.7109375, + "learning_rate": 0.0018403886148912188, + "loss": 1.6348, + "step": 2974 + }, + { + "epoch": 0.206963720477234, + "grad_norm": 2.03125, + "learning_rate": 0.0018402664666110316, + "loss": 1.429, + "step": 2975 + }, + { + "epoch": 0.2070332881143692, + "grad_norm": 1.8046875, + "learning_rate": 0.0018401442756663484, + "loss": 1.4652, + "step": 2976 + }, + { + "epoch": 0.2071028557515044, + "grad_norm": 1.7578125, + "learning_rate": 0.0018400220420633736, + "loss": 1.3845, + "step": 2977 + }, + { + "epoch": 0.20717242338863961, + "grad_norm": 1.9609375, + "learning_rate": 0.0018398997658083136, + "loss": 1.7564, + "step": 2978 + }, + { + "epoch": 0.2072419910257748, + "grad_norm": 2.046875, + "learning_rate": 0.0018397774469073767, + "loss": 1.7363, + "step": 2979 + }, + { + "epoch": 0.20731155866291, + "grad_norm": 1.640625, + "learning_rate": 0.0018396550853667741, + "loss": 1.3616, + "step": 2980 + }, + { + "epoch": 0.2073811263000452, + "grad_norm": 1.6953125, + "learning_rate": 0.0018395326811927182, + "loss": 1.2341, + "step": 2981 + }, + { + "epoch": 0.20745069393718044, + "grad_norm": 1.6015625, + "learning_rate": 0.0018394102343914245, + "loss": 1.4868, + "step": 2982 + }, + { + "epoch": 0.20752026157431563, + "grad_norm": 1.921875, + "learning_rate": 0.0018392877449691098, + "loss": 1.5404, + "step": 2983 + }, + { + "epoch": 0.20758982921145083, + "grad_norm": 2.96875, + "learning_rate": 0.0018391652129319941, + "loss": 1.5061, + "step": 2984 + }, + { + "epoch": 0.20765939684858603, + "grad_norm": 1.671875, + "learning_rate": 0.001839042638286298, + "loss": 1.4491, + "step": 2985 + }, + { + "epoch": 0.20772896448572123, + "grad_norm": 2.046875, + "learning_rate": 0.0018389200210382464, + "loss": 1.4995, + "step": 2986 + }, + { + "epoch": 0.20779853212285646, + "grad_norm": 1.8828125, + "learning_rate": 0.0018387973611940645, + "loss": 1.7329, + "step": 2987 + }, + { + "epoch": 0.20786809975999165, + "grad_norm": 2.3125, + "learning_rate": 0.0018386746587599804, + "loss": 1.326, + "step": 2988 + }, + { + "epoch": 0.20793766739712685, + "grad_norm": 1.7421875, + "learning_rate": 0.001838551913742224, + "loss": 1.3855, + "step": 2989 + }, + { + "epoch": 0.20800723503426205, + "grad_norm": 1.6484375, + "learning_rate": 0.0018384291261470285, + "loss": 1.4359, + "step": 2990 + }, + { + "epoch": 0.20807680267139728, + "grad_norm": 2.015625, + "learning_rate": 0.0018383062959806279, + "loss": 1.6186, + "step": 2991 + }, + { + "epoch": 0.20814637030853247, + "grad_norm": 1.78125, + "learning_rate": 0.0018381834232492587, + "loss": 1.6195, + "step": 2992 + }, + { + "epoch": 0.20821593794566767, + "grad_norm": 1.4296875, + "learning_rate": 0.0018380605079591603, + "loss": 1.2774, + "step": 2993 + }, + { + "epoch": 0.20828550558280287, + "grad_norm": 1.875, + "learning_rate": 0.0018379375501165734, + "loss": 1.4691, + "step": 2994 + }, + { + "epoch": 0.2083550732199381, + "grad_norm": 1.6796875, + "learning_rate": 0.0018378145497277409, + "loss": 1.3312, + "step": 2995 + }, + { + "epoch": 0.2084246408570733, + "grad_norm": 1.640625, + "learning_rate": 0.001837691506798909, + "loss": 1.5309, + "step": 2996 + }, + { + "epoch": 0.2084942084942085, + "grad_norm": 1.8671875, + "learning_rate": 0.0018375684213363243, + "loss": 1.5467, + "step": 2997 + }, + { + "epoch": 0.2085637761313437, + "grad_norm": 1.7265625, + "learning_rate": 0.001837445293346237, + "loss": 1.6222, + "step": 2998 + }, + { + "epoch": 0.2086333437684789, + "grad_norm": 1.59375, + "learning_rate": 0.0018373221228348987, + "loss": 1.4877, + "step": 2999 + }, + { + "epoch": 0.20870291140561412, + "grad_norm": 2.421875, + "learning_rate": 0.0018371989098085633, + "loss": 1.4715, + "step": 3000 + }, + { + "epoch": 0.20877247904274931, + "grad_norm": 1.9609375, + "learning_rate": 0.0018370756542734872, + "loss": 1.6579, + "step": 3001 + }, + { + "epoch": 0.2088420466798845, + "grad_norm": 2.203125, + "learning_rate": 0.0018369523562359285, + "loss": 1.6065, + "step": 3002 + }, + { + "epoch": 0.2089116143170197, + "grad_norm": 1.75, + "learning_rate": 0.0018368290157021474, + "loss": 1.3236, + "step": 3003 + }, + { + "epoch": 0.20898118195415494, + "grad_norm": 1.640625, + "learning_rate": 0.0018367056326784074, + "loss": 1.462, + "step": 3004 + }, + { + "epoch": 0.20905074959129014, + "grad_norm": 1.796875, + "learning_rate": 0.0018365822071709724, + "loss": 1.4751, + "step": 3005 + }, + { + "epoch": 0.20912031722842533, + "grad_norm": 1.8125, + "learning_rate": 0.0018364587391861095, + "loss": 1.385, + "step": 3006 + }, + { + "epoch": 0.20918988486556053, + "grad_norm": 1.71875, + "learning_rate": 0.0018363352287300877, + "loss": 1.3311, + "step": 3007 + }, + { + "epoch": 0.20925945250269576, + "grad_norm": 2.15625, + "learning_rate": 0.001836211675809179, + "loss": 1.6793, + "step": 3008 + }, + { + "epoch": 0.20932902013983096, + "grad_norm": 1.65625, + "learning_rate": 0.001836088080429656, + "loss": 1.4785, + "step": 3009 + }, + { + "epoch": 0.20939858777696616, + "grad_norm": 1.796875, + "learning_rate": 0.0018359644425977942, + "loss": 1.5472, + "step": 3010 + }, + { + "epoch": 0.20946815541410135, + "grad_norm": 1.96875, + "learning_rate": 0.0018358407623198718, + "loss": 1.3917, + "step": 3011 + }, + { + "epoch": 0.20953772305123655, + "grad_norm": 1.796875, + "learning_rate": 0.0018357170396021685, + "loss": 1.3381, + "step": 3012 + }, + { + "epoch": 0.20960729068837178, + "grad_norm": 1.6640625, + "learning_rate": 0.0018355932744509662, + "loss": 1.3268, + "step": 3013 + }, + { + "epoch": 0.20967685832550698, + "grad_norm": 1.4296875, + "learning_rate": 0.0018354694668725488, + "loss": 1.5767, + "step": 3014 + }, + { + "epoch": 0.20974642596264217, + "grad_norm": 1.546875, + "learning_rate": 0.0018353456168732035, + "loss": 1.5286, + "step": 3015 + }, + { + "epoch": 0.20981599359977737, + "grad_norm": 1.359375, + "learning_rate": 0.001835221724459218, + "loss": 1.3449, + "step": 3016 + }, + { + "epoch": 0.2098855612369126, + "grad_norm": 2.03125, + "learning_rate": 0.0018350977896368832, + "loss": 1.4017, + "step": 3017 + }, + { + "epoch": 0.2099551288740478, + "grad_norm": 1.875, + "learning_rate": 0.0018349738124124918, + "loss": 1.4264, + "step": 3018 + }, + { + "epoch": 0.210024696511183, + "grad_norm": 1.84375, + "learning_rate": 0.0018348497927923387, + "loss": 1.4162, + "step": 3019 + }, + { + "epoch": 0.2100942641483182, + "grad_norm": 1.96875, + "learning_rate": 0.0018347257307827212, + "loss": 1.407, + "step": 3020 + }, + { + "epoch": 0.21016383178545342, + "grad_norm": 1.8984375, + "learning_rate": 0.0018346016263899383, + "loss": 1.4882, + "step": 3021 + }, + { + "epoch": 0.21023339942258862, + "grad_norm": 1.7734375, + "learning_rate": 0.0018344774796202916, + "loss": 1.4429, + "step": 3022 + }, + { + "epoch": 0.21030296705972382, + "grad_norm": 1.3671875, + "learning_rate": 0.0018343532904800846, + "loss": 1.3768, + "step": 3023 + }, + { + "epoch": 0.21037253469685901, + "grad_norm": 1.4921875, + "learning_rate": 0.0018342290589756227, + "loss": 1.2807, + "step": 3024 + }, + { + "epoch": 0.2104421023339942, + "grad_norm": 1.890625, + "learning_rate": 0.0018341047851132143, + "loss": 1.4446, + "step": 3025 + }, + { + "epoch": 0.21051166997112944, + "grad_norm": 2.28125, + "learning_rate": 0.001833980468899169, + "loss": 1.412, + "step": 3026 + }, + { + "epoch": 0.21058123760826464, + "grad_norm": 1.6015625, + "learning_rate": 0.001833856110339799, + "loss": 1.3339, + "step": 3027 + }, + { + "epoch": 0.21065080524539984, + "grad_norm": 1.59375, + "learning_rate": 0.0018337317094414187, + "loss": 1.5386, + "step": 3028 + }, + { + "epoch": 0.21072037288253503, + "grad_norm": 1.375, + "learning_rate": 0.0018336072662103447, + "loss": 1.5191, + "step": 3029 + }, + { + "epoch": 0.21078994051967026, + "grad_norm": 1.4609375, + "learning_rate": 0.0018334827806528954, + "loss": 1.5986, + "step": 3030 + }, + { + "epoch": 0.21085950815680546, + "grad_norm": 1.4921875, + "learning_rate": 0.0018333582527753913, + "loss": 1.4021, + "step": 3031 + }, + { + "epoch": 0.21092907579394066, + "grad_norm": 1.6953125, + "learning_rate": 0.0018332336825841557, + "loss": 1.5484, + "step": 3032 + }, + { + "epoch": 0.21099864343107586, + "grad_norm": 1.5390625, + "learning_rate": 0.0018331090700855134, + "loss": 1.4599, + "step": 3033 + }, + { + "epoch": 0.21106821106821108, + "grad_norm": 1.6796875, + "learning_rate": 0.001832984415285792, + "loss": 1.5886, + "step": 3034 + }, + { + "epoch": 0.21113777870534628, + "grad_norm": 1.6171875, + "learning_rate": 0.0018328597181913203, + "loss": 1.5175, + "step": 3035 + }, + { + "epoch": 0.21120734634248148, + "grad_norm": 1.859375, + "learning_rate": 0.0018327349788084303, + "loss": 1.4476, + "step": 3036 + }, + { + "epoch": 0.21127691397961668, + "grad_norm": 1.71875, + "learning_rate": 0.001832610197143455, + "loss": 1.539, + "step": 3037 + }, + { + "epoch": 0.21134648161675187, + "grad_norm": 1.5546875, + "learning_rate": 0.0018324853732027307, + "loss": 1.6018, + "step": 3038 + }, + { + "epoch": 0.2114160492538871, + "grad_norm": 1.546875, + "learning_rate": 0.0018323605069925954, + "loss": 1.7061, + "step": 3039 + }, + { + "epoch": 0.2114856168910223, + "grad_norm": 1.78125, + "learning_rate": 0.0018322355985193885, + "loss": 1.5131, + "step": 3040 + }, + { + "epoch": 0.2115551845281575, + "grad_norm": 1.5, + "learning_rate": 0.001832110647789453, + "loss": 1.5458, + "step": 3041 + }, + { + "epoch": 0.2116247521652927, + "grad_norm": 1.453125, + "learning_rate": 0.0018319856548091329, + "loss": 1.3315, + "step": 3042 + }, + { + "epoch": 0.21169431980242792, + "grad_norm": 1.40625, + "learning_rate": 0.0018318606195847748, + "loss": 1.2364, + "step": 3043 + }, + { + "epoch": 0.21176388743956312, + "grad_norm": 1.546875, + "learning_rate": 0.0018317355421227273, + "loss": 1.3256, + "step": 3044 + }, + { + "epoch": 0.21183345507669832, + "grad_norm": 1.7265625, + "learning_rate": 0.0018316104224293413, + "loss": 1.2375, + "step": 3045 + }, + { + "epoch": 0.21190302271383352, + "grad_norm": 1.75, + "learning_rate": 0.0018314852605109695, + "loss": 1.2515, + "step": 3046 + }, + { + "epoch": 0.21197259035096874, + "grad_norm": 1.53125, + "learning_rate": 0.0018313600563739673, + "loss": 1.3807, + "step": 3047 + }, + { + "epoch": 0.21204215798810394, + "grad_norm": 1.6171875, + "learning_rate": 0.0018312348100246918, + "loss": 1.3903, + "step": 3048 + }, + { + "epoch": 0.21211172562523914, + "grad_norm": 2.1875, + "learning_rate": 0.0018311095214695024, + "loss": 1.9212, + "step": 3049 + }, + { + "epoch": 0.21218129326237434, + "grad_norm": 1.78125, + "learning_rate": 0.0018309841907147605, + "loss": 1.1514, + "step": 3050 + }, + { + "epoch": 0.21225086089950954, + "grad_norm": 1.5, + "learning_rate": 0.00183085881776683, + "loss": 1.3666, + "step": 3051 + }, + { + "epoch": 0.21232042853664476, + "grad_norm": 1.8125, + "learning_rate": 0.0018307334026320765, + "loss": 1.6806, + "step": 3052 + }, + { + "epoch": 0.21238999617377996, + "grad_norm": 1.53125, + "learning_rate": 0.0018306079453168681, + "loss": 1.5053, + "step": 3053 + }, + { + "epoch": 0.21245956381091516, + "grad_norm": 1.734375, + "learning_rate": 0.0018304824458275745, + "loss": 1.2886, + "step": 3054 + }, + { + "epoch": 0.21252913144805036, + "grad_norm": 1.3359375, + "learning_rate": 0.0018303569041705685, + "loss": 1.4875, + "step": 3055 + }, + { + "epoch": 0.21259869908518558, + "grad_norm": 1.3671875, + "learning_rate": 0.0018302313203522242, + "loss": 1.5887, + "step": 3056 + }, + { + "epoch": 0.21266826672232078, + "grad_norm": 1.59375, + "learning_rate": 0.0018301056943789181, + "loss": 1.4743, + "step": 3057 + }, + { + "epoch": 0.21273783435945598, + "grad_norm": 2.03125, + "learning_rate": 0.001829980026257029, + "loss": 1.2388, + "step": 3058 + }, + { + "epoch": 0.21280740199659118, + "grad_norm": 1.78125, + "learning_rate": 0.0018298543159929372, + "loss": 1.6247, + "step": 3059 + }, + { + "epoch": 0.2128769696337264, + "grad_norm": 1.7734375, + "learning_rate": 0.0018297285635930265, + "loss": 1.4666, + "step": 3060 + }, + { + "epoch": 0.2129465372708616, + "grad_norm": 1.5390625, + "learning_rate": 0.0018296027690636813, + "loss": 1.4244, + "step": 3061 + }, + { + "epoch": 0.2130161049079968, + "grad_norm": 1.4296875, + "learning_rate": 0.001829476932411289, + "loss": 1.3429, + "step": 3062 + }, + { + "epoch": 0.213085672545132, + "grad_norm": 1.96875, + "learning_rate": 0.0018293510536422388, + "loss": 1.1101, + "step": 3063 + }, + { + "epoch": 0.2131552401822672, + "grad_norm": 1.2734375, + "learning_rate": 0.0018292251327629225, + "loss": 1.2017, + "step": 3064 + }, + { + "epoch": 0.21322480781940242, + "grad_norm": 1.328125, + "learning_rate": 0.0018290991697797335, + "loss": 1.3601, + "step": 3065 + }, + { + "epoch": 0.21329437545653762, + "grad_norm": 2.015625, + "learning_rate": 0.0018289731646990678, + "loss": 1.2416, + "step": 3066 + }, + { + "epoch": 0.21336394309367282, + "grad_norm": 1.8125, + "learning_rate": 0.0018288471175273227, + "loss": 1.4516, + "step": 3067 + }, + { + "epoch": 0.21343351073080802, + "grad_norm": 1.8359375, + "learning_rate": 0.001828721028270899, + "loss": 1.245, + "step": 3068 + }, + { + "epoch": 0.21350307836794324, + "grad_norm": 2.1875, + "learning_rate": 0.0018285948969361985, + "loss": 1.4108, + "step": 3069 + }, + { + "epoch": 0.21357264600507844, + "grad_norm": 1.5703125, + "learning_rate": 0.0018284687235296255, + "loss": 1.1665, + "step": 3070 + }, + { + "epoch": 0.21364221364221364, + "grad_norm": 1.9296875, + "learning_rate": 0.0018283425080575866, + "loss": 1.4434, + "step": 3071 + }, + { + "epoch": 0.21371178127934884, + "grad_norm": 1.5390625, + "learning_rate": 0.00182821625052649, + "loss": 1.3012, + "step": 3072 + }, + { + "epoch": 0.21378134891648407, + "grad_norm": 2.046875, + "learning_rate": 0.001828089950942747, + "loss": 1.5307, + "step": 3073 + }, + { + "epoch": 0.21385091655361926, + "grad_norm": 1.4375, + "learning_rate": 0.0018279636093127705, + "loss": 1.18, + "step": 3074 + }, + { + "epoch": 0.21392048419075446, + "grad_norm": 1.5234375, + "learning_rate": 0.0018278372256429747, + "loss": 1.4393, + "step": 3075 + }, + { + "epoch": 0.21399005182788966, + "grad_norm": 1.625, + "learning_rate": 0.0018277107999397774, + "loss": 1.3494, + "step": 3076 + }, + { + "epoch": 0.21405961946502486, + "grad_norm": 1.46875, + "learning_rate": 0.0018275843322095974, + "loss": 1.3145, + "step": 3077 + }, + { + "epoch": 0.21412918710216008, + "grad_norm": 1.578125, + "learning_rate": 0.0018274578224588564, + "loss": 1.6444, + "step": 3078 + }, + { + "epoch": 0.21419875473929528, + "grad_norm": 1.4453125, + "learning_rate": 0.001827331270693978, + "loss": 1.2429, + "step": 3079 + }, + { + "epoch": 0.21426832237643048, + "grad_norm": 1.6640625, + "learning_rate": 0.0018272046769213879, + "loss": 1.5588, + "step": 3080 + }, + { + "epoch": 0.21433789001356568, + "grad_norm": 1.8203125, + "learning_rate": 0.0018270780411475133, + "loss": 1.5714, + "step": 3081 + }, + { + "epoch": 0.2144074576507009, + "grad_norm": 1.6875, + "learning_rate": 0.0018269513633787848, + "loss": 1.2007, + "step": 3082 + }, + { + "epoch": 0.2144770252878361, + "grad_norm": 1.5078125, + "learning_rate": 0.0018268246436216342, + "loss": 1.2613, + "step": 3083 + }, + { + "epoch": 0.2145465929249713, + "grad_norm": 1.5, + "learning_rate": 0.0018266978818824958, + "loss": 1.6095, + "step": 3084 + }, + { + "epoch": 0.2146161605621065, + "grad_norm": 1.5859375, + "learning_rate": 0.0018265710781678055, + "loss": 1.3277, + "step": 3085 + }, + { + "epoch": 0.21468572819924173, + "grad_norm": 1.65625, + "learning_rate": 0.0018264442324840025, + "loss": 1.4692, + "step": 3086 + }, + { + "epoch": 0.21475529583637692, + "grad_norm": 1.4296875, + "learning_rate": 0.0018263173448375266, + "loss": 1.3238, + "step": 3087 + }, + { + "epoch": 0.21482486347351212, + "grad_norm": 1.5390625, + "learning_rate": 0.0018261904152348212, + "loss": 1.2368, + "step": 3088 + }, + { + "epoch": 0.21489443111064732, + "grad_norm": 1.71875, + "learning_rate": 0.0018260634436823304, + "loss": 1.4299, + "step": 3089 + }, + { + "epoch": 0.21496399874778252, + "grad_norm": 1.2734375, + "learning_rate": 0.001825936430186502, + "loss": 1.3339, + "step": 3090 + }, + { + "epoch": 0.21503356638491775, + "grad_norm": 1.3828125, + "learning_rate": 0.0018258093747537845, + "loss": 1.3982, + "step": 3091 + }, + { + "epoch": 0.21510313402205294, + "grad_norm": 2.25, + "learning_rate": 0.0018256822773906297, + "loss": 1.451, + "step": 3092 + }, + { + "epoch": 0.21517270165918814, + "grad_norm": 1.8203125, + "learning_rate": 0.00182555513810349, + "loss": 1.56, + "step": 3093 + }, + { + "epoch": 0.21524226929632334, + "grad_norm": 1.703125, + "learning_rate": 0.0018254279568988218, + "loss": 1.1239, + "step": 3094 + }, + { + "epoch": 0.21531183693345857, + "grad_norm": 1.5, + "learning_rate": 0.0018253007337830824, + "loss": 1.2984, + "step": 3095 + }, + { + "epoch": 0.21538140457059377, + "grad_norm": 1.75, + "learning_rate": 0.0018251734687627318, + "loss": 1.0488, + "step": 3096 + }, + { + "epoch": 0.21545097220772896, + "grad_norm": 1.765625, + "learning_rate": 0.0018250461618442312, + "loss": 1.0971, + "step": 3097 + }, + { + "epoch": 0.21552053984486416, + "grad_norm": 1.484375, + "learning_rate": 0.0018249188130340453, + "loss": 1.3812, + "step": 3098 + }, + { + "epoch": 0.21559010748199936, + "grad_norm": 1.3359375, + "learning_rate": 0.0018247914223386398, + "loss": 1.4449, + "step": 3099 + }, + { + "epoch": 0.2156596751191346, + "grad_norm": 1.5, + "learning_rate": 0.0018246639897644835, + "loss": 1.3833, + "step": 3100 + }, + { + "epoch": 0.21572924275626978, + "grad_norm": 1.265625, + "learning_rate": 0.001824536515318046, + "loss": 1.4897, + "step": 3101 + }, + { + "epoch": 0.21579881039340498, + "grad_norm": 2.21875, + "learning_rate": 0.0018244089990058004, + "loss": 1.369, + "step": 3102 + }, + { + "epoch": 0.21586837803054018, + "grad_norm": 2.15625, + "learning_rate": 0.0018242814408342212, + "loss": 1.5356, + "step": 3103 + }, + { + "epoch": 0.2159379456676754, + "grad_norm": 1.734375, + "learning_rate": 0.0018241538408097849, + "loss": 1.2144, + "step": 3104 + }, + { + "epoch": 0.2160075133048106, + "grad_norm": 1.375, + "learning_rate": 0.001824026198938971, + "loss": 1.2871, + "step": 3105 + }, + { + "epoch": 0.2160770809419458, + "grad_norm": 1.359375, + "learning_rate": 0.0018238985152282598, + "loss": 1.4326, + "step": 3106 + }, + { + "epoch": 0.216146648579081, + "grad_norm": 1.4609375, + "learning_rate": 0.0018237707896841347, + "loss": 1.3786, + "step": 3107 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 2.015625, + "learning_rate": 0.0018236430223130813, + "loss": 1.4567, + "step": 3108 + }, + { + "epoch": 0.21628578385335143, + "grad_norm": 1.5703125, + "learning_rate": 0.0018235152131215867, + "loss": 1.397, + "step": 3109 + }, + { + "epoch": 0.21635535149048662, + "grad_norm": 2.03125, + "learning_rate": 0.0018233873621161401, + "loss": 1.3467, + "step": 3110 + }, + { + "epoch": 0.21642491912762182, + "grad_norm": 2.0625, + "learning_rate": 0.0018232594693032337, + "loss": 1.4307, + "step": 3111 + }, + { + "epoch": 0.21649448676475702, + "grad_norm": 1.3125, + "learning_rate": 0.001823131534689361, + "loss": 1.3591, + "step": 3112 + }, + { + "epoch": 0.21656405440189225, + "grad_norm": 1.5546875, + "learning_rate": 0.0018230035582810175, + "loss": 1.3185, + "step": 3113 + }, + { + "epoch": 0.21663362203902745, + "grad_norm": 1.5703125, + "learning_rate": 0.0018228755400847016, + "loss": 1.3026, + "step": 3114 + }, + { + "epoch": 0.21670318967616264, + "grad_norm": 1.3125, + "learning_rate": 0.0018227474801069136, + "loss": 1.2597, + "step": 3115 + }, + { + "epoch": 0.21677275731329784, + "grad_norm": 1.6484375, + "learning_rate": 0.0018226193783541557, + "loss": 1.2826, + "step": 3116 + }, + { + "epoch": 0.21684232495043307, + "grad_norm": 1.765625, + "learning_rate": 0.0018224912348329316, + "loss": 1.2702, + "step": 3117 + }, + { + "epoch": 0.21691189258756827, + "grad_norm": 1.46875, + "learning_rate": 0.0018223630495497484, + "loss": 1.1387, + "step": 3118 + }, + { + "epoch": 0.21698146022470347, + "grad_norm": 1.4765625, + "learning_rate": 0.001822234822511115, + "loss": 1.3643, + "step": 3119 + }, + { + "epoch": 0.21705102786183866, + "grad_norm": 1.3828125, + "learning_rate": 0.0018221065537235412, + "loss": 1.0964, + "step": 3120 + }, + { + "epoch": 0.2171205954989739, + "grad_norm": 1.3984375, + "learning_rate": 0.0018219782431935405, + "loss": 1.242, + "step": 3121 + }, + { + "epoch": 0.2171901631361091, + "grad_norm": 1.4921875, + "learning_rate": 0.0018218498909276276, + "loss": 1.3961, + "step": 3122 + }, + { + "epoch": 0.2172597307732443, + "grad_norm": 1.5078125, + "learning_rate": 0.0018217214969323198, + "loss": 1.3259, + "step": 3123 + }, + { + "epoch": 0.21732929841037948, + "grad_norm": 1.6796875, + "learning_rate": 0.001821593061214136, + "loss": 1.1908, + "step": 3124 + }, + { + "epoch": 0.21739886604751468, + "grad_norm": 1.875, + "learning_rate": 0.0018214645837795979, + "loss": 1.5525, + "step": 3125 + }, + { + "epoch": 0.2174684336846499, + "grad_norm": 1.5078125, + "learning_rate": 0.0018213360646352286, + "loss": 1.1041, + "step": 3126 + }, + { + "epoch": 0.2175380013217851, + "grad_norm": 1.53125, + "learning_rate": 0.0018212075037875538, + "loss": 1.4062, + "step": 3127 + }, + { + "epoch": 0.2176075689589203, + "grad_norm": 1.421875, + "learning_rate": 0.001821078901243101, + "loss": 1.3016, + "step": 3128 + }, + { + "epoch": 0.2176771365960555, + "grad_norm": 1.4140625, + "learning_rate": 0.0018209502570084005, + "loss": 1.2827, + "step": 3129 + }, + { + "epoch": 0.21774670423319073, + "grad_norm": 1.4453125, + "learning_rate": 0.0018208215710899833, + "loss": 1.5377, + "step": 3130 + }, + { + "epoch": 0.21781627187032593, + "grad_norm": 1.5703125, + "learning_rate": 0.0018206928434943846, + "loss": 1.3466, + "step": 3131 + }, + { + "epoch": 0.21788583950746113, + "grad_norm": 1.6484375, + "learning_rate": 0.0018205640742281397, + "loss": 1.2992, + "step": 3132 + }, + { + "epoch": 0.21795540714459632, + "grad_norm": 1.3515625, + "learning_rate": 0.001820435263297787, + "loss": 1.181, + "step": 3133 + }, + { + "epoch": 0.21802497478173155, + "grad_norm": 1.6328125, + "learning_rate": 0.0018203064107098666, + "loss": 1.4374, + "step": 3134 + }, + { + "epoch": 0.21809454241886675, + "grad_norm": 1.4296875, + "learning_rate": 0.0018201775164709219, + "loss": 1.1472, + "step": 3135 + }, + { + "epoch": 0.21816411005600195, + "grad_norm": 1.625, + "learning_rate": 0.0018200485805874962, + "loss": 1.2548, + "step": 3136 + }, + { + "epoch": 0.21823367769313715, + "grad_norm": 1.515625, + "learning_rate": 0.0018199196030661375, + "loss": 1.1976, + "step": 3137 + }, + { + "epoch": 0.21830324533027234, + "grad_norm": 1.3359375, + "learning_rate": 0.001819790583913394, + "loss": 1.3096, + "step": 3138 + }, + { + "epoch": 0.21837281296740757, + "grad_norm": 1.609375, + "learning_rate": 0.0018196615231358165, + "loss": 1.3699, + "step": 3139 + }, + { + "epoch": 0.21844238060454277, + "grad_norm": 1.1796875, + "learning_rate": 0.0018195324207399587, + "loss": 1.2893, + "step": 3140 + }, + { + "epoch": 0.21851194824167797, + "grad_norm": 1.3828125, + "learning_rate": 0.0018194032767323747, + "loss": 1.2761, + "step": 3141 + }, + { + "epoch": 0.21858151587881317, + "grad_norm": 1.515625, + "learning_rate": 0.0018192740911196225, + "loss": 1.1881, + "step": 3142 + }, + { + "epoch": 0.2186510835159484, + "grad_norm": 1.375, + "learning_rate": 0.001819144863908262, + "loss": 1.2316, + "step": 3143 + }, + { + "epoch": 0.2187206511530836, + "grad_norm": 1.4453125, + "learning_rate": 0.0018190155951048534, + "loss": 1.3229, + "step": 3144 + }, + { + "epoch": 0.2187902187902188, + "grad_norm": 1.328125, + "learning_rate": 0.0018188862847159616, + "loss": 1.2341, + "step": 3145 + }, + { + "epoch": 0.218859786427354, + "grad_norm": 1.3359375, + "learning_rate": 0.0018187569327481512, + "loss": 1.1094, + "step": 3146 + }, + { + "epoch": 0.2189293540644892, + "grad_norm": 1.7578125, + "learning_rate": 0.001818627539207991, + "loss": 1.2033, + "step": 3147 + }, + { + "epoch": 0.2189989217016244, + "grad_norm": 1.4609375, + "learning_rate": 0.0018184981041020505, + "loss": 1.0848, + "step": 3148 + }, + { + "epoch": 0.2190684893387596, + "grad_norm": 2.265625, + "learning_rate": 0.0018183686274369016, + "loss": 1.4267, + "step": 3149 + }, + { + "epoch": 0.2191380569758948, + "grad_norm": 1.5859375, + "learning_rate": 0.001818239109219119, + "loss": 1.2773, + "step": 3150 + }, + { + "epoch": 0.21920762461303, + "grad_norm": 1.34375, + "learning_rate": 0.0018181095494552784, + "loss": 1.3228, + "step": 3151 + }, + { + "epoch": 0.21927719225016523, + "grad_norm": 1.515625, + "learning_rate": 0.0018179799481519586, + "loss": 1.2327, + "step": 3152 + }, + { + "epoch": 0.21934675988730043, + "grad_norm": 1.3359375, + "learning_rate": 0.00181785030531574, + "loss": 1.3893, + "step": 3153 + }, + { + "epoch": 0.21941632752443563, + "grad_norm": 1.5078125, + "learning_rate": 0.001817720620953205, + "loss": 1.3026, + "step": 3154 + }, + { + "epoch": 0.21948589516157083, + "grad_norm": 1.515625, + "learning_rate": 0.0018175908950709384, + "loss": 1.1513, + "step": 3155 + }, + { + "epoch": 0.21955546279870605, + "grad_norm": 1.5, + "learning_rate": 0.0018174611276755273, + "loss": 1.239, + "step": 3156 + }, + { + "epoch": 0.21962503043584125, + "grad_norm": 1.6640625, + "learning_rate": 0.0018173313187735602, + "loss": 1.1086, + "step": 3157 + }, + { + "epoch": 0.21969459807297645, + "grad_norm": 1.4453125, + "learning_rate": 0.0018172014683716287, + "loss": 1.2177, + "step": 3158 + }, + { + "epoch": 0.21976416571011165, + "grad_norm": 1.59375, + "learning_rate": 0.0018170715764763254, + "loss": 1.1821, + "step": 3159 + }, + { + "epoch": 0.21983373334724687, + "grad_norm": 1.4921875, + "learning_rate": 0.0018169416430942461, + "loss": 1.7046, + "step": 3160 + }, + { + "epoch": 0.21990330098438207, + "grad_norm": 1.40625, + "learning_rate": 0.0018168116682319875, + "loss": 1.3001, + "step": 3161 + }, + { + "epoch": 0.21997286862151727, + "grad_norm": 1.171875, + "learning_rate": 0.0018166816518961498, + "loss": 1.2037, + "step": 3162 + }, + { + "epoch": 0.22004243625865247, + "grad_norm": 1.5078125, + "learning_rate": 0.001816551594093334, + "loss": 1.4909, + "step": 3163 + }, + { + "epoch": 0.22011200389578767, + "grad_norm": 1.2734375, + "learning_rate": 0.001816421494830144, + "loss": 1.2482, + "step": 3164 + }, + { + "epoch": 0.2201815715329229, + "grad_norm": 1.4453125, + "learning_rate": 0.0018162913541131856, + "loss": 1.5315, + "step": 3165 + }, + { + "epoch": 0.2202511391700581, + "grad_norm": 1.625, + "learning_rate": 0.0018161611719490663, + "loss": 1.2521, + "step": 3166 + }, + { + "epoch": 0.2203207068071933, + "grad_norm": 1.4375, + "learning_rate": 0.0018160309483443969, + "loss": 1.2491, + "step": 3167 + }, + { + "epoch": 0.2203902744443285, + "grad_norm": 1.4453125, + "learning_rate": 0.001815900683305789, + "loss": 1.3487, + "step": 3168 + }, + { + "epoch": 0.2204598420814637, + "grad_norm": 1.453125, + "learning_rate": 0.0018157703768398566, + "loss": 1.2102, + "step": 3169 + }, + { + "epoch": 0.2205294097185989, + "grad_norm": 1.5703125, + "learning_rate": 0.0018156400289532164, + "loss": 1.2524, + "step": 3170 + }, + { + "epoch": 0.2205989773557341, + "grad_norm": 1.140625, + "learning_rate": 0.0018155096396524867, + "loss": 1.3116, + "step": 3171 + }, + { + "epoch": 0.2206685449928693, + "grad_norm": 1.2734375, + "learning_rate": 0.0018153792089442879, + "loss": 1.3005, + "step": 3172 + }, + { + "epoch": 0.22073811263000453, + "grad_norm": 1.5859375, + "learning_rate": 0.0018152487368352426, + "loss": 1.3617, + "step": 3173 + }, + { + "epoch": 0.22080768026713973, + "grad_norm": 1.296875, + "learning_rate": 0.0018151182233319756, + "loss": 1.3319, + "step": 3174 + }, + { + "epoch": 0.22087724790427493, + "grad_norm": 1.25, + "learning_rate": 0.001814987668441114, + "loss": 1.1319, + "step": 3175 + }, + { + "epoch": 0.22094681554141013, + "grad_norm": 1.546875, + "learning_rate": 0.0018148570721692862, + "loss": 1.1426, + "step": 3176 + }, + { + "epoch": 0.22101638317854533, + "grad_norm": 1.328125, + "learning_rate": 0.0018147264345231234, + "loss": 1.5089, + "step": 3177 + }, + { + "epoch": 0.22108595081568055, + "grad_norm": 1.3359375, + "learning_rate": 0.001814595755509259, + "loss": 1.2042, + "step": 3178 + }, + { + "epoch": 0.22115551845281575, + "grad_norm": 1.3046875, + "learning_rate": 0.0018144650351343277, + "loss": 1.3103, + "step": 3179 + }, + { + "epoch": 0.22122508608995095, + "grad_norm": 1.2578125, + "learning_rate": 0.0018143342734049672, + "loss": 1.256, + "step": 3180 + }, + { + "epoch": 0.22129465372708615, + "grad_norm": 1.390625, + "learning_rate": 0.0018142034703278172, + "loss": 1.4062, + "step": 3181 + }, + { + "epoch": 0.22136422136422138, + "grad_norm": 1.4453125, + "learning_rate": 0.0018140726259095186, + "loss": 1.5177, + "step": 3182 + }, + { + "epoch": 0.22143378900135657, + "grad_norm": 1.53125, + "learning_rate": 0.0018139417401567153, + "loss": 1.3101, + "step": 3183 + }, + { + "epoch": 0.22150335663849177, + "grad_norm": 1.2109375, + "learning_rate": 0.0018138108130760528, + "loss": 1.2181, + "step": 3184 + }, + { + "epoch": 0.22157292427562697, + "grad_norm": 1.4453125, + "learning_rate": 0.0018136798446741797, + "loss": 1.1762, + "step": 3185 + }, + { + "epoch": 0.2216424919127622, + "grad_norm": 1.34375, + "learning_rate": 0.001813548834957745, + "loss": 1.3214, + "step": 3186 + }, + { + "epoch": 0.2217120595498974, + "grad_norm": 1.2578125, + "learning_rate": 0.0018134177839334007, + "loss": 1.332, + "step": 3187 + }, + { + "epoch": 0.2217816271870326, + "grad_norm": 1.671875, + "learning_rate": 0.0018132866916078017, + "loss": 1.5935, + "step": 3188 + }, + { + "epoch": 0.2218511948241678, + "grad_norm": 1.046875, + "learning_rate": 0.0018131555579876037, + "loss": 1.1784, + "step": 3189 + }, + { + "epoch": 0.221920762461303, + "grad_norm": 1.328125, + "learning_rate": 0.001813024383079465, + "loss": 1.1413, + "step": 3190 + }, + { + "epoch": 0.22199033009843822, + "grad_norm": 1.1484375, + "learning_rate": 0.0018128931668900462, + "loss": 1.2292, + "step": 3191 + }, + { + "epoch": 0.2220598977355734, + "grad_norm": 1.21875, + "learning_rate": 0.0018127619094260095, + "loss": 1.0415, + "step": 3192 + }, + { + "epoch": 0.2221294653727086, + "grad_norm": 1.5078125, + "learning_rate": 0.0018126306106940198, + "loss": 1.2261, + "step": 3193 + }, + { + "epoch": 0.2221990330098438, + "grad_norm": 0.9453125, + "learning_rate": 0.0018124992707007435, + "loss": 0.8691, + "step": 3194 + }, + { + "epoch": 0.22226860064697904, + "grad_norm": 1.2578125, + "learning_rate": 0.0018123678894528498, + "loss": 1.3883, + "step": 3195 + }, + { + "epoch": 0.22233816828411423, + "grad_norm": 1.4609375, + "learning_rate": 0.0018122364669570091, + "loss": 1.3116, + "step": 3196 + }, + { + "epoch": 0.22240773592124943, + "grad_norm": 1.3828125, + "learning_rate": 0.0018121050032198945, + "loss": 1.032, + "step": 3197 + }, + { + "epoch": 0.22247730355838463, + "grad_norm": 1.4375, + "learning_rate": 0.0018119734982481814, + "loss": 1.2981, + "step": 3198 + }, + { + "epoch": 0.22254687119551986, + "grad_norm": 1.5625, + "learning_rate": 0.0018118419520485466, + "loss": 1.3269, + "step": 3199 + }, + { + "epoch": 0.22261643883265506, + "grad_norm": 1.71875, + "learning_rate": 0.0018117103646276692, + "loss": 1.2838, + "step": 3200 + }, + { + "epoch": 0.22268600646979025, + "grad_norm": 1.140625, + "learning_rate": 0.001811578735992231, + "loss": 1.3026, + "step": 3201 + }, + { + "epoch": 0.22275557410692545, + "grad_norm": 1.265625, + "learning_rate": 0.0018114470661489154, + "loss": 1.264, + "step": 3202 + }, + { + "epoch": 0.22282514174406065, + "grad_norm": 1.4453125, + "learning_rate": 0.0018113153551044077, + "loss": 1.1016, + "step": 3203 + }, + { + "epoch": 0.22289470938119588, + "grad_norm": 1.4921875, + "learning_rate": 0.0018111836028653957, + "loss": 1.1228, + "step": 3204 + }, + { + "epoch": 0.22296427701833108, + "grad_norm": 1.1875, + "learning_rate": 0.0018110518094385686, + "loss": 1.4613, + "step": 3205 + }, + { + "epoch": 0.22303384465546627, + "grad_norm": 1.6484375, + "learning_rate": 0.001810919974830619, + "loss": 1.4146, + "step": 3206 + }, + { + "epoch": 0.22310341229260147, + "grad_norm": 1.5, + "learning_rate": 0.0018107880990482403, + "loss": 1.2999, + "step": 3207 + }, + { + "epoch": 0.2231729799297367, + "grad_norm": 1.359375, + "learning_rate": 0.0018106561820981286, + "loss": 1.2888, + "step": 3208 + }, + { + "epoch": 0.2232425475668719, + "grad_norm": 1.25, + "learning_rate": 0.0018105242239869822, + "loss": 1.3011, + "step": 3209 + }, + { + "epoch": 0.2233121152040071, + "grad_norm": 1.890625, + "learning_rate": 0.0018103922247215008, + "loss": 1.2069, + "step": 3210 + }, + { + "epoch": 0.2233816828411423, + "grad_norm": 1.1875, + "learning_rate": 0.0018102601843083869, + "loss": 1.0903, + "step": 3211 + }, + { + "epoch": 0.22345125047827752, + "grad_norm": 1.28125, + "learning_rate": 0.0018101281027543448, + "loss": 1.044, + "step": 3212 + }, + { + "epoch": 0.22352081811541272, + "grad_norm": 1.3359375, + "learning_rate": 0.0018099959800660812, + "loss": 1.3568, + "step": 3213 + }, + { + "epoch": 0.22359038575254792, + "grad_norm": 1.578125, + "learning_rate": 0.0018098638162503042, + "loss": 1.4231, + "step": 3214 + }, + { + "epoch": 0.2236599533896831, + "grad_norm": 1.3984375, + "learning_rate": 0.001809731611313725, + "loss": 1.0716, + "step": 3215 + }, + { + "epoch": 0.2237295210268183, + "grad_norm": 1.578125, + "learning_rate": 0.0018095993652630555, + "loss": 1.3694, + "step": 3216 + }, + { + "epoch": 0.22379908866395354, + "grad_norm": 1.1484375, + "learning_rate": 0.001809467078105011, + "loss": 1.1203, + "step": 3217 + }, + { + "epoch": 0.22386865630108874, + "grad_norm": 1.3671875, + "learning_rate": 0.0018093347498463086, + "loss": 1.2607, + "step": 3218 + }, + { + "epoch": 0.22393822393822393, + "grad_norm": 1.2890625, + "learning_rate": 0.0018092023804936667, + "loss": 1.3865, + "step": 3219 + }, + { + "epoch": 0.22400779157535913, + "grad_norm": 2.28125, + "learning_rate": 0.0018090699700538068, + "loss": 1.1439, + "step": 3220 + }, + { + "epoch": 0.22407735921249436, + "grad_norm": 1.6328125, + "learning_rate": 0.0018089375185334515, + "loss": 1.3804, + "step": 3221 + }, + { + "epoch": 0.22414692684962956, + "grad_norm": 1.3515625, + "learning_rate": 0.0018088050259393268, + "loss": 1.1861, + "step": 3222 + }, + { + "epoch": 0.22421649448676476, + "grad_norm": 1.171875, + "learning_rate": 0.0018086724922781593, + "loss": 1.2895, + "step": 3223 + }, + { + "epoch": 0.22428606212389995, + "grad_norm": 1.1484375, + "learning_rate": 0.0018085399175566783, + "loss": 0.9787, + "step": 3224 + }, + { + "epoch": 0.22435562976103518, + "grad_norm": 1.3984375, + "learning_rate": 0.0018084073017816161, + "loss": 1.5001, + "step": 3225 + }, + { + "epoch": 0.22442519739817038, + "grad_norm": 1.2734375, + "learning_rate": 0.0018082746449597056, + "loss": 1.0478, + "step": 3226 + }, + { + "epoch": 0.22449476503530558, + "grad_norm": 1.3984375, + "learning_rate": 0.0018081419470976827, + "loss": 1.1353, + "step": 3227 + }, + { + "epoch": 0.22456433267244078, + "grad_norm": 1.4921875, + "learning_rate": 0.001808009208202285, + "loss": 1.3408, + "step": 3228 + }, + { + "epoch": 0.22463390030957597, + "grad_norm": 1.8671875, + "learning_rate": 0.0018078764282802526, + "loss": 1.3925, + "step": 3229 + }, + { + "epoch": 0.2247034679467112, + "grad_norm": 1.53125, + "learning_rate": 0.001807743607338327, + "loss": 1.2309, + "step": 3230 + }, + { + "epoch": 0.2247730355838464, + "grad_norm": 1.484375, + "learning_rate": 0.0018076107453832524, + "loss": 1.3426, + "step": 3231 + }, + { + "epoch": 0.2248426032209816, + "grad_norm": 1.484375, + "learning_rate": 0.0018074778424217745, + "loss": 1.1903, + "step": 3232 + }, + { + "epoch": 0.2249121708581168, + "grad_norm": 1.296875, + "learning_rate": 0.0018073448984606423, + "loss": 1.2461, + "step": 3233 + }, + { + "epoch": 0.22498173849525202, + "grad_norm": 1.6875, + "learning_rate": 0.0018072119135066052, + "loss": 1.4229, + "step": 3234 + }, + { + "epoch": 0.22505130613238722, + "grad_norm": 1.3671875, + "learning_rate": 0.0018070788875664157, + "loss": 1.1639, + "step": 3235 + }, + { + "epoch": 0.22512087376952242, + "grad_norm": 1.2421875, + "learning_rate": 0.0018069458206468284, + "loss": 1.0116, + "step": 3236 + }, + { + "epoch": 0.22519044140665762, + "grad_norm": 1.296875, + "learning_rate": 0.0018068127127545998, + "loss": 1.3706, + "step": 3237 + }, + { + "epoch": 0.2252600090437928, + "grad_norm": 1.4453125, + "learning_rate": 0.0018066795638964877, + "loss": 1.1156, + "step": 3238 + }, + { + "epoch": 0.22532957668092804, + "grad_norm": 1.4765625, + "learning_rate": 0.001806546374079254, + "loss": 1.3318, + "step": 3239 + }, + { + "epoch": 0.22539914431806324, + "grad_norm": 1.578125, + "learning_rate": 0.0018064131433096601, + "loss": 1.2423, + "step": 3240 + }, + { + "epoch": 0.22546871195519844, + "grad_norm": 1.2890625, + "learning_rate": 0.0018062798715944718, + "loss": 1.0989, + "step": 3241 + }, + { + "epoch": 0.22553827959233363, + "grad_norm": 1.28125, + "learning_rate": 0.0018061465589404556, + "loss": 1.5471, + "step": 3242 + }, + { + "epoch": 0.22560784722946886, + "grad_norm": 1.40625, + "learning_rate": 0.0018060132053543804, + "loss": 1.1482, + "step": 3243 + }, + { + "epoch": 0.22567741486660406, + "grad_norm": 1.515625, + "learning_rate": 0.0018058798108430167, + "loss": 1.229, + "step": 3244 + }, + { + "epoch": 0.22574698250373926, + "grad_norm": 1.4140625, + "learning_rate": 0.001805746375413139, + "loss": 1.4308, + "step": 3245 + }, + { + "epoch": 0.22581655014087446, + "grad_norm": 1.1953125, + "learning_rate": 0.001805612899071521, + "loss": 1.007, + "step": 3246 + }, + { + "epoch": 0.22588611777800968, + "grad_norm": 1.25, + "learning_rate": 0.0018054793818249406, + "loss": 1.3335, + "step": 3247 + }, + { + "epoch": 0.22595568541514488, + "grad_norm": 1.078125, + "learning_rate": 0.0018053458236801773, + "loss": 1.1016, + "step": 3248 + }, + { + "epoch": 0.22602525305228008, + "grad_norm": 1.5078125, + "learning_rate": 0.0018052122246440124, + "loss": 1.1453, + "step": 3249 + }, + { + "epoch": 0.22609482068941528, + "grad_norm": 1.328125, + "learning_rate": 0.0018050785847232294, + "loss": 1.2315, + "step": 3250 + }, + { + "epoch": 0.22616438832655048, + "grad_norm": 1.3125, + "learning_rate": 0.0018049449039246133, + "loss": 1.1245, + "step": 3251 + }, + { + "epoch": 0.2262339559636857, + "grad_norm": 1.1875, + "learning_rate": 0.0018048111822549524, + "loss": 1.0022, + "step": 3252 + }, + { + "epoch": 0.2263035236008209, + "grad_norm": 1.4375, + "learning_rate": 0.0018046774197210365, + "loss": 1.274, + "step": 3253 + }, + { + "epoch": 0.2263730912379561, + "grad_norm": 1.4375, + "learning_rate": 0.0018045436163296566, + "loss": 1.3615, + "step": 3254 + }, + { + "epoch": 0.2264426588750913, + "grad_norm": 1.3046875, + "learning_rate": 0.0018044097720876077, + "loss": 1.3341, + "step": 3255 + }, + { + "epoch": 0.22651222651222652, + "grad_norm": 1.3828125, + "learning_rate": 0.0018042758870016847, + "loss": 1.013, + "step": 3256 + }, + { + "epoch": 0.22658179414936172, + "grad_norm": 1.8359375, + "learning_rate": 0.001804141961078686, + "loss": 1.3983, + "step": 3257 + }, + { + "epoch": 0.22665136178649692, + "grad_norm": 1.21875, + "learning_rate": 0.0018040079943254118, + "loss": 1.1396, + "step": 3258 + }, + { + "epoch": 0.22672092942363212, + "grad_norm": 1.6953125, + "learning_rate": 0.001803873986748664, + "loss": 1.4228, + "step": 3259 + }, + { + "epoch": 0.22679049706076734, + "grad_norm": 1.296875, + "learning_rate": 0.0018037399383552472, + "loss": 1.3769, + "step": 3260 + }, + { + "epoch": 0.22686006469790254, + "grad_norm": 1.484375, + "learning_rate": 0.001803605849151967, + "loss": 1.754, + "step": 3261 + }, + { + "epoch": 0.22692963233503774, + "grad_norm": 1.1953125, + "learning_rate": 0.0018034717191456327, + "loss": 1.2549, + "step": 3262 + }, + { + "epoch": 0.22699919997217294, + "grad_norm": 1.2734375, + "learning_rate": 0.0018033375483430542, + "loss": 1.3371, + "step": 3263 + }, + { + "epoch": 0.22706876760930814, + "grad_norm": 1.453125, + "learning_rate": 0.0018032033367510443, + "loss": 1.3409, + "step": 3264 + }, + { + "epoch": 0.22713833524644336, + "grad_norm": 1.15625, + "learning_rate": 0.0018030690843764173, + "loss": 1.0501, + "step": 3265 + }, + { + "epoch": 0.22720790288357856, + "grad_norm": 1.203125, + "learning_rate": 0.0018029347912259896, + "loss": 1.2673, + "step": 3266 + }, + { + "epoch": 0.22727747052071376, + "grad_norm": 1.1875, + "learning_rate": 0.0018028004573065806, + "loss": 1.1292, + "step": 3267 + }, + { + "epoch": 0.22734703815784896, + "grad_norm": 2.4375, + "learning_rate": 0.0018026660826250106, + "loss": 1.181, + "step": 3268 + }, + { + "epoch": 0.22741660579498418, + "grad_norm": 1.2421875, + "learning_rate": 0.0018025316671881032, + "loss": 1.2582, + "step": 3269 + }, + { + "epoch": 0.22748617343211938, + "grad_norm": 1.1171875, + "learning_rate": 0.0018023972110026822, + "loss": 1.1066, + "step": 3270 + }, + { + "epoch": 0.22755574106925458, + "grad_norm": 1.3515625, + "learning_rate": 0.0018022627140755754, + "loss": 1.1172, + "step": 3271 + }, + { + "epoch": 0.22762530870638978, + "grad_norm": 1.1796875, + "learning_rate": 0.0018021281764136119, + "loss": 1.0191, + "step": 3272 + }, + { + "epoch": 0.227694876343525, + "grad_norm": 1.0625, + "learning_rate": 0.0018019935980236224, + "loss": 1.1673, + "step": 3273 + }, + { + "epoch": 0.2277644439806602, + "grad_norm": 1.4296875, + "learning_rate": 0.0018018589789124404, + "loss": 1.2633, + "step": 3274 + }, + { + "epoch": 0.2278340116177954, + "grad_norm": 1.109375, + "learning_rate": 0.001801724319086901, + "loss": 1.1363, + "step": 3275 + }, + { + "epoch": 0.2279035792549306, + "grad_norm": 1.2421875, + "learning_rate": 0.0018015896185538418, + "loss": 1.1201, + "step": 3276 + }, + { + "epoch": 0.2279731468920658, + "grad_norm": 1.09375, + "learning_rate": 0.001801454877320102, + "loss": 1.094, + "step": 3277 + }, + { + "epoch": 0.22804271452920102, + "grad_norm": 1.3515625, + "learning_rate": 0.0018013200953925232, + "loss": 1.1318, + "step": 3278 + }, + { + "epoch": 0.22811228216633622, + "grad_norm": 1.2265625, + "learning_rate": 0.001801185272777949, + "loss": 1.079, + "step": 3279 + }, + { + "epoch": 0.22818184980347142, + "grad_norm": 1.6953125, + "learning_rate": 0.001801050409483225, + "loss": 1.4306, + "step": 3280 + }, + { + "epoch": 0.22825141744060662, + "grad_norm": 1.5, + "learning_rate": 0.0018009155055151984, + "loss": 1.4664, + "step": 3281 + }, + { + "epoch": 0.22832098507774184, + "grad_norm": 1.296875, + "learning_rate": 0.0018007805608807198, + "loss": 1.2103, + "step": 3282 + }, + { + "epoch": 0.22839055271487704, + "grad_norm": 1.3203125, + "learning_rate": 0.0018006455755866404, + "loss": 1.1655, + "step": 3283 + }, + { + "epoch": 0.22846012035201224, + "grad_norm": 1.1953125, + "learning_rate": 0.0018005105496398139, + "loss": 1.2991, + "step": 3284 + }, + { + "epoch": 0.22852968798914744, + "grad_norm": 1.6640625, + "learning_rate": 0.0018003754830470968, + "loss": 1.04, + "step": 3285 + }, + { + "epoch": 0.22859925562628267, + "grad_norm": 1.3203125, + "learning_rate": 0.001800240375815347, + "loss": 1.297, + "step": 3286 + }, + { + "epoch": 0.22866882326341786, + "grad_norm": 1.1171875, + "learning_rate": 0.0018001052279514242, + "loss": 1.0963, + "step": 3287 + }, + { + "epoch": 0.22873839090055306, + "grad_norm": 1.3359375, + "learning_rate": 0.001799970039462191, + "loss": 0.9655, + "step": 3288 + }, + { + "epoch": 0.22880795853768826, + "grad_norm": 1.7109375, + "learning_rate": 0.0017998348103545113, + "loss": 1.0901, + "step": 3289 + }, + { + "epoch": 0.22887752617482346, + "grad_norm": 1.2109375, + "learning_rate": 0.0017996995406352513, + "loss": 1.0801, + "step": 3290 + }, + { + "epoch": 0.22894709381195869, + "grad_norm": 1.2890625, + "learning_rate": 0.0017995642303112794, + "loss": 1.3627, + "step": 3291 + }, + { + "epoch": 0.22901666144909388, + "grad_norm": 1.1953125, + "learning_rate": 0.001799428879389466, + "loss": 1.0702, + "step": 3292 + }, + { + "epoch": 0.22908622908622908, + "grad_norm": 1.2734375, + "learning_rate": 0.0017992934878766835, + "loss": 1.3215, + "step": 3293 + }, + { + "epoch": 0.22915579672336428, + "grad_norm": 1.2421875, + "learning_rate": 0.0017991580557798065, + "loss": 1.1234, + "step": 3294 + }, + { + "epoch": 0.2292253643604995, + "grad_norm": 1.1328125, + "learning_rate": 0.0017990225831057114, + "loss": 1.1273, + "step": 3295 + }, + { + "epoch": 0.2292949319976347, + "grad_norm": 1.1796875, + "learning_rate": 0.001798887069861277, + "loss": 1.0268, + "step": 3296 + }, + { + "epoch": 0.2293644996347699, + "grad_norm": 1.0234375, + "learning_rate": 0.0017987515160533837, + "loss": 1.0267, + "step": 3297 + }, + { + "epoch": 0.2294340672719051, + "grad_norm": 1.0859375, + "learning_rate": 0.0017986159216889146, + "loss": 0.9197, + "step": 3298 + }, + { + "epoch": 0.22950363490904033, + "grad_norm": 1.453125, + "learning_rate": 0.0017984802867747542, + "loss": 1.3159, + "step": 3299 + }, + { + "epoch": 0.22957320254617553, + "grad_norm": 1.390625, + "learning_rate": 0.0017983446113177895, + "loss": 1.6719, + "step": 3300 + }, + { + "epoch": 0.22964277018331072, + "grad_norm": 1.1875, + "learning_rate": 0.0017982088953249096, + "loss": 1.0058, + "step": 3301 + }, + { + "epoch": 0.22971233782044592, + "grad_norm": 1.1640625, + "learning_rate": 0.0017980731388030052, + "loss": 1.0851, + "step": 3302 + }, + { + "epoch": 0.22978190545758112, + "grad_norm": 1.15625, + "learning_rate": 0.0017979373417589693, + "loss": 0.8804, + "step": 3303 + }, + { + "epoch": 0.22985147309471635, + "grad_norm": 1.140625, + "learning_rate": 0.0017978015041996969, + "loss": 1.0491, + "step": 3304 + }, + { + "epoch": 0.22992104073185154, + "grad_norm": 1.5234375, + "learning_rate": 0.0017976656261320856, + "loss": 1.2712, + "step": 3305 + }, + { + "epoch": 0.22999060836898674, + "grad_norm": 1.125, + "learning_rate": 0.0017975297075630342, + "loss": 1.0895, + "step": 3306 + }, + { + "epoch": 0.23006017600612194, + "grad_norm": 1.40625, + "learning_rate": 0.0017973937484994443, + "loss": 1.0422, + "step": 3307 + }, + { + "epoch": 0.23012974364325717, + "grad_norm": 1.5703125, + "learning_rate": 0.0017972577489482188, + "loss": 0.9921, + "step": 3308 + }, + { + "epoch": 0.23019931128039237, + "grad_norm": 1.2578125, + "learning_rate": 0.0017971217089162632, + "loss": 0.9485, + "step": 3309 + }, + { + "epoch": 0.23026887891752756, + "grad_norm": 1.171875, + "learning_rate": 0.0017969856284104854, + "loss": 1.1615, + "step": 3310 + }, + { + "epoch": 0.23033844655466276, + "grad_norm": 1.25, + "learning_rate": 0.001796849507437794, + "loss": 1.1171, + "step": 3311 + }, + { + "epoch": 0.230408014191798, + "grad_norm": 1.2421875, + "learning_rate": 0.0017967133460051014, + "loss": 1.1948, + "step": 3312 + }, + { + "epoch": 0.2304775818289332, + "grad_norm": 1.4453125, + "learning_rate": 0.0017965771441193206, + "loss": 1.2799, + "step": 3313 + }, + { + "epoch": 0.23054714946606839, + "grad_norm": 1.1796875, + "learning_rate": 0.0017964409017873675, + "loss": 1.0338, + "step": 3314 + }, + { + "epoch": 0.23061671710320358, + "grad_norm": 1.1171875, + "learning_rate": 0.0017963046190161598, + "loss": 1.2925, + "step": 3315 + }, + { + "epoch": 0.23068628474033878, + "grad_norm": 1.5625, + "learning_rate": 0.0017961682958126174, + "loss": 1.0702, + "step": 3316 + }, + { + "epoch": 0.230755852377474, + "grad_norm": 1.1015625, + "learning_rate": 0.0017960319321836619, + "loss": 1.0485, + "step": 3317 + }, + { + "epoch": 0.2308254200146092, + "grad_norm": 1.375, + "learning_rate": 0.001795895528136217, + "loss": 1.0255, + "step": 3318 + }, + { + "epoch": 0.2308949876517444, + "grad_norm": 1.171875, + "learning_rate": 0.0017957590836772091, + "loss": 1.1028, + "step": 3319 + }, + { + "epoch": 0.2309645552888796, + "grad_norm": 1.359375, + "learning_rate": 0.0017956225988135653, + "loss": 1.2513, + "step": 3320 + }, + { + "epoch": 0.23103412292601483, + "grad_norm": 1.3671875, + "learning_rate": 0.0017954860735522166, + "loss": 1.1436, + "step": 3321 + }, + { + "epoch": 0.23110369056315003, + "grad_norm": 1.2734375, + "learning_rate": 0.0017953495079000945, + "loss": 1.1775, + "step": 3322 + }, + { + "epoch": 0.23117325820028523, + "grad_norm": 1.1953125, + "learning_rate": 0.0017952129018641333, + "loss": 1.1371, + "step": 3323 + }, + { + "epoch": 0.23124282583742042, + "grad_norm": 1.4609375, + "learning_rate": 0.001795076255451269, + "loss": 1.3008, + "step": 3324 + }, + { + "epoch": 0.23131239347455565, + "grad_norm": 1.2734375, + "learning_rate": 0.00179493956866844, + "loss": 1.0581, + "step": 3325 + }, + { + "epoch": 0.23138196111169085, + "grad_norm": 1.1875, + "learning_rate": 0.0017948028415225865, + "loss": 1.0475, + "step": 3326 + }, + { + "epoch": 0.23145152874882605, + "grad_norm": 1.3046875, + "learning_rate": 0.0017946660740206508, + "loss": 1.3079, + "step": 3327 + }, + { + "epoch": 0.23152109638596124, + "grad_norm": 1.234375, + "learning_rate": 0.0017945292661695773, + "loss": 1.2221, + "step": 3328 + }, + { + "epoch": 0.23159066402309644, + "grad_norm": 1.3046875, + "learning_rate": 0.0017943924179763125, + "loss": 1.0878, + "step": 3329 + }, + { + "epoch": 0.23166023166023167, + "grad_norm": 1.5234375, + "learning_rate": 0.0017942555294478044, + "loss": 1.4133, + "step": 3330 + }, + { + "epoch": 0.23172979929736687, + "grad_norm": 1.1953125, + "learning_rate": 0.0017941186005910042, + "loss": 1.1608, + "step": 3331 + }, + { + "epoch": 0.23179936693450207, + "grad_norm": 1.25, + "learning_rate": 0.001793981631412864, + "loss": 1.3104, + "step": 3332 + }, + { + "epoch": 0.23186893457163726, + "grad_norm": 1.2421875, + "learning_rate": 0.0017938446219203385, + "loss": 1.2514, + "step": 3333 + }, + { + "epoch": 0.2319385022087725, + "grad_norm": 1.125, + "learning_rate": 0.0017937075721203843, + "loss": 1.1533, + "step": 3334 + }, + { + "epoch": 0.2320080698459077, + "grad_norm": 1.15625, + "learning_rate": 0.0017935704820199604, + "loss": 1.4404, + "step": 3335 + }, + { + "epoch": 0.2320776374830429, + "grad_norm": 1.109375, + "learning_rate": 0.0017934333516260272, + "loss": 1.0626, + "step": 3336 + }, + { + "epoch": 0.23214720512017809, + "grad_norm": 1.3671875, + "learning_rate": 0.0017932961809455476, + "loss": 1.2449, + "step": 3337 + }, + { + "epoch": 0.2322167727573133, + "grad_norm": 1.234375, + "learning_rate": 0.0017931589699854865, + "loss": 1.0743, + "step": 3338 + }, + { + "epoch": 0.2322863403944485, + "grad_norm": 1.15625, + "learning_rate": 0.0017930217187528106, + "loss": 1.2008, + "step": 3339 + }, + { + "epoch": 0.2323559080315837, + "grad_norm": 1.34375, + "learning_rate": 0.0017928844272544892, + "loss": 1.1208, + "step": 3340 + }, + { + "epoch": 0.2324254756687189, + "grad_norm": 1.203125, + "learning_rate": 0.0017927470954974924, + "loss": 1.2209, + "step": 3341 + }, + { + "epoch": 0.2324950433058541, + "grad_norm": 1.21875, + "learning_rate": 0.0017926097234887944, + "loss": 1.5304, + "step": 3342 + }, + { + "epoch": 0.23256461094298933, + "grad_norm": 0.9921875, + "learning_rate": 0.0017924723112353695, + "loss": 0.9315, + "step": 3343 + }, + { + "epoch": 0.23263417858012453, + "grad_norm": 1.3046875, + "learning_rate": 0.0017923348587441951, + "loss": 1.0834, + "step": 3344 + }, + { + "epoch": 0.23270374621725973, + "grad_norm": 1.2578125, + "learning_rate": 0.0017921973660222502, + "loss": 1.0801, + "step": 3345 + }, + { + "epoch": 0.23277331385439493, + "grad_norm": 1.2109375, + "learning_rate": 0.001792059833076516, + "loss": 0.9739, + "step": 3346 + }, + { + "epoch": 0.23284288149153015, + "grad_norm": 1.5546875, + "learning_rate": 0.0017919222599139758, + "loss": 1.2927, + "step": 3347 + }, + { + "epoch": 0.23291244912866535, + "grad_norm": 1.2578125, + "learning_rate": 0.0017917846465416148, + "loss": 0.9077, + "step": 3348 + }, + { + "epoch": 0.23298201676580055, + "grad_norm": 1.03125, + "learning_rate": 0.00179164699296642, + "loss": 0.7696, + "step": 3349 + }, + { + "epoch": 0.23305158440293575, + "grad_norm": 1.296875, + "learning_rate": 0.0017915092991953815, + "loss": 1.3586, + "step": 3350 + }, + { + "epoch": 0.23312115204007097, + "grad_norm": 1.28125, + "learning_rate": 0.0017913715652354903, + "loss": 1.1076, + "step": 3351 + }, + { + "epoch": 0.23319071967720617, + "grad_norm": 1.046875, + "learning_rate": 0.0017912337910937395, + "loss": 1.0509, + "step": 3352 + }, + { + "epoch": 0.23326028731434137, + "grad_norm": 1.2421875, + "learning_rate": 0.0017910959767771253, + "loss": 1.5079, + "step": 3353 + }, + { + "epoch": 0.23332985495147657, + "grad_norm": 1.078125, + "learning_rate": 0.0017909581222926446, + "loss": 0.9356, + "step": 3354 + }, + { + "epoch": 0.23339942258861177, + "grad_norm": 1.0625, + "learning_rate": 0.001790820227647297, + "loss": 1.0451, + "step": 3355 + }, + { + "epoch": 0.233468990225747, + "grad_norm": 1.0546875, + "learning_rate": 0.0017906822928480848, + "loss": 1.0699, + "step": 3356 + }, + { + "epoch": 0.2335385578628822, + "grad_norm": 1.1953125, + "learning_rate": 0.0017905443179020107, + "loss": 1.1171, + "step": 3357 + }, + { + "epoch": 0.2336081255000174, + "grad_norm": 1.3203125, + "learning_rate": 0.0017904063028160806, + "loss": 1.2825, + "step": 3358 + }, + { + "epoch": 0.2336776931371526, + "grad_norm": 1.25, + "learning_rate": 0.001790268247597303, + "loss": 1.0923, + "step": 3359 + }, + { + "epoch": 0.2337472607742878, + "grad_norm": 1.2890625, + "learning_rate": 0.0017901301522526864, + "loss": 1.2474, + "step": 3360 + }, + { + "epoch": 0.233816828411423, + "grad_norm": 1.1328125, + "learning_rate": 0.0017899920167892436, + "loss": 1.1623, + "step": 3361 + }, + { + "epoch": 0.2338863960485582, + "grad_norm": 1.890625, + "learning_rate": 0.001789853841213988, + "loss": 1.3545, + "step": 3362 + }, + { + "epoch": 0.2339559636856934, + "grad_norm": 1.3984375, + "learning_rate": 0.0017897156255339355, + "loss": 1.0498, + "step": 3363 + }, + { + "epoch": 0.23402553132282863, + "grad_norm": 1.2578125, + "learning_rate": 0.0017895773697561039, + "loss": 1.158, + "step": 3364 + }, + { + "epoch": 0.23409509895996383, + "grad_norm": 0.99609375, + "learning_rate": 0.0017894390738875132, + "loss": 0.9316, + "step": 3365 + }, + { + "epoch": 0.23416466659709903, + "grad_norm": 0.984375, + "learning_rate": 0.0017893007379351854, + "loss": 0.9714, + "step": 3366 + }, + { + "epoch": 0.23423423423423423, + "grad_norm": 1.3984375, + "learning_rate": 0.0017891623619061445, + "loss": 1.3723, + "step": 3367 + }, + { + "epoch": 0.23430380187136943, + "grad_norm": 1.2734375, + "learning_rate": 0.0017890239458074166, + "loss": 1.2839, + "step": 3368 + }, + { + "epoch": 0.23437336950850465, + "grad_norm": 1.3359375, + "learning_rate": 0.0017888854896460297, + "loss": 1.397, + "step": 3369 + }, + { + "epoch": 0.23444293714563985, + "grad_norm": 1.28125, + "learning_rate": 0.0017887469934290139, + "loss": 1.3389, + "step": 3370 + }, + { + "epoch": 0.23451250478277505, + "grad_norm": 1.2578125, + "learning_rate": 0.0017886084571634014, + "loss": 1.0145, + "step": 3371 + }, + { + "epoch": 0.23458207241991025, + "grad_norm": 1.34375, + "learning_rate": 0.0017884698808562263, + "loss": 1.1883, + "step": 3372 + }, + { + "epoch": 0.23465164005704547, + "grad_norm": 1.0078125, + "learning_rate": 0.0017883312645145249, + "loss": 1.115, + "step": 3373 + }, + { + "epoch": 0.23472120769418067, + "grad_norm": 1.2265625, + "learning_rate": 0.0017881926081453354, + "loss": 1.1842, + "step": 3374 + }, + { + "epoch": 0.23479077533131587, + "grad_norm": 1.1171875, + "learning_rate": 0.0017880539117556978, + "loss": 0.9997, + "step": 3375 + }, + { + "epoch": 0.23486034296845107, + "grad_norm": 1.2734375, + "learning_rate": 0.0017879151753526549, + "loss": 0.9747, + "step": 3376 + }, + { + "epoch": 0.2349299106055863, + "grad_norm": 1.140625, + "learning_rate": 0.0017877763989432504, + "loss": 1.0838, + "step": 3377 + }, + { + "epoch": 0.2349994782427215, + "grad_norm": 1.2109375, + "learning_rate": 0.0017876375825345314, + "loss": 1.1765, + "step": 3378 + }, + { + "epoch": 0.2350690458798567, + "grad_norm": 1.2578125, + "learning_rate": 0.001787498726133546, + "loss": 1.1738, + "step": 3379 + }, + { + "epoch": 0.2351386135169919, + "grad_norm": 1.1328125, + "learning_rate": 0.0017873598297473445, + "loss": 1.1137, + "step": 3380 + }, + { + "epoch": 0.2352081811541271, + "grad_norm": 1.2421875, + "learning_rate": 0.0017872208933829793, + "loss": 1.1858, + "step": 3381 + }, + { + "epoch": 0.23527774879126231, + "grad_norm": 1.25, + "learning_rate": 0.0017870819170475053, + "loss": 1.1267, + "step": 3382 + }, + { + "epoch": 0.2353473164283975, + "grad_norm": 1.21875, + "learning_rate": 0.0017869429007479783, + "loss": 1.1509, + "step": 3383 + }, + { + "epoch": 0.2354168840655327, + "grad_norm": 1.1328125, + "learning_rate": 0.0017868038444914577, + "loss": 1.2707, + "step": 3384 + }, + { + "epoch": 0.2354864517026679, + "grad_norm": 1.1015625, + "learning_rate": 0.0017866647482850033, + "loss": 1.1556, + "step": 3385 + }, + { + "epoch": 0.23555601933980314, + "grad_norm": 1.5546875, + "learning_rate": 0.0017865256121356783, + "loss": 1.5673, + "step": 3386 + }, + { + "epoch": 0.23562558697693833, + "grad_norm": 1.2109375, + "learning_rate": 0.001786386436050547, + "loss": 1.1818, + "step": 3387 + }, + { + "epoch": 0.23569515461407353, + "grad_norm": 1.2421875, + "learning_rate": 0.0017862472200366763, + "loss": 1.1394, + "step": 3388 + }, + { + "epoch": 0.23576472225120873, + "grad_norm": 1.3046875, + "learning_rate": 0.0017861079641011345, + "loss": 0.9994, + "step": 3389 + }, + { + "epoch": 0.23583428988834393, + "grad_norm": 1.203125, + "learning_rate": 0.0017859686682509927, + "loss": 1.1313, + "step": 3390 + }, + { + "epoch": 0.23590385752547915, + "grad_norm": 1.453125, + "learning_rate": 0.0017858293324933237, + "loss": 1.2673, + "step": 3391 + }, + { + "epoch": 0.23597342516261435, + "grad_norm": 1.5078125, + "learning_rate": 0.0017856899568352018, + "loss": 1.2771, + "step": 3392 + }, + { + "epoch": 0.23604299279974955, + "grad_norm": 1.046875, + "learning_rate": 0.0017855505412837044, + "loss": 1.0559, + "step": 3393 + }, + { + "epoch": 0.23611256043688475, + "grad_norm": 1.1171875, + "learning_rate": 0.0017854110858459094, + "loss": 1.2803, + "step": 3394 + }, + { + "epoch": 0.23618212807401998, + "grad_norm": 1.21875, + "learning_rate": 0.0017852715905288985, + "loss": 1.2212, + "step": 3395 + }, + { + "epoch": 0.23625169571115517, + "grad_norm": 1.28125, + "learning_rate": 0.0017851320553397545, + "loss": 1.2095, + "step": 3396 + }, + { + "epoch": 0.23632126334829037, + "grad_norm": 1.2265625, + "learning_rate": 0.001784992480285562, + "loss": 1.1702, + "step": 3397 + }, + { + "epoch": 0.23639083098542557, + "grad_norm": 1.4296875, + "learning_rate": 0.0017848528653734079, + "loss": 1.1098, + "step": 3398 + }, + { + "epoch": 0.2364603986225608, + "grad_norm": 1.3671875, + "learning_rate": 0.0017847132106103812, + "loss": 1.2043, + "step": 3399 + }, + { + "epoch": 0.236529966259696, + "grad_norm": 1.28125, + "learning_rate": 0.0017845735160035732, + "loss": 1.3687, + "step": 3400 + }, + { + "epoch": 0.2365995338968312, + "grad_norm": 1.3359375, + "learning_rate": 0.0017844337815600762, + "loss": 1.0551, + "step": 3401 + }, + { + "epoch": 0.2366691015339664, + "grad_norm": 1.109375, + "learning_rate": 0.0017842940072869858, + "loss": 1.0612, + "step": 3402 + }, + { + "epoch": 0.2367386691711016, + "grad_norm": 1.3046875, + "learning_rate": 0.001784154193191399, + "loss": 1.1619, + "step": 3403 + }, + { + "epoch": 0.23680823680823682, + "grad_norm": 1.140625, + "learning_rate": 0.0017840143392804145, + "loss": 1.0387, + "step": 3404 + }, + { + "epoch": 0.23687780444537201, + "grad_norm": 1.6171875, + "learning_rate": 0.0017838744455611337, + "loss": 1.0793, + "step": 3405 + }, + { + "epoch": 0.2369473720825072, + "grad_norm": 1.3828125, + "learning_rate": 0.0017837345120406596, + "loss": 1.1494, + "step": 3406 + }, + { + "epoch": 0.2370169397196424, + "grad_norm": 1.3359375, + "learning_rate": 0.001783594538726097, + "loss": 0.9968, + "step": 3407 + }, + { + "epoch": 0.23708650735677764, + "grad_norm": 1.125, + "learning_rate": 0.0017834545256245535, + "loss": 0.9704, + "step": 3408 + }, + { + "epoch": 0.23715607499391284, + "grad_norm": 1.1953125, + "learning_rate": 0.0017833144727431383, + "loss": 1.1628, + "step": 3409 + }, + { + "epoch": 0.23722564263104803, + "grad_norm": 1.140625, + "learning_rate": 0.0017831743800889623, + "loss": 1.0879, + "step": 3410 + }, + { + "epoch": 0.23729521026818323, + "grad_norm": 1.0546875, + "learning_rate": 0.0017830342476691386, + "loss": 1.3391, + "step": 3411 + }, + { + "epoch": 0.23736477790531846, + "grad_norm": 1.15625, + "learning_rate": 0.0017828940754907828, + "loss": 1.1479, + "step": 3412 + }, + { + "epoch": 0.23743434554245366, + "grad_norm": 1.234375, + "learning_rate": 0.0017827538635610117, + "loss": 1.2333, + "step": 3413 + }, + { + "epoch": 0.23750391317958885, + "grad_norm": 1.0234375, + "learning_rate": 0.0017826136118869447, + "loss": 1.0714, + "step": 3414 + }, + { + "epoch": 0.23757348081672405, + "grad_norm": 1.3203125, + "learning_rate": 0.0017824733204757034, + "loss": 1.0966, + "step": 3415 + }, + { + "epoch": 0.23764304845385925, + "grad_norm": 1.0, + "learning_rate": 0.0017823329893344106, + "loss": 0.9634, + "step": 3416 + }, + { + "epoch": 0.23771261609099448, + "grad_norm": 0.8984375, + "learning_rate": 0.0017821926184701923, + "loss": 0.8942, + "step": 3417 + }, + { + "epoch": 0.23778218372812968, + "grad_norm": 1.2109375, + "learning_rate": 0.001782052207890175, + "loss": 1.1385, + "step": 3418 + }, + { + "epoch": 0.23785175136526487, + "grad_norm": 0.9453125, + "learning_rate": 0.0017819117576014884, + "loss": 0.9763, + "step": 3419 + }, + { + "epoch": 0.23792131900240007, + "grad_norm": 1.125, + "learning_rate": 0.001781771267611264, + "loss": 1.2327, + "step": 3420 + }, + { + "epoch": 0.2379908866395353, + "grad_norm": 1.5078125, + "learning_rate": 0.0017816307379266351, + "loss": 1.1842, + "step": 3421 + }, + { + "epoch": 0.2380604542766705, + "grad_norm": 1.1171875, + "learning_rate": 0.0017814901685547372, + "loss": 1.172, + "step": 3422 + }, + { + "epoch": 0.2381300219138057, + "grad_norm": 1.3984375, + "learning_rate": 0.0017813495595027072, + "loss": 1.2356, + "step": 3423 + }, + { + "epoch": 0.2381995895509409, + "grad_norm": 1.640625, + "learning_rate": 0.0017812089107776847, + "loss": 1.1543, + "step": 3424 + }, + { + "epoch": 0.23826915718807612, + "grad_norm": 1.015625, + "learning_rate": 0.0017810682223868117, + "loss": 1.3024, + "step": 3425 + }, + { + "epoch": 0.23833872482521132, + "grad_norm": 1.265625, + "learning_rate": 0.0017809274943372312, + "loss": 1.1413, + "step": 3426 + }, + { + "epoch": 0.23840829246234652, + "grad_norm": 1.1015625, + "learning_rate": 0.0017807867266360887, + "loss": 1.1717, + "step": 3427 + }, + { + "epoch": 0.23847786009948171, + "grad_norm": 1.40625, + "learning_rate": 0.0017806459192905315, + "loss": 1.2329, + "step": 3428 + }, + { + "epoch": 0.2385474277366169, + "grad_norm": 1.4140625, + "learning_rate": 0.0017805050723077095, + "loss": 1.3393, + "step": 3429 + }, + { + "epoch": 0.23861699537375214, + "grad_norm": 1.125, + "learning_rate": 0.0017803641856947738, + "loss": 1.1232, + "step": 3430 + }, + { + "epoch": 0.23868656301088734, + "grad_norm": 1.2734375, + "learning_rate": 0.0017802232594588778, + "loss": 1.2795, + "step": 3431 + }, + { + "epoch": 0.23875613064802254, + "grad_norm": 1.390625, + "learning_rate": 0.001780082293607178, + "loss": 1.2462, + "step": 3432 + }, + { + "epoch": 0.23882569828515773, + "grad_norm": 0.890625, + "learning_rate": 0.0017799412881468306, + "loss": 1.0268, + "step": 3433 + }, + { + "epoch": 0.23889526592229296, + "grad_norm": 1.09375, + "learning_rate": 0.001779800243084996, + "loss": 1.2895, + "step": 3434 + }, + { + "epoch": 0.23896483355942816, + "grad_norm": 1.1171875, + "learning_rate": 0.0017796591584288356, + "loss": 1.1936, + "step": 3435 + }, + { + "epoch": 0.23903440119656336, + "grad_norm": 1.34375, + "learning_rate": 0.001779518034185513, + "loss": 1.4595, + "step": 3436 + }, + { + "epoch": 0.23910396883369855, + "grad_norm": 1.265625, + "learning_rate": 0.0017793768703621936, + "loss": 1.2653, + "step": 3437 + }, + { + "epoch": 0.23917353647083378, + "grad_norm": 1.109375, + "learning_rate": 0.001779235666966045, + "loss": 0.8807, + "step": 3438 + }, + { + "epoch": 0.23924310410796898, + "grad_norm": 1.4375, + "learning_rate": 0.0017790944240042368, + "loss": 1.3037, + "step": 3439 + }, + { + "epoch": 0.23931267174510418, + "grad_norm": 1.2109375, + "learning_rate": 0.0017789531414839409, + "loss": 1.1275, + "step": 3440 + }, + { + "epoch": 0.23938223938223938, + "grad_norm": 1.1484375, + "learning_rate": 0.0017788118194123307, + "loss": 1.1571, + "step": 3441 + }, + { + "epoch": 0.23945180701937457, + "grad_norm": 1.390625, + "learning_rate": 0.0017786704577965814, + "loss": 1.0926, + "step": 3442 + }, + { + "epoch": 0.2395213746565098, + "grad_norm": 1.265625, + "learning_rate": 0.0017785290566438717, + "loss": 1.2125, + "step": 3443 + }, + { + "epoch": 0.239590942293645, + "grad_norm": 1.15625, + "learning_rate": 0.0017783876159613802, + "loss": 1.1799, + "step": 3444 + }, + { + "epoch": 0.2396605099307802, + "grad_norm": 1.0859375, + "learning_rate": 0.0017782461357562886, + "loss": 1.2677, + "step": 3445 + }, + { + "epoch": 0.2397300775679154, + "grad_norm": 1.3125, + "learning_rate": 0.0017781046160357814, + "loss": 1.1287, + "step": 3446 + }, + { + "epoch": 0.23979964520505062, + "grad_norm": 1.3125, + "learning_rate": 0.0017779630568070435, + "loss": 1.292, + "step": 3447 + }, + { + "epoch": 0.23986921284218582, + "grad_norm": 1.140625, + "learning_rate": 0.0017778214580772627, + "loss": 1.0825, + "step": 3448 + }, + { + "epoch": 0.23993878047932102, + "grad_norm": 1.3515625, + "learning_rate": 0.001777679819853629, + "loss": 1.2443, + "step": 3449 + }, + { + "epoch": 0.24000834811645622, + "grad_norm": 1.3203125, + "learning_rate": 0.001777538142143334, + "loss": 1.2154, + "step": 3450 + }, + { + "epoch": 0.24007791575359144, + "grad_norm": 1.4453125, + "learning_rate": 0.001777396424953571, + "loss": 1.3522, + "step": 3451 + }, + { + "epoch": 0.24014748339072664, + "grad_norm": 1.4296875, + "learning_rate": 0.0017772546682915359, + "loss": 1.2646, + "step": 3452 + }, + { + "epoch": 0.24021705102786184, + "grad_norm": 1.1640625, + "learning_rate": 0.0017771128721644264, + "loss": 0.9994, + "step": 3453 + }, + { + "epoch": 0.24028661866499704, + "grad_norm": 1.375, + "learning_rate": 0.001776971036579442, + "loss": 0.911, + "step": 3454 + }, + { + "epoch": 0.24035618630213224, + "grad_norm": 1.2265625, + "learning_rate": 0.0017768291615437848, + "loss": 1.0556, + "step": 3455 + }, + { + "epoch": 0.24042575393926746, + "grad_norm": 1.5546875, + "learning_rate": 0.0017766872470646583, + "loss": 1.2429, + "step": 3456 + }, + { + "epoch": 0.24049532157640266, + "grad_norm": 1.5546875, + "learning_rate": 0.0017765452931492681, + "loss": 1.3328, + "step": 3457 + }, + { + "epoch": 0.24056488921353786, + "grad_norm": 1.2734375, + "learning_rate": 0.001776403299804822, + "loss": 1.1474, + "step": 3458 + }, + { + "epoch": 0.24063445685067306, + "grad_norm": 1.6015625, + "learning_rate": 0.0017762612670385299, + "loss": 1.0391, + "step": 3459 + }, + { + "epoch": 0.24070402448780828, + "grad_norm": 1.34375, + "learning_rate": 0.001776119194857603, + "loss": 1.2632, + "step": 3460 + }, + { + "epoch": 0.24077359212494348, + "grad_norm": 1.3359375, + "learning_rate": 0.0017759770832692556, + "loss": 1.2716, + "step": 3461 + }, + { + "epoch": 0.24084315976207868, + "grad_norm": 1.1328125, + "learning_rate": 0.001775834932280703, + "loss": 0.9257, + "step": 3462 + }, + { + "epoch": 0.24091272739921388, + "grad_norm": 1.296875, + "learning_rate": 0.001775692741899163, + "loss": 1.1569, + "step": 3463 + }, + { + "epoch": 0.2409822950363491, + "grad_norm": 1.3203125, + "learning_rate": 0.0017755505121318552, + "loss": 1.3196, + "step": 3464 + }, + { + "epoch": 0.2410518626734843, + "grad_norm": 1.34375, + "learning_rate": 0.0017754082429860018, + "loss": 1.1923, + "step": 3465 + }, + { + "epoch": 0.2411214303106195, + "grad_norm": 1.140625, + "learning_rate": 0.001775265934468826, + "loss": 1.182, + "step": 3466 + }, + { + "epoch": 0.2411909979477547, + "grad_norm": 1.2421875, + "learning_rate": 0.0017751235865875537, + "loss": 1.1103, + "step": 3467 + }, + { + "epoch": 0.2412605655848899, + "grad_norm": 1.1328125, + "learning_rate": 0.0017749811993494125, + "loss": 1.133, + "step": 3468 + }, + { + "epoch": 0.24133013322202512, + "grad_norm": 1.15625, + "learning_rate": 0.0017748387727616322, + "loss": 1.1061, + "step": 3469 + }, + { + "epoch": 0.24139970085916032, + "grad_norm": 1.28125, + "learning_rate": 0.0017746963068314447, + "loss": 1.079, + "step": 3470 + }, + { + "epoch": 0.24146926849629552, + "grad_norm": 1.3203125, + "learning_rate": 0.0017745538015660834, + "loss": 1.24, + "step": 3471 + }, + { + "epoch": 0.24153883613343072, + "grad_norm": 1.0625, + "learning_rate": 0.0017744112569727838, + "loss": 1.0469, + "step": 3472 + }, + { + "epoch": 0.24160840377056594, + "grad_norm": 1.1796875, + "learning_rate": 0.0017742686730587841, + "loss": 1.1172, + "step": 3473 + }, + { + "epoch": 0.24167797140770114, + "grad_norm": 1.2578125, + "learning_rate": 0.001774126049831324, + "loss": 1.0453, + "step": 3474 + }, + { + "epoch": 0.24174753904483634, + "grad_norm": 1.3125, + "learning_rate": 0.0017739833872976447, + "loss": 0.9883, + "step": 3475 + }, + { + "epoch": 0.24181710668197154, + "grad_norm": 1.46875, + "learning_rate": 0.0017738406854649902, + "loss": 1.3942, + "step": 3476 + }, + { + "epoch": 0.24188667431910676, + "grad_norm": 1.28125, + "learning_rate": 0.0017736979443406062, + "loss": 0.8944, + "step": 3477 + }, + { + "epoch": 0.24195624195624196, + "grad_norm": 1.28125, + "learning_rate": 0.0017735551639317402, + "loss": 1.2261, + "step": 3478 + }, + { + "epoch": 0.24202580959337716, + "grad_norm": 1.078125, + "learning_rate": 0.0017734123442456422, + "loss": 1.1733, + "step": 3479 + }, + { + "epoch": 0.24209537723051236, + "grad_norm": 1.2109375, + "learning_rate": 0.0017732694852895636, + "loss": 1.0871, + "step": 3480 + }, + { + "epoch": 0.24216494486764756, + "grad_norm": 1.0078125, + "learning_rate": 0.001773126587070758, + "loss": 1.0592, + "step": 3481 + }, + { + "epoch": 0.24223451250478278, + "grad_norm": 1.5625, + "learning_rate": 0.001772983649596481, + "loss": 1.2381, + "step": 3482 + }, + { + "epoch": 0.24230408014191798, + "grad_norm": 1.53125, + "learning_rate": 0.0017728406728739908, + "loss": 0.8463, + "step": 3483 + }, + { + "epoch": 0.24237364777905318, + "grad_norm": 1.0234375, + "learning_rate": 0.0017726976569105463, + "loss": 1.07, + "step": 3484 + }, + { + "epoch": 0.24244321541618838, + "grad_norm": 0.93359375, + "learning_rate": 0.0017725546017134098, + "loss": 1.1426, + "step": 3485 + }, + { + "epoch": 0.2425127830533236, + "grad_norm": 1.140625, + "learning_rate": 0.0017724115072898442, + "loss": 1.222, + "step": 3486 + }, + { + "epoch": 0.2425823506904588, + "grad_norm": 1.4765625, + "learning_rate": 0.0017722683736471159, + "loss": 1.0897, + "step": 3487 + }, + { + "epoch": 0.242651918327594, + "grad_norm": 1.515625, + "learning_rate": 0.001772125200792492, + "loss": 0.8916, + "step": 3488 + }, + { + "epoch": 0.2427214859647292, + "grad_norm": 1.2578125, + "learning_rate": 0.0017719819887332417, + "loss": 1.1342, + "step": 3489 + }, + { + "epoch": 0.24279105360186443, + "grad_norm": 1.125, + "learning_rate": 0.0017718387374766379, + "loss": 0.9353, + "step": 3490 + }, + { + "epoch": 0.24286062123899962, + "grad_norm": 1.1171875, + "learning_rate": 0.001771695447029953, + "loss": 1.2812, + "step": 3491 + }, + { + "epoch": 0.24293018887613482, + "grad_norm": 1.125, + "learning_rate": 0.0017715521174004624, + "loss": 1.2196, + "step": 3492 + }, + { + "epoch": 0.24299975651327002, + "grad_norm": 1.1484375, + "learning_rate": 0.0017714087485954449, + "loss": 1.0235, + "step": 3493 + }, + { + "epoch": 0.24306932415040522, + "grad_norm": 0.9921875, + "learning_rate": 0.001771265340622179, + "loss": 0.9502, + "step": 3494 + }, + { + "epoch": 0.24313889178754045, + "grad_norm": 1.09375, + "learning_rate": 0.0017711218934879467, + "loss": 0.9577, + "step": 3495 + }, + { + "epoch": 0.24320845942467564, + "grad_norm": 1.3125, + "learning_rate": 0.0017709784072000314, + "loss": 1.001, + "step": 3496 + }, + { + "epoch": 0.24327802706181084, + "grad_norm": 1.265625, + "learning_rate": 0.0017708348817657188, + "loss": 1.1289, + "step": 3497 + }, + { + "epoch": 0.24334759469894604, + "grad_norm": 1.296875, + "learning_rate": 0.0017706913171922959, + "loss": 1.1366, + "step": 3498 + }, + { + "epoch": 0.24341716233608127, + "grad_norm": 1.0390625, + "learning_rate": 0.0017705477134870526, + "loss": 1.0603, + "step": 3499 + }, + { + "epoch": 0.24348672997321646, + "grad_norm": 1.171875, + "learning_rate": 0.00177040407065728, + "loss": 1.1888, + "step": 3500 + }, + { + "epoch": 0.24355629761035166, + "grad_norm": 0.9609375, + "learning_rate": 0.0017702603887102721, + "loss": 0.9626, + "step": 3501 + }, + { + "epoch": 0.24362586524748686, + "grad_norm": 1.3046875, + "learning_rate": 0.001770116667653324, + "loss": 1.2676, + "step": 3502 + }, + { + "epoch": 0.2436954328846221, + "grad_norm": 1.28125, + "learning_rate": 0.0017699729074937332, + "loss": 1.0256, + "step": 3503 + }, + { + "epoch": 0.24376500052175729, + "grad_norm": 1.3125, + "learning_rate": 0.001769829108238799, + "loss": 1.3075, + "step": 3504 + }, + { + "epoch": 0.24383456815889248, + "grad_norm": 1.4375, + "learning_rate": 0.001769685269895823, + "loss": 1.1977, + "step": 3505 + }, + { + "epoch": 0.24390413579602768, + "grad_norm": 1.3046875, + "learning_rate": 0.0017695413924721088, + "loss": 1.1401, + "step": 3506 + }, + { + "epoch": 0.24397370343316288, + "grad_norm": 1.1171875, + "learning_rate": 0.0017693974759749609, + "loss": 1.0515, + "step": 3507 + }, + { + "epoch": 0.2440432710702981, + "grad_norm": 1.0, + "learning_rate": 0.0017692535204116876, + "loss": 1.1446, + "step": 3508 + }, + { + "epoch": 0.2441128387074333, + "grad_norm": 1.03125, + "learning_rate": 0.0017691095257895977, + "loss": 1.0242, + "step": 3509 + }, + { + "epoch": 0.2441824063445685, + "grad_norm": 1.265625, + "learning_rate": 0.0017689654921160028, + "loss": 1.2903, + "step": 3510 + }, + { + "epoch": 0.2442519739817037, + "grad_norm": 1.1328125, + "learning_rate": 0.0017688214193982159, + "loss": 0.9139, + "step": 3511 + }, + { + "epoch": 0.24432154161883893, + "grad_norm": 1.265625, + "learning_rate": 0.0017686773076435527, + "loss": 0.9588, + "step": 3512 + }, + { + "epoch": 0.24439110925597413, + "grad_norm": 1.2578125, + "learning_rate": 0.00176853315685933, + "loss": 1.2142, + "step": 3513 + }, + { + "epoch": 0.24446067689310932, + "grad_norm": 1.078125, + "learning_rate": 0.0017683889670528675, + "loss": 1.0694, + "step": 3514 + }, + { + "epoch": 0.24453024453024452, + "grad_norm": 1.1953125, + "learning_rate": 0.0017682447382314861, + "loss": 1.073, + "step": 3515 + }, + { + "epoch": 0.24459981216737975, + "grad_norm": 1.234375, + "learning_rate": 0.0017681004704025091, + "loss": 0.9615, + "step": 3516 + }, + { + "epoch": 0.24466937980451495, + "grad_norm": 1.2265625, + "learning_rate": 0.001767956163573262, + "loss": 0.8559, + "step": 3517 + }, + { + "epoch": 0.24473894744165015, + "grad_norm": 1.0390625, + "learning_rate": 0.0017678118177510713, + "loss": 1.0005, + "step": 3518 + }, + { + "epoch": 0.24480851507878534, + "grad_norm": 1.2578125, + "learning_rate": 0.0017676674329432669, + "loss": 1.0934, + "step": 3519 + }, + { + "epoch": 0.24487808271592054, + "grad_norm": 1.046875, + "learning_rate": 0.0017675230091571791, + "loss": 1.0709, + "step": 3520 + }, + { + "epoch": 0.24494765035305577, + "grad_norm": 1.046875, + "learning_rate": 0.001767378546400142, + "loss": 1.1684, + "step": 3521 + }, + { + "epoch": 0.24501721799019097, + "grad_norm": 1.0859375, + "learning_rate": 0.00176723404467949, + "loss": 1.1248, + "step": 3522 + }, + { + "epoch": 0.24508678562732616, + "grad_norm": 1.03125, + "learning_rate": 0.0017670895040025605, + "loss": 1.1665, + "step": 3523 + }, + { + "epoch": 0.24515635326446136, + "grad_norm": 1.6484375, + "learning_rate": 0.0017669449243766923, + "loss": 1.1104, + "step": 3524 + }, + { + "epoch": 0.2452259209015966, + "grad_norm": 1.234375, + "learning_rate": 0.0017668003058092263, + "loss": 0.9688, + "step": 3525 + }, + { + "epoch": 0.2452954885387318, + "grad_norm": 1.265625, + "learning_rate": 0.001766655648307506, + "loss": 1.1997, + "step": 3526 + }, + { + "epoch": 0.24536505617586699, + "grad_norm": 1.203125, + "learning_rate": 0.001766510951878876, + "loss": 0.9125, + "step": 3527 + }, + { + "epoch": 0.24543462381300218, + "grad_norm": 1.1875, + "learning_rate": 0.0017663662165306833, + "loss": 1.3179, + "step": 3528 + }, + { + "epoch": 0.24550419145013738, + "grad_norm": 1.46875, + "learning_rate": 0.0017662214422702772, + "loss": 1.1226, + "step": 3529 + }, + { + "epoch": 0.2455737590872726, + "grad_norm": 0.9765625, + "learning_rate": 0.0017660766291050082, + "loss": 0.8704, + "step": 3530 + }, + { + "epoch": 0.2456433267244078, + "grad_norm": 1.359375, + "learning_rate": 0.001765931777042229, + "loss": 1.2727, + "step": 3531 + }, + { + "epoch": 0.245712894361543, + "grad_norm": 1.046875, + "learning_rate": 0.001765786886089295, + "loss": 0.854, + "step": 3532 + }, + { + "epoch": 0.2457824619986782, + "grad_norm": 1.3828125, + "learning_rate": 0.0017656419562535625, + "loss": 1.1171, + "step": 3533 + }, + { + "epoch": 0.24585202963581343, + "grad_norm": 1.1484375, + "learning_rate": 0.001765496987542391, + "loss": 0.929, + "step": 3534 + }, + { + "epoch": 0.24592159727294863, + "grad_norm": 1.1171875, + "learning_rate": 0.0017653519799631407, + "loss": 1.2473, + "step": 3535 + }, + { + "epoch": 0.24599116491008383, + "grad_norm": 1.34375, + "learning_rate": 0.0017652069335231744, + "loss": 1.2321, + "step": 3536 + }, + { + "epoch": 0.24606073254721902, + "grad_norm": 1.21875, + "learning_rate": 0.001765061848229857, + "loss": 1.0668, + "step": 3537 + }, + { + "epoch": 0.24613030018435425, + "grad_norm": 1.1484375, + "learning_rate": 0.0017649167240905554, + "loss": 1.1731, + "step": 3538 + }, + { + "epoch": 0.24619986782148945, + "grad_norm": 1.3125, + "learning_rate": 0.0017647715611126375, + "loss": 0.8925, + "step": 3539 + }, + { + "epoch": 0.24626943545862465, + "grad_norm": 1.3515625, + "learning_rate": 0.0017646263593034748, + "loss": 1.1732, + "step": 3540 + }, + { + "epoch": 0.24633900309575985, + "grad_norm": 1.3359375, + "learning_rate": 0.0017644811186704396, + "loss": 1.1054, + "step": 3541 + }, + { + "epoch": 0.24640857073289504, + "grad_norm": 1.265625, + "learning_rate": 0.0017643358392209062, + "loss": 1.2514, + "step": 3542 + }, + { + "epoch": 0.24647813837003027, + "grad_norm": 1.15625, + "learning_rate": 0.0017641905209622518, + "loss": 0.8752, + "step": 3543 + }, + { + "epoch": 0.24654770600716547, + "grad_norm": 1.140625, + "learning_rate": 0.0017640451639018542, + "loss": 1.1269, + "step": 3544 + }, + { + "epoch": 0.24661727364430067, + "grad_norm": 0.96875, + "learning_rate": 0.0017638997680470944, + "loss": 1.1035, + "step": 3545 + }, + { + "epoch": 0.24668684128143586, + "grad_norm": 1.1875, + "learning_rate": 0.001763754333405355, + "loss": 1.3187, + "step": 3546 + }, + { + "epoch": 0.2467564089185711, + "grad_norm": 1.2578125, + "learning_rate": 0.0017636088599840196, + "loss": 1.2283, + "step": 3547 + }, + { + "epoch": 0.2468259765557063, + "grad_norm": 1.1796875, + "learning_rate": 0.0017634633477904755, + "loss": 1.1916, + "step": 3548 + }, + { + "epoch": 0.2468955441928415, + "grad_norm": 1.0546875, + "learning_rate": 0.0017633177968321109, + "loss": 0.9575, + "step": 3549 + }, + { + "epoch": 0.24696511182997669, + "grad_norm": 1.046875, + "learning_rate": 0.0017631722071163156, + "loss": 0.8847, + "step": 3550 + }, + { + "epoch": 0.2470346794671119, + "grad_norm": 1.0625, + "learning_rate": 0.0017630265786504824, + "loss": 0.888, + "step": 3551 + }, + { + "epoch": 0.2471042471042471, + "grad_norm": 1.03125, + "learning_rate": 0.0017628809114420057, + "loss": 0.951, + "step": 3552 + }, + { + "epoch": 0.2471738147413823, + "grad_norm": 1.3125, + "learning_rate": 0.0017627352054982812, + "loss": 1.0284, + "step": 3553 + }, + { + "epoch": 0.2472433823785175, + "grad_norm": 1.2734375, + "learning_rate": 0.0017625894608267077, + "loss": 1.2441, + "step": 3554 + }, + { + "epoch": 0.2473129500156527, + "grad_norm": 1.390625, + "learning_rate": 0.001762443677434685, + "loss": 1.1451, + "step": 3555 + }, + { + "epoch": 0.24738251765278793, + "grad_norm": 1.2109375, + "learning_rate": 0.0017622978553296154, + "loss": 1.1002, + "step": 3556 + }, + { + "epoch": 0.24745208528992313, + "grad_norm": 1.25, + "learning_rate": 0.0017621519945189028, + "loss": 1.0644, + "step": 3557 + }, + { + "epoch": 0.24752165292705833, + "grad_norm": 1.171875, + "learning_rate": 0.0017620060950099537, + "loss": 0.9999, + "step": 3558 + }, + { + "epoch": 0.24759122056419353, + "grad_norm": 1.2578125, + "learning_rate": 0.0017618601568101758, + "loss": 1.3499, + "step": 3559 + }, + { + "epoch": 0.24766078820132875, + "grad_norm": 1.1328125, + "learning_rate": 0.001761714179926979, + "loss": 0.9083, + "step": 3560 + }, + { + "epoch": 0.24773035583846395, + "grad_norm": 1.2421875, + "learning_rate": 0.001761568164367776, + "loss": 0.9532, + "step": 3561 + }, + { + "epoch": 0.24779992347559915, + "grad_norm": 1.390625, + "learning_rate": 0.0017614221101399797, + "loss": 1.064, + "step": 3562 + }, + { + "epoch": 0.24786949111273435, + "grad_norm": 1.4375, + "learning_rate": 0.0017612760172510066, + "loss": 1.0452, + "step": 3563 + }, + { + "epoch": 0.24793905874986957, + "grad_norm": 1.4140625, + "learning_rate": 0.0017611298857082745, + "loss": 0.9763, + "step": 3564 + }, + { + "epoch": 0.24800862638700477, + "grad_norm": 1.0703125, + "learning_rate": 0.0017609837155192032, + "loss": 0.8372, + "step": 3565 + }, + { + "epoch": 0.24807819402413997, + "grad_norm": 1.125, + "learning_rate": 0.0017608375066912143, + "loss": 1.0637, + "step": 3566 + }, + { + "epoch": 0.24814776166127517, + "grad_norm": 0.9765625, + "learning_rate": 0.0017606912592317322, + "loss": 1.0154, + "step": 3567 + }, + { + "epoch": 0.24821732929841037, + "grad_norm": 1.3125, + "learning_rate": 0.0017605449731481816, + "loss": 1.1124, + "step": 3568 + }, + { + "epoch": 0.2482868969355456, + "grad_norm": 1.2578125, + "learning_rate": 0.001760398648447991, + "loss": 1.1471, + "step": 3569 + }, + { + "epoch": 0.2483564645726808, + "grad_norm": 1.296875, + "learning_rate": 0.0017602522851385895, + "loss": 1.1284, + "step": 3570 + }, + { + "epoch": 0.248426032209816, + "grad_norm": 1.0859375, + "learning_rate": 0.001760105883227409, + "loss": 1.1458, + "step": 3571 + }, + { + "epoch": 0.2484955998469512, + "grad_norm": 1.0234375, + "learning_rate": 0.001759959442721883, + "loss": 0.9884, + "step": 3572 + }, + { + "epoch": 0.2485651674840864, + "grad_norm": 1.2734375, + "learning_rate": 0.001759812963629447, + "loss": 1.043, + "step": 3573 + }, + { + "epoch": 0.2486347351212216, + "grad_norm": 1.125, + "learning_rate": 0.0017596664459575385, + "loss": 1.0459, + "step": 3574 + }, + { + "epoch": 0.2487043027583568, + "grad_norm": 1.0546875, + "learning_rate": 0.0017595198897135968, + "loss": 1.1725, + "step": 3575 + }, + { + "epoch": 0.248773870395492, + "grad_norm": 1.140625, + "learning_rate": 0.0017593732949050633, + "loss": 1.0612, + "step": 3576 + }, + { + "epoch": 0.24884343803262723, + "grad_norm": 1.0859375, + "learning_rate": 0.0017592266615393815, + "loss": 0.7711, + "step": 3577 + }, + { + "epoch": 0.24891300566976243, + "grad_norm": 1.2734375, + "learning_rate": 0.0017590799896239969, + "loss": 1.2447, + "step": 3578 + }, + { + "epoch": 0.24898257330689763, + "grad_norm": 0.984375, + "learning_rate": 0.001758933279166356, + "loss": 0.9262, + "step": 3579 + }, + { + "epoch": 0.24905214094403283, + "grad_norm": 1.0625, + "learning_rate": 0.0017587865301739085, + "loss": 1.075, + "step": 3580 + }, + { + "epoch": 0.24912170858116803, + "grad_norm": 1.0859375, + "learning_rate": 0.001758639742654106, + "loss": 1.109, + "step": 3581 + }, + { + "epoch": 0.24919127621830325, + "grad_norm": 1.1015625, + "learning_rate": 0.0017584929166144009, + "loss": 1.0506, + "step": 3582 + }, + { + "epoch": 0.24926084385543845, + "grad_norm": 1.09375, + "learning_rate": 0.0017583460520622482, + "loss": 1.1858, + "step": 3583 + }, + { + "epoch": 0.24933041149257365, + "grad_norm": 1.046875, + "learning_rate": 0.001758199149005106, + "loss": 1.0795, + "step": 3584 + }, + { + "epoch": 0.24939997912970885, + "grad_norm": 0.9296875, + "learning_rate": 0.0017580522074504324, + "loss": 1.1447, + "step": 3585 + }, + { + "epoch": 0.24946954676684407, + "grad_norm": 1.4921875, + "learning_rate": 0.0017579052274056884, + "loss": 0.9276, + "step": 3586 + }, + { + "epoch": 0.24953911440397927, + "grad_norm": 1.265625, + "learning_rate": 0.0017577582088783373, + "loss": 1.0903, + "step": 3587 + }, + { + "epoch": 0.24960868204111447, + "grad_norm": 0.9296875, + "learning_rate": 0.0017576111518758436, + "loss": 0.9308, + "step": 3588 + }, + { + "epoch": 0.24967824967824967, + "grad_norm": 0.890625, + "learning_rate": 0.001757464056405674, + "loss": 0.8891, + "step": 3589 + }, + { + "epoch": 0.2497478173153849, + "grad_norm": 1.484375, + "learning_rate": 0.0017573169224752978, + "loss": 1.0226, + "step": 3590 + }, + { + "epoch": 0.2498173849525201, + "grad_norm": 1.078125, + "learning_rate": 0.0017571697500921857, + "loss": 1.2338, + "step": 3591 + }, + { + "epoch": 0.2498869525896553, + "grad_norm": 1.3125, + "learning_rate": 0.0017570225392638098, + "loss": 0.9897, + "step": 3592 + }, + { + "epoch": 0.2499565202267905, + "grad_norm": 1.34375, + "learning_rate": 0.001756875289997645, + "loss": 1.0784, + "step": 3593 + }, + { + "epoch": 0.2500260878639257, + "grad_norm": 1.609375, + "learning_rate": 0.001756728002301168, + "loss": 1.2579, + "step": 3594 + }, + { + "epoch": 0.2500956555010609, + "grad_norm": 1.171875, + "learning_rate": 0.0017565806761818572, + "loss": 1.1706, + "step": 3595 + }, + { + "epoch": 0.2501652231381961, + "grad_norm": 1.203125, + "learning_rate": 0.001756433311647193, + "loss": 1.167, + "step": 3596 + }, + { + "epoch": 0.25023479077533134, + "grad_norm": 1.078125, + "learning_rate": 0.0017562859087046584, + "loss": 1.1437, + "step": 3597 + }, + { + "epoch": 0.2503043584124665, + "grad_norm": 1.109375, + "learning_rate": 0.001756138467361737, + "loss": 0.9237, + "step": 3598 + }, + { + "epoch": 0.25037392604960174, + "grad_norm": 1.3046875, + "learning_rate": 0.0017559909876259155, + "loss": 1.0883, + "step": 3599 + }, + { + "epoch": 0.2504434936867369, + "grad_norm": 1.0390625, + "learning_rate": 0.0017558434695046819, + "loss": 1.121, + "step": 3600 + }, + { + "epoch": 0.25051306132387213, + "grad_norm": 1.265625, + "learning_rate": 0.0017556959130055267, + "loss": 1.0369, + "step": 3601 + }, + { + "epoch": 0.25058262896100736, + "grad_norm": 1.1328125, + "learning_rate": 0.001755548318135942, + "loss": 1.0228, + "step": 3602 + }, + { + "epoch": 0.25065219659814253, + "grad_norm": 1.171875, + "learning_rate": 0.0017554006849034222, + "loss": 0.9694, + "step": 3603 + }, + { + "epoch": 0.25072176423527776, + "grad_norm": 1.3125, + "learning_rate": 0.0017552530133154631, + "loss": 1.0473, + "step": 3604 + }, + { + "epoch": 0.2507913318724129, + "grad_norm": 1.21875, + "learning_rate": 0.0017551053033795627, + "loss": 1.1891, + "step": 3605 + }, + { + "epoch": 0.25086089950954815, + "grad_norm": 1.1875, + "learning_rate": 0.001754957555103221, + "loss": 0.9631, + "step": 3606 + }, + { + "epoch": 0.2509304671466834, + "grad_norm": 1.078125, + "learning_rate": 0.00175480976849394, + "loss": 0.9455, + "step": 3607 + }, + { + "epoch": 0.25100003478381855, + "grad_norm": 1.046875, + "learning_rate": 0.0017546619435592232, + "loss": 1.0606, + "step": 3608 + }, + { + "epoch": 0.2510696024209538, + "grad_norm": 1.03125, + "learning_rate": 0.001754514080306577, + "loss": 1.0355, + "step": 3609 + }, + { + "epoch": 0.251139170058089, + "grad_norm": 1.4609375, + "learning_rate": 0.0017543661787435085, + "loss": 0.9412, + "step": 3610 + }, + { + "epoch": 0.25120873769522417, + "grad_norm": 1.28125, + "learning_rate": 0.0017542182388775279, + "loss": 1.322, + "step": 3611 + }, + { + "epoch": 0.2512783053323594, + "grad_norm": 1.2890625, + "learning_rate": 0.0017540702607161467, + "loss": 0.9471, + "step": 3612 + }, + { + "epoch": 0.25134787296949457, + "grad_norm": 1.546875, + "learning_rate": 0.0017539222442668784, + "loss": 1.0245, + "step": 3613 + }, + { + "epoch": 0.2514174406066298, + "grad_norm": 1.1171875, + "learning_rate": 0.0017537741895372388, + "loss": 1.2477, + "step": 3614 + }, + { + "epoch": 0.251487008243765, + "grad_norm": 1.6953125, + "learning_rate": 0.0017536260965347447, + "loss": 1.1399, + "step": 3615 + }, + { + "epoch": 0.2515565758809002, + "grad_norm": 1.265625, + "learning_rate": 0.0017534779652669163, + "loss": 1.2683, + "step": 3616 + }, + { + "epoch": 0.2516261435180354, + "grad_norm": 1.0625, + "learning_rate": 0.0017533297957412746, + "loss": 0.993, + "step": 3617 + }, + { + "epoch": 0.2516957111551706, + "grad_norm": 0.99609375, + "learning_rate": 0.0017531815879653432, + "loss": 1.0794, + "step": 3618 + }, + { + "epoch": 0.2517652787923058, + "grad_norm": 1.2421875, + "learning_rate": 0.0017530333419466468, + "loss": 1.1167, + "step": 3619 + }, + { + "epoch": 0.25183484642944104, + "grad_norm": 1.1640625, + "learning_rate": 0.0017528850576927128, + "loss": 1.032, + "step": 3620 + }, + { + "epoch": 0.2519044140665762, + "grad_norm": 1.4140625, + "learning_rate": 0.0017527367352110704, + "loss": 1.1043, + "step": 3621 + }, + { + "epoch": 0.25197398170371144, + "grad_norm": 1.1875, + "learning_rate": 0.0017525883745092509, + "loss": 1.0413, + "step": 3622 + }, + { + "epoch": 0.25204354934084666, + "grad_norm": 1.1171875, + "learning_rate": 0.0017524399755947865, + "loss": 1.0989, + "step": 3623 + }, + { + "epoch": 0.25211311697798183, + "grad_norm": 1.296875, + "learning_rate": 0.0017522915384752134, + "loss": 0.9765, + "step": 3624 + }, + { + "epoch": 0.25218268461511706, + "grad_norm": 1.6640625, + "learning_rate": 0.0017521430631580674, + "loss": 1.2161, + "step": 3625 + }, + { + "epoch": 0.25225225225225223, + "grad_norm": 1.140625, + "learning_rate": 0.001751994549650888, + "loss": 1.4329, + "step": 3626 + }, + { + "epoch": 0.25232181988938746, + "grad_norm": 1.1953125, + "learning_rate": 0.0017518459979612155, + "loss": 1.2085, + "step": 3627 + }, + { + "epoch": 0.2523913875265227, + "grad_norm": 1.1953125, + "learning_rate": 0.001751697408096593, + "loss": 1.2017, + "step": 3628 + }, + { + "epoch": 0.25246095516365785, + "grad_norm": 1.15625, + "learning_rate": 0.0017515487800645647, + "loss": 0.9696, + "step": 3629 + }, + { + "epoch": 0.2525305228007931, + "grad_norm": 0.94140625, + "learning_rate": 0.0017514001138726775, + "loss": 1.0484, + "step": 3630 + }, + { + "epoch": 0.25260009043792825, + "grad_norm": 1.0859375, + "learning_rate": 0.00175125140952848, + "loss": 1.0938, + "step": 3631 + }, + { + "epoch": 0.2526696580750635, + "grad_norm": 1.046875, + "learning_rate": 0.0017511026670395222, + "loss": 1.158, + "step": 3632 + }, + { + "epoch": 0.2527392257121987, + "grad_norm": 1.1484375, + "learning_rate": 0.0017509538864133574, + "loss": 0.862, + "step": 3633 + }, + { + "epoch": 0.25280879334933387, + "grad_norm": 1.3984375, + "learning_rate": 0.0017508050676575389, + "loss": 0.8049, + "step": 3634 + }, + { + "epoch": 0.2528783609864691, + "grad_norm": 0.9140625, + "learning_rate": 0.0017506562107796233, + "loss": 0.9967, + "step": 3635 + }, + { + "epoch": 0.2529479286236043, + "grad_norm": 1.265625, + "learning_rate": 0.001750507315787169, + "loss": 1.1872, + "step": 3636 + }, + { + "epoch": 0.2530174962607395, + "grad_norm": 0.91015625, + "learning_rate": 0.0017503583826877364, + "loss": 1.0083, + "step": 3637 + }, + { + "epoch": 0.2530870638978747, + "grad_norm": 1.203125, + "learning_rate": 0.001750209411488887, + "loss": 1.1361, + "step": 3638 + }, + { + "epoch": 0.2531566315350099, + "grad_norm": 0.9609375, + "learning_rate": 0.0017500604021981848, + "loss": 1.0634, + "step": 3639 + }, + { + "epoch": 0.2532261991721451, + "grad_norm": 1.109375, + "learning_rate": 0.0017499113548231963, + "loss": 0.7385, + "step": 3640 + }, + { + "epoch": 0.25329576680928034, + "grad_norm": 1.140625, + "learning_rate": 0.0017497622693714886, + "loss": 1.0432, + "step": 3641 + }, + { + "epoch": 0.2533653344464155, + "grad_norm": 1.2421875, + "learning_rate": 0.001749613145850632, + "loss": 0.9483, + "step": 3642 + }, + { + "epoch": 0.25343490208355074, + "grad_norm": 1.2109375, + "learning_rate": 0.0017494639842681986, + "loss": 1.1768, + "step": 3643 + }, + { + "epoch": 0.2535044697206859, + "grad_norm": 1.109375, + "learning_rate": 0.0017493147846317613, + "loss": 1.2575, + "step": 3644 + }, + { + "epoch": 0.25357403735782114, + "grad_norm": 1.1640625, + "learning_rate": 0.0017491655469488963, + "loss": 1.0326, + "step": 3645 + }, + { + "epoch": 0.25364360499495636, + "grad_norm": 1.4609375, + "learning_rate": 0.0017490162712271808, + "loss": 1.0368, + "step": 3646 + }, + { + "epoch": 0.25371317263209153, + "grad_norm": 1.3359375, + "learning_rate": 0.0017488669574741943, + "loss": 0.9942, + "step": 3647 + }, + { + "epoch": 0.25378274026922676, + "grad_norm": 1.1640625, + "learning_rate": 0.0017487176056975185, + "loss": 1.1022, + "step": 3648 + }, + { + "epoch": 0.253852307906362, + "grad_norm": 1.2265625, + "learning_rate": 0.001748568215904736, + "loss": 1.1188, + "step": 3649 + }, + { + "epoch": 0.25392187554349716, + "grad_norm": 1.0703125, + "learning_rate": 0.001748418788103433, + "loss": 1.158, + "step": 3650 + }, + { + "epoch": 0.2539914431806324, + "grad_norm": 0.98046875, + "learning_rate": 0.0017482693223011961, + "loss": 1.0782, + "step": 3651 + }, + { + "epoch": 0.25406101081776755, + "grad_norm": 1.09375, + "learning_rate": 0.0017481198185056146, + "loss": 0.964, + "step": 3652 + }, + { + "epoch": 0.2541305784549028, + "grad_norm": 1.140625, + "learning_rate": 0.0017479702767242795, + "loss": 1.0249, + "step": 3653 + }, + { + "epoch": 0.254200146092038, + "grad_norm": 1.09375, + "learning_rate": 0.001747820696964784, + "loss": 1.0348, + "step": 3654 + }, + { + "epoch": 0.2542697137291732, + "grad_norm": 1.265625, + "learning_rate": 0.0017476710792347226, + "loss": 1.1565, + "step": 3655 + }, + { + "epoch": 0.2543392813663084, + "grad_norm": 1.109375, + "learning_rate": 0.0017475214235416923, + "loss": 1.1294, + "step": 3656 + }, + { + "epoch": 0.25440884900344357, + "grad_norm": 1.0703125, + "learning_rate": 0.0017473717298932918, + "loss": 0.8729, + "step": 3657 + }, + { + "epoch": 0.2544784166405788, + "grad_norm": 1.1796875, + "learning_rate": 0.0017472219982971222, + "loss": 0.9156, + "step": 3658 + }, + { + "epoch": 0.254547984277714, + "grad_norm": 1.1171875, + "learning_rate": 0.0017470722287607856, + "loss": 1.1243, + "step": 3659 + }, + { + "epoch": 0.2546175519148492, + "grad_norm": 1.375, + "learning_rate": 0.001746922421291887, + "loss": 1.0525, + "step": 3660 + }, + { + "epoch": 0.2546871195519844, + "grad_norm": 1.171875, + "learning_rate": 0.0017467725758980323, + "loss": 1.0949, + "step": 3661 + }, + { + "epoch": 0.25475668718911965, + "grad_norm": 1.0078125, + "learning_rate": 0.0017466226925868305, + "loss": 0.918, + "step": 3662 + }, + { + "epoch": 0.2548262548262548, + "grad_norm": 1.234375, + "learning_rate": 0.0017464727713658915, + "loss": 1.2199, + "step": 3663 + }, + { + "epoch": 0.25489582246339004, + "grad_norm": 1.3046875, + "learning_rate": 0.0017463228122428275, + "loss": 1.2338, + "step": 3664 + }, + { + "epoch": 0.2549653901005252, + "grad_norm": 1.1171875, + "learning_rate": 0.0017461728152252528, + "loss": 0.93, + "step": 3665 + }, + { + "epoch": 0.25503495773766044, + "grad_norm": 1.078125, + "learning_rate": 0.0017460227803207838, + "loss": 1.1836, + "step": 3666 + }, + { + "epoch": 0.25510452537479567, + "grad_norm": 1.2109375, + "learning_rate": 0.0017458727075370382, + "loss": 0.9921, + "step": 3667 + }, + { + "epoch": 0.25517409301193084, + "grad_norm": 1.0859375, + "learning_rate": 0.001745722596881636, + "loss": 1.1594, + "step": 3668 + }, + { + "epoch": 0.25524366064906606, + "grad_norm": 1.0390625, + "learning_rate": 0.0017455724483621989, + "loss": 1.2506, + "step": 3669 + }, + { + "epoch": 0.25531322828620123, + "grad_norm": 1.4140625, + "learning_rate": 0.001745422261986351, + "loss": 1.1569, + "step": 3670 + }, + { + "epoch": 0.25538279592333646, + "grad_norm": 1.03125, + "learning_rate": 0.0017452720377617178, + "loss": 0.8423, + "step": 3671 + }, + { + "epoch": 0.2554523635604717, + "grad_norm": 1.359375, + "learning_rate": 0.0017451217756959268, + "loss": 1.1653, + "step": 3672 + }, + { + "epoch": 0.25552193119760686, + "grad_norm": 1.0703125, + "learning_rate": 0.001744971475796608, + "loss": 0.7858, + "step": 3673 + }, + { + "epoch": 0.2555914988347421, + "grad_norm": 1.0, + "learning_rate": 0.0017448211380713923, + "loss": 1.1199, + "step": 3674 + }, + { + "epoch": 0.2556610664718773, + "grad_norm": 1.0390625, + "learning_rate": 0.0017446707625279135, + "loss": 0.9626, + "step": 3675 + }, + { + "epoch": 0.2557306341090125, + "grad_norm": 1.03125, + "learning_rate": 0.001744520349173807, + "loss": 0.9203, + "step": 3676 + }, + { + "epoch": 0.2558002017461477, + "grad_norm": 1.0234375, + "learning_rate": 0.0017443698980167096, + "loss": 0.8413, + "step": 3677 + }, + { + "epoch": 0.2558697693832829, + "grad_norm": 1.671875, + "learning_rate": 0.0017442194090642607, + "loss": 1.3561, + "step": 3678 + }, + { + "epoch": 0.2559393370204181, + "grad_norm": 1.3046875, + "learning_rate": 0.0017440688823241012, + "loss": 0.9417, + "step": 3679 + }, + { + "epoch": 0.2560089046575533, + "grad_norm": 1.2578125, + "learning_rate": 0.0017439183178038747, + "loss": 0.8635, + "step": 3680 + }, + { + "epoch": 0.2560784722946885, + "grad_norm": 1.1875, + "learning_rate": 0.0017437677155112252, + "loss": 1.0922, + "step": 3681 + }, + { + "epoch": 0.2561480399318237, + "grad_norm": 1.171875, + "learning_rate": 0.0017436170754538001, + "loss": 1.0156, + "step": 3682 + }, + { + "epoch": 0.2562176075689589, + "grad_norm": 1.1953125, + "learning_rate": 0.0017434663976392483, + "loss": 1.2135, + "step": 3683 + }, + { + "epoch": 0.2562871752060941, + "grad_norm": 1.2734375, + "learning_rate": 0.00174331568207522, + "loss": 1.1355, + "step": 3684 + }, + { + "epoch": 0.25635674284322935, + "grad_norm": 1.5390625, + "learning_rate": 0.0017431649287693678, + "loss": 1.1804, + "step": 3685 + }, + { + "epoch": 0.2564263104803645, + "grad_norm": 0.98828125, + "learning_rate": 0.0017430141377293466, + "loss": 1.0658, + "step": 3686 + }, + { + "epoch": 0.25649587811749974, + "grad_norm": 1.21875, + "learning_rate": 0.0017428633089628122, + "loss": 1.0869, + "step": 3687 + }, + { + "epoch": 0.25656544575463497, + "grad_norm": 0.96484375, + "learning_rate": 0.0017427124424774236, + "loss": 0.7553, + "step": 3688 + }, + { + "epoch": 0.25663501339177014, + "grad_norm": 1.1953125, + "learning_rate": 0.0017425615382808406, + "loss": 1.0089, + "step": 3689 + }, + { + "epoch": 0.25670458102890537, + "grad_norm": 1.015625, + "learning_rate": 0.0017424105963807252, + "loss": 1.1103, + "step": 3690 + }, + { + "epoch": 0.25677414866604054, + "grad_norm": 1.09375, + "learning_rate": 0.0017422596167847421, + "loss": 0.9985, + "step": 3691 + }, + { + "epoch": 0.25684371630317576, + "grad_norm": 1.453125, + "learning_rate": 0.001742108599500557, + "loss": 1.1166, + "step": 3692 + }, + { + "epoch": 0.256913283940311, + "grad_norm": 0.90625, + "learning_rate": 0.0017419575445358376, + "loss": 0.9415, + "step": 3693 + }, + { + "epoch": 0.25698285157744616, + "grad_norm": 1.1328125, + "learning_rate": 0.0017418064518982539, + "loss": 0.9779, + "step": 3694 + }, + { + "epoch": 0.2570524192145814, + "grad_norm": 0.94921875, + "learning_rate": 0.0017416553215954774, + "loss": 1.0644, + "step": 3695 + }, + { + "epoch": 0.25712198685171656, + "grad_norm": 0.94921875, + "learning_rate": 0.0017415041536351819, + "loss": 0.8074, + "step": 3696 + }, + { + "epoch": 0.2571915544888518, + "grad_norm": 1.046875, + "learning_rate": 0.001741352948025043, + "loss": 1.1636, + "step": 3697 + }, + { + "epoch": 0.257261122125987, + "grad_norm": 1.171875, + "learning_rate": 0.001741201704772738, + "loss": 1.1348, + "step": 3698 + }, + { + "epoch": 0.2573306897631222, + "grad_norm": 1.1953125, + "learning_rate": 0.0017410504238859464, + "loss": 0.9444, + "step": 3699 + }, + { + "epoch": 0.2574002574002574, + "grad_norm": 0.984375, + "learning_rate": 0.0017408991053723495, + "loss": 0.9938, + "step": 3700 + }, + { + "epoch": 0.25746982503739263, + "grad_norm": 1.1640625, + "learning_rate": 0.0017407477492396306, + "loss": 1.0754, + "step": 3701 + }, + { + "epoch": 0.2575393926745278, + "grad_norm": 1.1796875, + "learning_rate": 0.0017405963554954745, + "loss": 0.981, + "step": 3702 + }, + { + "epoch": 0.257608960311663, + "grad_norm": 1.2265625, + "learning_rate": 0.0017404449241475682, + "loss": 1.3793, + "step": 3703 + }, + { + "epoch": 0.2576785279487982, + "grad_norm": 1.0625, + "learning_rate": 0.0017402934552036007, + "loss": 1.051, + "step": 3704 + }, + { + "epoch": 0.2577480955859334, + "grad_norm": 1.359375, + "learning_rate": 0.0017401419486712632, + "loss": 1.3719, + "step": 3705 + }, + { + "epoch": 0.25781766322306865, + "grad_norm": 1.59375, + "learning_rate": 0.001739990404558248, + "loss": 1.1657, + "step": 3706 + }, + { + "epoch": 0.2578872308602038, + "grad_norm": 1.0859375, + "learning_rate": 0.00173983882287225, + "loss": 1.097, + "step": 3707 + }, + { + "epoch": 0.25795679849733905, + "grad_norm": 1.2578125, + "learning_rate": 0.0017396872036209655, + "loss": 0.9704, + "step": 3708 + }, + { + "epoch": 0.2580263661344742, + "grad_norm": 1.125, + "learning_rate": 0.001739535546812093, + "loss": 1.1431, + "step": 3709 + }, + { + "epoch": 0.25809593377160944, + "grad_norm": 1.234375, + "learning_rate": 0.0017393838524533333, + "loss": 0.9777, + "step": 3710 + }, + { + "epoch": 0.25816550140874467, + "grad_norm": 1.328125, + "learning_rate": 0.001739232120552388, + "loss": 1.1802, + "step": 3711 + }, + { + "epoch": 0.25823506904587984, + "grad_norm": 1.3046875, + "learning_rate": 0.0017390803511169617, + "loss": 1.0882, + "step": 3712 + }, + { + "epoch": 0.25830463668301507, + "grad_norm": 1.421875, + "learning_rate": 0.0017389285441547606, + "loss": 1.0451, + "step": 3713 + }, + { + "epoch": 0.2583742043201503, + "grad_norm": 1.09375, + "learning_rate": 0.0017387766996734924, + "loss": 1.0731, + "step": 3714 + }, + { + "epoch": 0.25844377195728546, + "grad_norm": 1.0859375, + "learning_rate": 0.0017386248176808673, + "loss": 1.101, + "step": 3715 + }, + { + "epoch": 0.2585133395944207, + "grad_norm": 1.125, + "learning_rate": 0.0017384728981845966, + "loss": 1.0954, + "step": 3716 + }, + { + "epoch": 0.25858290723155586, + "grad_norm": 1.046875, + "learning_rate": 0.0017383209411923944, + "loss": 1.0016, + "step": 3717 + }, + { + "epoch": 0.2586524748686911, + "grad_norm": 0.953125, + "learning_rate": 0.0017381689467119764, + "loss": 1.1121, + "step": 3718 + }, + { + "epoch": 0.2587220425058263, + "grad_norm": 1.3359375, + "learning_rate": 0.0017380169147510594, + "loss": 1.2915, + "step": 3719 + }, + { + "epoch": 0.2587916101429615, + "grad_norm": 1.21875, + "learning_rate": 0.0017378648453173638, + "loss": 1.2362, + "step": 3720 + }, + { + "epoch": 0.2588611777800967, + "grad_norm": 1.1328125, + "learning_rate": 0.0017377127384186105, + "loss": 1.1448, + "step": 3721 + }, + { + "epoch": 0.2589307454172319, + "grad_norm": 1.328125, + "learning_rate": 0.0017375605940625225, + "loss": 1.2008, + "step": 3722 + }, + { + "epoch": 0.2590003130543671, + "grad_norm": 1.4296875, + "learning_rate": 0.001737408412256825, + "loss": 1.0659, + "step": 3723 + }, + { + "epoch": 0.25906988069150233, + "grad_norm": 1.1484375, + "learning_rate": 0.0017372561930092455, + "loss": 1.1304, + "step": 3724 + }, + { + "epoch": 0.2591394483286375, + "grad_norm": 1.1484375, + "learning_rate": 0.0017371039363275123, + "loss": 0.945, + "step": 3725 + }, + { + "epoch": 0.2592090159657727, + "grad_norm": 1.1015625, + "learning_rate": 0.0017369516422193567, + "loss": 1.0949, + "step": 3726 + }, + { + "epoch": 0.25927858360290795, + "grad_norm": 1.1171875, + "learning_rate": 0.001736799310692511, + "loss": 1.1905, + "step": 3727 + }, + { + "epoch": 0.2593481512400431, + "grad_norm": 1.1171875, + "learning_rate": 0.0017366469417547101, + "loss": 1.1927, + "step": 3728 + }, + { + "epoch": 0.25941771887717835, + "grad_norm": 1.4375, + "learning_rate": 0.0017364945354136907, + "loss": 1.1854, + "step": 3729 + }, + { + "epoch": 0.2594872865143135, + "grad_norm": 1.1640625, + "learning_rate": 0.0017363420916771909, + "loss": 0.9458, + "step": 3730 + }, + { + "epoch": 0.25955685415144875, + "grad_norm": 1.359375, + "learning_rate": 0.0017361896105529508, + "loss": 1.3304, + "step": 3731 + }, + { + "epoch": 0.25962642178858397, + "grad_norm": 1.0078125, + "learning_rate": 0.0017360370920487134, + "loss": 0.9451, + "step": 3732 + }, + { + "epoch": 0.25969598942571914, + "grad_norm": 1.15625, + "learning_rate": 0.0017358845361722221, + "loss": 0.6876, + "step": 3733 + }, + { + "epoch": 0.25976555706285437, + "grad_norm": 1.15625, + "learning_rate": 0.0017357319429312232, + "loss": 1.0501, + "step": 3734 + }, + { + "epoch": 0.25983512469998954, + "grad_norm": 0.95703125, + "learning_rate": 0.0017355793123334648, + "loss": 0.8675, + "step": 3735 + }, + { + "epoch": 0.25990469233712477, + "grad_norm": 1.09375, + "learning_rate": 0.0017354266443866961, + "loss": 1.1924, + "step": 3736 + }, + { + "epoch": 0.25997425997426, + "grad_norm": 1.1171875, + "learning_rate": 0.0017352739390986696, + "loss": 0.9601, + "step": 3737 + }, + { + "epoch": 0.26004382761139516, + "grad_norm": 1.3984375, + "learning_rate": 0.0017351211964771384, + "loss": 1.0359, + "step": 3738 + }, + { + "epoch": 0.2601133952485304, + "grad_norm": 1.15625, + "learning_rate": 0.0017349684165298583, + "loss": 1.1129, + "step": 3739 + }, + { + "epoch": 0.2601829628856656, + "grad_norm": 0.97265625, + "learning_rate": 0.0017348155992645863, + "loss": 1.0371, + "step": 3740 + }, + { + "epoch": 0.2602525305228008, + "grad_norm": 1.3203125, + "learning_rate": 0.001734662744689082, + "loss": 1.1541, + "step": 3741 + }, + { + "epoch": 0.260322098159936, + "grad_norm": 1.375, + "learning_rate": 0.0017345098528111062, + "loss": 1.1502, + "step": 3742 + }, + { + "epoch": 0.2603916657970712, + "grad_norm": 1.296875, + "learning_rate": 0.0017343569236384227, + "loss": 1.1974, + "step": 3743 + }, + { + "epoch": 0.2604612334342064, + "grad_norm": 1.3984375, + "learning_rate": 0.001734203957178796, + "loss": 0.9742, + "step": 3744 + }, + { + "epoch": 0.26053080107134163, + "grad_norm": 1.078125, + "learning_rate": 0.0017340509534399928, + "loss": 1.1656, + "step": 3745 + }, + { + "epoch": 0.2606003687084768, + "grad_norm": 0.99609375, + "learning_rate": 0.0017338979124297822, + "loss": 1.1089, + "step": 3746 + }, + { + "epoch": 0.26066993634561203, + "grad_norm": 1.0859375, + "learning_rate": 0.0017337448341559348, + "loss": 0.8266, + "step": 3747 + }, + { + "epoch": 0.2607395039827472, + "grad_norm": 1.0390625, + "learning_rate": 0.001733591718626223, + "loss": 0.8495, + "step": 3748 + }, + { + "epoch": 0.2608090716198824, + "grad_norm": 1.234375, + "learning_rate": 0.0017334385658484212, + "loss": 1.2051, + "step": 3749 + }, + { + "epoch": 0.26087863925701765, + "grad_norm": 1.4765625, + "learning_rate": 0.0017332853758303059, + "loss": 1.307, + "step": 3750 + }, + { + "epoch": 0.2609482068941528, + "grad_norm": 1.203125, + "learning_rate": 0.0017331321485796554, + "loss": 1.1944, + "step": 3751 + }, + { + "epoch": 0.26101777453128805, + "grad_norm": 1.2265625, + "learning_rate": 0.0017329788841042495, + "loss": 0.9252, + "step": 3752 + }, + { + "epoch": 0.2610873421684233, + "grad_norm": 1.265625, + "learning_rate": 0.0017328255824118704, + "loss": 0.9735, + "step": 3753 + }, + { + "epoch": 0.26115690980555845, + "grad_norm": 1.3359375, + "learning_rate": 0.001732672243510302, + "loss": 1.2298, + "step": 3754 + }, + { + "epoch": 0.26122647744269367, + "grad_norm": 1.1484375, + "learning_rate": 0.00173251886740733, + "loss": 1.1572, + "step": 3755 + }, + { + "epoch": 0.26129604507982884, + "grad_norm": 0.92578125, + "learning_rate": 0.0017323654541107419, + "loss": 0.9327, + "step": 3756 + }, + { + "epoch": 0.26136561271696407, + "grad_norm": 1.265625, + "learning_rate": 0.0017322120036283276, + "loss": 0.8942, + "step": 3757 + }, + { + "epoch": 0.2614351803540993, + "grad_norm": 1.1796875, + "learning_rate": 0.0017320585159678783, + "loss": 0.9664, + "step": 3758 + }, + { + "epoch": 0.26150474799123447, + "grad_norm": 0.9765625, + "learning_rate": 0.0017319049911371876, + "loss": 1.1331, + "step": 3759 + }, + { + "epoch": 0.2615743156283697, + "grad_norm": 1.0, + "learning_rate": 0.00173175142914405, + "loss": 0.9512, + "step": 3760 + }, + { + "epoch": 0.26164388326550486, + "grad_norm": 1.0703125, + "learning_rate": 0.0017315978299962636, + "loss": 0.7905, + "step": 3761 + }, + { + "epoch": 0.2617134509026401, + "grad_norm": 1.46875, + "learning_rate": 0.001731444193701627, + "loss": 1.2855, + "step": 3762 + }, + { + "epoch": 0.2617830185397753, + "grad_norm": 1.3046875, + "learning_rate": 0.0017312905202679408, + "loss": 1.0728, + "step": 3763 + }, + { + "epoch": 0.2618525861769105, + "grad_norm": 1.2578125, + "learning_rate": 0.001731136809703008, + "loss": 1.2534, + "step": 3764 + }, + { + "epoch": 0.2619221538140457, + "grad_norm": 0.94921875, + "learning_rate": 0.0017309830620146332, + "loss": 0.7704, + "step": 3765 + }, + { + "epoch": 0.26199172145118094, + "grad_norm": 1.296875, + "learning_rate": 0.0017308292772106229, + "loss": 1.0189, + "step": 3766 + }, + { + "epoch": 0.2620612890883161, + "grad_norm": 1.1484375, + "learning_rate": 0.0017306754552987855, + "loss": 1.1517, + "step": 3767 + }, + { + "epoch": 0.26213085672545133, + "grad_norm": 1.015625, + "learning_rate": 0.0017305215962869313, + "loss": 1.0454, + "step": 3768 + }, + { + "epoch": 0.2622004243625865, + "grad_norm": 1.03125, + "learning_rate": 0.0017303677001828729, + "loss": 0.8192, + "step": 3769 + }, + { + "epoch": 0.26226999199972173, + "grad_norm": 1.4296875, + "learning_rate": 0.0017302137669944235, + "loss": 1.2484, + "step": 3770 + }, + { + "epoch": 0.26233955963685696, + "grad_norm": 1.0703125, + "learning_rate": 0.0017300597967294, + "loss": 0.9947, + "step": 3771 + }, + { + "epoch": 0.2624091272739921, + "grad_norm": 0.90234375, + "learning_rate": 0.0017299057893956195, + "loss": 1.0141, + "step": 3772 + }, + { + "epoch": 0.26247869491112735, + "grad_norm": 1.328125, + "learning_rate": 0.0017297517450009022, + "loss": 1.3139, + "step": 3773 + }, + { + "epoch": 0.2625482625482625, + "grad_norm": 1.171875, + "learning_rate": 0.0017295976635530695, + "loss": 1.2733, + "step": 3774 + }, + { + "epoch": 0.26261783018539775, + "grad_norm": 1.1328125, + "learning_rate": 0.001729443545059945, + "loss": 0.9708, + "step": 3775 + }, + { + "epoch": 0.262687397822533, + "grad_norm": 1.0625, + "learning_rate": 0.0017292893895293538, + "loss": 1.1274, + "step": 3776 + }, + { + "epoch": 0.26275696545966815, + "grad_norm": 0.88671875, + "learning_rate": 0.0017291351969691232, + "loss": 0.9228, + "step": 3777 + }, + { + "epoch": 0.26282653309680337, + "grad_norm": 1.25, + "learning_rate": 0.0017289809673870825, + "loss": 1.2935, + "step": 3778 + }, + { + "epoch": 0.2628961007339386, + "grad_norm": 1.1640625, + "learning_rate": 0.0017288267007910627, + "loss": 0.8132, + "step": 3779 + }, + { + "epoch": 0.26296566837107377, + "grad_norm": 1.203125, + "learning_rate": 0.0017286723971888965, + "loss": 0.9086, + "step": 3780 + }, + { + "epoch": 0.263035236008209, + "grad_norm": 1.3359375, + "learning_rate": 0.0017285180565884187, + "loss": 1.0525, + "step": 3781 + }, + { + "epoch": 0.26310480364534417, + "grad_norm": 1.2109375, + "learning_rate": 0.0017283636789974662, + "loss": 0.8169, + "step": 3782 + }, + { + "epoch": 0.2631743712824794, + "grad_norm": 0.84765625, + "learning_rate": 0.001728209264423877, + "loss": 0.8526, + "step": 3783 + }, + { + "epoch": 0.2632439389196146, + "grad_norm": 0.921875, + "learning_rate": 0.001728054812875492, + "loss": 0.965, + "step": 3784 + }, + { + "epoch": 0.2633135065567498, + "grad_norm": 1.046875, + "learning_rate": 0.0017279003243601532, + "loss": 0.8445, + "step": 3785 + }, + { + "epoch": 0.263383074193885, + "grad_norm": 1.2578125, + "learning_rate": 0.001727745798885705, + "loss": 1.0094, + "step": 3786 + }, + { + "epoch": 0.2634526418310202, + "grad_norm": 0.96875, + "learning_rate": 0.0017275912364599928, + "loss": 1.0176, + "step": 3787 + }, + { + "epoch": 0.2635222094681554, + "grad_norm": 1.296875, + "learning_rate": 0.0017274366370908655, + "loss": 0.941, + "step": 3788 + }, + { + "epoch": 0.26359177710529064, + "grad_norm": 1.046875, + "learning_rate": 0.0017272820007861718, + "loss": 1.2155, + "step": 3789 + }, + { + "epoch": 0.2636613447424258, + "grad_norm": 1.0859375, + "learning_rate": 0.0017271273275537642, + "loss": 0.8125, + "step": 3790 + }, + { + "epoch": 0.26373091237956103, + "grad_norm": 1.515625, + "learning_rate": 0.0017269726174014956, + "loss": 0.8297, + "step": 3791 + }, + { + "epoch": 0.26380048001669626, + "grad_norm": 1.609375, + "learning_rate": 0.001726817870337222, + "loss": 1.0904, + "step": 3792 + }, + { + "epoch": 0.26387004765383143, + "grad_norm": 1.46875, + "learning_rate": 0.0017266630863688004, + "loss": 1.1033, + "step": 3793 + }, + { + "epoch": 0.26393961529096666, + "grad_norm": 0.9453125, + "learning_rate": 0.0017265082655040897, + "loss": 0.8511, + "step": 3794 + }, + { + "epoch": 0.2640091829281018, + "grad_norm": 1.3984375, + "learning_rate": 0.0017263534077509514, + "loss": 1.1242, + "step": 3795 + }, + { + "epoch": 0.26407875056523705, + "grad_norm": 1.0390625, + "learning_rate": 0.0017261985131172479, + "loss": 0.952, + "step": 3796 + }, + { + "epoch": 0.2641483182023723, + "grad_norm": 1.0078125, + "learning_rate": 0.0017260435816108446, + "loss": 0.9841, + "step": 3797 + }, + { + "epoch": 0.26421788583950745, + "grad_norm": 1.0078125, + "learning_rate": 0.0017258886132396074, + "loss": 0.8861, + "step": 3798 + }, + { + "epoch": 0.2642874534766427, + "grad_norm": 1.28125, + "learning_rate": 0.0017257336080114052, + "loss": 1.0207, + "step": 3799 + }, + { + "epoch": 0.26435702111377785, + "grad_norm": 1.015625, + "learning_rate": 0.0017255785659341086, + "loss": 0.9993, + "step": 3800 + }, + { + "epoch": 0.26442658875091307, + "grad_norm": 1.0625, + "learning_rate": 0.0017254234870155893, + "loss": 0.749, + "step": 3801 + }, + { + "epoch": 0.2644961563880483, + "grad_norm": 1.1015625, + "learning_rate": 0.0017252683712637219, + "loss": 1.1282, + "step": 3802 + }, + { + "epoch": 0.26456572402518347, + "grad_norm": 1.09375, + "learning_rate": 0.0017251132186863823, + "loss": 1.0138, + "step": 3803 + }, + { + "epoch": 0.2646352916623187, + "grad_norm": 1.1796875, + "learning_rate": 0.001724958029291448, + "loss": 0.9922, + "step": 3804 + }, + { + "epoch": 0.2647048592994539, + "grad_norm": 1.2421875, + "learning_rate": 0.0017248028030867992, + "loss": 1.4224, + "step": 3805 + }, + { + "epoch": 0.2647744269365891, + "grad_norm": 0.97265625, + "learning_rate": 0.0017246475400803174, + "loss": 0.8931, + "step": 3806 + }, + { + "epoch": 0.2648439945737243, + "grad_norm": 1.15625, + "learning_rate": 0.001724492240279886, + "loss": 0.9911, + "step": 3807 + }, + { + "epoch": 0.2649135622108595, + "grad_norm": 1.15625, + "learning_rate": 0.0017243369036933904, + "loss": 1.0472, + "step": 3808 + }, + { + "epoch": 0.2649831298479947, + "grad_norm": 1.0546875, + "learning_rate": 0.0017241815303287176, + "loss": 0.9658, + "step": 3809 + }, + { + "epoch": 0.26505269748512994, + "grad_norm": 1.25, + "learning_rate": 0.001724026120193757, + "loss": 1.0667, + "step": 3810 + }, + { + "epoch": 0.2651222651222651, + "grad_norm": 1.0390625, + "learning_rate": 0.0017238706732963993, + "loss": 0.9033, + "step": 3811 + }, + { + "epoch": 0.26519183275940034, + "grad_norm": 1.0625, + "learning_rate": 0.0017237151896445373, + "loss": 1.1824, + "step": 3812 + }, + { + "epoch": 0.2652614003965355, + "grad_norm": 0.91015625, + "learning_rate": 0.001723559669246066, + "loss": 0.9525, + "step": 3813 + }, + { + "epoch": 0.26533096803367073, + "grad_norm": 1.109375, + "learning_rate": 0.0017234041121088814, + "loss": 0.8767, + "step": 3814 + }, + { + "epoch": 0.26540053567080596, + "grad_norm": 1.2734375, + "learning_rate": 0.0017232485182408824, + "loss": 1.0737, + "step": 3815 + }, + { + "epoch": 0.26547010330794113, + "grad_norm": 1.109375, + "learning_rate": 0.001723092887649969, + "loss": 1.0869, + "step": 3816 + }, + { + "epoch": 0.26553967094507636, + "grad_norm": 1.3046875, + "learning_rate": 0.0017229372203440435, + "loss": 1.1493, + "step": 3817 + }, + { + "epoch": 0.2656092385822116, + "grad_norm": 1.3984375, + "learning_rate": 0.00172278151633101, + "loss": 1.3631, + "step": 3818 + }, + { + "epoch": 0.26567880621934675, + "grad_norm": 1.1484375, + "learning_rate": 0.001722625775618774, + "loss": 1.1792, + "step": 3819 + }, + { + "epoch": 0.265748373856482, + "grad_norm": 1.25, + "learning_rate": 0.0017224699982152432, + "loss": 0.8632, + "step": 3820 + }, + { + "epoch": 0.26581794149361715, + "grad_norm": 1.1875, + "learning_rate": 0.0017223141841283276, + "loss": 1.177, + "step": 3821 + }, + { + "epoch": 0.2658875091307524, + "grad_norm": 1.1875, + "learning_rate": 0.0017221583333659385, + "loss": 0.9559, + "step": 3822 + }, + { + "epoch": 0.2659570767678876, + "grad_norm": 1.3359375, + "learning_rate": 0.0017220024459359893, + "loss": 1.1445, + "step": 3823 + }, + { + "epoch": 0.26602664440502277, + "grad_norm": 1.234375, + "learning_rate": 0.0017218465218463948, + "loss": 1.3609, + "step": 3824 + }, + { + "epoch": 0.266096212042158, + "grad_norm": 1.2578125, + "learning_rate": 0.0017216905611050725, + "loss": 1.3015, + "step": 3825 + }, + { + "epoch": 0.26616577967929317, + "grad_norm": 1.203125, + "learning_rate": 0.0017215345637199412, + "loss": 1.1312, + "step": 3826 + }, + { + "epoch": 0.2662353473164284, + "grad_norm": 1.265625, + "learning_rate": 0.0017213785296989212, + "loss": 1.1181, + "step": 3827 + }, + { + "epoch": 0.2663049149535636, + "grad_norm": 1.0546875, + "learning_rate": 0.0017212224590499358, + "loss": 1.0499, + "step": 3828 + }, + { + "epoch": 0.2663744825906988, + "grad_norm": 1.203125, + "learning_rate": 0.001721066351780909, + "loss": 1.0541, + "step": 3829 + }, + { + "epoch": 0.266444050227834, + "grad_norm": 1.03125, + "learning_rate": 0.0017209102078997673, + "loss": 0.9934, + "step": 3830 + }, + { + "epoch": 0.26651361786496924, + "grad_norm": 1.015625, + "learning_rate": 0.0017207540274144387, + "loss": 1.0731, + "step": 3831 + }, + { + "epoch": 0.2665831855021044, + "grad_norm": 1.28125, + "learning_rate": 0.0017205978103328537, + "loss": 0.9135, + "step": 3832 + }, + { + "epoch": 0.26665275313923964, + "grad_norm": 1.0625, + "learning_rate": 0.001720441556662944, + "loss": 1.1729, + "step": 3833 + }, + { + "epoch": 0.2667223207763748, + "grad_norm": 1.09375, + "learning_rate": 0.0017202852664126432, + "loss": 0.9803, + "step": 3834 + }, + { + "epoch": 0.26679188841351004, + "grad_norm": 0.9765625, + "learning_rate": 0.001720128939589887, + "loss": 0.8769, + "step": 3835 + }, + { + "epoch": 0.26686145605064526, + "grad_norm": 1.0546875, + "learning_rate": 0.0017199725762026136, + "loss": 0.8438, + "step": 3836 + }, + { + "epoch": 0.26693102368778043, + "grad_norm": 1.25, + "learning_rate": 0.001719816176258761, + "loss": 1.1329, + "step": 3837 + }, + { + "epoch": 0.26700059132491566, + "grad_norm": 1.1328125, + "learning_rate": 0.0017196597397662714, + "loss": 1.0036, + "step": 3838 + }, + { + "epoch": 0.26707015896205083, + "grad_norm": 1.203125, + "learning_rate": 0.0017195032667330875, + "loss": 0.8832, + "step": 3839 + }, + { + "epoch": 0.26713972659918606, + "grad_norm": 1.078125, + "learning_rate": 0.0017193467571671541, + "loss": 0.9158, + "step": 3840 + }, + { + "epoch": 0.2672092942363213, + "grad_norm": 1.1328125, + "learning_rate": 0.0017191902110764183, + "loss": 0.8351, + "step": 3841 + }, + { + "epoch": 0.26727886187345645, + "grad_norm": 1.0859375, + "learning_rate": 0.0017190336284688289, + "loss": 1.0573, + "step": 3842 + }, + { + "epoch": 0.2673484295105917, + "grad_norm": 1.328125, + "learning_rate": 0.001718877009352336, + "loss": 1.0967, + "step": 3843 + }, + { + "epoch": 0.2674179971477269, + "grad_norm": 1.0, + "learning_rate": 0.0017187203537348914, + "loss": 0.8831, + "step": 3844 + }, + { + "epoch": 0.2674875647848621, + "grad_norm": 1.1328125, + "learning_rate": 0.0017185636616244503, + "loss": 0.8737, + "step": 3845 + }, + { + "epoch": 0.2675571324219973, + "grad_norm": 1.5390625, + "learning_rate": 0.0017184069330289681, + "loss": 1.2432, + "step": 3846 + }, + { + "epoch": 0.26762670005913247, + "grad_norm": 1.0234375, + "learning_rate": 0.0017182501679564029, + "loss": 1.2132, + "step": 3847 + }, + { + "epoch": 0.2676962676962677, + "grad_norm": 0.91015625, + "learning_rate": 0.0017180933664147147, + "loss": 0.7472, + "step": 3848 + }, + { + "epoch": 0.2677658353334029, + "grad_norm": 1.3125, + "learning_rate": 0.0017179365284118644, + "loss": 0.9909, + "step": 3849 + }, + { + "epoch": 0.2678354029705381, + "grad_norm": 1.2109375, + "learning_rate": 0.0017177796539558162, + "loss": 1.064, + "step": 3850 + }, + { + "epoch": 0.2679049706076733, + "grad_norm": 0.91015625, + "learning_rate": 0.0017176227430545348, + "loss": 0.9202, + "step": 3851 + }, + { + "epoch": 0.2679745382448085, + "grad_norm": 1.078125, + "learning_rate": 0.0017174657957159875, + "loss": 0.865, + "step": 3852 + }, + { + "epoch": 0.2680441058819437, + "grad_norm": 1.0625, + "learning_rate": 0.001717308811948144, + "loss": 1.0918, + "step": 3853 + }, + { + "epoch": 0.26811367351907894, + "grad_norm": 1.0859375, + "learning_rate": 0.0017171517917589738, + "loss": 1.0774, + "step": 3854 + }, + { + "epoch": 0.2681832411562141, + "grad_norm": 1.171875, + "learning_rate": 0.0017169947351564508, + "loss": 1.165, + "step": 3855 + }, + { + "epoch": 0.26825280879334934, + "grad_norm": 1.28125, + "learning_rate": 0.0017168376421485489, + "loss": 1.0177, + "step": 3856 + }, + { + "epoch": 0.26832237643048457, + "grad_norm": 0.8359375, + "learning_rate": 0.0017166805127432447, + "loss": 0.879, + "step": 3857 + }, + { + "epoch": 0.26839194406761974, + "grad_norm": 1.3359375, + "learning_rate": 0.0017165233469485163, + "loss": 1.1177, + "step": 3858 + }, + { + "epoch": 0.26846151170475496, + "grad_norm": 1.2578125, + "learning_rate": 0.001716366144772344, + "loss": 1.1613, + "step": 3859 + }, + { + "epoch": 0.26853107934189013, + "grad_norm": 1.0390625, + "learning_rate": 0.0017162089062227096, + "loss": 0.8865, + "step": 3860 + }, + { + "epoch": 0.26860064697902536, + "grad_norm": 1.140625, + "learning_rate": 0.0017160516313075968, + "loss": 1.176, + "step": 3861 + }, + { + "epoch": 0.2686702146161606, + "grad_norm": 1.2421875, + "learning_rate": 0.0017158943200349915, + "loss": 1.0508, + "step": 3862 + }, + { + "epoch": 0.26873978225329576, + "grad_norm": 1.15625, + "learning_rate": 0.0017157369724128812, + "loss": 0.884, + "step": 3863 + }, + { + "epoch": 0.268809349890431, + "grad_norm": 1.3125, + "learning_rate": 0.0017155795884492547, + "loss": 1.3607, + "step": 3864 + }, + { + "epoch": 0.26887891752756615, + "grad_norm": 1.0234375, + "learning_rate": 0.0017154221681521034, + "loss": 0.9544, + "step": 3865 + }, + { + "epoch": 0.2689484851647014, + "grad_norm": 1.109375, + "learning_rate": 0.0017152647115294204, + "loss": 1.084, + "step": 3866 + }, + { + "epoch": 0.2690180528018366, + "grad_norm": 1.0390625, + "learning_rate": 0.0017151072185892008, + "loss": 1.0318, + "step": 3867 + }, + { + "epoch": 0.2690876204389718, + "grad_norm": 1.1171875, + "learning_rate": 0.001714949689339441, + "loss": 1.0706, + "step": 3868 + }, + { + "epoch": 0.269157188076107, + "grad_norm": 1.28125, + "learning_rate": 0.0017147921237881394, + "loss": 0.9735, + "step": 3869 + }, + { + "epoch": 0.2692267557132422, + "grad_norm": 1.0625, + "learning_rate": 0.0017146345219432966, + "loss": 0.8209, + "step": 3870 + }, + { + "epoch": 0.2692963233503774, + "grad_norm": 1.109375, + "learning_rate": 0.0017144768838129147, + "loss": 0.8533, + "step": 3871 + }, + { + "epoch": 0.2693658909875126, + "grad_norm": 0.96875, + "learning_rate": 0.0017143192094049985, + "loss": 1.0087, + "step": 3872 + }, + { + "epoch": 0.2694354586246478, + "grad_norm": 1.2734375, + "learning_rate": 0.0017141614987275526, + "loss": 0.8013, + "step": 3873 + }, + { + "epoch": 0.269505026261783, + "grad_norm": 1.5859375, + "learning_rate": 0.0017140037517885856, + "loss": 1.1604, + "step": 3874 + }, + { + "epoch": 0.26957459389891825, + "grad_norm": 1.171875, + "learning_rate": 0.001713845968596107, + "loss": 1.0919, + "step": 3875 + }, + { + "epoch": 0.2696441615360534, + "grad_norm": 1.4140625, + "learning_rate": 0.0017136881491581284, + "loss": 0.9652, + "step": 3876 + }, + { + "epoch": 0.26971372917318864, + "grad_norm": 0.99609375, + "learning_rate": 0.0017135302934826627, + "loss": 0.9007, + "step": 3877 + }, + { + "epoch": 0.2697832968103238, + "grad_norm": 1.1875, + "learning_rate": 0.001713372401577725, + "loss": 1.0811, + "step": 3878 + }, + { + "epoch": 0.26985286444745904, + "grad_norm": 1.046875, + "learning_rate": 0.0017132144734513324, + "loss": 1.2171, + "step": 3879 + }, + { + "epoch": 0.26992243208459427, + "grad_norm": 1.0703125, + "learning_rate": 0.0017130565091115037, + "loss": 0.9081, + "step": 3880 + }, + { + "epoch": 0.26999199972172944, + "grad_norm": 1.0078125, + "learning_rate": 0.0017128985085662599, + "loss": 1.0543, + "step": 3881 + }, + { + "epoch": 0.27006156735886466, + "grad_norm": 1.1953125, + "learning_rate": 0.0017127404718236226, + "loss": 0.9564, + "step": 3882 + }, + { + "epoch": 0.27013113499599983, + "grad_norm": 1.125, + "learning_rate": 0.001712582398891617, + "loss": 0.9315, + "step": 3883 + }, + { + "epoch": 0.27020070263313506, + "grad_norm": 1.484375, + "learning_rate": 0.0017124242897782684, + "loss": 1.186, + "step": 3884 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.0390625, + "learning_rate": 0.0017122661444916058, + "loss": 0.9827, + "step": 3885 + }, + { + "epoch": 0.27033983790740546, + "grad_norm": 0.9765625, + "learning_rate": 0.0017121079630396583, + "loss": 0.95, + "step": 3886 + }, + { + "epoch": 0.2704094055445407, + "grad_norm": 1.015625, + "learning_rate": 0.0017119497454304575, + "loss": 1.0094, + "step": 3887 + }, + { + "epoch": 0.2704789731816759, + "grad_norm": 1.3984375, + "learning_rate": 0.0017117914916720373, + "loss": 1.2051, + "step": 3888 + }, + { + "epoch": 0.2705485408188111, + "grad_norm": 1.0078125, + "learning_rate": 0.001711633201772433, + "loss": 1.0079, + "step": 3889 + }, + { + "epoch": 0.2706181084559463, + "grad_norm": 1.1953125, + "learning_rate": 0.0017114748757396812, + "loss": 1.0215, + "step": 3890 + }, + { + "epoch": 0.2706876760930815, + "grad_norm": 1.15625, + "learning_rate": 0.0017113165135818217, + "loss": 0.8982, + "step": 3891 + }, + { + "epoch": 0.2707572437302167, + "grad_norm": 1.0390625, + "learning_rate": 0.0017111581153068948, + "loss": 0.8377, + "step": 3892 + }, + { + "epoch": 0.2708268113673519, + "grad_norm": 1.1875, + "learning_rate": 0.0017109996809229434, + "loss": 0.9026, + "step": 3893 + }, + { + "epoch": 0.2708963790044871, + "grad_norm": 1.125, + "learning_rate": 0.0017108412104380117, + "loss": 1.0206, + "step": 3894 + }, + { + "epoch": 0.2709659466416223, + "grad_norm": 1.1796875, + "learning_rate": 0.0017106827038601464, + "loss": 1.1209, + "step": 3895 + }, + { + "epoch": 0.2710355142787575, + "grad_norm": 0.9375, + "learning_rate": 0.0017105241611973954, + "loss": 1.0785, + "step": 3896 + }, + { + "epoch": 0.2711050819158927, + "grad_norm": 1.21875, + "learning_rate": 0.001710365582457809, + "loss": 1.0814, + "step": 3897 + }, + { + "epoch": 0.27117464955302795, + "grad_norm": 1.8828125, + "learning_rate": 0.0017102069676494386, + "loss": 1.086, + "step": 3898 + }, + { + "epoch": 0.2712442171901631, + "grad_norm": 1.28125, + "learning_rate": 0.0017100483167803381, + "loss": 0.8824, + "step": 3899 + }, + { + "epoch": 0.27131378482729834, + "grad_norm": 1.0703125, + "learning_rate": 0.0017098896298585631, + "loss": 0.927, + "step": 3900 + }, + { + "epoch": 0.27138335246443357, + "grad_norm": 1.109375, + "learning_rate": 0.0017097309068921708, + "loss": 1.0531, + "step": 3901 + }, + { + "epoch": 0.27145292010156874, + "grad_norm": 1.1875, + "learning_rate": 0.00170957214788922, + "loss": 1.1599, + "step": 3902 + }, + { + "epoch": 0.27152248773870397, + "grad_norm": 1.046875, + "learning_rate": 0.0017094133528577724, + "loss": 1.1276, + "step": 3903 + }, + { + "epoch": 0.27159205537583914, + "grad_norm": 1.0078125, + "learning_rate": 0.0017092545218058905, + "loss": 0.8651, + "step": 3904 + }, + { + "epoch": 0.27166162301297436, + "grad_norm": 1.1328125, + "learning_rate": 0.0017090956547416388, + "loss": 1.0644, + "step": 3905 + }, + { + "epoch": 0.2717311906501096, + "grad_norm": 1.0859375, + "learning_rate": 0.001708936751673084, + "loss": 1.1555, + "step": 3906 + }, + { + "epoch": 0.27180075828724476, + "grad_norm": 1.078125, + "learning_rate": 0.001708777812608294, + "loss": 1.024, + "step": 3907 + }, + { + "epoch": 0.27187032592438, + "grad_norm": 1.3671875, + "learning_rate": 0.0017086188375553394, + "loss": 1.188, + "step": 3908 + }, + { + "epoch": 0.27193989356151516, + "grad_norm": 1.1484375, + "learning_rate": 0.0017084598265222919, + "loss": 0.8403, + "step": 3909 + }, + { + "epoch": 0.2720094611986504, + "grad_norm": 1.484375, + "learning_rate": 0.0017083007795172251, + "loss": 1.0971, + "step": 3910 + }, + { + "epoch": 0.2720790288357856, + "grad_norm": 1.1328125, + "learning_rate": 0.001708141696548215, + "loss": 1.1551, + "step": 3911 + }, + { + "epoch": 0.2721485964729208, + "grad_norm": 1.3515625, + "learning_rate": 0.001707982577623339, + "loss": 1.1604, + "step": 3912 + }, + { + "epoch": 0.272218164110056, + "grad_norm": 1.234375, + "learning_rate": 0.0017078234227506756, + "loss": 0.9916, + "step": 3913 + }, + { + "epoch": 0.27228773174719123, + "grad_norm": 0.88671875, + "learning_rate": 0.0017076642319383071, + "loss": 0.8347, + "step": 3914 + }, + { + "epoch": 0.2723572993843264, + "grad_norm": 1.1953125, + "learning_rate": 0.0017075050051943155, + "loss": 0.8493, + "step": 3915 + }, + { + "epoch": 0.2724268670214616, + "grad_norm": 1.2421875, + "learning_rate": 0.001707345742526786, + "loss": 1.1102, + "step": 3916 + }, + { + "epoch": 0.2724964346585968, + "grad_norm": 1.1640625, + "learning_rate": 0.001707186443943805, + "loss": 0.7921, + "step": 3917 + }, + { + "epoch": 0.272566002295732, + "grad_norm": 0.96875, + "learning_rate": 0.0017070271094534607, + "loss": 0.9222, + "step": 3918 + }, + { + "epoch": 0.27263556993286725, + "grad_norm": 1.078125, + "learning_rate": 0.0017068677390638435, + "loss": 1.2166, + "step": 3919 + }, + { + "epoch": 0.2727051375700024, + "grad_norm": 0.9375, + "learning_rate": 0.0017067083327830454, + "loss": 0.8154, + "step": 3920 + }, + { + "epoch": 0.27277470520713765, + "grad_norm": 1.609375, + "learning_rate": 0.0017065488906191602, + "loss": 1.3034, + "step": 3921 + }, + { + "epoch": 0.2728442728442728, + "grad_norm": 1.1484375, + "learning_rate": 0.0017063894125802835, + "loss": 1.0304, + "step": 3922 + }, + { + "epoch": 0.27291384048140804, + "grad_norm": 1.0859375, + "learning_rate": 0.0017062298986745131, + "loss": 0.8899, + "step": 3923 + }, + { + "epoch": 0.27298340811854327, + "grad_norm": 1.0390625, + "learning_rate": 0.001706070348909948, + "loss": 0.951, + "step": 3924 + }, + { + "epoch": 0.27305297575567844, + "grad_norm": 1.140625, + "learning_rate": 0.0017059107632946895, + "loss": 1.087, + "step": 3925 + }, + { + "epoch": 0.27312254339281367, + "grad_norm": 1.7265625, + "learning_rate": 0.0017057511418368408, + "loss": 1.0523, + "step": 3926 + }, + { + "epoch": 0.2731921110299489, + "grad_norm": 1.2265625, + "learning_rate": 0.0017055914845445059, + "loss": 0.9484, + "step": 3927 + }, + { + "epoch": 0.27326167866708406, + "grad_norm": 1.4140625, + "learning_rate": 0.001705431791425792, + "loss": 1.2164, + "step": 3928 + }, + { + "epoch": 0.2733312463042193, + "grad_norm": 1.0078125, + "learning_rate": 0.0017052720624888074, + "loss": 0.9607, + "step": 3929 + }, + { + "epoch": 0.27340081394135446, + "grad_norm": 1.1875, + "learning_rate": 0.0017051122977416622, + "loss": 0.9486, + "step": 3930 + }, + { + "epoch": 0.2734703815784897, + "grad_norm": 1.4296875, + "learning_rate": 0.0017049524971924686, + "loss": 0.8929, + "step": 3931 + }, + { + "epoch": 0.2735399492156249, + "grad_norm": 1.3671875, + "learning_rate": 0.0017047926608493404, + "loss": 0.9276, + "step": 3932 + }, + { + "epoch": 0.2736095168527601, + "grad_norm": 1.328125, + "learning_rate": 0.0017046327887203937, + "loss": 1.1083, + "step": 3933 + }, + { + "epoch": 0.2736790844898953, + "grad_norm": 1.125, + "learning_rate": 0.0017044728808137451, + "loss": 1.0824, + "step": 3934 + }, + { + "epoch": 0.2737486521270305, + "grad_norm": 1.6875, + "learning_rate": 0.0017043129371375147, + "loss": 1.3346, + "step": 3935 + }, + { + "epoch": 0.2738182197641657, + "grad_norm": 1.2421875, + "learning_rate": 0.0017041529576998229, + "loss": 0.9817, + "step": 3936 + }, + { + "epoch": 0.27388778740130093, + "grad_norm": 1.140625, + "learning_rate": 0.0017039929425087938, + "loss": 0.8586, + "step": 3937 + }, + { + "epoch": 0.2739573550384361, + "grad_norm": 1.2421875, + "learning_rate": 0.0017038328915725508, + "loss": 1.3273, + "step": 3938 + }, + { + "epoch": 0.2740269226755713, + "grad_norm": 0.96484375, + "learning_rate": 0.0017036728048992215, + "loss": 1.0559, + "step": 3939 + }, + { + "epoch": 0.27409649031270655, + "grad_norm": 1.1953125, + "learning_rate": 0.0017035126824969339, + "loss": 1.0056, + "step": 3940 + }, + { + "epoch": 0.2741660579498417, + "grad_norm": 1.0546875, + "learning_rate": 0.0017033525243738182, + "loss": 1.1017, + "step": 3941 + }, + { + "epoch": 0.27423562558697695, + "grad_norm": 1.1640625, + "learning_rate": 0.0017031923305380063, + "loss": 1.0774, + "step": 3942 + }, + { + "epoch": 0.2743051932241121, + "grad_norm": 1.203125, + "learning_rate": 0.001703032100997633, + "loss": 1.1158, + "step": 3943 + }, + { + "epoch": 0.27437476086124735, + "grad_norm": 1.0078125, + "learning_rate": 0.0017028718357608322, + "loss": 0.9334, + "step": 3944 + }, + { + "epoch": 0.2744443284983826, + "grad_norm": 1.03125, + "learning_rate": 0.0017027115348357427, + "loss": 1.0335, + "step": 3945 + }, + { + "epoch": 0.27451389613551774, + "grad_norm": 1.3359375, + "learning_rate": 0.0017025511982305033, + "loss": 0.9501, + "step": 3946 + }, + { + "epoch": 0.27458346377265297, + "grad_norm": 1.53125, + "learning_rate": 0.0017023908259532552, + "loss": 1.0681, + "step": 3947 + }, + { + "epoch": 0.27465303140978814, + "grad_norm": 1.0078125, + "learning_rate": 0.0017022304180121415, + "loss": 1.0523, + "step": 3948 + }, + { + "epoch": 0.27472259904692337, + "grad_norm": 1.140625, + "learning_rate": 0.0017020699744153065, + "loss": 1.0081, + "step": 3949 + }, + { + "epoch": 0.2747921666840586, + "grad_norm": 1.2890625, + "learning_rate": 0.0017019094951708968, + "loss": 0.8158, + "step": 3950 + }, + { + "epoch": 0.27486173432119376, + "grad_norm": 1.125, + "learning_rate": 0.0017017489802870606, + "loss": 1.0486, + "step": 3951 + }, + { + "epoch": 0.274931301958329, + "grad_norm": 1.265625, + "learning_rate": 0.001701588429771949, + "loss": 1.0314, + "step": 3952 + }, + { + "epoch": 0.2750008695954642, + "grad_norm": 1.0, + "learning_rate": 0.0017014278436337125, + "loss": 0.8683, + "step": 3953 + }, + { + "epoch": 0.2750704372325994, + "grad_norm": 1.109375, + "learning_rate": 0.001701267221880506, + "loss": 0.7564, + "step": 3954 + }, + { + "epoch": 0.2751400048697346, + "grad_norm": 1.2734375, + "learning_rate": 0.0017011065645204844, + "loss": 1.0051, + "step": 3955 + }, + { + "epoch": 0.2752095725068698, + "grad_norm": 0.9453125, + "learning_rate": 0.0017009458715618053, + "loss": 0.9184, + "step": 3956 + }, + { + "epoch": 0.275279140144005, + "grad_norm": 1.03125, + "learning_rate": 0.0017007851430126278, + "loss": 0.9399, + "step": 3957 + }, + { + "epoch": 0.27534870778114023, + "grad_norm": 1.1171875, + "learning_rate": 0.001700624378881113, + "loss": 0.9812, + "step": 3958 + }, + { + "epoch": 0.2754182754182754, + "grad_norm": 1.0625, + "learning_rate": 0.0017004635791754237, + "loss": 0.985, + "step": 3959 + }, + { + "epoch": 0.27548784305541063, + "grad_norm": 1.1171875, + "learning_rate": 0.0017003027439037245, + "loss": 0.8774, + "step": 3960 + }, + { + "epoch": 0.2755574106925458, + "grad_norm": 1.171875, + "learning_rate": 0.0017001418730741818, + "loss": 0.94, + "step": 3961 + }, + { + "epoch": 0.275626978329681, + "grad_norm": 1.015625, + "learning_rate": 0.0016999809666949637, + "loss": 0.8023, + "step": 3962 + }, + { + "epoch": 0.27569654596681625, + "grad_norm": 0.9921875, + "learning_rate": 0.0016998200247742403, + "loss": 1.077, + "step": 3963 + }, + { + "epoch": 0.2757661136039514, + "grad_norm": 1.3671875, + "learning_rate": 0.0016996590473201834, + "loss": 0.8863, + "step": 3964 + }, + { + "epoch": 0.27583568124108665, + "grad_norm": 1.0703125, + "learning_rate": 0.0016994980343409665, + "loss": 1.0414, + "step": 3965 + }, + { + "epoch": 0.2759052488782219, + "grad_norm": 1.140625, + "learning_rate": 0.001699336985844765, + "loss": 0.7603, + "step": 3966 + }, + { + "epoch": 0.27597481651535705, + "grad_norm": 1.0078125, + "learning_rate": 0.0016991759018397568, + "loss": 0.9134, + "step": 3967 + }, + { + "epoch": 0.2760443841524923, + "grad_norm": 1.3125, + "learning_rate": 0.00169901478233412, + "loss": 1.108, + "step": 3968 + }, + { + "epoch": 0.27611395178962744, + "grad_norm": 1.28125, + "learning_rate": 0.001698853627336036, + "loss": 1.099, + "step": 3969 + }, + { + "epoch": 0.27618351942676267, + "grad_norm": 1.1171875, + "learning_rate": 0.0016986924368536872, + "loss": 0.981, + "step": 3970 + }, + { + "epoch": 0.2762530870638979, + "grad_norm": 1.296875, + "learning_rate": 0.0016985312108952582, + "loss": 0.9792, + "step": 3971 + }, + { + "epoch": 0.27632265470103307, + "grad_norm": 1.0546875, + "learning_rate": 0.001698369949468935, + "loss": 0.9086, + "step": 3972 + }, + { + "epoch": 0.2763922223381683, + "grad_norm": 1.2265625, + "learning_rate": 0.0016982086525829062, + "loss": 1.0202, + "step": 3973 + }, + { + "epoch": 0.27646178997530346, + "grad_norm": 1.2890625, + "learning_rate": 0.0016980473202453609, + "loss": 1.1561, + "step": 3974 + }, + { + "epoch": 0.2765313576124387, + "grad_norm": 1.1015625, + "learning_rate": 0.0016978859524644913, + "loss": 0.9192, + "step": 3975 + }, + { + "epoch": 0.2766009252495739, + "grad_norm": 1.3203125, + "learning_rate": 0.0016977245492484905, + "loss": 1.0597, + "step": 3976 + }, + { + "epoch": 0.2766704928867091, + "grad_norm": 0.9609375, + "learning_rate": 0.0016975631106055538, + "loss": 0.8171, + "step": 3977 + }, + { + "epoch": 0.2767400605238443, + "grad_norm": 1.140625, + "learning_rate": 0.0016974016365438787, + "loss": 0.9362, + "step": 3978 + }, + { + "epoch": 0.27680962816097954, + "grad_norm": 1.2578125, + "learning_rate": 0.0016972401270716633, + "loss": 1.1577, + "step": 3979 + }, + { + "epoch": 0.2768791957981147, + "grad_norm": 1.1875, + "learning_rate": 0.0016970785821971087, + "loss": 0.908, + "step": 3980 + }, + { + "epoch": 0.27694876343524993, + "grad_norm": 1.1015625, + "learning_rate": 0.0016969170019284173, + "loss": 0.9814, + "step": 3981 + }, + { + "epoch": 0.2770183310723851, + "grad_norm": 1.0390625, + "learning_rate": 0.001696755386273793, + "loss": 0.9795, + "step": 3982 + }, + { + "epoch": 0.27708789870952033, + "grad_norm": 1.1328125, + "learning_rate": 0.0016965937352414425, + "loss": 1.0025, + "step": 3983 + }, + { + "epoch": 0.27715746634665556, + "grad_norm": 0.9765625, + "learning_rate": 0.001696432048839573, + "loss": 1.045, + "step": 3984 + }, + { + "epoch": 0.2772270339837907, + "grad_norm": 1.0703125, + "learning_rate": 0.0016962703270763941, + "loss": 0.9926, + "step": 3985 + }, + { + "epoch": 0.27729660162092595, + "grad_norm": 1.15625, + "learning_rate": 0.001696108569960118, + "loss": 0.9154, + "step": 3986 + }, + { + "epoch": 0.2773661692580611, + "grad_norm": 1.140625, + "learning_rate": 0.001695946777498957, + "loss": 0.8187, + "step": 3987 + }, + { + "epoch": 0.27743573689519635, + "grad_norm": 1.65625, + "learning_rate": 0.0016957849497011264, + "loss": 1.3706, + "step": 3988 + }, + { + "epoch": 0.2775053045323316, + "grad_norm": 1.078125, + "learning_rate": 0.0016956230865748433, + "loss": 1.0324, + "step": 3989 + }, + { + "epoch": 0.27757487216946675, + "grad_norm": 1.0234375, + "learning_rate": 0.001695461188128326, + "loss": 0.7268, + "step": 3990 + }, + { + "epoch": 0.277644439806602, + "grad_norm": 1.1875, + "learning_rate": 0.001695299254369795, + "loss": 0.9714, + "step": 3991 + }, + { + "epoch": 0.2777140074437372, + "grad_norm": 1.0625, + "learning_rate": 0.0016951372853074723, + "loss": 0.8341, + "step": 3992 + }, + { + "epoch": 0.27778357508087237, + "grad_norm": 0.890625, + "learning_rate": 0.001694975280949582, + "loss": 1.0248, + "step": 3993 + }, + { + "epoch": 0.2778531427180076, + "grad_norm": 1.03125, + "learning_rate": 0.00169481324130435, + "loss": 1.0205, + "step": 3994 + }, + { + "epoch": 0.27792271035514277, + "grad_norm": 1.046875, + "learning_rate": 0.0016946511663800035, + "loss": 0.965, + "step": 3995 + }, + { + "epoch": 0.277992277992278, + "grad_norm": 1.125, + "learning_rate": 0.0016944890561847723, + "loss": 0.9839, + "step": 3996 + }, + { + "epoch": 0.2780618456294132, + "grad_norm": 1.109375, + "learning_rate": 0.0016943269107268873, + "loss": 0.8865, + "step": 3997 + }, + { + "epoch": 0.2781314132665484, + "grad_norm": 0.98046875, + "learning_rate": 0.0016941647300145813, + "loss": 0.9313, + "step": 3998 + }, + { + "epoch": 0.2782009809036836, + "grad_norm": 1.2578125, + "learning_rate": 0.0016940025140560894, + "loss": 0.9325, + "step": 3999 + }, + { + "epoch": 0.2782705485408188, + "grad_norm": 1.046875, + "learning_rate": 0.0016938402628596477, + "loss": 0.8353, + "step": 4000 + }, + { + "epoch": 0.278340116177954, + "grad_norm": 0.984375, + "learning_rate": 0.0016936779764334946, + "loss": 0.9596, + "step": 4001 + }, + { + "epoch": 0.27840968381508924, + "grad_norm": 1.1484375, + "learning_rate": 0.00169351565478587, + "loss": 1.1228, + "step": 4002 + }, + { + "epoch": 0.2784792514522244, + "grad_norm": 1.2265625, + "learning_rate": 0.0016933532979250166, + "loss": 1.0892, + "step": 4003 + }, + { + "epoch": 0.27854881908935963, + "grad_norm": 1.2734375, + "learning_rate": 0.0016931909058591772, + "loss": 0.811, + "step": 4004 + }, + { + "epoch": 0.27861838672649486, + "grad_norm": 1.21875, + "learning_rate": 0.0016930284785965975, + "loss": 0.9714, + "step": 4005 + }, + { + "epoch": 0.27868795436363003, + "grad_norm": 1.2421875, + "learning_rate": 0.001692866016145525, + "loss": 0.9037, + "step": 4006 + }, + { + "epoch": 0.27875752200076526, + "grad_norm": 1.359375, + "learning_rate": 0.0016927035185142084, + "loss": 1.0398, + "step": 4007 + }, + { + "epoch": 0.2788270896379004, + "grad_norm": 1.203125, + "learning_rate": 0.0016925409857108985, + "loss": 0.9304, + "step": 4008 + }, + { + "epoch": 0.27889665727503565, + "grad_norm": 1.0, + "learning_rate": 0.0016923784177438482, + "loss": 0.962, + "step": 4009 + }, + { + "epoch": 0.2789662249121709, + "grad_norm": 1.109375, + "learning_rate": 0.0016922158146213113, + "loss": 0.9493, + "step": 4010 + }, + { + "epoch": 0.27903579254930605, + "grad_norm": 0.98828125, + "learning_rate": 0.0016920531763515447, + "loss": 0.8456, + "step": 4011 + }, + { + "epoch": 0.2791053601864413, + "grad_norm": 1.0546875, + "learning_rate": 0.001691890502942806, + "loss": 0.7361, + "step": 4012 + }, + { + "epoch": 0.27917492782357645, + "grad_norm": 1.234375, + "learning_rate": 0.0016917277944033548, + "loss": 1.0424, + "step": 4013 + }, + { + "epoch": 0.2792444954607117, + "grad_norm": 1.1796875, + "learning_rate": 0.001691565050741453, + "loss": 0.9785, + "step": 4014 + }, + { + "epoch": 0.2793140630978469, + "grad_norm": 1.1328125, + "learning_rate": 0.0016914022719653637, + "loss": 0.8496, + "step": 4015 + }, + { + "epoch": 0.27938363073498207, + "grad_norm": 1.2578125, + "learning_rate": 0.0016912394580833516, + "loss": 0.9109, + "step": 4016 + }, + { + "epoch": 0.2794531983721173, + "grad_norm": 1.40625, + "learning_rate": 0.0016910766091036843, + "loss": 1.1295, + "step": 4017 + }, + { + "epoch": 0.2795227660092525, + "grad_norm": 1.2421875, + "learning_rate": 0.0016909137250346298, + "loss": 0.9687, + "step": 4018 + }, + { + "epoch": 0.2795923336463877, + "grad_norm": 0.921875, + "learning_rate": 0.0016907508058844588, + "loss": 0.9258, + "step": 4019 + }, + { + "epoch": 0.2796619012835229, + "grad_norm": 1.2421875, + "learning_rate": 0.0016905878516614437, + "loss": 1.2742, + "step": 4020 + }, + { + "epoch": 0.2797314689206581, + "grad_norm": 0.9921875, + "learning_rate": 0.0016904248623738584, + "loss": 0.7806, + "step": 4021 + }, + { + "epoch": 0.2798010365577933, + "grad_norm": 1.25, + "learning_rate": 0.0016902618380299783, + "loss": 1.0143, + "step": 4022 + }, + { + "epoch": 0.27987060419492854, + "grad_norm": 1.4296875, + "learning_rate": 0.0016900987786380812, + "loss": 0.8619, + "step": 4023 + }, + { + "epoch": 0.2799401718320637, + "grad_norm": 1.046875, + "learning_rate": 0.0016899356842064468, + "loss": 0.9301, + "step": 4024 + }, + { + "epoch": 0.28000973946919894, + "grad_norm": 1.140625, + "learning_rate": 0.0016897725547433556, + "loss": 0.9563, + "step": 4025 + }, + { + "epoch": 0.2800793071063341, + "grad_norm": 1.03125, + "learning_rate": 0.001689609390257091, + "loss": 1.0522, + "step": 4026 + }, + { + "epoch": 0.28014887474346933, + "grad_norm": 1.09375, + "learning_rate": 0.0016894461907559374, + "loss": 1.0169, + "step": 4027 + }, + { + "epoch": 0.28021844238060456, + "grad_norm": 1.2421875, + "learning_rate": 0.001689282956248181, + "loss": 1.1033, + "step": 4028 + }, + { + "epoch": 0.28028801001773973, + "grad_norm": 0.9453125, + "learning_rate": 0.0016891196867421109, + "loss": 0.791, + "step": 4029 + }, + { + "epoch": 0.28035757765487496, + "grad_norm": 1.1328125, + "learning_rate": 0.0016889563822460158, + "loss": 1.0634, + "step": 4030 + }, + { + "epoch": 0.2804271452920102, + "grad_norm": 1.1328125, + "learning_rate": 0.001688793042768189, + "loss": 1.2211, + "step": 4031 + }, + { + "epoch": 0.28049671292914535, + "grad_norm": 1.109375, + "learning_rate": 0.0016886296683169227, + "loss": 1.0032, + "step": 4032 + }, + { + "epoch": 0.2805662805662806, + "grad_norm": 1.1640625, + "learning_rate": 0.001688466258900513, + "loss": 1.1016, + "step": 4033 + }, + { + "epoch": 0.28063584820341575, + "grad_norm": 1.2265625, + "learning_rate": 0.0016883028145272567, + "loss": 0.8212, + "step": 4034 + }, + { + "epoch": 0.280705415840551, + "grad_norm": 1.3984375, + "learning_rate": 0.0016881393352054528, + "loss": 1.1214, + "step": 4035 + }, + { + "epoch": 0.2807749834776862, + "grad_norm": 1.078125, + "learning_rate": 0.0016879758209434022, + "loss": 0.9552, + "step": 4036 + }, + { + "epoch": 0.2808445511148214, + "grad_norm": 1.2109375, + "learning_rate": 0.0016878122717494067, + "loss": 0.7728, + "step": 4037 + }, + { + "epoch": 0.2809141187519566, + "grad_norm": 1.234375, + "learning_rate": 0.0016876486876317711, + "loss": 1.2132, + "step": 4038 + }, + { + "epoch": 0.28098368638909177, + "grad_norm": 1.2890625, + "learning_rate": 0.001687485068598801, + "loss": 1.1734, + "step": 4039 + }, + { + "epoch": 0.281053254026227, + "grad_norm": 1.34375, + "learning_rate": 0.0016873214146588046, + "loss": 0.7552, + "step": 4040 + }, + { + "epoch": 0.2811228216633622, + "grad_norm": 1.1015625, + "learning_rate": 0.0016871577258200908, + "loss": 0.9213, + "step": 4041 + }, + { + "epoch": 0.2811923893004974, + "grad_norm": 1.1640625, + "learning_rate": 0.0016869940020909713, + "loss": 0.8805, + "step": 4042 + }, + { + "epoch": 0.2812619569376326, + "grad_norm": 0.9609375, + "learning_rate": 0.0016868302434797592, + "loss": 0.9129, + "step": 4043 + }, + { + "epoch": 0.28133152457476784, + "grad_norm": 1.125, + "learning_rate": 0.0016866664499947687, + "loss": 0.9635, + "step": 4044 + }, + { + "epoch": 0.281401092211903, + "grad_norm": 1.3671875, + "learning_rate": 0.0016865026216443177, + "loss": 1.1198, + "step": 4045 + }, + { + "epoch": 0.28147065984903824, + "grad_norm": 0.953125, + "learning_rate": 0.0016863387584367233, + "loss": 0.8736, + "step": 4046 + }, + { + "epoch": 0.2815402274861734, + "grad_norm": 1.265625, + "learning_rate": 0.0016861748603803062, + "loss": 1.2039, + "step": 4047 + }, + { + "epoch": 0.28160979512330864, + "grad_norm": 1.328125, + "learning_rate": 0.0016860109274833884, + "loss": 1.3537, + "step": 4048 + }, + { + "epoch": 0.28167936276044386, + "grad_norm": 1.1953125, + "learning_rate": 0.0016858469597542936, + "loss": 0.9092, + "step": 4049 + }, + { + "epoch": 0.28174893039757903, + "grad_norm": 1.0546875, + "learning_rate": 0.0016856829572013468, + "loss": 1.0306, + "step": 4050 + }, + { + "epoch": 0.28181849803471426, + "grad_norm": 1.046875, + "learning_rate": 0.0016855189198328757, + "loss": 0.9224, + "step": 4051 + }, + { + "epoch": 0.28188806567184943, + "grad_norm": 1.234375, + "learning_rate": 0.0016853548476572092, + "loss": 0.9718, + "step": 4052 + }, + { + "epoch": 0.28195763330898466, + "grad_norm": 1.2421875, + "learning_rate": 0.0016851907406826776, + "loss": 1.1605, + "step": 4053 + }, + { + "epoch": 0.2820272009461199, + "grad_norm": 1.140625, + "learning_rate": 0.001685026598917614, + "loss": 1.1018, + "step": 4054 + }, + { + "epoch": 0.28209676858325505, + "grad_norm": 1.1640625, + "learning_rate": 0.0016848624223703527, + "loss": 0.8536, + "step": 4055 + }, + { + "epoch": 0.2821663362203903, + "grad_norm": 0.9609375, + "learning_rate": 0.0016846982110492292, + "loss": 0.7116, + "step": 4056 + }, + { + "epoch": 0.2822359038575255, + "grad_norm": 0.96484375, + "learning_rate": 0.0016845339649625818, + "loss": 0.7503, + "step": 4057 + }, + { + "epoch": 0.2823054714946607, + "grad_norm": 1.015625, + "learning_rate": 0.0016843696841187504, + "loss": 0.9039, + "step": 4058 + }, + { + "epoch": 0.2823750391317959, + "grad_norm": 1.125, + "learning_rate": 0.0016842053685260754, + "loss": 0.9546, + "step": 4059 + }, + { + "epoch": 0.2824446067689311, + "grad_norm": 1.2734375, + "learning_rate": 0.0016840410181929006, + "loss": 1.0785, + "step": 4060 + }, + { + "epoch": 0.2825141744060663, + "grad_norm": 1.1328125, + "learning_rate": 0.001683876633127571, + "loss": 0.7847, + "step": 4061 + }, + { + "epoch": 0.2825837420432015, + "grad_norm": 1.1953125, + "learning_rate": 0.0016837122133384326, + "loss": 1.0703, + "step": 4062 + }, + { + "epoch": 0.2826533096803367, + "grad_norm": 1.1015625, + "learning_rate": 0.001683547758833834, + "loss": 0.9288, + "step": 4063 + }, + { + "epoch": 0.2827228773174719, + "grad_norm": 0.99609375, + "learning_rate": 0.0016833832696221262, + "loss": 1.0972, + "step": 4064 + }, + { + "epoch": 0.2827924449546071, + "grad_norm": 1.28125, + "learning_rate": 0.00168321874571166, + "loss": 1.1096, + "step": 4065 + }, + { + "epoch": 0.2828620125917423, + "grad_norm": 1.234375, + "learning_rate": 0.0016830541871107893, + "loss": 1.0317, + "step": 4066 + }, + { + "epoch": 0.28293158022887754, + "grad_norm": 0.9609375, + "learning_rate": 0.0016828895938278703, + "loss": 0.7199, + "step": 4067 + }, + { + "epoch": 0.2830011478660127, + "grad_norm": 1.0859375, + "learning_rate": 0.0016827249658712597, + "loss": 0.9799, + "step": 4068 + }, + { + "epoch": 0.28307071550314794, + "grad_norm": 1.3125, + "learning_rate": 0.0016825603032493163, + "loss": 1.0341, + "step": 4069 + }, + { + "epoch": 0.28314028314028317, + "grad_norm": 1.5546875, + "learning_rate": 0.0016823956059704012, + "loss": 0.9213, + "step": 4070 + }, + { + "epoch": 0.28320985077741834, + "grad_norm": 1.2109375, + "learning_rate": 0.001682230874042877, + "loss": 1.188, + "step": 4071 + }, + { + "epoch": 0.28327941841455356, + "grad_norm": 0.88671875, + "learning_rate": 0.0016820661074751074, + "loss": 1.047, + "step": 4072 + }, + { + "epoch": 0.28334898605168873, + "grad_norm": 0.98046875, + "learning_rate": 0.0016819013062754587, + "loss": 0.8298, + "step": 4073 + }, + { + "epoch": 0.28341855368882396, + "grad_norm": 1.125, + "learning_rate": 0.0016817364704522987, + "loss": 0.9393, + "step": 4074 + }, + { + "epoch": 0.2834881213259592, + "grad_norm": 1.21875, + "learning_rate": 0.0016815716000139972, + "loss": 1.2058, + "step": 4075 + }, + { + "epoch": 0.28355768896309436, + "grad_norm": 1.1171875, + "learning_rate": 0.0016814066949689252, + "loss": 1.0323, + "step": 4076 + }, + { + "epoch": 0.2836272566002296, + "grad_norm": 1.0625, + "learning_rate": 0.0016812417553254556, + "loss": 1.2022, + "step": 4077 + }, + { + "epoch": 0.28369682423736475, + "grad_norm": 1.1640625, + "learning_rate": 0.0016810767810919633, + "loss": 0.9194, + "step": 4078 + }, + { + "epoch": 0.2837663918745, + "grad_norm": 1.125, + "learning_rate": 0.001680911772276825, + "loss": 0.8541, + "step": 4079 + }, + { + "epoch": 0.2838359595116352, + "grad_norm": 1.0234375, + "learning_rate": 0.0016807467288884191, + "loss": 0.9392, + "step": 4080 + }, + { + "epoch": 0.2839055271487704, + "grad_norm": 1.0390625, + "learning_rate": 0.0016805816509351255, + "loss": 1.1768, + "step": 4081 + }, + { + "epoch": 0.2839750947859056, + "grad_norm": 1.2265625, + "learning_rate": 0.001680416538425326, + "loss": 1.1473, + "step": 4082 + }, + { + "epoch": 0.28404466242304083, + "grad_norm": 1.21875, + "learning_rate": 0.0016802513913674042, + "loss": 0.9821, + "step": 4083 + }, + { + "epoch": 0.284114230060176, + "grad_norm": 1.2421875, + "learning_rate": 0.0016800862097697453, + "loss": 0.9646, + "step": 4084 + }, + { + "epoch": 0.2841837976973112, + "grad_norm": 1.140625, + "learning_rate": 0.0016799209936407369, + "loss": 1.1557, + "step": 4085 + }, + { + "epoch": 0.2842533653344464, + "grad_norm": 1.015625, + "learning_rate": 0.0016797557429887673, + "loss": 0.8796, + "step": 4086 + }, + { + "epoch": 0.2843229329715816, + "grad_norm": 1.1015625, + "learning_rate": 0.0016795904578222275, + "loss": 0.7061, + "step": 4087 + }, + { + "epoch": 0.28439250060871685, + "grad_norm": 1.1875, + "learning_rate": 0.0016794251381495094, + "loss": 1.0073, + "step": 4088 + }, + { + "epoch": 0.284462068245852, + "grad_norm": 1.203125, + "learning_rate": 0.0016792597839790074, + "loss": 0.8033, + "step": 4089 + }, + { + "epoch": 0.28453163588298724, + "grad_norm": 1.1640625, + "learning_rate": 0.0016790943953191174, + "loss": 1.1437, + "step": 4090 + }, + { + "epoch": 0.2846012035201224, + "grad_norm": 1.03125, + "learning_rate": 0.0016789289721782367, + "loss": 0.9835, + "step": 4091 + }, + { + "epoch": 0.28467077115725764, + "grad_norm": 1.0546875, + "learning_rate": 0.0016787635145647651, + "loss": 1.0082, + "step": 4092 + }, + { + "epoch": 0.28474033879439287, + "grad_norm": 1.453125, + "learning_rate": 0.0016785980224871032, + "loss": 0.8653, + "step": 4093 + }, + { + "epoch": 0.28480990643152804, + "grad_norm": 0.90625, + "learning_rate": 0.0016784324959536541, + "loss": 0.6891, + "step": 4094 + }, + { + "epoch": 0.28487947406866326, + "grad_norm": 1.3984375, + "learning_rate": 0.0016782669349728226, + "loss": 1.1953, + "step": 4095 + }, + { + "epoch": 0.2849490417057985, + "grad_norm": 1.234375, + "learning_rate": 0.0016781013395530148, + "loss": 0.8658, + "step": 4096 + }, + { + "epoch": 0.28501860934293366, + "grad_norm": 1.1328125, + "learning_rate": 0.0016779357097026389, + "loss": 0.8288, + "step": 4097 + }, + { + "epoch": 0.2850881769800689, + "grad_norm": 1.015625, + "learning_rate": 0.0016777700454301046, + "loss": 0.7104, + "step": 4098 + }, + { + "epoch": 0.28515774461720406, + "grad_norm": 1.3359375, + "learning_rate": 0.0016776043467438236, + "loss": 0.9844, + "step": 4099 + }, + { + "epoch": 0.2852273122543393, + "grad_norm": 1.3046875, + "learning_rate": 0.0016774386136522092, + "loss": 0.9912, + "step": 4100 + }, + { + "epoch": 0.2852968798914745, + "grad_norm": 1.328125, + "learning_rate": 0.0016772728461636767, + "loss": 0.9511, + "step": 4101 + }, + { + "epoch": 0.2853664475286097, + "grad_norm": 1.046875, + "learning_rate": 0.0016771070442866427, + "loss": 0.8666, + "step": 4102 + }, + { + "epoch": 0.2854360151657449, + "grad_norm": 1.0703125, + "learning_rate": 0.001676941208029526, + "loss": 1.2521, + "step": 4103 + }, + { + "epoch": 0.2855055828028801, + "grad_norm": 0.96484375, + "learning_rate": 0.0016767753374007466, + "loss": 1.1681, + "step": 4104 + }, + { + "epoch": 0.2855751504400153, + "grad_norm": 1.1015625, + "learning_rate": 0.001676609432408727, + "loss": 0.8885, + "step": 4105 + }, + { + "epoch": 0.28564471807715053, + "grad_norm": 0.9765625, + "learning_rate": 0.001676443493061891, + "loss": 0.9408, + "step": 4106 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.0234375, + "learning_rate": 0.0016762775193686632, + "loss": 1.078, + "step": 4107 + }, + { + "epoch": 0.2857838533514209, + "grad_norm": 1.328125, + "learning_rate": 0.001676111511337472, + "loss": 1.1927, + "step": 4108 + }, + { + "epoch": 0.28585342098855615, + "grad_norm": 1.3984375, + "learning_rate": 0.0016759454689767464, + "loss": 1.0382, + "step": 4109 + }, + { + "epoch": 0.2859229886256913, + "grad_norm": 1.421875, + "learning_rate": 0.0016757793922949165, + "loss": 0.9997, + "step": 4110 + }, + { + "epoch": 0.28599255626282655, + "grad_norm": 1.4296875, + "learning_rate": 0.0016756132813004153, + "loss": 1.2973, + "step": 4111 + }, + { + "epoch": 0.2860621238999617, + "grad_norm": 1.328125, + "learning_rate": 0.0016754471360016772, + "loss": 1.0318, + "step": 4112 + }, + { + "epoch": 0.28613169153709694, + "grad_norm": 0.828125, + "learning_rate": 0.001675280956407138, + "loss": 0.8628, + "step": 4113 + }, + { + "epoch": 0.28620125917423217, + "grad_norm": 1.0546875, + "learning_rate": 0.0016751147425252354, + "loss": 0.9316, + "step": 4114 + }, + { + "epoch": 0.28627082681136734, + "grad_norm": 1.1953125, + "learning_rate": 0.001674948494364409, + "loss": 0.772, + "step": 4115 + }, + { + "epoch": 0.28634039444850257, + "grad_norm": 1.453125, + "learning_rate": 0.0016747822119331003, + "loss": 1.4195, + "step": 4116 + }, + { + "epoch": 0.28640996208563774, + "grad_norm": 1.3125, + "learning_rate": 0.0016746158952397519, + "loss": 1.1103, + "step": 4117 + }, + { + "epoch": 0.28647952972277296, + "grad_norm": 1.171875, + "learning_rate": 0.0016744495442928085, + "loss": 0.9768, + "step": 4118 + }, + { + "epoch": 0.2865490973599082, + "grad_norm": 1.0625, + "learning_rate": 0.0016742831591007171, + "loss": 1.1117, + "step": 4119 + }, + { + "epoch": 0.28661866499704336, + "grad_norm": 1.453125, + "learning_rate": 0.001674116739671925, + "loss": 0.9725, + "step": 4120 + }, + { + "epoch": 0.2866882326341786, + "grad_norm": 1.3515625, + "learning_rate": 0.001673950286014883, + "loss": 1.0254, + "step": 4121 + }, + { + "epoch": 0.2867578002713138, + "grad_norm": 1.1328125, + "learning_rate": 0.001673783798138042, + "loss": 1.1598, + "step": 4122 + }, + { + "epoch": 0.286827367908449, + "grad_norm": 1.15625, + "learning_rate": 0.0016736172760498564, + "loss": 1.0397, + "step": 4123 + }, + { + "epoch": 0.2868969355455842, + "grad_norm": 1.265625, + "learning_rate": 0.0016734507197587807, + "loss": 1.1071, + "step": 4124 + }, + { + "epoch": 0.2869665031827194, + "grad_norm": 1.484375, + "learning_rate": 0.001673284129273272, + "loss": 1.0425, + "step": 4125 + }, + { + "epoch": 0.2870360708198546, + "grad_norm": 1.203125, + "learning_rate": 0.0016731175046017883, + "loss": 0.944, + "step": 4126 + }, + { + "epoch": 0.28710563845698983, + "grad_norm": 1.0859375, + "learning_rate": 0.0016729508457527908, + "loss": 0.878, + "step": 4127 + }, + { + "epoch": 0.287175206094125, + "grad_norm": 0.953125, + "learning_rate": 0.0016727841527347414, + "loss": 0.9369, + "step": 4128 + }, + { + "epoch": 0.28724477373126023, + "grad_norm": 1.140625, + "learning_rate": 0.0016726174255561035, + "loss": 0.896, + "step": 4129 + }, + { + "epoch": 0.2873143413683954, + "grad_norm": 1.0078125, + "learning_rate": 0.0016724506642253432, + "loss": 0.885, + "step": 4130 + }, + { + "epoch": 0.2873839090055306, + "grad_norm": 1.1875, + "learning_rate": 0.0016722838687509276, + "loss": 1.1057, + "step": 4131 + }, + { + "epoch": 0.28745347664266585, + "grad_norm": 1.0078125, + "learning_rate": 0.0016721170391413257, + "loss": 0.7977, + "step": 4132 + }, + { + "epoch": 0.287523044279801, + "grad_norm": 0.9375, + "learning_rate": 0.0016719501754050082, + "loss": 0.8088, + "step": 4133 + }, + { + "epoch": 0.28759261191693625, + "grad_norm": 1.078125, + "learning_rate": 0.0016717832775504475, + "loss": 0.9313, + "step": 4134 + }, + { + "epoch": 0.2876621795540715, + "grad_norm": 1.15625, + "learning_rate": 0.0016716163455861182, + "loss": 0.9587, + "step": 4135 + }, + { + "epoch": 0.28773174719120664, + "grad_norm": 1.1015625, + "learning_rate": 0.0016714493795204962, + "loss": 1.2193, + "step": 4136 + }, + { + "epoch": 0.28780131482834187, + "grad_norm": 1.125, + "learning_rate": 0.0016712823793620588, + "loss": 1.0766, + "step": 4137 + }, + { + "epoch": 0.28787088246547704, + "grad_norm": 1.2265625, + "learning_rate": 0.001671115345119286, + "loss": 0.92, + "step": 4138 + }, + { + "epoch": 0.28794045010261227, + "grad_norm": 0.859375, + "learning_rate": 0.0016709482768006584, + "loss": 0.821, + "step": 4139 + }, + { + "epoch": 0.2880100177397475, + "grad_norm": 1.125, + "learning_rate": 0.001670781174414659, + "loss": 1.1517, + "step": 4140 + }, + { + "epoch": 0.28807958537688266, + "grad_norm": 1.1171875, + "learning_rate": 0.0016706140379697727, + "loss": 0.9332, + "step": 4141 + }, + { + "epoch": 0.2881491530140179, + "grad_norm": 1.28125, + "learning_rate": 0.001670446867474486, + "loss": 0.9166, + "step": 4142 + }, + { + "epoch": 0.28821872065115306, + "grad_norm": 1.0546875, + "learning_rate": 0.0016702796629372862, + "loss": 0.9084, + "step": 4143 + }, + { + "epoch": 0.2882882882882883, + "grad_norm": 1.09375, + "learning_rate": 0.0016701124243666636, + "loss": 0.6126, + "step": 4144 + }, + { + "epoch": 0.2883578559254235, + "grad_norm": 1.15625, + "learning_rate": 0.0016699451517711102, + "loss": 0.9703, + "step": 4145 + }, + { + "epoch": 0.2884274235625587, + "grad_norm": 1.359375, + "learning_rate": 0.0016697778451591184, + "loss": 0.8664, + "step": 4146 + }, + { + "epoch": 0.2884969911996939, + "grad_norm": 1.0390625, + "learning_rate": 0.0016696105045391836, + "loss": 0.614, + "step": 4147 + }, + { + "epoch": 0.28856655883682913, + "grad_norm": 1.2109375, + "learning_rate": 0.0016694431299198024, + "loss": 1.1663, + "step": 4148 + }, + { + "epoch": 0.2886361264739643, + "grad_norm": 1.2109375, + "learning_rate": 0.0016692757213094733, + "loss": 1.2911, + "step": 4149 + }, + { + "epoch": 0.28870569411109953, + "grad_norm": 1.0546875, + "learning_rate": 0.0016691082787166967, + "loss": 0.8398, + "step": 4150 + }, + { + "epoch": 0.2887752617482347, + "grad_norm": 1.0234375, + "learning_rate": 0.001668940802149974, + "loss": 0.9779, + "step": 4151 + }, + { + "epoch": 0.28884482938536993, + "grad_norm": 1.1484375, + "learning_rate": 0.0016687732916178092, + "loss": 1.0343, + "step": 4152 + }, + { + "epoch": 0.28891439702250515, + "grad_norm": 1.1484375, + "learning_rate": 0.0016686057471287077, + "loss": 1.0199, + "step": 4153 + }, + { + "epoch": 0.2889839646596403, + "grad_norm": 0.90625, + "learning_rate": 0.0016684381686911762, + "loss": 0.8642, + "step": 4154 + }, + { + "epoch": 0.28905353229677555, + "grad_norm": 1.2421875, + "learning_rate": 0.0016682705563137237, + "loss": 0.954, + "step": 4155 + }, + { + "epoch": 0.2891230999339107, + "grad_norm": 1.3671875, + "learning_rate": 0.0016681029100048606, + "loss": 1.064, + "step": 4156 + }, + { + "epoch": 0.28919266757104595, + "grad_norm": 1.3046875, + "learning_rate": 0.0016679352297730991, + "loss": 1.0502, + "step": 4157 + }, + { + "epoch": 0.2892622352081812, + "grad_norm": 1.0703125, + "learning_rate": 0.0016677675156269536, + "loss": 1.0124, + "step": 4158 + }, + { + "epoch": 0.28933180284531634, + "grad_norm": 1.1875, + "learning_rate": 0.0016675997675749392, + "loss": 1.1415, + "step": 4159 + }, + { + "epoch": 0.28940137048245157, + "grad_norm": 1.140625, + "learning_rate": 0.0016674319856255738, + "loss": 0.9647, + "step": 4160 + }, + { + "epoch": 0.2894709381195868, + "grad_norm": 1.2578125, + "learning_rate": 0.0016672641697873761, + "loss": 0.9417, + "step": 4161 + }, + { + "epoch": 0.28954050575672197, + "grad_norm": 1.21875, + "learning_rate": 0.0016670963200688669, + "loss": 0.8899, + "step": 4162 + }, + { + "epoch": 0.2896100733938572, + "grad_norm": 1.140625, + "learning_rate": 0.0016669284364785692, + "loss": 1.1638, + "step": 4163 + }, + { + "epoch": 0.28967964103099236, + "grad_norm": 1.21875, + "learning_rate": 0.0016667605190250072, + "loss": 1.2574, + "step": 4164 + }, + { + "epoch": 0.2897492086681276, + "grad_norm": 1.375, + "learning_rate": 0.0016665925677167067, + "loss": 0.9816, + "step": 4165 + }, + { + "epoch": 0.2898187763052628, + "grad_norm": 1.3125, + "learning_rate": 0.0016664245825621954, + "loss": 0.9461, + "step": 4166 + }, + { + "epoch": 0.289888343942398, + "grad_norm": 1.03125, + "learning_rate": 0.0016662565635700028, + "loss": 0.8763, + "step": 4167 + }, + { + "epoch": 0.2899579115795332, + "grad_norm": 1.171875, + "learning_rate": 0.0016660885107486606, + "loss": 0.8769, + "step": 4168 + }, + { + "epoch": 0.2900274792166684, + "grad_norm": 1.0625, + "learning_rate": 0.0016659204241067003, + "loss": 1.0802, + "step": 4169 + }, + { + "epoch": 0.2900970468538036, + "grad_norm": 1.1640625, + "learning_rate": 0.001665752303652658, + "loss": 0.9512, + "step": 4170 + }, + { + "epoch": 0.29016661449093883, + "grad_norm": 1.203125, + "learning_rate": 0.001665584149395069, + "loss": 0.9257, + "step": 4171 + }, + { + "epoch": 0.290236182128074, + "grad_norm": 1.0078125, + "learning_rate": 0.0016654159613424717, + "loss": 0.9806, + "step": 4172 + }, + { + "epoch": 0.29030574976520923, + "grad_norm": 1.328125, + "learning_rate": 0.0016652477395034061, + "loss": 1.2123, + "step": 4173 + }, + { + "epoch": 0.2903753174023444, + "grad_norm": 1.1328125, + "learning_rate": 0.0016650794838864132, + "loss": 1.0177, + "step": 4174 + }, + { + "epoch": 0.29044488503947963, + "grad_norm": 1.0703125, + "learning_rate": 0.0016649111945000363, + "loss": 0.8756, + "step": 4175 + }, + { + "epoch": 0.29051445267661485, + "grad_norm": 1.21875, + "learning_rate": 0.0016647428713528205, + "loss": 0.9949, + "step": 4176 + }, + { + "epoch": 0.29058402031375, + "grad_norm": 1.125, + "learning_rate": 0.0016645745144533122, + "loss": 1.2037, + "step": 4177 + }, + { + "epoch": 0.29065358795088525, + "grad_norm": 1.03125, + "learning_rate": 0.0016644061238100596, + "loss": 0.8776, + "step": 4178 + }, + { + "epoch": 0.2907231555880205, + "grad_norm": 0.9453125, + "learning_rate": 0.0016642376994316132, + "loss": 1.0536, + "step": 4179 + }, + { + "epoch": 0.29079272322515565, + "grad_norm": 1.1171875, + "learning_rate": 0.001664069241326524, + "loss": 1.0989, + "step": 4180 + }, + { + "epoch": 0.2908622908622909, + "grad_norm": 1.421875, + "learning_rate": 0.0016639007495033462, + "loss": 1.1989, + "step": 4181 + }, + { + "epoch": 0.29093185849942604, + "grad_norm": 1.109375, + "learning_rate": 0.0016637322239706348, + "loss": 0.9185, + "step": 4182 + }, + { + "epoch": 0.29100142613656127, + "grad_norm": 1.03125, + "learning_rate": 0.0016635636647369463, + "loss": 0.8248, + "step": 4183 + }, + { + "epoch": 0.2910709937736965, + "grad_norm": 0.9609375, + "learning_rate": 0.0016633950718108394, + "loss": 0.8318, + "step": 4184 + }, + { + "epoch": 0.29114056141083167, + "grad_norm": 1.3125, + "learning_rate": 0.0016632264452008747, + "loss": 0.8321, + "step": 4185 + }, + { + "epoch": 0.2912101290479669, + "grad_norm": 1.203125, + "learning_rate": 0.0016630577849156142, + "loss": 1.1485, + "step": 4186 + }, + { + "epoch": 0.29127969668510206, + "grad_norm": 1.28125, + "learning_rate": 0.001662889090963621, + "loss": 1.0078, + "step": 4187 + }, + { + "epoch": 0.2913492643222373, + "grad_norm": 1.125, + "learning_rate": 0.001662720363353461, + "loss": 1.0449, + "step": 4188 + }, + { + "epoch": 0.2914188319593725, + "grad_norm": 1.109375, + "learning_rate": 0.0016625516020937015, + "loss": 0.8293, + "step": 4189 + }, + { + "epoch": 0.2914883995965077, + "grad_norm": 1.140625, + "learning_rate": 0.0016623828071929113, + "loss": 0.7138, + "step": 4190 + }, + { + "epoch": 0.2915579672336429, + "grad_norm": 1.2421875, + "learning_rate": 0.0016622139786596603, + "loss": 0.9308, + "step": 4191 + }, + { + "epoch": 0.29162753487077814, + "grad_norm": 1.3828125, + "learning_rate": 0.0016620451165025218, + "loss": 1.4084, + "step": 4192 + }, + { + "epoch": 0.2916971025079133, + "grad_norm": 0.94921875, + "learning_rate": 0.001661876220730069, + "loss": 0.8537, + "step": 4193 + }, + { + "epoch": 0.29176667014504853, + "grad_norm": 1.2109375, + "learning_rate": 0.001661707291350878, + "loss": 0.8307, + "step": 4194 + }, + { + "epoch": 0.2918362377821837, + "grad_norm": 0.91015625, + "learning_rate": 0.0016615383283735256, + "loss": 0.797, + "step": 4195 + }, + { + "epoch": 0.29190580541931893, + "grad_norm": 0.9296875, + "learning_rate": 0.0016613693318065917, + "loss": 0.8068, + "step": 4196 + }, + { + "epoch": 0.29197537305645416, + "grad_norm": 1.265625, + "learning_rate": 0.0016612003016586562, + "loss": 1.0044, + "step": 4197 + }, + { + "epoch": 0.2920449406935893, + "grad_norm": 1.1796875, + "learning_rate": 0.0016610312379383028, + "loss": 0.9487, + "step": 4198 + }, + { + "epoch": 0.29211450833072455, + "grad_norm": 1.125, + "learning_rate": 0.0016608621406541144, + "loss": 0.7799, + "step": 4199 + }, + { + "epoch": 0.2921840759678597, + "grad_norm": 1.140625, + "learning_rate": 0.0016606930098146777, + "loss": 1.0704, + "step": 4200 + }, + { + "epoch": 0.29225364360499495, + "grad_norm": 1.0546875, + "learning_rate": 0.0016605238454285801, + "loss": 1.0219, + "step": 4201 + }, + { + "epoch": 0.2923232112421302, + "grad_norm": 1.15625, + "learning_rate": 0.001660354647504411, + "loss": 1.1202, + "step": 4202 + }, + { + "epoch": 0.29239277887926535, + "grad_norm": 1.0, + "learning_rate": 0.0016601854160507613, + "loss": 0.8611, + "step": 4203 + }, + { + "epoch": 0.2924623465164006, + "grad_norm": 1.1640625, + "learning_rate": 0.0016600161510762232, + "loss": 0.8448, + "step": 4204 + }, + { + "epoch": 0.2925319141535358, + "grad_norm": 1.2109375, + "learning_rate": 0.0016598468525893923, + "loss": 0.7808, + "step": 4205 + }, + { + "epoch": 0.29260148179067097, + "grad_norm": 1.140625, + "learning_rate": 0.001659677520598864, + "loss": 1.0908, + "step": 4206 + }, + { + "epoch": 0.2926710494278062, + "grad_norm": 1.0234375, + "learning_rate": 0.0016595081551132364, + "loss": 0.6729, + "step": 4207 + }, + { + "epoch": 0.29274061706494137, + "grad_norm": 1.1484375, + "learning_rate": 0.0016593387561411085, + "loss": 1.0274, + "step": 4208 + }, + { + "epoch": 0.2928101847020766, + "grad_norm": 1.2578125, + "learning_rate": 0.0016591693236910818, + "loss": 1.0735, + "step": 4209 + }, + { + "epoch": 0.2928797523392118, + "grad_norm": 1.0078125, + "learning_rate": 0.0016589998577717596, + "loss": 0.8966, + "step": 4210 + }, + { + "epoch": 0.292949319976347, + "grad_norm": 1.046875, + "learning_rate": 0.0016588303583917462, + "loss": 1.036, + "step": 4211 + }, + { + "epoch": 0.2930188876134822, + "grad_norm": 1.03125, + "learning_rate": 0.0016586608255596477, + "loss": 1.0781, + "step": 4212 + }, + { + "epoch": 0.2930884552506174, + "grad_norm": 1.0859375, + "learning_rate": 0.0016584912592840727, + "loss": 1.0266, + "step": 4213 + }, + { + "epoch": 0.2931580228877526, + "grad_norm": 0.953125, + "learning_rate": 0.0016583216595736304, + "loss": 0.8677, + "step": 4214 + }, + { + "epoch": 0.29322759052488784, + "grad_norm": 1.046875, + "learning_rate": 0.0016581520264369325, + "loss": 0.9807, + "step": 4215 + }, + { + "epoch": 0.293297158162023, + "grad_norm": 0.9921875, + "learning_rate": 0.001657982359882592, + "loss": 0.7129, + "step": 4216 + }, + { + "epoch": 0.29336672579915823, + "grad_norm": 0.89453125, + "learning_rate": 0.0016578126599192237, + "loss": 1.0546, + "step": 4217 + }, + { + "epoch": 0.29343629343629346, + "grad_norm": 1.3046875, + "learning_rate": 0.001657642926555444, + "loss": 0.8812, + "step": 4218 + }, + { + "epoch": 0.29350586107342863, + "grad_norm": 1.1640625, + "learning_rate": 0.0016574731597998715, + "loss": 0.9671, + "step": 4219 + }, + { + "epoch": 0.29357542871056386, + "grad_norm": 1.359375, + "learning_rate": 0.001657303359661126, + "loss": 1.1712, + "step": 4220 + }, + { + "epoch": 0.293644996347699, + "grad_norm": 0.89453125, + "learning_rate": 0.001657133526147829, + "loss": 1.011, + "step": 4221 + }, + { + "epoch": 0.29371456398483425, + "grad_norm": 1.328125, + "learning_rate": 0.0016569636592686033, + "loss": 1.0849, + "step": 4222 + }, + { + "epoch": 0.2937841316219695, + "grad_norm": 1.0546875, + "learning_rate": 0.0016567937590320745, + "loss": 1.1929, + "step": 4223 + }, + { + "epoch": 0.29385369925910465, + "grad_norm": 1.125, + "learning_rate": 0.0016566238254468691, + "loss": 0.9215, + "step": 4224 + }, + { + "epoch": 0.2939232668962399, + "grad_norm": 1.5703125, + "learning_rate": 0.0016564538585216153, + "loss": 1.2911, + "step": 4225 + }, + { + "epoch": 0.29399283453337505, + "grad_norm": 1.1796875, + "learning_rate": 0.0016562838582649439, + "loss": 0.9647, + "step": 4226 + }, + { + "epoch": 0.2940624021705103, + "grad_norm": 1.1796875, + "learning_rate": 0.0016561138246854853, + "loss": 1.0962, + "step": 4227 + }, + { + "epoch": 0.2941319698076455, + "grad_norm": 1.015625, + "learning_rate": 0.0016559437577918744, + "loss": 1.0047, + "step": 4228 + }, + { + "epoch": 0.29420153744478067, + "grad_norm": 0.9609375, + "learning_rate": 0.0016557736575927454, + "loss": 0.8579, + "step": 4229 + }, + { + "epoch": 0.2942711050819159, + "grad_norm": 1.375, + "learning_rate": 0.0016556035240967355, + "loss": 0.9159, + "step": 4230 + }, + { + "epoch": 0.2943406727190511, + "grad_norm": 1.0703125, + "learning_rate": 0.0016554333573124832, + "loss": 0.8801, + "step": 4231 + }, + { + "epoch": 0.2944102403561863, + "grad_norm": 1.046875, + "learning_rate": 0.0016552631572486283, + "loss": 0.8854, + "step": 4232 + }, + { + "epoch": 0.2944798079933215, + "grad_norm": 0.9609375, + "learning_rate": 0.0016550929239138132, + "loss": 0.8029, + "step": 4233 + }, + { + "epoch": 0.2945493756304567, + "grad_norm": 1.0546875, + "learning_rate": 0.0016549226573166816, + "loss": 0.7103, + "step": 4234 + }, + { + "epoch": 0.2946189432675919, + "grad_norm": 1.2578125, + "learning_rate": 0.0016547523574658783, + "loss": 0.858, + "step": 4235 + }, + { + "epoch": 0.29468851090472714, + "grad_norm": 1.078125, + "learning_rate": 0.0016545820243700504, + "loss": 0.8784, + "step": 4236 + }, + { + "epoch": 0.2947580785418623, + "grad_norm": 1.0625, + "learning_rate": 0.001654411658037847, + "loss": 0.8463, + "step": 4237 + }, + { + "epoch": 0.29482764617899754, + "grad_norm": 1.109375, + "learning_rate": 0.0016542412584779175, + "loss": 0.9727, + "step": 4238 + }, + { + "epoch": 0.2948972138161327, + "grad_norm": 1.1328125, + "learning_rate": 0.001654070825698915, + "loss": 0.9683, + "step": 4239 + }, + { + "epoch": 0.29496678145326793, + "grad_norm": 1.234375, + "learning_rate": 0.0016539003597094927, + "loss": 1.2157, + "step": 4240 + }, + { + "epoch": 0.29503634909040316, + "grad_norm": 1.1953125, + "learning_rate": 0.0016537298605183058, + "loss": 1.0825, + "step": 4241 + }, + { + "epoch": 0.29510591672753833, + "grad_norm": 1.328125, + "learning_rate": 0.0016535593281340117, + "loss": 1.0039, + "step": 4242 + }, + { + "epoch": 0.29517548436467356, + "grad_norm": 1.21875, + "learning_rate": 0.0016533887625652692, + "loss": 0.8681, + "step": 4243 + }, + { + "epoch": 0.2952450520018088, + "grad_norm": 0.984375, + "learning_rate": 0.0016532181638207386, + "loss": 0.7514, + "step": 4244 + }, + { + "epoch": 0.29531461963894395, + "grad_norm": 1.203125, + "learning_rate": 0.001653047531909082, + "loss": 1.2113, + "step": 4245 + }, + { + "epoch": 0.2953841872760792, + "grad_norm": 1.171875, + "learning_rate": 0.0016528768668389636, + "loss": 1.1541, + "step": 4246 + }, + { + "epoch": 0.29545375491321435, + "grad_norm": 0.98828125, + "learning_rate": 0.0016527061686190485, + "loss": 1.0614, + "step": 4247 + }, + { + "epoch": 0.2955233225503496, + "grad_norm": 1.0234375, + "learning_rate": 0.001652535437258004, + "loss": 0.9572, + "step": 4248 + }, + { + "epoch": 0.2955928901874848, + "grad_norm": 0.91015625, + "learning_rate": 0.0016523646727644992, + "loss": 0.9534, + "step": 4249 + }, + { + "epoch": 0.29566245782462, + "grad_norm": 1.1484375, + "learning_rate": 0.001652193875147204, + "loss": 1.1222, + "step": 4250 + }, + { + "epoch": 0.2957320254617552, + "grad_norm": 1.2734375, + "learning_rate": 0.0016520230444147916, + "loss": 1.1133, + "step": 4251 + }, + { + "epoch": 0.29580159309889037, + "grad_norm": 1.078125, + "learning_rate": 0.0016518521805759352, + "loss": 1.1543, + "step": 4252 + }, + { + "epoch": 0.2958711607360256, + "grad_norm": 1.234375, + "learning_rate": 0.001651681283639311, + "loss": 0.9325, + "step": 4253 + }, + { + "epoch": 0.2959407283731608, + "grad_norm": 1.2421875, + "learning_rate": 0.0016515103536135956, + "loss": 1.1222, + "step": 4254 + }, + { + "epoch": 0.296010296010296, + "grad_norm": 1.2578125, + "learning_rate": 0.0016513393905074683, + "loss": 0.9288, + "step": 4255 + }, + { + "epoch": 0.2960798636474312, + "grad_norm": 1.0, + "learning_rate": 0.00165116839432961, + "loss": 0.8688, + "step": 4256 + }, + { + "epoch": 0.29614943128456644, + "grad_norm": 1.0703125, + "learning_rate": 0.0016509973650887023, + "loss": 0.9634, + "step": 4257 + }, + { + "epoch": 0.2962189989217016, + "grad_norm": 1.0, + "learning_rate": 0.0016508263027934303, + "loss": 0.9483, + "step": 4258 + }, + { + "epoch": 0.29628856655883684, + "grad_norm": 1.125, + "learning_rate": 0.0016506552074524784, + "loss": 1.0677, + "step": 4259 + }, + { + "epoch": 0.296358134195972, + "grad_norm": 0.95703125, + "learning_rate": 0.001650484079074535, + "loss": 0.8843, + "step": 4260 + }, + { + "epoch": 0.29642770183310724, + "grad_norm": 0.921875, + "learning_rate": 0.0016503129176682887, + "loss": 1.0493, + "step": 4261 + }, + { + "epoch": 0.29649726947024246, + "grad_norm": 1.3203125, + "learning_rate": 0.00165014172324243, + "loss": 1.1126, + "step": 4262 + }, + { + "epoch": 0.29656683710737763, + "grad_norm": 1.3359375, + "learning_rate": 0.0016499704958056521, + "loss": 1.1817, + "step": 4263 + }, + { + "epoch": 0.29663640474451286, + "grad_norm": 1.1015625, + "learning_rate": 0.001649799235366648, + "loss": 0.7771, + "step": 4264 + }, + { + "epoch": 0.29670597238164803, + "grad_norm": 1.3125, + "learning_rate": 0.0016496279419341143, + "loss": 1.1077, + "step": 4265 + }, + { + "epoch": 0.29677554001878326, + "grad_norm": 1.53125, + "learning_rate": 0.001649456615516748, + "loss": 1.017, + "step": 4266 + }, + { + "epoch": 0.2968451076559185, + "grad_norm": 1.375, + "learning_rate": 0.0016492852561232482, + "loss": 1.0378, + "step": 4267 + }, + { + "epoch": 0.29691467529305365, + "grad_norm": 1.0, + "learning_rate": 0.0016491138637623156, + "loss": 0.8971, + "step": 4268 + }, + { + "epoch": 0.2969842429301889, + "grad_norm": 1.328125, + "learning_rate": 0.0016489424384426529, + "loss": 1.1585, + "step": 4269 + }, + { + "epoch": 0.2970538105673241, + "grad_norm": 1.015625, + "learning_rate": 0.001648770980172964, + "loss": 0.9772, + "step": 4270 + }, + { + "epoch": 0.2971233782044593, + "grad_norm": 1.2109375, + "learning_rate": 0.0016485994889619549, + "loss": 0.9343, + "step": 4271 + }, + { + "epoch": 0.2971929458415945, + "grad_norm": 1.109375, + "learning_rate": 0.0016484279648183331, + "loss": 0.8485, + "step": 4272 + }, + { + "epoch": 0.2972625134787297, + "grad_norm": 1.296875, + "learning_rate": 0.0016482564077508074, + "loss": 0.9716, + "step": 4273 + }, + { + "epoch": 0.2973320811158649, + "grad_norm": 1.125, + "learning_rate": 0.0016480848177680887, + "loss": 0.9989, + "step": 4274 + }, + { + "epoch": 0.2974016487530001, + "grad_norm": 1.09375, + "learning_rate": 0.0016479131948788895, + "loss": 0.735, + "step": 4275 + }, + { + "epoch": 0.2974712163901353, + "grad_norm": 1.1796875, + "learning_rate": 0.001647741539091924, + "loss": 1.0061, + "step": 4276 + }, + { + "epoch": 0.2975407840272705, + "grad_norm": 1.0078125, + "learning_rate": 0.0016475698504159083, + "loss": 0.7155, + "step": 4277 + }, + { + "epoch": 0.2976103516644057, + "grad_norm": 1.421875, + "learning_rate": 0.0016473981288595589, + "loss": 1.0526, + "step": 4278 + }, + { + "epoch": 0.2976799193015409, + "grad_norm": 1.15625, + "learning_rate": 0.0016472263744315963, + "loss": 0.819, + "step": 4279 + }, + { + "epoch": 0.29774948693867614, + "grad_norm": 1.359375, + "learning_rate": 0.0016470545871407405, + "loss": 1.3974, + "step": 4280 + }, + { + "epoch": 0.2978190545758113, + "grad_norm": 1.1328125, + "learning_rate": 0.0016468827669957142, + "loss": 0.959, + "step": 4281 + }, + { + "epoch": 0.29788862221294654, + "grad_norm": 1.1640625, + "learning_rate": 0.0016467109140052415, + "loss": 1.0633, + "step": 4282 + }, + { + "epoch": 0.29795818985008177, + "grad_norm": 1.03125, + "learning_rate": 0.001646539028178048, + "loss": 0.8363, + "step": 4283 + }, + { + "epoch": 0.29802775748721694, + "grad_norm": 1.0546875, + "learning_rate": 0.0016463671095228618, + "loss": 0.9048, + "step": 4284 + }, + { + "epoch": 0.29809732512435216, + "grad_norm": 1.3046875, + "learning_rate": 0.0016461951580484116, + "loss": 1.3229, + "step": 4285 + }, + { + "epoch": 0.29816689276148733, + "grad_norm": 1.234375, + "learning_rate": 0.0016460231737634283, + "loss": 1.0129, + "step": 4286 + }, + { + "epoch": 0.29823646039862256, + "grad_norm": 0.98046875, + "learning_rate": 0.0016458511566766446, + "loss": 0.7665, + "step": 4287 + }, + { + "epoch": 0.2983060280357578, + "grad_norm": 1.6015625, + "learning_rate": 0.0016456791067967942, + "loss": 1.1054, + "step": 4288 + }, + { + "epoch": 0.29837559567289296, + "grad_norm": 1.296875, + "learning_rate": 0.0016455070241326133, + "loss": 1.0423, + "step": 4289 + }, + { + "epoch": 0.2984451633100282, + "grad_norm": 1.0859375, + "learning_rate": 0.0016453349086928395, + "loss": 0.8518, + "step": 4290 + }, + { + "epoch": 0.29851473094716335, + "grad_norm": 1.0625, + "learning_rate": 0.0016451627604862115, + "loss": 0.7496, + "step": 4291 + }, + { + "epoch": 0.2985842985842986, + "grad_norm": 1.2890625, + "learning_rate": 0.0016449905795214706, + "loss": 1.0012, + "step": 4292 + }, + { + "epoch": 0.2986538662214338, + "grad_norm": 1.078125, + "learning_rate": 0.001644818365807359, + "loss": 1.0817, + "step": 4293 + }, + { + "epoch": 0.298723433858569, + "grad_norm": 1.546875, + "learning_rate": 0.001644646119352621, + "loss": 1.3537, + "step": 4294 + }, + { + "epoch": 0.2987930014957042, + "grad_norm": 1.078125, + "learning_rate": 0.0016444738401660021, + "loss": 0.8799, + "step": 4295 + }, + { + "epoch": 0.29886256913283943, + "grad_norm": 1.0234375, + "learning_rate": 0.0016443015282562499, + "loss": 0.9406, + "step": 4296 + }, + { + "epoch": 0.2989321367699746, + "grad_norm": 1.015625, + "learning_rate": 0.0016441291836321139, + "loss": 0.9568, + "step": 4297 + }, + { + "epoch": 0.2990017044071098, + "grad_norm": 1.0390625, + "learning_rate": 0.0016439568063023446, + "loss": 0.8902, + "step": 4298 + }, + { + "epoch": 0.299071272044245, + "grad_norm": 0.9453125, + "learning_rate": 0.0016437843962756942, + "loss": 0.9022, + "step": 4299 + }, + { + "epoch": 0.2991408396813802, + "grad_norm": 1.0078125, + "learning_rate": 0.0016436119535609176, + "loss": 0.96, + "step": 4300 + }, + { + "epoch": 0.29921040731851545, + "grad_norm": 1.1015625, + "learning_rate": 0.0016434394781667696, + "loss": 1.0838, + "step": 4301 + }, + { + "epoch": 0.2992799749556506, + "grad_norm": 1.109375, + "learning_rate": 0.0016432669701020083, + "loss": 1.1138, + "step": 4302 + }, + { + "epoch": 0.29934954259278584, + "grad_norm": 1.1015625, + "learning_rate": 0.0016430944293753921, + "loss": 0.8477, + "step": 4303 + }, + { + "epoch": 0.299419110229921, + "grad_norm": 0.98046875, + "learning_rate": 0.0016429218559956826, + "loss": 0.8323, + "step": 4304 + }, + { + "epoch": 0.29948867786705624, + "grad_norm": 1.2890625, + "learning_rate": 0.001642749249971642, + "loss": 0.9095, + "step": 4305 + }, + { + "epoch": 0.29955824550419147, + "grad_norm": 1.1015625, + "learning_rate": 0.0016425766113120337, + "loss": 0.7988, + "step": 4306 + }, + { + "epoch": 0.29962781314132664, + "grad_norm": 0.9375, + "learning_rate": 0.0016424039400256244, + "loss": 0.8935, + "step": 4307 + }, + { + "epoch": 0.29969738077846186, + "grad_norm": 0.98828125, + "learning_rate": 0.0016422312361211806, + "loss": 1.1378, + "step": 4308 + }, + { + "epoch": 0.2997669484155971, + "grad_norm": 1.0546875, + "learning_rate": 0.001642058499607472, + "loss": 1.0826, + "step": 4309 + }, + { + "epoch": 0.29983651605273226, + "grad_norm": 0.80859375, + "learning_rate": 0.0016418857304932686, + "loss": 0.682, + "step": 4310 + }, + { + "epoch": 0.2999060836898675, + "grad_norm": 1.0078125, + "learning_rate": 0.0016417129287873435, + "loss": 0.8288, + "step": 4311 + }, + { + "epoch": 0.29997565132700266, + "grad_norm": 1.1328125, + "learning_rate": 0.0016415400944984702, + "loss": 1.1236, + "step": 4312 + }, + { + "epoch": 0.3000452189641379, + "grad_norm": 1.09375, + "learning_rate": 0.0016413672276354245, + "loss": 0.8868, + "step": 4313 + }, + { + "epoch": 0.3001147866012731, + "grad_norm": 0.859375, + "learning_rate": 0.0016411943282069838, + "loss": 0.6118, + "step": 4314 + }, + { + "epoch": 0.3001843542384083, + "grad_norm": 1.03125, + "learning_rate": 0.001641021396221927, + "loss": 0.8889, + "step": 4315 + }, + { + "epoch": 0.3002539218755435, + "grad_norm": 0.87109375, + "learning_rate": 0.0016408484316890347, + "loss": 0.7158, + "step": 4316 + }, + { + "epoch": 0.3003234895126787, + "grad_norm": 1.0546875, + "learning_rate": 0.001640675434617089, + "loss": 1.0108, + "step": 4317 + }, + { + "epoch": 0.3003930571498139, + "grad_norm": 1.1171875, + "learning_rate": 0.001640502405014874, + "loss": 0.9027, + "step": 4318 + }, + { + "epoch": 0.30046262478694913, + "grad_norm": 0.96484375, + "learning_rate": 0.0016403293428911754, + "loss": 0.9381, + "step": 4319 + }, + { + "epoch": 0.3005321924240843, + "grad_norm": 1.2265625, + "learning_rate": 0.00164015624825478, + "loss": 1.1626, + "step": 4320 + }, + { + "epoch": 0.3006017600612195, + "grad_norm": 1.140625, + "learning_rate": 0.0016399831211144772, + "loss": 1.0997, + "step": 4321 + }, + { + "epoch": 0.30067132769835475, + "grad_norm": 1.1953125, + "learning_rate": 0.001639809961479057, + "loss": 1.1818, + "step": 4322 + }, + { + "epoch": 0.3007408953354899, + "grad_norm": 1.1640625, + "learning_rate": 0.0016396367693573119, + "loss": 1.1089, + "step": 4323 + }, + { + "epoch": 0.30081046297262515, + "grad_norm": 1.2578125, + "learning_rate": 0.0016394635447580358, + "loss": 1.216, + "step": 4324 + }, + { + "epoch": 0.3008800306097603, + "grad_norm": 1.0859375, + "learning_rate": 0.0016392902876900242, + "loss": 1.1119, + "step": 4325 + }, + { + "epoch": 0.30094959824689554, + "grad_norm": 1.2109375, + "learning_rate": 0.001639116998162074, + "loss": 1.2958, + "step": 4326 + }, + { + "epoch": 0.30101916588403077, + "grad_norm": 1.03125, + "learning_rate": 0.0016389436761829836, + "loss": 0.8656, + "step": 4327 + }, + { + "epoch": 0.30108873352116594, + "grad_norm": 1.125, + "learning_rate": 0.0016387703217615541, + "loss": 0.9134, + "step": 4328 + }, + { + "epoch": 0.30115830115830117, + "grad_norm": 1.0234375, + "learning_rate": 0.0016385969349065875, + "loss": 0.7285, + "step": 4329 + }, + { + "epoch": 0.30122786879543634, + "grad_norm": 1.1796875, + "learning_rate": 0.001638423515626887, + "loss": 1.0429, + "step": 4330 + }, + { + "epoch": 0.30129743643257156, + "grad_norm": 1.2421875, + "learning_rate": 0.0016382500639312582, + "loss": 1.1032, + "step": 4331 + }, + { + "epoch": 0.3013670040697068, + "grad_norm": 1.140625, + "learning_rate": 0.0016380765798285086, + "loss": 0.9027, + "step": 4332 + }, + { + "epoch": 0.30143657170684196, + "grad_norm": 1.1796875, + "learning_rate": 0.0016379030633274462, + "loss": 0.8908, + "step": 4333 + }, + { + "epoch": 0.3015061393439772, + "grad_norm": 1.1953125, + "learning_rate": 0.0016377295144368816, + "loss": 0.9926, + "step": 4334 + }, + { + "epoch": 0.3015757069811124, + "grad_norm": 1.21875, + "learning_rate": 0.0016375559331656265, + "loss": 0.8325, + "step": 4335 + }, + { + "epoch": 0.3016452746182476, + "grad_norm": 1.6796875, + "learning_rate": 0.0016373823195224943, + "loss": 0.8746, + "step": 4336 + }, + { + "epoch": 0.3017148422553828, + "grad_norm": 0.94140625, + "learning_rate": 0.0016372086735163011, + "loss": 0.9151, + "step": 4337 + }, + { + "epoch": 0.301784409892518, + "grad_norm": 0.96484375, + "learning_rate": 0.0016370349951558632, + "loss": 0.8559, + "step": 4338 + }, + { + "epoch": 0.3018539775296532, + "grad_norm": 1.21875, + "learning_rate": 0.001636861284449999, + "loss": 1.004, + "step": 4339 + }, + { + "epoch": 0.30192354516678843, + "grad_norm": 1.2109375, + "learning_rate": 0.0016366875414075288, + "loss": 0.8747, + "step": 4340 + }, + { + "epoch": 0.3019931128039236, + "grad_norm": 1.40625, + "learning_rate": 0.0016365137660372744, + "loss": 0.9914, + "step": 4341 + }, + { + "epoch": 0.30206268044105883, + "grad_norm": 1.046875, + "learning_rate": 0.0016363399583480592, + "loss": 0.9239, + "step": 4342 + }, + { + "epoch": 0.302132248078194, + "grad_norm": 0.98828125, + "learning_rate": 0.0016361661183487085, + "loss": 0.8221, + "step": 4343 + }, + { + "epoch": 0.3022018157153292, + "grad_norm": 0.9140625, + "learning_rate": 0.0016359922460480484, + "loss": 0.7647, + "step": 4344 + }, + { + "epoch": 0.30227138335246445, + "grad_norm": 1.203125, + "learning_rate": 0.001635818341454908, + "loss": 1.116, + "step": 4345 + }, + { + "epoch": 0.3023409509895996, + "grad_norm": 1.2265625, + "learning_rate": 0.001635644404578117, + "loss": 1.0775, + "step": 4346 + }, + { + "epoch": 0.30241051862673485, + "grad_norm": 0.67578125, + "learning_rate": 0.0016354704354265071, + "loss": 0.7777, + "step": 4347 + }, + { + "epoch": 0.3024800862638701, + "grad_norm": 0.9765625, + "learning_rate": 0.0016352964340089113, + "loss": 0.9641, + "step": 4348 + }, + { + "epoch": 0.30254965390100524, + "grad_norm": 1.09375, + "learning_rate": 0.0016351224003341644, + "loss": 1.0605, + "step": 4349 + }, + { + "epoch": 0.30261922153814047, + "grad_norm": 1.0859375, + "learning_rate": 0.0016349483344111038, + "loss": 0.6386, + "step": 4350 + }, + { + "epoch": 0.30268878917527564, + "grad_norm": 1.03125, + "learning_rate": 0.0016347742362485672, + "loss": 0.9836, + "step": 4351 + }, + { + "epoch": 0.30275835681241087, + "grad_norm": 1.2109375, + "learning_rate": 0.0016346001058553938, + "loss": 0.9887, + "step": 4352 + }, + { + "epoch": 0.3028279244495461, + "grad_norm": 1.171875, + "learning_rate": 0.001634425943240426, + "loss": 1.0901, + "step": 4353 + }, + { + "epoch": 0.30289749208668126, + "grad_norm": 0.9453125, + "learning_rate": 0.0016342517484125069, + "loss": 1.0028, + "step": 4354 + }, + { + "epoch": 0.3029670597238165, + "grad_norm": 1.1484375, + "learning_rate": 0.0016340775213804803, + "loss": 0.976, + "step": 4355 + }, + { + "epoch": 0.30303662736095166, + "grad_norm": 1.0234375, + "learning_rate": 0.0016339032621531936, + "loss": 0.8511, + "step": 4356 + }, + { + "epoch": 0.3031061949980869, + "grad_norm": 0.9140625, + "learning_rate": 0.0016337289707394939, + "loss": 0.7611, + "step": 4357 + }, + { + "epoch": 0.3031757626352221, + "grad_norm": 1.0, + "learning_rate": 0.0016335546471482317, + "loss": 0.7739, + "step": 4358 + }, + { + "epoch": 0.3032453302723573, + "grad_norm": 1.046875, + "learning_rate": 0.0016333802913882573, + "loss": 0.8467, + "step": 4359 + }, + { + "epoch": 0.3033148979094925, + "grad_norm": 0.859375, + "learning_rate": 0.0016332059034684248, + "loss": 0.9224, + "step": 4360 + }, + { + "epoch": 0.30338446554662774, + "grad_norm": 1.46875, + "learning_rate": 0.001633031483397588, + "loss": 0.8146, + "step": 4361 + }, + { + "epoch": 0.3034540331837629, + "grad_norm": 1.3671875, + "learning_rate": 0.0016328570311846032, + "loss": 1.1802, + "step": 4362 + }, + { + "epoch": 0.30352360082089813, + "grad_norm": 1.0625, + "learning_rate": 0.001632682546838328, + "loss": 1.055, + "step": 4363 + }, + { + "epoch": 0.3035931684580333, + "grad_norm": 1.1875, + "learning_rate": 0.0016325080303676218, + "loss": 1.0319, + "step": 4364 + }, + { + "epoch": 0.30366273609516853, + "grad_norm": 1.203125, + "learning_rate": 0.0016323334817813465, + "loss": 0.8394, + "step": 4365 + }, + { + "epoch": 0.30373230373230375, + "grad_norm": 1.125, + "learning_rate": 0.001632158901088364, + "loss": 1.1423, + "step": 4366 + }, + { + "epoch": 0.3038018713694389, + "grad_norm": 1.0703125, + "learning_rate": 0.0016319842882975386, + "loss": 1.1017, + "step": 4367 + }, + { + "epoch": 0.30387143900657415, + "grad_norm": 1.1796875, + "learning_rate": 0.0016318096434177365, + "loss": 1.069, + "step": 4368 + }, + { + "epoch": 0.3039410066437093, + "grad_norm": 1.0625, + "learning_rate": 0.0016316349664578253, + "loss": 1.0585, + "step": 4369 + }, + { + "epoch": 0.30401057428084455, + "grad_norm": 0.9296875, + "learning_rate": 0.0016314602574266743, + "loss": 0.893, + "step": 4370 + }, + { + "epoch": 0.3040801419179798, + "grad_norm": 1.1796875, + "learning_rate": 0.0016312855163331543, + "loss": 1.1369, + "step": 4371 + }, + { + "epoch": 0.30414970955511494, + "grad_norm": 1.15625, + "learning_rate": 0.0016311107431861377, + "loss": 0.8239, + "step": 4372 + }, + { + "epoch": 0.30421927719225017, + "grad_norm": 1.21875, + "learning_rate": 0.001630935937994498, + "loss": 0.9099, + "step": 4373 + }, + { + "epoch": 0.3042888448293854, + "grad_norm": 1.0546875, + "learning_rate": 0.0016307611007671122, + "loss": 0.8812, + "step": 4374 + }, + { + "epoch": 0.30435841246652057, + "grad_norm": 1.09375, + "learning_rate": 0.0016305862315128565, + "loss": 1.1323, + "step": 4375 + }, + { + "epoch": 0.3044279801036558, + "grad_norm": 0.95703125, + "learning_rate": 0.001630411330240611, + "loss": 0.9536, + "step": 4376 + }, + { + "epoch": 0.30449754774079096, + "grad_norm": 1.421875, + "learning_rate": 0.0016302363969592551, + "loss": 1.147, + "step": 4377 + }, + { + "epoch": 0.3045671153779262, + "grad_norm": 1.1015625, + "learning_rate": 0.0016300614316776718, + "loss": 1.0402, + "step": 4378 + }, + { + "epoch": 0.3046366830150614, + "grad_norm": 1.21875, + "learning_rate": 0.0016298864344047447, + "loss": 1.0486, + "step": 4379 + }, + { + "epoch": 0.3047062506521966, + "grad_norm": 1.1328125, + "learning_rate": 0.0016297114051493592, + "loss": 0.9422, + "step": 4380 + }, + { + "epoch": 0.3047758182893318, + "grad_norm": 1.234375, + "learning_rate": 0.0016295363439204028, + "loss": 1.076, + "step": 4381 + }, + { + "epoch": 0.304845385926467, + "grad_norm": 1.1484375, + "learning_rate": 0.0016293612507267637, + "loss": 0.9279, + "step": 4382 + }, + { + "epoch": 0.3049149535636022, + "grad_norm": 1.1328125, + "learning_rate": 0.0016291861255773325, + "loss": 0.9118, + "step": 4383 + }, + { + "epoch": 0.30498452120073744, + "grad_norm": 1.0546875, + "learning_rate": 0.0016290109684810013, + "loss": 0.8204, + "step": 4384 + }, + { + "epoch": 0.3050540888378726, + "grad_norm": 1.1328125, + "learning_rate": 0.0016288357794466638, + "loss": 1.0852, + "step": 4385 + }, + { + "epoch": 0.30512365647500783, + "grad_norm": 0.9921875, + "learning_rate": 0.0016286605584832144, + "loss": 0.9775, + "step": 4386 + }, + { + "epoch": 0.30519322411214306, + "grad_norm": 1.0078125, + "learning_rate": 0.001628485305599551, + "loss": 0.8048, + "step": 4387 + }, + { + "epoch": 0.30526279174927823, + "grad_norm": 0.94140625, + "learning_rate": 0.0016283100208045714, + "loss": 0.9148, + "step": 4388 + }, + { + "epoch": 0.30533235938641345, + "grad_norm": 1.203125, + "learning_rate": 0.0016281347041071758, + "loss": 1.0056, + "step": 4389 + }, + { + "epoch": 0.3054019270235486, + "grad_norm": 0.89453125, + "learning_rate": 0.0016279593555162662, + "loss": 0.9933, + "step": 4390 + }, + { + "epoch": 0.30547149466068385, + "grad_norm": 0.984375, + "learning_rate": 0.0016277839750407455, + "loss": 0.9661, + "step": 4391 + }, + { + "epoch": 0.3055410622978191, + "grad_norm": 1.2109375, + "learning_rate": 0.001627608562689519, + "loss": 1.0881, + "step": 4392 + }, + { + "epoch": 0.30561062993495425, + "grad_norm": 0.94140625, + "learning_rate": 0.0016274331184714928, + "loss": 0.8765, + "step": 4393 + }, + { + "epoch": 0.3056801975720895, + "grad_norm": 1.046875, + "learning_rate": 0.0016272576423955753, + "loss": 0.859, + "step": 4394 + }, + { + "epoch": 0.30574976520922464, + "grad_norm": 1.0390625, + "learning_rate": 0.0016270821344706765, + "loss": 0.8187, + "step": 4395 + }, + { + "epoch": 0.30581933284635987, + "grad_norm": 1.0234375, + "learning_rate": 0.0016269065947057079, + "loss": 1.2265, + "step": 4396 + }, + { + "epoch": 0.3058889004834951, + "grad_norm": 1.28125, + "learning_rate": 0.0016267310231095817, + "loss": 1.0202, + "step": 4397 + }, + { + "epoch": 0.30595846812063027, + "grad_norm": 1.140625, + "learning_rate": 0.0016265554196912137, + "loss": 0.7479, + "step": 4398 + }, + { + "epoch": 0.3060280357577655, + "grad_norm": 1.1171875, + "learning_rate": 0.001626379784459519, + "loss": 0.9621, + "step": 4399 + }, + { + "epoch": 0.3060976033949007, + "grad_norm": 1.3203125, + "learning_rate": 0.0016262041174234163, + "loss": 0.9615, + "step": 4400 + }, + { + "epoch": 0.3061671710320359, + "grad_norm": 1.2734375, + "learning_rate": 0.001626028418591825, + "loss": 1.0641, + "step": 4401 + }, + { + "epoch": 0.3062367386691711, + "grad_norm": 1.09375, + "learning_rate": 0.0016258526879736658, + "loss": 0.8778, + "step": 4402 + }, + { + "epoch": 0.3063063063063063, + "grad_norm": 1.234375, + "learning_rate": 0.0016256769255778615, + "loss": 0.8062, + "step": 4403 + }, + { + "epoch": 0.3063758739434415, + "grad_norm": 1.1796875, + "learning_rate": 0.001625501131413337, + "loss": 0.783, + "step": 4404 + }, + { + "epoch": 0.30644544158057674, + "grad_norm": 1.140625, + "learning_rate": 0.0016253253054890173, + "loss": 1.1104, + "step": 4405 + }, + { + "epoch": 0.3065150092177119, + "grad_norm": 1.5390625, + "learning_rate": 0.001625149447813831, + "loss": 0.8862, + "step": 4406 + }, + { + "epoch": 0.30658457685484714, + "grad_norm": 1.0859375, + "learning_rate": 0.001624973558396706, + "loss": 0.992, + "step": 4407 + }, + { + "epoch": 0.3066541444919823, + "grad_norm": 1.3359375, + "learning_rate": 0.0016247976372465744, + "loss": 0.7775, + "step": 4408 + }, + { + "epoch": 0.30672371212911753, + "grad_norm": 1.0703125, + "learning_rate": 0.001624621684372368, + "loss": 0.8106, + "step": 4409 + }, + { + "epoch": 0.30679327976625276, + "grad_norm": 0.93359375, + "learning_rate": 0.0016244456997830203, + "loss": 1.0442, + "step": 4410 + }, + { + "epoch": 0.30686284740338793, + "grad_norm": 1.078125, + "learning_rate": 0.001624269683487468, + "loss": 1.1166, + "step": 4411 + }, + { + "epoch": 0.30693241504052315, + "grad_norm": 1.078125, + "learning_rate": 0.0016240936354946474, + "loss": 0.896, + "step": 4412 + }, + { + "epoch": 0.3070019826776584, + "grad_norm": 0.90625, + "learning_rate": 0.0016239175558134976, + "loss": 1.0908, + "step": 4413 + }, + { + "epoch": 0.30707155031479355, + "grad_norm": 1.1484375, + "learning_rate": 0.0016237414444529592, + "loss": 1.1792, + "step": 4414 + }, + { + "epoch": 0.3071411179519288, + "grad_norm": 1.21875, + "learning_rate": 0.0016235653014219742, + "loss": 0.9716, + "step": 4415 + }, + { + "epoch": 0.30721068558906395, + "grad_norm": 1.8125, + "learning_rate": 0.001623389126729486, + "loss": 0.9999, + "step": 4416 + }, + { + "epoch": 0.3072802532261992, + "grad_norm": 0.88671875, + "learning_rate": 0.0016232129203844403, + "loss": 1.0203, + "step": 4417 + }, + { + "epoch": 0.3073498208633344, + "grad_norm": 0.94140625, + "learning_rate": 0.0016230366823957836, + "loss": 1.0839, + "step": 4418 + }, + { + "epoch": 0.30741938850046957, + "grad_norm": 0.890625, + "learning_rate": 0.0016228604127724645, + "loss": 0.8569, + "step": 4419 + }, + { + "epoch": 0.3074889561376048, + "grad_norm": 1.3359375, + "learning_rate": 0.0016226841115234332, + "loss": 1.134, + "step": 4420 + }, + { + "epoch": 0.30755852377473997, + "grad_norm": 0.953125, + "learning_rate": 0.0016225077786576412, + "loss": 0.795, + "step": 4421 + }, + { + "epoch": 0.3076280914118752, + "grad_norm": 1.234375, + "learning_rate": 0.0016223314141840417, + "loss": 1.0879, + "step": 4422 + }, + { + "epoch": 0.3076976590490104, + "grad_norm": 1.1015625, + "learning_rate": 0.0016221550181115898, + "loss": 0.8837, + "step": 4423 + }, + { + "epoch": 0.3077672266861456, + "grad_norm": 1.203125, + "learning_rate": 0.0016219785904492423, + "loss": 1.3526, + "step": 4424 + }, + { + "epoch": 0.3078367943232808, + "grad_norm": 1.2890625, + "learning_rate": 0.001621802131205957, + "loss": 0.8536, + "step": 4425 + }, + { + "epoch": 0.30790636196041604, + "grad_norm": 1.0234375, + "learning_rate": 0.0016216256403906932, + "loss": 0.9965, + "step": 4426 + }, + { + "epoch": 0.3079759295975512, + "grad_norm": 1.34375, + "learning_rate": 0.0016214491180124128, + "loss": 1.1338, + "step": 4427 + }, + { + "epoch": 0.30804549723468644, + "grad_norm": 0.84375, + "learning_rate": 0.0016212725640800784, + "loss": 0.9712, + "step": 4428 + }, + { + "epoch": 0.3081150648718216, + "grad_norm": 1.25, + "learning_rate": 0.001621095978602655, + "loss": 1.1477, + "step": 4429 + }, + { + "epoch": 0.30818463250895684, + "grad_norm": 1.0625, + "learning_rate": 0.0016209193615891078, + "loss": 0.8307, + "step": 4430 + }, + { + "epoch": 0.30825420014609206, + "grad_norm": 1.15625, + "learning_rate": 0.0016207427130484056, + "loss": 1.0563, + "step": 4431 + }, + { + "epoch": 0.30832376778322723, + "grad_norm": 1.1015625, + "learning_rate": 0.001620566032989517, + "loss": 1.1016, + "step": 4432 + }, + { + "epoch": 0.30839333542036246, + "grad_norm": 1.2734375, + "learning_rate": 0.001620389321421413, + "loss": 1.1264, + "step": 4433 + }, + { + "epoch": 0.30846290305749763, + "grad_norm": 1.171875, + "learning_rate": 0.0016202125783530666, + "loss": 1.1547, + "step": 4434 + }, + { + "epoch": 0.30853247069463285, + "grad_norm": 1.203125, + "learning_rate": 0.0016200358037934512, + "loss": 1.0316, + "step": 4435 + }, + { + "epoch": 0.3086020383317681, + "grad_norm": 1.015625, + "learning_rate": 0.0016198589977515431, + "loss": 0.9454, + "step": 4436 + }, + { + "epoch": 0.30867160596890325, + "grad_norm": 1.0859375, + "learning_rate": 0.0016196821602363193, + "loss": 0.9787, + "step": 4437 + }, + { + "epoch": 0.3087411736060385, + "grad_norm": 0.96484375, + "learning_rate": 0.001619505291256759, + "loss": 0.9956, + "step": 4438 + }, + { + "epoch": 0.3088107412431737, + "grad_norm": 1.2578125, + "learning_rate": 0.0016193283908218423, + "loss": 1.0377, + "step": 4439 + }, + { + "epoch": 0.3088803088803089, + "grad_norm": 1.2265625, + "learning_rate": 0.001619151458940552, + "loss": 1.0214, + "step": 4440 + }, + { + "epoch": 0.3089498765174441, + "grad_norm": 0.96875, + "learning_rate": 0.001618974495621871, + "loss": 0.7111, + "step": 4441 + }, + { + "epoch": 0.30901944415457927, + "grad_norm": 1.171875, + "learning_rate": 0.001618797500874785, + "loss": 1.1319, + "step": 4442 + }, + { + "epoch": 0.3090890117917145, + "grad_norm": 0.921875, + "learning_rate": 0.001618620474708281, + "loss": 1.0498, + "step": 4443 + }, + { + "epoch": 0.3091585794288497, + "grad_norm": 1.21875, + "learning_rate": 0.0016184434171313473, + "loss": 1.1306, + "step": 4444 + }, + { + "epoch": 0.3092281470659849, + "grad_norm": 1.0546875, + "learning_rate": 0.001618266328152974, + "loss": 0.8588, + "step": 4445 + }, + { + "epoch": 0.3092977147031201, + "grad_norm": 1.375, + "learning_rate": 0.0016180892077821529, + "loss": 0.9119, + "step": 4446 + }, + { + "epoch": 0.3093672823402553, + "grad_norm": 0.953125, + "learning_rate": 0.0016179120560278772, + "loss": 0.8275, + "step": 4447 + }, + { + "epoch": 0.3094368499773905, + "grad_norm": 1.0234375, + "learning_rate": 0.0016177348728991419, + "loss": 1.0409, + "step": 4448 + }, + { + "epoch": 0.30950641761452574, + "grad_norm": 1.265625, + "learning_rate": 0.0016175576584049431, + "loss": 0.9388, + "step": 4449 + }, + { + "epoch": 0.3095759852516609, + "grad_norm": 1.0, + "learning_rate": 0.0016173804125542797, + "loss": 0.8171, + "step": 4450 + }, + { + "epoch": 0.30964555288879614, + "grad_norm": 1.625, + "learning_rate": 0.0016172031353561503, + "loss": 1.3405, + "step": 4451 + }, + { + "epoch": 0.30971512052593136, + "grad_norm": 1.21875, + "learning_rate": 0.0016170258268195568, + "loss": 1.105, + "step": 4452 + }, + { + "epoch": 0.30978468816306653, + "grad_norm": 1.046875, + "learning_rate": 0.0016168484869535015, + "loss": 0.5778, + "step": 4453 + }, + { + "epoch": 0.30985425580020176, + "grad_norm": 1.046875, + "learning_rate": 0.0016166711157669898, + "loss": 0.752, + "step": 4454 + }, + { + "epoch": 0.30992382343733693, + "grad_norm": 1.1015625, + "learning_rate": 0.0016164937132690266, + "loss": 1.0001, + "step": 4455 + }, + { + "epoch": 0.30999339107447216, + "grad_norm": 1.0390625, + "learning_rate": 0.0016163162794686201, + "loss": 1.0588, + "step": 4456 + }, + { + "epoch": 0.3100629587116074, + "grad_norm": 1.0703125, + "learning_rate": 0.0016161388143747797, + "loss": 0.8943, + "step": 4457 + }, + { + "epoch": 0.31013252634874255, + "grad_norm": 0.875, + "learning_rate": 0.0016159613179965156, + "loss": 0.9216, + "step": 4458 + }, + { + "epoch": 0.3102020939858778, + "grad_norm": 0.9609375, + "learning_rate": 0.0016157837903428404, + "loss": 0.8055, + "step": 4459 + }, + { + "epoch": 0.31027166162301295, + "grad_norm": 0.90234375, + "learning_rate": 0.0016156062314227682, + "loss": 0.8656, + "step": 4460 + }, + { + "epoch": 0.3103412292601482, + "grad_norm": 1.2578125, + "learning_rate": 0.0016154286412453144, + "loss": 1.1291, + "step": 4461 + }, + { + "epoch": 0.3104107968972834, + "grad_norm": 1.1875, + "learning_rate": 0.0016152510198194966, + "loss": 0.8579, + "step": 4462 + }, + { + "epoch": 0.3104803645344186, + "grad_norm": 1.1015625, + "learning_rate": 0.0016150733671543324, + "loss": 0.821, + "step": 4463 + }, + { + "epoch": 0.3105499321715538, + "grad_norm": 0.875, + "learning_rate": 0.0016148956832588435, + "loss": 0.8403, + "step": 4464 + }, + { + "epoch": 0.31061949980868897, + "grad_norm": 1.0859375, + "learning_rate": 0.0016147179681420506, + "loss": 0.9634, + "step": 4465 + }, + { + "epoch": 0.3106890674458242, + "grad_norm": 1.2734375, + "learning_rate": 0.001614540221812978, + "loss": 1.0434, + "step": 4466 + }, + { + "epoch": 0.3107586350829594, + "grad_norm": 1.03125, + "learning_rate": 0.00161436244428065, + "loss": 0.9125, + "step": 4467 + }, + { + "epoch": 0.3108282027200946, + "grad_norm": 1.046875, + "learning_rate": 0.0016141846355540942, + "loss": 1.0275, + "step": 4468 + }, + { + "epoch": 0.3108977703572298, + "grad_norm": 1.2578125, + "learning_rate": 0.0016140067956423381, + "loss": 0.7801, + "step": 4469 + }, + { + "epoch": 0.31096733799436505, + "grad_norm": 1.1640625, + "learning_rate": 0.0016138289245544116, + "loss": 0.8483, + "step": 4470 + }, + { + "epoch": 0.3110369056315002, + "grad_norm": 1.2421875, + "learning_rate": 0.0016136510222993464, + "loss": 1.0084, + "step": 4471 + }, + { + "epoch": 0.31110647326863544, + "grad_norm": 1.2265625, + "learning_rate": 0.0016134730888861754, + "loss": 0.9062, + "step": 4472 + }, + { + "epoch": 0.3111760409057706, + "grad_norm": 1.140625, + "learning_rate": 0.0016132951243239331, + "loss": 0.9388, + "step": 4473 + }, + { + "epoch": 0.31124560854290584, + "grad_norm": 0.8984375, + "learning_rate": 0.0016131171286216555, + "loss": 0.7988, + "step": 4474 + }, + { + "epoch": 0.31131517618004106, + "grad_norm": 1.2109375, + "learning_rate": 0.0016129391017883803, + "loss": 0.9827, + "step": 4475 + }, + { + "epoch": 0.31138474381717623, + "grad_norm": 1.203125, + "learning_rate": 0.0016127610438331473, + "loss": 0.7951, + "step": 4476 + }, + { + "epoch": 0.31145431145431146, + "grad_norm": 1.1328125, + "learning_rate": 0.0016125829547649967, + "loss": 0.9847, + "step": 4477 + }, + { + "epoch": 0.31152387909144663, + "grad_norm": 1.1015625, + "learning_rate": 0.0016124048345929716, + "loss": 0.7996, + "step": 4478 + }, + { + "epoch": 0.31159344672858186, + "grad_norm": 0.95703125, + "learning_rate": 0.0016122266833261158, + "loss": 1.0155, + "step": 4479 + }, + { + "epoch": 0.3116630143657171, + "grad_norm": 1.078125, + "learning_rate": 0.0016120485009734743, + "loss": 0.9282, + "step": 4480 + }, + { + "epoch": 0.31173258200285225, + "grad_norm": 1.0390625, + "learning_rate": 0.0016118702875440954, + "loss": 0.9679, + "step": 4481 + }, + { + "epoch": 0.3118021496399875, + "grad_norm": 0.98828125, + "learning_rate": 0.0016116920430470272, + "loss": 0.7549, + "step": 4482 + }, + { + "epoch": 0.3118717172771227, + "grad_norm": 1.25, + "learning_rate": 0.0016115137674913202, + "loss": 1.0342, + "step": 4483 + }, + { + "epoch": 0.3119412849142579, + "grad_norm": 1.21875, + "learning_rate": 0.0016113354608860264, + "loss": 0.7318, + "step": 4484 + }, + { + "epoch": 0.3120108525513931, + "grad_norm": 1.2578125, + "learning_rate": 0.0016111571232401993, + "loss": 0.868, + "step": 4485 + }, + { + "epoch": 0.3120804201885283, + "grad_norm": 1.09375, + "learning_rate": 0.0016109787545628938, + "loss": 0.9422, + "step": 4486 + }, + { + "epoch": 0.3121499878256635, + "grad_norm": 1.03125, + "learning_rate": 0.001610800354863167, + "loss": 0.8755, + "step": 4487 + }, + { + "epoch": 0.3122195554627987, + "grad_norm": 1.140625, + "learning_rate": 0.0016106219241500766, + "loss": 1.0344, + "step": 4488 + }, + { + "epoch": 0.3122891230999339, + "grad_norm": 0.984375, + "learning_rate": 0.0016104434624326825, + "loss": 0.8669, + "step": 4489 + }, + { + "epoch": 0.3123586907370691, + "grad_norm": 0.953125, + "learning_rate": 0.0016102649697200464, + "loss": 0.961, + "step": 4490 + }, + { + "epoch": 0.3124282583742043, + "grad_norm": 1.2109375, + "learning_rate": 0.001610086446021231, + "loss": 0.9783, + "step": 4491 + }, + { + "epoch": 0.3124978260113395, + "grad_norm": 1.1875, + "learning_rate": 0.0016099078913453014, + "loss": 0.6967, + "step": 4492 + }, + { + "epoch": 0.31256739364847475, + "grad_norm": 1.171875, + "learning_rate": 0.0016097293057013226, + "loss": 0.9887, + "step": 4493 + }, + { + "epoch": 0.3126369612856099, + "grad_norm": 1.25, + "learning_rate": 0.0016095506890983634, + "loss": 1.0331, + "step": 4494 + }, + { + "epoch": 0.31270652892274514, + "grad_norm": 1.1484375, + "learning_rate": 0.0016093720415454925, + "loss": 0.8642, + "step": 4495 + }, + { + "epoch": 0.31277609655988037, + "grad_norm": 1.1171875, + "learning_rate": 0.0016091933630517806, + "loss": 0.8122, + "step": 4496 + }, + { + "epoch": 0.31284566419701554, + "grad_norm": 1.0703125, + "learning_rate": 0.0016090146536263002, + "loss": 0.8992, + "step": 4497 + }, + { + "epoch": 0.31291523183415076, + "grad_norm": 1.2265625, + "learning_rate": 0.0016088359132781253, + "loss": 0.8872, + "step": 4498 + }, + { + "epoch": 0.31298479947128593, + "grad_norm": 1.53125, + "learning_rate": 0.0016086571420163322, + "loss": 1.1528, + "step": 4499 + }, + { + "epoch": 0.31305436710842116, + "grad_norm": 1.078125, + "learning_rate": 0.0016084783398499964, + "loss": 0.897, + "step": 4500 + }, + { + "epoch": 0.3131239347455564, + "grad_norm": 2.578125, + "learning_rate": 0.0016082995067881979, + "loss": 1.0002, + "step": 4501 + }, + { + "epoch": 0.31319350238269156, + "grad_norm": 1.2265625, + "learning_rate": 0.0016081206428400165, + "loss": 1.1553, + "step": 4502 + }, + { + "epoch": 0.3132630700198268, + "grad_norm": 1.0, + "learning_rate": 0.0016079417480145339, + "loss": 0.8074, + "step": 4503 + }, + { + "epoch": 0.31333263765696195, + "grad_norm": 1.25, + "learning_rate": 0.0016077628223208338, + "loss": 1.249, + "step": 4504 + }, + { + "epoch": 0.3134022052940972, + "grad_norm": 1.015625, + "learning_rate": 0.0016075838657680004, + "loss": 1.1904, + "step": 4505 + }, + { + "epoch": 0.3134717729312324, + "grad_norm": 1.109375, + "learning_rate": 0.0016074048783651213, + "loss": 0.8828, + "step": 4506 + }, + { + "epoch": 0.3135413405683676, + "grad_norm": 1.5078125, + "learning_rate": 0.0016072258601212838, + "loss": 0.8629, + "step": 4507 + }, + { + "epoch": 0.3136109082055028, + "grad_norm": 1.0390625, + "learning_rate": 0.001607046811045578, + "loss": 0.9527, + "step": 4508 + }, + { + "epoch": 0.31368047584263803, + "grad_norm": 1.0859375, + "learning_rate": 0.0016068677311470948, + "loss": 0.9314, + "step": 4509 + }, + { + "epoch": 0.3137500434797732, + "grad_norm": 0.953125, + "learning_rate": 0.0016066886204349267, + "loss": 0.7579, + "step": 4510 + }, + { + "epoch": 0.3138196111169084, + "grad_norm": 0.90234375, + "learning_rate": 0.0016065094789181687, + "loss": 0.7607, + "step": 4511 + }, + { + "epoch": 0.3138891787540436, + "grad_norm": 0.9140625, + "learning_rate": 0.0016063303066059162, + "loss": 0.7219, + "step": 4512 + }, + { + "epoch": 0.3139587463911788, + "grad_norm": 1.34375, + "learning_rate": 0.001606151103507267, + "loss": 1.1225, + "step": 4513 + }, + { + "epoch": 0.31402831402831405, + "grad_norm": 1.1796875, + "learning_rate": 0.0016059718696313202, + "loss": 0.7852, + "step": 4514 + }, + { + "epoch": 0.3140978816654492, + "grad_norm": 1.28125, + "learning_rate": 0.001605792604987176, + "loss": 1.0759, + "step": 4515 + }, + { + "epoch": 0.31416744930258445, + "grad_norm": 1.1640625, + "learning_rate": 0.0016056133095839365, + "loss": 1.0242, + "step": 4516 + }, + { + "epoch": 0.3142370169397196, + "grad_norm": 1.125, + "learning_rate": 0.0016054339834307059, + "loss": 1.0011, + "step": 4517 + }, + { + "epoch": 0.31430658457685484, + "grad_norm": 1.0625, + "learning_rate": 0.0016052546265365893, + "loss": 1.2239, + "step": 4518 + }, + { + "epoch": 0.31437615221399007, + "grad_norm": 1.046875, + "learning_rate": 0.0016050752389106934, + "loss": 1.11, + "step": 4519 + }, + { + "epoch": 0.31444571985112524, + "grad_norm": 1.09375, + "learning_rate": 0.0016048958205621268, + "loss": 0.8507, + "step": 4520 + }, + { + "epoch": 0.31451528748826046, + "grad_norm": 1.34375, + "learning_rate": 0.0016047163714999991, + "loss": 0.9852, + "step": 4521 + }, + { + "epoch": 0.3145848551253957, + "grad_norm": 1.0703125, + "learning_rate": 0.001604536891733422, + "loss": 0.9424, + "step": 4522 + }, + { + "epoch": 0.31465442276253086, + "grad_norm": 1.3203125, + "learning_rate": 0.0016043573812715086, + "loss": 1.0454, + "step": 4523 + }, + { + "epoch": 0.3147239903996661, + "grad_norm": 1.390625, + "learning_rate": 0.001604177840123374, + "loss": 1.1769, + "step": 4524 + }, + { + "epoch": 0.31479355803680126, + "grad_norm": 1.2578125, + "learning_rate": 0.0016039982682981336, + "loss": 0.9743, + "step": 4525 + }, + { + "epoch": 0.3148631256739365, + "grad_norm": 1.03125, + "learning_rate": 0.0016038186658049055, + "loss": 0.9401, + "step": 4526 + }, + { + "epoch": 0.3149326933110717, + "grad_norm": 1.0546875, + "learning_rate": 0.0016036390326528093, + "loss": 0.9003, + "step": 4527 + }, + { + "epoch": 0.3150022609482069, + "grad_norm": 1.1015625, + "learning_rate": 0.0016034593688509654, + "loss": 0.9977, + "step": 4528 + }, + { + "epoch": 0.3150718285853421, + "grad_norm": 1.1953125, + "learning_rate": 0.0016032796744084963, + "loss": 0.9259, + "step": 4529 + }, + { + "epoch": 0.3151413962224773, + "grad_norm": 1.390625, + "learning_rate": 0.0016030999493345261, + "loss": 0.7179, + "step": 4530 + }, + { + "epoch": 0.3152109638596125, + "grad_norm": 1.359375, + "learning_rate": 0.0016029201936381804, + "loss": 0.9177, + "step": 4531 + }, + { + "epoch": 0.31528053149674773, + "grad_norm": 1.296875, + "learning_rate": 0.0016027404073285863, + "loss": 0.7988, + "step": 4532 + }, + { + "epoch": 0.3153500991338829, + "grad_norm": 1.1640625, + "learning_rate": 0.0016025605904148726, + "loss": 0.8419, + "step": 4533 + }, + { + "epoch": 0.3154196667710181, + "grad_norm": 1.0859375, + "learning_rate": 0.0016023807429061687, + "loss": 1.1772, + "step": 4534 + }, + { + "epoch": 0.31548923440815335, + "grad_norm": 1.0859375, + "learning_rate": 0.0016022008648116071, + "loss": 1.1462, + "step": 4535 + }, + { + "epoch": 0.3155588020452885, + "grad_norm": 1.109375, + "learning_rate": 0.0016020209561403212, + "loss": 0.8713, + "step": 4536 + }, + { + "epoch": 0.31562836968242375, + "grad_norm": 1.2578125, + "learning_rate": 0.001601841016901445, + "loss": 0.9731, + "step": 4537 + }, + { + "epoch": 0.3156979373195589, + "grad_norm": 1.4140625, + "learning_rate": 0.001601661047104116, + "loss": 1.0064, + "step": 4538 + }, + { + "epoch": 0.31576750495669414, + "grad_norm": 0.95703125, + "learning_rate": 0.0016014810467574712, + "loss": 0.7273, + "step": 4539 + }, + { + "epoch": 0.31583707259382937, + "grad_norm": 1.0859375, + "learning_rate": 0.001601301015870651, + "loss": 0.9267, + "step": 4540 + }, + { + "epoch": 0.31590664023096454, + "grad_norm": 1.4453125, + "learning_rate": 0.0016011209544527956, + "loss": 1.2038, + "step": 4541 + }, + { + "epoch": 0.31597620786809977, + "grad_norm": 0.96484375, + "learning_rate": 0.001600940862513048, + "loss": 0.8266, + "step": 4542 + }, + { + "epoch": 0.31604577550523494, + "grad_norm": 1.0, + "learning_rate": 0.0016007607400605527, + "loss": 0.8463, + "step": 4543 + }, + { + "epoch": 0.31611534314237016, + "grad_norm": 1.140625, + "learning_rate": 0.0016005805871044548, + "loss": 0.9369, + "step": 4544 + }, + { + "epoch": 0.3161849107795054, + "grad_norm": 1.0078125, + "learning_rate": 0.0016004004036539018, + "loss": 0.9409, + "step": 4545 + }, + { + "epoch": 0.31625447841664056, + "grad_norm": 1.0625, + "learning_rate": 0.0016002201897180426, + "loss": 0.8067, + "step": 4546 + }, + { + "epoch": 0.3163240460537758, + "grad_norm": 1.1015625, + "learning_rate": 0.0016000399453060276, + "loss": 1.0236, + "step": 4547 + }, + { + "epoch": 0.316393613690911, + "grad_norm": 1.0078125, + "learning_rate": 0.0015998596704270085, + "loss": 0.8579, + "step": 4548 + }, + { + "epoch": 0.3164631813280462, + "grad_norm": 1.1015625, + "learning_rate": 0.001599679365090139, + "loss": 1.0777, + "step": 4549 + }, + { + "epoch": 0.3165327489651814, + "grad_norm": 1.1875, + "learning_rate": 0.0015994990293045738, + "loss": 0.911, + "step": 4550 + }, + { + "epoch": 0.3166023166023166, + "grad_norm": 1.0078125, + "learning_rate": 0.0015993186630794698, + "loss": 0.8788, + "step": 4551 + }, + { + "epoch": 0.3166718842394518, + "grad_norm": 1.0390625, + "learning_rate": 0.0015991382664239846, + "loss": 1.1761, + "step": 4552 + }, + { + "epoch": 0.31674145187658703, + "grad_norm": 1.171875, + "learning_rate": 0.0015989578393472783, + "loss": 1.0325, + "step": 4553 + }, + { + "epoch": 0.3168110195137222, + "grad_norm": 0.92578125, + "learning_rate": 0.0015987773818585118, + "loss": 0.8483, + "step": 4554 + }, + { + "epoch": 0.31688058715085743, + "grad_norm": 1.0390625, + "learning_rate": 0.001598596893966848, + "loss": 0.7744, + "step": 4555 + }, + { + "epoch": 0.3169501547879926, + "grad_norm": 1.34375, + "learning_rate": 0.0015984163756814509, + "loss": 0.9896, + "step": 4556 + }, + { + "epoch": 0.3170197224251278, + "grad_norm": 0.87890625, + "learning_rate": 0.0015982358270114868, + "loss": 0.804, + "step": 4557 + }, + { + "epoch": 0.31708929006226305, + "grad_norm": 1.171875, + "learning_rate": 0.0015980552479661224, + "loss": 0.8124, + "step": 4558 + }, + { + "epoch": 0.3171588576993982, + "grad_norm": 1.1484375, + "learning_rate": 0.0015978746385545272, + "loss": 1.0928, + "step": 4559 + }, + { + "epoch": 0.31722842533653345, + "grad_norm": 1.0, + "learning_rate": 0.001597693998785871, + "loss": 0.744, + "step": 4560 + }, + { + "epoch": 0.3172979929736687, + "grad_norm": 1.0390625, + "learning_rate": 0.0015975133286693266, + "loss": 0.7445, + "step": 4561 + }, + { + "epoch": 0.31736756061080384, + "grad_norm": 1.1640625, + "learning_rate": 0.0015973326282140668, + "loss": 0.9578, + "step": 4562 + }, + { + "epoch": 0.31743712824793907, + "grad_norm": 1.2421875, + "learning_rate": 0.001597151897429267, + "loss": 0.9887, + "step": 4563 + }, + { + "epoch": 0.31750669588507424, + "grad_norm": 0.93359375, + "learning_rate": 0.0015969711363241035, + "loss": 0.7945, + "step": 4564 + }, + { + "epoch": 0.31757626352220947, + "grad_norm": 0.96484375, + "learning_rate": 0.0015967903449077548, + "loss": 0.9663, + "step": 4565 + }, + { + "epoch": 0.3176458311593447, + "grad_norm": 1.2578125, + "learning_rate": 0.0015966095231894006, + "loss": 0.9744, + "step": 4566 + }, + { + "epoch": 0.31771539879647986, + "grad_norm": 1.40625, + "learning_rate": 0.001596428671178222, + "loss": 0.8752, + "step": 4567 + }, + { + "epoch": 0.3177849664336151, + "grad_norm": 0.9609375, + "learning_rate": 0.0015962477888834012, + "loss": 0.8157, + "step": 4568 + }, + { + "epoch": 0.31785453407075026, + "grad_norm": 0.99609375, + "learning_rate": 0.0015960668763141234, + "loss": 1.0015, + "step": 4569 + }, + { + "epoch": 0.3179241017078855, + "grad_norm": 1.0625, + "learning_rate": 0.001595885933479574, + "loss": 0.7865, + "step": 4570 + }, + { + "epoch": 0.3179936693450207, + "grad_norm": 0.92578125, + "learning_rate": 0.0015957049603889401, + "loss": 0.8637, + "step": 4571 + }, + { + "epoch": 0.3180632369821559, + "grad_norm": 1.21875, + "learning_rate": 0.0015955239570514112, + "loss": 0.9391, + "step": 4572 + }, + { + "epoch": 0.3181328046192911, + "grad_norm": 1.0234375, + "learning_rate": 0.0015953429234761773, + "loss": 1.0095, + "step": 4573 + }, + { + "epoch": 0.31820237225642634, + "grad_norm": 0.9765625, + "learning_rate": 0.0015951618596724306, + "loss": 0.8901, + "step": 4574 + }, + { + "epoch": 0.3182719398935615, + "grad_norm": 1.2265625, + "learning_rate": 0.0015949807656493644, + "loss": 0.9012, + "step": 4575 + }, + { + "epoch": 0.31834150753069673, + "grad_norm": 1.109375, + "learning_rate": 0.001594799641416174, + "loss": 1.0944, + "step": 4576 + }, + { + "epoch": 0.3184110751678319, + "grad_norm": 1.1484375, + "learning_rate": 0.0015946184869820557, + "loss": 0.9183, + "step": 4577 + }, + { + "epoch": 0.31848064280496713, + "grad_norm": 1.203125, + "learning_rate": 0.0015944373023562075, + "loss": 0.808, + "step": 4578 + }, + { + "epoch": 0.31855021044210236, + "grad_norm": 1.265625, + "learning_rate": 0.0015942560875478295, + "loss": 0.9312, + "step": 4579 + }, + { + "epoch": 0.3186197780792375, + "grad_norm": 0.92578125, + "learning_rate": 0.0015940748425661226, + "loss": 0.7663, + "step": 4580 + }, + { + "epoch": 0.31868934571637275, + "grad_norm": 1.015625, + "learning_rate": 0.0015938935674202897, + "loss": 0.8847, + "step": 4581 + }, + { + "epoch": 0.3187589133535079, + "grad_norm": 1.25, + "learning_rate": 0.0015937122621195348, + "loss": 0.9319, + "step": 4582 + }, + { + "epoch": 0.31882848099064315, + "grad_norm": 1.0625, + "learning_rate": 0.0015935309266730635, + "loss": 0.9495, + "step": 4583 + }, + { + "epoch": 0.3188980486277784, + "grad_norm": 1.125, + "learning_rate": 0.0015933495610900839, + "loss": 0.8488, + "step": 4584 + }, + { + "epoch": 0.31896761626491354, + "grad_norm": 1.1953125, + "learning_rate": 0.001593168165379804, + "loss": 0.9934, + "step": 4585 + }, + { + "epoch": 0.31903718390204877, + "grad_norm": 1.0703125, + "learning_rate": 0.0015929867395514344, + "loss": 1.0, + "step": 4586 + }, + { + "epoch": 0.319106751539184, + "grad_norm": 0.9453125, + "learning_rate": 0.0015928052836141871, + "loss": 0.8157, + "step": 4587 + }, + { + "epoch": 0.31917631917631917, + "grad_norm": 0.9375, + "learning_rate": 0.0015926237975772755, + "loss": 0.7274, + "step": 4588 + }, + { + "epoch": 0.3192458868134544, + "grad_norm": 1.2265625, + "learning_rate": 0.0015924422814499145, + "loss": 0.8182, + "step": 4589 + }, + { + "epoch": 0.31931545445058956, + "grad_norm": 1.015625, + "learning_rate": 0.0015922607352413204, + "loss": 0.8247, + "step": 4590 + }, + { + "epoch": 0.3193850220877248, + "grad_norm": 1.046875, + "learning_rate": 0.0015920791589607115, + "loss": 0.8285, + "step": 4591 + }, + { + "epoch": 0.31945458972486, + "grad_norm": 1.0859375, + "learning_rate": 0.0015918975526173073, + "loss": 1.0798, + "step": 4592 + }, + { + "epoch": 0.3195241573619952, + "grad_norm": 1.1328125, + "learning_rate": 0.0015917159162203284, + "loss": 0.912, + "step": 4593 + }, + { + "epoch": 0.3195937249991304, + "grad_norm": 1.0625, + "learning_rate": 0.0015915342497789982, + "loss": 0.8307, + "step": 4594 + }, + { + "epoch": 0.3196632926362656, + "grad_norm": 0.859375, + "learning_rate": 0.0015913525533025402, + "loss": 0.8298, + "step": 4595 + }, + { + "epoch": 0.3197328602734008, + "grad_norm": 1.03125, + "learning_rate": 0.0015911708268001802, + "loss": 0.8014, + "step": 4596 + }, + { + "epoch": 0.31980242791053604, + "grad_norm": 1.15625, + "learning_rate": 0.0015909890702811452, + "loss": 0.8576, + "step": 4597 + }, + { + "epoch": 0.3198719955476712, + "grad_norm": 1.2734375, + "learning_rate": 0.0015908072837546642, + "loss": 1.0694, + "step": 4598 + }, + { + "epoch": 0.31994156318480643, + "grad_norm": 1.09375, + "learning_rate": 0.001590625467229967, + "loss": 1.1575, + "step": 4599 + }, + { + "epoch": 0.32001113082194166, + "grad_norm": 0.95703125, + "learning_rate": 0.0015904436207162856, + "loss": 0.6767, + "step": 4600 + }, + { + "epoch": 0.32008069845907683, + "grad_norm": 1.046875, + "learning_rate": 0.0015902617442228532, + "loss": 1.1852, + "step": 4601 + }, + { + "epoch": 0.32015026609621206, + "grad_norm": 1.1640625, + "learning_rate": 0.0015900798377589047, + "loss": 1.0463, + "step": 4602 + }, + { + "epoch": 0.3202198337333472, + "grad_norm": 1.0703125, + "learning_rate": 0.0015898979013336764, + "loss": 1.0375, + "step": 4603 + }, + { + "epoch": 0.32028940137048245, + "grad_norm": 1.0390625, + "learning_rate": 0.0015897159349564057, + "loss": 1.0322, + "step": 4604 + }, + { + "epoch": 0.3203589690076177, + "grad_norm": 1.1015625, + "learning_rate": 0.0015895339386363322, + "loss": 1.1247, + "step": 4605 + }, + { + "epoch": 0.32042853664475285, + "grad_norm": 1.1484375, + "learning_rate": 0.0015893519123826969, + "loss": 0.9166, + "step": 4606 + }, + { + "epoch": 0.3204981042818881, + "grad_norm": 0.95703125, + "learning_rate": 0.0015891698562047422, + "loss": 1.0436, + "step": 4607 + }, + { + "epoch": 0.32056767191902324, + "grad_norm": 1.03125, + "learning_rate": 0.0015889877701117114, + "loss": 0.8429, + "step": 4608 + }, + { + "epoch": 0.32063723955615847, + "grad_norm": 1.203125, + "learning_rate": 0.0015888056541128505, + "loss": 0.9104, + "step": 4609 + }, + { + "epoch": 0.3207068071932937, + "grad_norm": 1.03125, + "learning_rate": 0.0015886235082174065, + "loss": 1.0487, + "step": 4610 + }, + { + "epoch": 0.32077637483042887, + "grad_norm": 1.171875, + "learning_rate": 0.0015884413324346275, + "loss": 0.9555, + "step": 4611 + }, + { + "epoch": 0.3208459424675641, + "grad_norm": 1.078125, + "learning_rate": 0.0015882591267737639, + "loss": 0.7592, + "step": 4612 + }, + { + "epoch": 0.3209155101046993, + "grad_norm": 1.09375, + "learning_rate": 0.001588076891244066, + "loss": 0.7827, + "step": 4613 + }, + { + "epoch": 0.3209850777418345, + "grad_norm": 1.0546875, + "learning_rate": 0.0015878946258547889, + "loss": 0.8938, + "step": 4614 + }, + { + "epoch": 0.3210546453789697, + "grad_norm": 0.96875, + "learning_rate": 0.0015877123306151848, + "loss": 0.8164, + "step": 4615 + }, + { + "epoch": 0.3211242130161049, + "grad_norm": 1.09375, + "learning_rate": 0.0015875300055345114, + "loss": 0.929, + "step": 4616 + }, + { + "epoch": 0.3211937806532401, + "grad_norm": 1.1015625, + "learning_rate": 0.001587347650622026, + "loss": 1.0975, + "step": 4617 + }, + { + "epoch": 0.32126334829037534, + "grad_norm": 1.0078125, + "learning_rate": 0.0015871652658869869, + "loss": 0.9612, + "step": 4618 + }, + { + "epoch": 0.3213329159275105, + "grad_norm": 1.046875, + "learning_rate": 0.001586982851338655, + "loss": 0.818, + "step": 4619 + }, + { + "epoch": 0.32140248356464574, + "grad_norm": 1.140625, + "learning_rate": 0.001586800406986293, + "loss": 0.8031, + "step": 4620 + }, + { + "epoch": 0.3214720512017809, + "grad_norm": 1.0859375, + "learning_rate": 0.0015866179328391636, + "loss": 1.0245, + "step": 4621 + }, + { + "epoch": 0.32154161883891613, + "grad_norm": 0.92578125, + "learning_rate": 0.0015864354289065324, + "loss": 0.7114, + "step": 4622 + }, + { + "epoch": 0.32161118647605136, + "grad_norm": 1.0078125, + "learning_rate": 0.001586252895197666, + "loss": 0.848, + "step": 4623 + }, + { + "epoch": 0.32168075411318653, + "grad_norm": 1.2109375, + "learning_rate": 0.0015860703317218325, + "loss": 0.9736, + "step": 4624 + }, + { + "epoch": 0.32175032175032175, + "grad_norm": 1.265625, + "learning_rate": 0.0015858877384883018, + "loss": 0.899, + "step": 4625 + }, + { + "epoch": 0.321819889387457, + "grad_norm": 0.99609375, + "learning_rate": 0.001585705115506345, + "loss": 0.912, + "step": 4626 + }, + { + "epoch": 0.32188945702459215, + "grad_norm": 1.03125, + "learning_rate": 0.001585522462785234, + "loss": 1.0388, + "step": 4627 + }, + { + "epoch": 0.3219590246617274, + "grad_norm": 1.0390625, + "learning_rate": 0.001585339780334244, + "loss": 0.8447, + "step": 4628 + }, + { + "epoch": 0.32202859229886255, + "grad_norm": 1.03125, + "learning_rate": 0.0015851570681626502, + "loss": 0.9812, + "step": 4629 + }, + { + "epoch": 0.3220981599359978, + "grad_norm": 0.98046875, + "learning_rate": 0.0015849743262797299, + "loss": 0.8619, + "step": 4630 + }, + { + "epoch": 0.322167727573133, + "grad_norm": 0.95703125, + "learning_rate": 0.0015847915546947618, + "loss": 0.8297, + "step": 4631 + }, + { + "epoch": 0.32223729521026817, + "grad_norm": 1.421875, + "learning_rate": 0.001584608753417026, + "loss": 1.0828, + "step": 4632 + }, + { + "epoch": 0.3223068628474034, + "grad_norm": 1.203125, + "learning_rate": 0.0015844259224558044, + "loss": 0.9393, + "step": 4633 + }, + { + "epoch": 0.32237643048453857, + "grad_norm": 1.203125, + "learning_rate": 0.0015842430618203803, + "loss": 1.045, + "step": 4634 + }, + { + "epoch": 0.3224459981216738, + "grad_norm": 1.1015625, + "learning_rate": 0.0015840601715200382, + "loss": 0.9415, + "step": 4635 + }, + { + "epoch": 0.322515565758809, + "grad_norm": 1.203125, + "learning_rate": 0.0015838772515640645, + "loss": 1.0985, + "step": 4636 + }, + { + "epoch": 0.3225851333959442, + "grad_norm": 0.91015625, + "learning_rate": 0.0015836943019617467, + "loss": 0.8123, + "step": 4637 + }, + { + "epoch": 0.3226547010330794, + "grad_norm": 1.0546875, + "learning_rate": 0.0015835113227223748, + "loss": 0.5864, + "step": 4638 + }, + { + "epoch": 0.32272426867021464, + "grad_norm": 1.234375, + "learning_rate": 0.0015833283138552386, + "loss": 1.0993, + "step": 4639 + }, + { + "epoch": 0.3227938363073498, + "grad_norm": 1.3046875, + "learning_rate": 0.0015831452753696312, + "loss": 1.0382, + "step": 4640 + }, + { + "epoch": 0.32286340394448504, + "grad_norm": 1.34375, + "learning_rate": 0.0015829622072748455, + "loss": 0.9563, + "step": 4641 + }, + { + "epoch": 0.3229329715816202, + "grad_norm": 0.92578125, + "learning_rate": 0.0015827791095801777, + "loss": 0.7008, + "step": 4642 + }, + { + "epoch": 0.32300253921875544, + "grad_norm": 1.2265625, + "learning_rate": 0.001582595982294924, + "loss": 0.9963, + "step": 4643 + }, + { + "epoch": 0.32307210685589066, + "grad_norm": 1.15625, + "learning_rate": 0.0015824128254283828, + "loss": 1.0364, + "step": 4644 + }, + { + "epoch": 0.32314167449302583, + "grad_norm": 1.109375, + "learning_rate": 0.0015822296389898538, + "loss": 1.0407, + "step": 4645 + }, + { + "epoch": 0.32321124213016106, + "grad_norm": 0.91796875, + "learning_rate": 0.0015820464229886384, + "loss": 0.7409, + "step": 4646 + }, + { + "epoch": 0.32328080976729623, + "grad_norm": 1.03125, + "learning_rate": 0.0015818631774340394, + "loss": 0.9008, + "step": 4647 + }, + { + "epoch": 0.32335037740443145, + "grad_norm": 1.125, + "learning_rate": 0.0015816799023353613, + "loss": 0.9936, + "step": 4648 + }, + { + "epoch": 0.3234199450415667, + "grad_norm": 1.359375, + "learning_rate": 0.0015814965977019094, + "loss": 1.1239, + "step": 4649 + }, + { + "epoch": 0.32348951267870185, + "grad_norm": 1.2734375, + "learning_rate": 0.0015813132635429912, + "loss": 1.0262, + "step": 4650 + }, + { + "epoch": 0.3235590803158371, + "grad_norm": 0.92578125, + "learning_rate": 0.0015811298998679156, + "loss": 0.6669, + "step": 4651 + }, + { + "epoch": 0.3236286479529723, + "grad_norm": 0.96484375, + "learning_rate": 0.0015809465066859928, + "loss": 1.0144, + "step": 4652 + }, + { + "epoch": 0.3236982155901075, + "grad_norm": 0.828125, + "learning_rate": 0.0015807630840065346, + "loss": 0.7395, + "step": 4653 + }, + { + "epoch": 0.3237677832272427, + "grad_norm": 0.98046875, + "learning_rate": 0.0015805796318388544, + "loss": 0.9497, + "step": 4654 + }, + { + "epoch": 0.32383735086437787, + "grad_norm": 0.953125, + "learning_rate": 0.0015803961501922666, + "loss": 1.0169, + "step": 4655 + }, + { + "epoch": 0.3239069185015131, + "grad_norm": 1.1171875, + "learning_rate": 0.0015802126390760875, + "loss": 1.2319, + "step": 4656 + }, + { + "epoch": 0.3239764861386483, + "grad_norm": 1.1640625, + "learning_rate": 0.0015800290984996355, + "loss": 0.7076, + "step": 4657 + }, + { + "epoch": 0.3240460537757835, + "grad_norm": 0.984375, + "learning_rate": 0.0015798455284722294, + "loss": 1.0573, + "step": 4658 + }, + { + "epoch": 0.3241156214129187, + "grad_norm": 1.421875, + "learning_rate": 0.0015796619290031897, + "loss": 1.17, + "step": 4659 + }, + { + "epoch": 0.3241851890500539, + "grad_norm": 1.3828125, + "learning_rate": 0.001579478300101839, + "loss": 1.0127, + "step": 4660 + }, + { + "epoch": 0.3242547566871891, + "grad_norm": 0.94921875, + "learning_rate": 0.0015792946417775013, + "loss": 0.927, + "step": 4661 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 1.0390625, + "learning_rate": 0.0015791109540395014, + "loss": 1.0321, + "step": 4662 + }, + { + "epoch": 0.3243938919614595, + "grad_norm": 1.15625, + "learning_rate": 0.0015789272368971663, + "loss": 0.758, + "step": 4663 + }, + { + "epoch": 0.32446345959859474, + "grad_norm": 1.328125, + "learning_rate": 0.001578743490359824, + "loss": 1.1223, + "step": 4664 + }, + { + "epoch": 0.32453302723572997, + "grad_norm": 1.1640625, + "learning_rate": 0.0015785597144368042, + "loss": 0.9884, + "step": 4665 + }, + { + "epoch": 0.32460259487286514, + "grad_norm": 1.1015625, + "learning_rate": 0.0015783759091374386, + "loss": 1.0554, + "step": 4666 + }, + { + "epoch": 0.32467216251000036, + "grad_norm": 1.109375, + "learning_rate": 0.0015781920744710593, + "loss": 0.7594, + "step": 4667 + }, + { + "epoch": 0.32474173014713553, + "grad_norm": 1.0390625, + "learning_rate": 0.0015780082104470009, + "loss": 0.9857, + "step": 4668 + }, + { + "epoch": 0.32481129778427076, + "grad_norm": 1.203125, + "learning_rate": 0.0015778243170745988, + "loss": 0.815, + "step": 4669 + }, + { + "epoch": 0.324880865421406, + "grad_norm": 1.3359375, + "learning_rate": 0.0015776403943631905, + "loss": 1.0044, + "step": 4670 + }, + { + "epoch": 0.32495043305854115, + "grad_norm": 1.2578125, + "learning_rate": 0.0015774564423221143, + "loss": 1.1304, + "step": 4671 + }, + { + "epoch": 0.3250200006956764, + "grad_norm": 1.46875, + "learning_rate": 0.0015772724609607108, + "loss": 0.9644, + "step": 4672 + }, + { + "epoch": 0.32508956833281155, + "grad_norm": 0.96484375, + "learning_rate": 0.001577088450288321, + "loss": 0.7194, + "step": 4673 + }, + { + "epoch": 0.3251591359699468, + "grad_norm": 0.9609375, + "learning_rate": 0.001576904410314289, + "loss": 0.8332, + "step": 4674 + }, + { + "epoch": 0.325228703607082, + "grad_norm": 0.95703125, + "learning_rate": 0.0015767203410479587, + "loss": 0.9187, + "step": 4675 + }, + { + "epoch": 0.3252982712442172, + "grad_norm": 1.265625, + "learning_rate": 0.001576536242498676, + "loss": 1.1336, + "step": 4676 + }, + { + "epoch": 0.3253678388813524, + "grad_norm": 0.83203125, + "learning_rate": 0.0015763521146757893, + "loss": 0.6083, + "step": 4677 + }, + { + "epoch": 0.3254374065184876, + "grad_norm": 1.1953125, + "learning_rate": 0.001576167957588647, + "loss": 0.9975, + "step": 4678 + }, + { + "epoch": 0.3255069741556228, + "grad_norm": 1.421875, + "learning_rate": 0.0015759837712465998, + "loss": 1.0539, + "step": 4679 + }, + { + "epoch": 0.325576541792758, + "grad_norm": 1.5625, + "learning_rate": 0.001575799555659, + "loss": 1.2816, + "step": 4680 + }, + { + "epoch": 0.3256461094298932, + "grad_norm": 1.1796875, + "learning_rate": 0.0015756153108352012, + "loss": 1.0234, + "step": 4681 + }, + { + "epoch": 0.3257156770670284, + "grad_norm": 1.0625, + "learning_rate": 0.0015754310367845582, + "loss": 1.026, + "step": 4682 + }, + { + "epoch": 0.32578524470416365, + "grad_norm": 1.609375, + "learning_rate": 0.001575246733516427, + "loss": 1.1533, + "step": 4683 + }, + { + "epoch": 0.3258548123412988, + "grad_norm": 1.078125, + "learning_rate": 0.001575062401040167, + "loss": 0.6291, + "step": 4684 + }, + { + "epoch": 0.32592437997843404, + "grad_norm": 1.21875, + "learning_rate": 0.001574878039365136, + "loss": 0.7976, + "step": 4685 + }, + { + "epoch": 0.3259939476155692, + "grad_norm": 0.9765625, + "learning_rate": 0.0015746936485006961, + "loss": 0.9032, + "step": 4686 + }, + { + "epoch": 0.32606351525270444, + "grad_norm": 1.0078125, + "learning_rate": 0.0015745092284562094, + "loss": 0.8683, + "step": 4687 + }, + { + "epoch": 0.32613308288983966, + "grad_norm": 1.078125, + "learning_rate": 0.00157432477924104, + "loss": 0.9426, + "step": 4688 + }, + { + "epoch": 0.32620265052697484, + "grad_norm": 1.234375, + "learning_rate": 0.001574140300864553, + "loss": 0.8228, + "step": 4689 + }, + { + "epoch": 0.32627221816411006, + "grad_norm": 0.92578125, + "learning_rate": 0.0015739557933361153, + "loss": 0.9683, + "step": 4690 + }, + { + "epoch": 0.3263417858012453, + "grad_norm": 1.109375, + "learning_rate": 0.0015737712566650955, + "loss": 1.0398, + "step": 4691 + }, + { + "epoch": 0.32641135343838046, + "grad_norm": 1.296875, + "learning_rate": 0.0015735866908608632, + "loss": 1.0982, + "step": 4692 + }, + { + "epoch": 0.3264809210755157, + "grad_norm": 1.0625, + "learning_rate": 0.00157340209593279, + "loss": 0.8874, + "step": 4693 + }, + { + "epoch": 0.32655048871265085, + "grad_norm": 1.1484375, + "learning_rate": 0.001573217471890248, + "loss": 1.1152, + "step": 4694 + }, + { + "epoch": 0.3266200563497861, + "grad_norm": 1.4609375, + "learning_rate": 0.0015730328187426126, + "loss": 1.0359, + "step": 4695 + }, + { + "epoch": 0.3266896239869213, + "grad_norm": 1.0625, + "learning_rate": 0.0015728481364992587, + "loss": 0.8886, + "step": 4696 + }, + { + "epoch": 0.3267591916240565, + "grad_norm": 1.0625, + "learning_rate": 0.001572663425169564, + "loss": 0.7187, + "step": 4697 + }, + { + "epoch": 0.3268287592611917, + "grad_norm": 1.2578125, + "learning_rate": 0.0015724786847629067, + "loss": 0.9773, + "step": 4698 + }, + { + "epoch": 0.3268983268983269, + "grad_norm": 1.0078125, + "learning_rate": 0.0015722939152886676, + "loss": 0.7418, + "step": 4699 + }, + { + "epoch": 0.3269678945354621, + "grad_norm": 1.3203125, + "learning_rate": 0.0015721091167562279, + "loss": 0.9311, + "step": 4700 + }, + { + "epoch": 0.3270374621725973, + "grad_norm": 1.0625, + "learning_rate": 0.0015719242891749708, + "loss": 0.8655, + "step": 4701 + }, + { + "epoch": 0.3271070298097325, + "grad_norm": 1.0625, + "learning_rate": 0.0015717394325542814, + "loss": 0.5923, + "step": 4702 + }, + { + "epoch": 0.3271765974468677, + "grad_norm": 0.984375, + "learning_rate": 0.0015715545469035448, + "loss": 0.7129, + "step": 4703 + }, + { + "epoch": 0.32724616508400295, + "grad_norm": 1.3515625, + "learning_rate": 0.0015713696322321496, + "loss": 1.1562, + "step": 4704 + }, + { + "epoch": 0.3273157327211381, + "grad_norm": 1.3125, + "learning_rate": 0.0015711846885494843, + "loss": 1.1207, + "step": 4705 + }, + { + "epoch": 0.32738530035827335, + "grad_norm": 1.2578125, + "learning_rate": 0.0015709997158649394, + "loss": 0.846, + "step": 4706 + }, + { + "epoch": 0.3274548679954085, + "grad_norm": 1.28125, + "learning_rate": 0.001570814714187907, + "loss": 0.8207, + "step": 4707 + }, + { + "epoch": 0.32752443563254374, + "grad_norm": 0.859375, + "learning_rate": 0.0015706296835277804, + "loss": 0.7765, + "step": 4708 + }, + { + "epoch": 0.32759400326967897, + "grad_norm": 1.1328125, + "learning_rate": 0.001570444623893955, + "loss": 0.7479, + "step": 4709 + }, + { + "epoch": 0.32766357090681414, + "grad_norm": 1.078125, + "learning_rate": 0.0015702595352958266, + "loss": 1.097, + "step": 4710 + }, + { + "epoch": 0.32773313854394936, + "grad_norm": 1.0078125, + "learning_rate": 0.0015700744177427933, + "loss": 0.7712, + "step": 4711 + }, + { + "epoch": 0.32780270618108454, + "grad_norm": 0.9921875, + "learning_rate": 0.0015698892712442546, + "loss": 0.8461, + "step": 4712 + }, + { + "epoch": 0.32787227381821976, + "grad_norm": 1.4609375, + "learning_rate": 0.0015697040958096112, + "loss": 1.0414, + "step": 4713 + }, + { + "epoch": 0.327941841455355, + "grad_norm": 1.015625, + "learning_rate": 0.0015695188914482655, + "loss": 0.8578, + "step": 4714 + }, + { + "epoch": 0.32801140909249016, + "grad_norm": 1.171875, + "learning_rate": 0.0015693336581696204, + "loss": 0.996, + "step": 4715 + }, + { + "epoch": 0.3280809767296254, + "grad_norm": 1.046875, + "learning_rate": 0.0015691483959830825, + "loss": 0.8574, + "step": 4716 + }, + { + "epoch": 0.3281505443667606, + "grad_norm": 1.171875, + "learning_rate": 0.0015689631048980575, + "loss": 0.9643, + "step": 4717 + }, + { + "epoch": 0.3282201120038958, + "grad_norm": 1.1015625, + "learning_rate": 0.0015687777849239537, + "loss": 0.6624, + "step": 4718 + }, + { + "epoch": 0.328289679641031, + "grad_norm": 1.03125, + "learning_rate": 0.001568592436070181, + "loss": 0.7277, + "step": 4719 + }, + { + "epoch": 0.3283592472781662, + "grad_norm": 1.0859375, + "learning_rate": 0.0015684070583461504, + "loss": 0.8696, + "step": 4720 + }, + { + "epoch": 0.3284288149153014, + "grad_norm": 1.265625, + "learning_rate": 0.0015682216517612741, + "loss": 0.8684, + "step": 4721 + }, + { + "epoch": 0.32849838255243663, + "grad_norm": 0.98046875, + "learning_rate": 0.0015680362163249665, + "loss": 0.8038, + "step": 4722 + }, + { + "epoch": 0.3285679501895718, + "grad_norm": 0.9921875, + "learning_rate": 0.001567850752046643, + "loss": 0.8145, + "step": 4723 + }, + { + "epoch": 0.328637517826707, + "grad_norm": 1.1875, + "learning_rate": 0.0015676652589357203, + "loss": 1.1258, + "step": 4724 + }, + { + "epoch": 0.3287070854638422, + "grad_norm": 1.0234375, + "learning_rate": 0.0015674797370016172, + "loss": 0.9553, + "step": 4725 + }, + { + "epoch": 0.3287766531009774, + "grad_norm": 1.21875, + "learning_rate": 0.0015672941862537534, + "loss": 1.1712, + "step": 4726 + }, + { + "epoch": 0.32884622073811265, + "grad_norm": 1.15625, + "learning_rate": 0.0015671086067015501, + "loss": 1.0842, + "step": 4727 + }, + { + "epoch": 0.3289157883752478, + "grad_norm": 0.9765625, + "learning_rate": 0.0015669229983544303, + "loss": 0.854, + "step": 4728 + }, + { + "epoch": 0.32898535601238305, + "grad_norm": 1.21875, + "learning_rate": 0.0015667373612218176, + "loss": 0.9932, + "step": 4729 + }, + { + "epoch": 0.32905492364951827, + "grad_norm": 1.1328125, + "learning_rate": 0.001566551695313139, + "loss": 1.0276, + "step": 4730 + }, + { + "epoch": 0.32912449128665344, + "grad_norm": 1.0546875, + "learning_rate": 0.0015663660006378203, + "loss": 0.9543, + "step": 4731 + }, + { + "epoch": 0.32919405892378867, + "grad_norm": 1.3671875, + "learning_rate": 0.0015661802772052914, + "loss": 0.8605, + "step": 4732 + }, + { + "epoch": 0.32926362656092384, + "grad_norm": 1.2109375, + "learning_rate": 0.0015659945250249814, + "loss": 1.184, + "step": 4733 + }, + { + "epoch": 0.32933319419805906, + "grad_norm": 0.86328125, + "learning_rate": 0.0015658087441063225, + "loss": 0.7605, + "step": 4734 + }, + { + "epoch": 0.3294027618351943, + "grad_norm": 1.3203125, + "learning_rate": 0.0015656229344587472, + "loss": 0.9397, + "step": 4735 + }, + { + "epoch": 0.32947232947232946, + "grad_norm": 1.0546875, + "learning_rate": 0.0015654370960916904, + "loss": 0.6376, + "step": 4736 + }, + { + "epoch": 0.3295418971094647, + "grad_norm": 1.6015625, + "learning_rate": 0.001565251229014588, + "loss": 0.9573, + "step": 4737 + }, + { + "epoch": 0.32961146474659986, + "grad_norm": 1.171875, + "learning_rate": 0.001565065333236877, + "loss": 0.8613, + "step": 4738 + }, + { + "epoch": 0.3296810323837351, + "grad_norm": 0.859375, + "learning_rate": 0.0015648794087679968, + "loss": 0.7105, + "step": 4739 + }, + { + "epoch": 0.3297506000208703, + "grad_norm": 1.2734375, + "learning_rate": 0.0015646934556173872, + "loss": 1.0448, + "step": 4740 + }, + { + "epoch": 0.3298201676580055, + "grad_norm": 0.98828125, + "learning_rate": 0.0015645074737944897, + "loss": 0.9482, + "step": 4741 + }, + { + "epoch": 0.3298897352951407, + "grad_norm": 1.109375, + "learning_rate": 0.0015643214633087488, + "loss": 0.9157, + "step": 4742 + }, + { + "epoch": 0.32995930293227593, + "grad_norm": 1.140625, + "learning_rate": 0.0015641354241696082, + "loss": 0.8553, + "step": 4743 + }, + { + "epoch": 0.3300288705694111, + "grad_norm": 1.5625, + "learning_rate": 0.001563949356386514, + "loss": 0.8695, + "step": 4744 + }, + { + "epoch": 0.33009843820654633, + "grad_norm": 1.1328125, + "learning_rate": 0.0015637632599689141, + "loss": 1.2751, + "step": 4745 + }, + { + "epoch": 0.3301680058436815, + "grad_norm": 1.0703125, + "learning_rate": 0.0015635771349262577, + "loss": 1.0886, + "step": 4746 + }, + { + "epoch": 0.3302375734808167, + "grad_norm": 1.1640625, + "learning_rate": 0.0015633909812679948, + "loss": 0.7407, + "step": 4747 + }, + { + "epoch": 0.33030714111795195, + "grad_norm": 1.125, + "learning_rate": 0.0015632047990035774, + "loss": 1.0269, + "step": 4748 + }, + { + "epoch": 0.3303767087550871, + "grad_norm": 1.28125, + "learning_rate": 0.0015630185881424592, + "loss": 1.027, + "step": 4749 + }, + { + "epoch": 0.33044627639222235, + "grad_norm": 0.86328125, + "learning_rate": 0.0015628323486940952, + "loss": 0.6675, + "step": 4750 + }, + { + "epoch": 0.3305158440293575, + "grad_norm": 0.9453125, + "learning_rate": 0.0015626460806679413, + "loss": 0.764, + "step": 4751 + }, + { + "epoch": 0.33058541166649275, + "grad_norm": 1.109375, + "learning_rate": 0.0015624597840734552, + "loss": 1.0752, + "step": 4752 + }, + { + "epoch": 0.33065497930362797, + "grad_norm": 1.1796875, + "learning_rate": 0.0015622734589200962, + "loss": 0.9472, + "step": 4753 + }, + { + "epoch": 0.33072454694076314, + "grad_norm": 1.0625, + "learning_rate": 0.001562087105217325, + "loss": 1.0138, + "step": 4754 + }, + { + "epoch": 0.33079411457789837, + "grad_norm": 1.1484375, + "learning_rate": 0.0015619007229746038, + "loss": 0.9164, + "step": 4755 + }, + { + "epoch": 0.33086368221503354, + "grad_norm": 1.203125, + "learning_rate": 0.0015617143122013963, + "loss": 0.8213, + "step": 4756 + }, + { + "epoch": 0.33093324985216876, + "grad_norm": 1.109375, + "learning_rate": 0.001561527872907167, + "loss": 0.8078, + "step": 4757 + }, + { + "epoch": 0.331002817489304, + "grad_norm": 1.125, + "learning_rate": 0.0015613414051013827, + "loss": 1.1892, + "step": 4758 + }, + { + "epoch": 0.33107238512643916, + "grad_norm": 0.98046875, + "learning_rate": 0.0015611549087935115, + "loss": 0.8594, + "step": 4759 + }, + { + "epoch": 0.3311419527635744, + "grad_norm": 0.98828125, + "learning_rate": 0.0015609683839930223, + "loss": 0.8934, + "step": 4760 + }, + { + "epoch": 0.3312115204007096, + "grad_norm": 1.140625, + "learning_rate": 0.0015607818307093856, + "loss": 0.7721, + "step": 4761 + }, + { + "epoch": 0.3312810880378448, + "grad_norm": 1.15625, + "learning_rate": 0.0015605952489520748, + "loss": 0.9527, + "step": 4762 + }, + { + "epoch": 0.33135065567498, + "grad_norm": 1.109375, + "learning_rate": 0.0015604086387305625, + "loss": 0.9401, + "step": 4763 + }, + { + "epoch": 0.3314202233121152, + "grad_norm": 0.99609375, + "learning_rate": 0.0015602220000543242, + "loss": 0.8029, + "step": 4764 + }, + { + "epoch": 0.3314897909492504, + "grad_norm": 0.84765625, + "learning_rate": 0.0015600353329328364, + "loss": 0.8822, + "step": 4765 + }, + { + "epoch": 0.33155935858638563, + "grad_norm": 1.125, + "learning_rate": 0.0015598486373755774, + "loss": 1.0305, + "step": 4766 + }, + { + "epoch": 0.3316289262235208, + "grad_norm": 1.0390625, + "learning_rate": 0.0015596619133920262, + "loss": 0.9862, + "step": 4767 + }, + { + "epoch": 0.33169849386065603, + "grad_norm": 1.4140625, + "learning_rate": 0.0015594751609916643, + "loss": 0.9982, + "step": 4768 + }, + { + "epoch": 0.3317680614977912, + "grad_norm": 0.984375, + "learning_rate": 0.0015592883801839733, + "loss": 0.7449, + "step": 4769 + }, + { + "epoch": 0.3318376291349264, + "grad_norm": 1.71875, + "learning_rate": 0.0015591015709784375, + "loss": 0.6865, + "step": 4770 + }, + { + "epoch": 0.33190719677206165, + "grad_norm": 1.21875, + "learning_rate": 0.001558914733384542, + "loss": 0.9245, + "step": 4771 + }, + { + "epoch": 0.3319767644091968, + "grad_norm": 0.98828125, + "learning_rate": 0.0015587278674117735, + "loss": 0.7876, + "step": 4772 + }, + { + "epoch": 0.33204633204633205, + "grad_norm": 0.98828125, + "learning_rate": 0.00155854097306962, + "loss": 0.8075, + "step": 4773 + }, + { + "epoch": 0.3321158996834673, + "grad_norm": 1.015625, + "learning_rate": 0.0015583540503675715, + "loss": 1.0019, + "step": 4774 + }, + { + "epoch": 0.33218546732060245, + "grad_norm": 1.1640625, + "learning_rate": 0.0015581670993151183, + "loss": 0.9934, + "step": 4775 + }, + { + "epoch": 0.33225503495773767, + "grad_norm": 1.3984375, + "learning_rate": 0.0015579801199217533, + "loss": 0.9749, + "step": 4776 + }, + { + "epoch": 0.33232460259487284, + "grad_norm": 1.109375, + "learning_rate": 0.0015577931121969703, + "loss": 0.7169, + "step": 4777 + }, + { + "epoch": 0.33239417023200807, + "grad_norm": 1.0859375, + "learning_rate": 0.0015576060761502643, + "loss": 0.9887, + "step": 4778 + }, + { + "epoch": 0.3324637378691433, + "grad_norm": 1.078125, + "learning_rate": 0.0015574190117911325, + "loss": 1.1363, + "step": 4779 + }, + { + "epoch": 0.33253330550627846, + "grad_norm": 1.0625, + "learning_rate": 0.0015572319191290726, + "loss": 0.9078, + "step": 4780 + }, + { + "epoch": 0.3326028731434137, + "grad_norm": 1.0546875, + "learning_rate": 0.001557044798173585, + "loss": 0.8433, + "step": 4781 + }, + { + "epoch": 0.33267244078054886, + "grad_norm": 1.0078125, + "learning_rate": 0.0015568576489341699, + "loss": 0.9377, + "step": 4782 + }, + { + "epoch": 0.3327420084176841, + "grad_norm": 0.875, + "learning_rate": 0.00155667047142033, + "loss": 0.775, + "step": 4783 + }, + { + "epoch": 0.3328115760548193, + "grad_norm": 1.375, + "learning_rate": 0.0015564832656415697, + "loss": 1.1054, + "step": 4784 + }, + { + "epoch": 0.3328811436919545, + "grad_norm": 1.015625, + "learning_rate": 0.0015562960316073938, + "loss": 0.8957, + "step": 4785 + }, + { + "epoch": 0.3329507113290897, + "grad_norm": 0.9453125, + "learning_rate": 0.0015561087693273098, + "loss": 0.6607, + "step": 4786 + }, + { + "epoch": 0.33302027896622494, + "grad_norm": 1.046875, + "learning_rate": 0.001555921478810825, + "loss": 0.8963, + "step": 4787 + }, + { + "epoch": 0.3330898466033601, + "grad_norm": 0.97265625, + "learning_rate": 0.00155573416006745, + "loss": 0.9921, + "step": 4788 + }, + { + "epoch": 0.33315941424049533, + "grad_norm": 1.390625, + "learning_rate": 0.001555546813106695, + "loss": 1.1758, + "step": 4789 + }, + { + "epoch": 0.3332289818776305, + "grad_norm": 1.40625, + "learning_rate": 0.0015553594379380733, + "loss": 1.159, + "step": 4790 + }, + { + "epoch": 0.33329854951476573, + "grad_norm": 1.03125, + "learning_rate": 0.0015551720345710987, + "loss": 1.0012, + "step": 4791 + }, + { + "epoch": 0.33336811715190096, + "grad_norm": 1.359375, + "learning_rate": 0.0015549846030152858, + "loss": 0.9196, + "step": 4792 + }, + { + "epoch": 0.3334376847890361, + "grad_norm": 0.8984375, + "learning_rate": 0.0015547971432801528, + "loss": 0.9451, + "step": 4793 + }, + { + "epoch": 0.33350725242617135, + "grad_norm": 1.1796875, + "learning_rate": 0.001554609655375217, + "loss": 0.8466, + "step": 4794 + }, + { + "epoch": 0.3335768200633065, + "grad_norm": 1.0078125, + "learning_rate": 0.0015544221393099984, + "loss": 0.7053, + "step": 4795 + }, + { + "epoch": 0.33364638770044175, + "grad_norm": 1.1796875, + "learning_rate": 0.0015542345950940177, + "loss": 0.8763, + "step": 4796 + }, + { + "epoch": 0.333715955337577, + "grad_norm": 1.25, + "learning_rate": 0.0015540470227367984, + "loss": 1.0129, + "step": 4797 + }, + { + "epoch": 0.33378552297471215, + "grad_norm": 1.1484375, + "learning_rate": 0.0015538594222478635, + "loss": 0.9119, + "step": 4798 + }, + { + "epoch": 0.33385509061184737, + "grad_norm": 1.328125, + "learning_rate": 0.001553671793636739, + "loss": 0.9252, + "step": 4799 + }, + { + "epoch": 0.3339246582489826, + "grad_norm": 1.0859375, + "learning_rate": 0.0015534841369129514, + "loss": 0.9074, + "step": 4800 + }, + { + "epoch": 0.33399422588611777, + "grad_norm": 1.3203125, + "learning_rate": 0.001553296452086029, + "loss": 0.8542, + "step": 4801 + }, + { + "epoch": 0.334063793523253, + "grad_norm": 1.3203125, + "learning_rate": 0.0015531087391655017, + "loss": 1.1449, + "step": 4802 + }, + { + "epoch": 0.33413336116038816, + "grad_norm": 1.328125, + "learning_rate": 0.0015529209981609005, + "loss": 0.9562, + "step": 4803 + }, + { + "epoch": 0.3342029287975234, + "grad_norm": 1.234375, + "learning_rate": 0.001552733229081758, + "loss": 0.9129, + "step": 4804 + }, + { + "epoch": 0.3342724964346586, + "grad_norm": 0.9921875, + "learning_rate": 0.0015525454319376079, + "loss": 1.0567, + "step": 4805 + }, + { + "epoch": 0.3343420640717938, + "grad_norm": 1.4296875, + "learning_rate": 0.0015523576067379861, + "loss": 1.0974, + "step": 4806 + }, + { + "epoch": 0.334411631708929, + "grad_norm": 0.99609375, + "learning_rate": 0.001552169753492429, + "loss": 0.9468, + "step": 4807 + }, + { + "epoch": 0.3344811993460642, + "grad_norm": 0.9921875, + "learning_rate": 0.0015519818722104747, + "loss": 0.7054, + "step": 4808 + }, + { + "epoch": 0.3345507669831994, + "grad_norm": 1.3203125, + "learning_rate": 0.0015517939629016634, + "loss": 1.3383, + "step": 4809 + }, + { + "epoch": 0.33462033462033464, + "grad_norm": 1.203125, + "learning_rate": 0.001551606025575536, + "loss": 0.8769, + "step": 4810 + }, + { + "epoch": 0.3346899022574698, + "grad_norm": 1.109375, + "learning_rate": 0.0015514180602416348, + "loss": 0.8681, + "step": 4811 + }, + { + "epoch": 0.33475946989460503, + "grad_norm": 1.1484375, + "learning_rate": 0.0015512300669095036, + "loss": 0.9219, + "step": 4812 + }, + { + "epoch": 0.33482903753174026, + "grad_norm": 1.25, + "learning_rate": 0.0015510420455886885, + "loss": 1.2348, + "step": 4813 + }, + { + "epoch": 0.33489860516887543, + "grad_norm": 1.0703125, + "learning_rate": 0.0015508539962887356, + "loss": 0.6768, + "step": 4814 + }, + { + "epoch": 0.33496817280601066, + "grad_norm": 1.171875, + "learning_rate": 0.001550665919019193, + "loss": 0.8907, + "step": 4815 + }, + { + "epoch": 0.3350377404431458, + "grad_norm": 1.015625, + "learning_rate": 0.0015504778137896108, + "loss": 0.93, + "step": 4816 + }, + { + "epoch": 0.33510730808028105, + "grad_norm": 1.3125, + "learning_rate": 0.0015502896806095397, + "loss": 1.2813, + "step": 4817 + }, + { + "epoch": 0.3351768757174163, + "grad_norm": 0.984375, + "learning_rate": 0.0015501015194885326, + "loss": 1.2485, + "step": 4818 + }, + { + "epoch": 0.33524644335455145, + "grad_norm": 1.1015625, + "learning_rate": 0.0015499133304361426, + "loss": 0.9347, + "step": 4819 + }, + { + "epoch": 0.3353160109916867, + "grad_norm": 1.1484375, + "learning_rate": 0.001549725113461926, + "loss": 0.8417, + "step": 4820 + }, + { + "epoch": 0.33538557862882185, + "grad_norm": 1.0234375, + "learning_rate": 0.0015495368685754386, + "loss": 0.765, + "step": 4821 + }, + { + "epoch": 0.33545514626595707, + "grad_norm": 1.0546875, + "learning_rate": 0.0015493485957862388, + "loss": 1.0378, + "step": 4822 + }, + { + "epoch": 0.3355247139030923, + "grad_norm": 0.9453125, + "learning_rate": 0.0015491602951038866, + "loss": 0.891, + "step": 4823 + }, + { + "epoch": 0.33559428154022747, + "grad_norm": 1.1796875, + "learning_rate": 0.0015489719665379422, + "loss": 1.2746, + "step": 4824 + }, + { + "epoch": 0.3356638491773627, + "grad_norm": 1.1015625, + "learning_rate": 0.0015487836100979686, + "loss": 1.024, + "step": 4825 + }, + { + "epoch": 0.3357334168144979, + "grad_norm": 1.09375, + "learning_rate": 0.0015485952257935293, + "loss": 0.8937, + "step": 4826 + }, + { + "epoch": 0.3358029844516331, + "grad_norm": 1.265625, + "learning_rate": 0.0015484068136341898, + "loss": 1.1165, + "step": 4827 + }, + { + "epoch": 0.3358725520887683, + "grad_norm": 0.96875, + "learning_rate": 0.001548218373629516, + "loss": 0.8166, + "step": 4828 + }, + { + "epoch": 0.3359421197259035, + "grad_norm": 1.1015625, + "learning_rate": 0.0015480299057890768, + "loss": 0.7577, + "step": 4829 + }, + { + "epoch": 0.3360116873630387, + "grad_norm": 1.3046875, + "learning_rate": 0.0015478414101224409, + "loss": 1.0613, + "step": 4830 + }, + { + "epoch": 0.33608125500017394, + "grad_norm": 1.2421875, + "learning_rate": 0.0015476528866391797, + "loss": 0.8719, + "step": 4831 + }, + { + "epoch": 0.3361508226373091, + "grad_norm": 1.1171875, + "learning_rate": 0.0015474643353488653, + "loss": 0.876, + "step": 4832 + }, + { + "epoch": 0.33622039027444434, + "grad_norm": 1.03125, + "learning_rate": 0.0015472757562610714, + "loss": 0.6715, + "step": 4833 + }, + { + "epoch": 0.3362899579115795, + "grad_norm": 1.1328125, + "learning_rate": 0.0015470871493853734, + "loss": 0.7938, + "step": 4834 + }, + { + "epoch": 0.33635952554871473, + "grad_norm": 0.74609375, + "learning_rate": 0.0015468985147313468, + "loss": 0.7811, + "step": 4835 + }, + { + "epoch": 0.33642909318584996, + "grad_norm": 1.2421875, + "learning_rate": 0.0015467098523085706, + "loss": 0.9007, + "step": 4836 + }, + { + "epoch": 0.33649866082298513, + "grad_norm": 0.93359375, + "learning_rate": 0.0015465211621266237, + "loss": 0.8734, + "step": 4837 + }, + { + "epoch": 0.33656822846012036, + "grad_norm": 0.9609375, + "learning_rate": 0.0015463324441950868, + "loss": 0.8147, + "step": 4838 + }, + { + "epoch": 0.3366377960972556, + "grad_norm": 1.171875, + "learning_rate": 0.0015461436985235422, + "loss": 1.0236, + "step": 4839 + }, + { + "epoch": 0.33670736373439075, + "grad_norm": 0.984375, + "learning_rate": 0.0015459549251215733, + "loss": 0.9286, + "step": 4840 + }, + { + "epoch": 0.336776931371526, + "grad_norm": 0.94921875, + "learning_rate": 0.001545766123998765, + "loss": 0.6887, + "step": 4841 + }, + { + "epoch": 0.33684649900866115, + "grad_norm": 1.328125, + "learning_rate": 0.001545577295164704, + "loss": 0.8163, + "step": 4842 + }, + { + "epoch": 0.3369160666457964, + "grad_norm": 1.1640625, + "learning_rate": 0.0015453884386289775, + "loss": 0.9204, + "step": 4843 + }, + { + "epoch": 0.3369856342829316, + "grad_norm": 1.109375, + "learning_rate": 0.0015451995544011755, + "loss": 0.9626, + "step": 4844 + }, + { + "epoch": 0.33705520192006677, + "grad_norm": 1.140625, + "learning_rate": 0.0015450106424908876, + "loss": 0.8108, + "step": 4845 + }, + { + "epoch": 0.337124769557202, + "grad_norm": 1.125, + "learning_rate": 0.001544821702907707, + "loss": 1.0259, + "step": 4846 + }, + { + "epoch": 0.33719433719433717, + "grad_norm": 1.265625, + "learning_rate": 0.001544632735661226, + "loss": 0.906, + "step": 4847 + }, + { + "epoch": 0.3372639048314724, + "grad_norm": 1.1640625, + "learning_rate": 0.00154444374076104, + "loss": 1.0432, + "step": 4848 + }, + { + "epoch": 0.3373334724686076, + "grad_norm": 0.93359375, + "learning_rate": 0.0015442547182167449, + "loss": 0.8352, + "step": 4849 + }, + { + "epoch": 0.3374030401057428, + "grad_norm": 1.0078125, + "learning_rate": 0.0015440656680379386, + "loss": 0.8683, + "step": 4850 + }, + { + "epoch": 0.337472607742878, + "grad_norm": 1.078125, + "learning_rate": 0.0015438765902342198, + "loss": 0.7643, + "step": 4851 + }, + { + "epoch": 0.33754217538001324, + "grad_norm": 1.1015625, + "learning_rate": 0.0015436874848151893, + "loss": 0.9901, + "step": 4852 + }, + { + "epoch": 0.3376117430171484, + "grad_norm": 1.1484375, + "learning_rate": 0.0015434983517904485, + "loss": 0.7876, + "step": 4853 + }, + { + "epoch": 0.33768131065428364, + "grad_norm": 1.3203125, + "learning_rate": 0.0015433091911696009, + "loss": 1.1143, + "step": 4854 + }, + { + "epoch": 0.3377508782914188, + "grad_norm": 1.265625, + "learning_rate": 0.0015431200029622511, + "loss": 1.0496, + "step": 4855 + }, + { + "epoch": 0.33782044592855404, + "grad_norm": 1.21875, + "learning_rate": 0.001542930787178005, + "loss": 0.9421, + "step": 4856 + }, + { + "epoch": 0.33789001356568926, + "grad_norm": 0.875, + "learning_rate": 0.0015427415438264702, + "loss": 0.6855, + "step": 4857 + }, + { + "epoch": 0.33795958120282443, + "grad_norm": 1.109375, + "learning_rate": 0.0015425522729172552, + "loss": 0.639, + "step": 4858 + }, + { + "epoch": 0.33802914883995966, + "grad_norm": 1.015625, + "learning_rate": 0.0015423629744599709, + "loss": 0.854, + "step": 4859 + }, + { + "epoch": 0.33809871647709483, + "grad_norm": 1.2578125, + "learning_rate": 0.001542173648464228, + "loss": 0.9457, + "step": 4860 + }, + { + "epoch": 0.33816828411423006, + "grad_norm": 1.25, + "learning_rate": 0.0015419842949396404, + "loss": 1.1196, + "step": 4861 + }, + { + "epoch": 0.3382378517513653, + "grad_norm": 1.4453125, + "learning_rate": 0.0015417949138958218, + "loss": 0.901, + "step": 4862 + }, + { + "epoch": 0.33830741938850045, + "grad_norm": 1.1484375, + "learning_rate": 0.0015416055053423885, + "loss": 1.0023, + "step": 4863 + }, + { + "epoch": 0.3383769870256357, + "grad_norm": 1.15625, + "learning_rate": 0.0015414160692889575, + "loss": 0.8722, + "step": 4864 + }, + { + "epoch": 0.3384465546627709, + "grad_norm": 1.1796875, + "learning_rate": 0.0015412266057451471, + "loss": 0.8777, + "step": 4865 + }, + { + "epoch": 0.3385161222999061, + "grad_norm": 0.97265625, + "learning_rate": 0.001541037114720578, + "loss": 0.7736, + "step": 4866 + }, + { + "epoch": 0.3385856899370413, + "grad_norm": 1.03125, + "learning_rate": 0.001540847596224871, + "loss": 0.8343, + "step": 4867 + }, + { + "epoch": 0.33865525757417647, + "grad_norm": 0.98828125, + "learning_rate": 0.0015406580502676497, + "loss": 0.7841, + "step": 4868 + }, + { + "epoch": 0.3387248252113117, + "grad_norm": 1.2109375, + "learning_rate": 0.0015404684768585374, + "loss": 0.9431, + "step": 4869 + }, + { + "epoch": 0.3387943928484469, + "grad_norm": 0.97265625, + "learning_rate": 0.0015402788760071598, + "loss": 0.9309, + "step": 4870 + }, + { + "epoch": 0.3388639604855821, + "grad_norm": 1.1640625, + "learning_rate": 0.0015400892477231442, + "loss": 1.0093, + "step": 4871 + }, + { + "epoch": 0.3389335281227173, + "grad_norm": 1.046875, + "learning_rate": 0.001539899592016119, + "loss": 0.8758, + "step": 4872 + }, + { + "epoch": 0.3390030957598525, + "grad_norm": 1.5, + "learning_rate": 0.0015397099088957137, + "loss": 1.0624, + "step": 4873 + }, + { + "epoch": 0.3390726633969877, + "grad_norm": 0.86328125, + "learning_rate": 0.0015395201983715594, + "loss": 0.6798, + "step": 4874 + }, + { + "epoch": 0.33914223103412294, + "grad_norm": 1.171875, + "learning_rate": 0.001539330460453289, + "loss": 1.0312, + "step": 4875 + }, + { + "epoch": 0.3392117986712581, + "grad_norm": 1.03125, + "learning_rate": 0.0015391406951505361, + "loss": 0.9884, + "step": 4876 + }, + { + "epoch": 0.33928136630839334, + "grad_norm": 1.1171875, + "learning_rate": 0.0015389509024729365, + "loss": 0.9795, + "step": 4877 + }, + { + "epoch": 0.33935093394552857, + "grad_norm": 1.3125, + "learning_rate": 0.0015387610824301263, + "loss": 0.9417, + "step": 4878 + }, + { + "epoch": 0.33942050158266374, + "grad_norm": 0.94921875, + "learning_rate": 0.001538571235031744, + "loss": 0.7121, + "step": 4879 + }, + { + "epoch": 0.33949006921979896, + "grad_norm": 1.1171875, + "learning_rate": 0.0015383813602874291, + "loss": 0.8359, + "step": 4880 + }, + { + "epoch": 0.33955963685693413, + "grad_norm": 1.1640625, + "learning_rate": 0.0015381914582068223, + "loss": 1.0214, + "step": 4881 + }, + { + "epoch": 0.33962920449406936, + "grad_norm": 1.25, + "learning_rate": 0.0015380015287995655, + "loss": 0.8943, + "step": 4882 + }, + { + "epoch": 0.3396987721312046, + "grad_norm": 1.0390625, + "learning_rate": 0.0015378115720753032, + "loss": 0.7631, + "step": 4883 + }, + { + "epoch": 0.33976833976833976, + "grad_norm": 1.1171875, + "learning_rate": 0.00153762158804368, + "loss": 0.9479, + "step": 4884 + }, + { + "epoch": 0.339837907405475, + "grad_norm": 1.1484375, + "learning_rate": 0.0015374315767143422, + "loss": 1.1355, + "step": 4885 + }, + { + "epoch": 0.33990747504261015, + "grad_norm": 1.1953125, + "learning_rate": 0.001537241538096938, + "loss": 0.9332, + "step": 4886 + }, + { + "epoch": 0.3399770426797454, + "grad_norm": 0.9765625, + "learning_rate": 0.0015370514722011163, + "loss": 0.7488, + "step": 4887 + }, + { + "epoch": 0.3400466103168806, + "grad_norm": 1.1640625, + "learning_rate": 0.001536861379036528, + "loss": 0.9931, + "step": 4888 + }, + { + "epoch": 0.3401161779540158, + "grad_norm": 0.99609375, + "learning_rate": 0.0015366712586128246, + "loss": 0.9932, + "step": 4889 + }, + { + "epoch": 0.340185745591151, + "grad_norm": 1.0234375, + "learning_rate": 0.00153648111093966, + "loss": 0.7979, + "step": 4890 + }, + { + "epoch": 0.3402553132282862, + "grad_norm": 1.078125, + "learning_rate": 0.0015362909360266883, + "loss": 1.0964, + "step": 4891 + }, + { + "epoch": 0.3403248808654214, + "grad_norm": 1.28125, + "learning_rate": 0.0015361007338835662, + "loss": 0.938, + "step": 4892 + }, + { + "epoch": 0.3403944485025566, + "grad_norm": 1.125, + "learning_rate": 0.0015359105045199511, + "loss": 0.8411, + "step": 4893 + }, + { + "epoch": 0.3404640161396918, + "grad_norm": 1.109375, + "learning_rate": 0.0015357202479455016, + "loss": 1.1284, + "step": 4894 + }, + { + "epoch": 0.340533583776827, + "grad_norm": 0.9140625, + "learning_rate": 0.001535529964169878, + "loss": 1.0382, + "step": 4895 + }, + { + "epoch": 0.34060315141396225, + "grad_norm": 0.91796875, + "learning_rate": 0.0015353396532027423, + "loss": 0.8875, + "step": 4896 + }, + { + "epoch": 0.3406727190510974, + "grad_norm": 1.2265625, + "learning_rate": 0.001535149315053757, + "loss": 1.2211, + "step": 4897 + }, + { + "epoch": 0.34074228668823264, + "grad_norm": 1.359375, + "learning_rate": 0.0015349589497325872, + "loss": 0.9812, + "step": 4898 + }, + { + "epoch": 0.3408118543253678, + "grad_norm": 1.375, + "learning_rate": 0.001534768557248898, + "loss": 0.8877, + "step": 4899 + }, + { + "epoch": 0.34088142196250304, + "grad_norm": 1.1875, + "learning_rate": 0.0015345781376123573, + "loss": 0.8944, + "step": 4900 + }, + { + "epoch": 0.34095098959963827, + "grad_norm": 1.09375, + "learning_rate": 0.001534387690832633, + "loss": 0.8149, + "step": 4901 + }, + { + "epoch": 0.34102055723677344, + "grad_norm": 1.015625, + "learning_rate": 0.0015341972169193952, + "loss": 0.6687, + "step": 4902 + }, + { + "epoch": 0.34109012487390866, + "grad_norm": 0.96484375, + "learning_rate": 0.0015340067158823155, + "loss": 0.8705, + "step": 4903 + }, + { + "epoch": 0.3411596925110439, + "grad_norm": 1.0625, + "learning_rate": 0.001533816187731066, + "loss": 0.8809, + "step": 4904 + }, + { + "epoch": 0.34122926014817906, + "grad_norm": 0.96875, + "learning_rate": 0.0015336256324753215, + "loss": 0.868, + "step": 4905 + }, + { + "epoch": 0.3412988277853143, + "grad_norm": 1.140625, + "learning_rate": 0.0015334350501247569, + "loss": 0.7088, + "step": 4906 + }, + { + "epoch": 0.34136839542244946, + "grad_norm": 1.203125, + "learning_rate": 0.001533244440689049, + "loss": 0.904, + "step": 4907 + }, + { + "epoch": 0.3414379630595847, + "grad_norm": 1.171875, + "learning_rate": 0.0015330538041778766, + "loss": 1.0991, + "step": 4908 + }, + { + "epoch": 0.3415075306967199, + "grad_norm": 0.9921875, + "learning_rate": 0.0015328631406009183, + "loss": 0.9877, + "step": 4909 + }, + { + "epoch": 0.3415770983338551, + "grad_norm": 1.0859375, + "learning_rate": 0.001532672449967856, + "loss": 0.7401, + "step": 4910 + }, + { + "epoch": 0.3416466659709903, + "grad_norm": 0.984375, + "learning_rate": 0.0015324817322883715, + "loss": 0.6729, + "step": 4911 + }, + { + "epoch": 0.3417162336081255, + "grad_norm": 1.1875, + "learning_rate": 0.0015322909875721481, + "loss": 0.925, + "step": 4912 + }, + { + "epoch": 0.3417858012452607, + "grad_norm": 1.2421875, + "learning_rate": 0.001532100215828872, + "loss": 0.9261, + "step": 4913 + }, + { + "epoch": 0.3418553688823959, + "grad_norm": 0.984375, + "learning_rate": 0.0015319094170682282, + "loss": 0.7656, + "step": 4914 + }, + { + "epoch": 0.3419249365195311, + "grad_norm": 1.2265625, + "learning_rate": 0.0015317185912999056, + "loss": 0.8597, + "step": 4915 + }, + { + "epoch": 0.3419945041566663, + "grad_norm": 0.98046875, + "learning_rate": 0.001531527738533593, + "loss": 0.6822, + "step": 4916 + }, + { + "epoch": 0.34206407179380155, + "grad_norm": 0.9921875, + "learning_rate": 0.001531336858778981, + "loss": 1.1433, + "step": 4917 + }, + { + "epoch": 0.3421336394309367, + "grad_norm": 1.09375, + "learning_rate": 0.0015311459520457613, + "loss": 0.9465, + "step": 4918 + }, + { + "epoch": 0.34220320706807195, + "grad_norm": 0.9296875, + "learning_rate": 0.0015309550183436273, + "loss": 0.5912, + "step": 4919 + }, + { + "epoch": 0.3422727747052071, + "grad_norm": 1.171875, + "learning_rate": 0.0015307640576822737, + "loss": 0.9429, + "step": 4920 + }, + { + "epoch": 0.34234234234234234, + "grad_norm": 1.0, + "learning_rate": 0.0015305730700713965, + "loss": 0.7966, + "step": 4921 + }, + { + "epoch": 0.34241190997947757, + "grad_norm": 1.1328125, + "learning_rate": 0.0015303820555206931, + "loss": 0.9875, + "step": 4922 + }, + { + "epoch": 0.34248147761661274, + "grad_norm": 1.21875, + "learning_rate": 0.0015301910140398623, + "loss": 1.0872, + "step": 4923 + }, + { + "epoch": 0.34255104525374797, + "grad_norm": 1.3046875, + "learning_rate": 0.001529999945638604, + "loss": 0.9393, + "step": 4924 + }, + { + "epoch": 0.34262061289088314, + "grad_norm": 1.140625, + "learning_rate": 0.00152980885032662, + "loss": 0.9258, + "step": 4925 + }, + { + "epoch": 0.34269018052801836, + "grad_norm": 1.0390625, + "learning_rate": 0.001529617728113613, + "loss": 0.9441, + "step": 4926 + }, + { + "epoch": 0.3427597481651536, + "grad_norm": 0.98828125, + "learning_rate": 0.0015294265790092873, + "loss": 0.8605, + "step": 4927 + }, + { + "epoch": 0.34282931580228876, + "grad_norm": 0.859375, + "learning_rate": 0.0015292354030233483, + "loss": 0.7356, + "step": 4928 + }, + { + "epoch": 0.342898883439424, + "grad_norm": 1.0390625, + "learning_rate": 0.0015290442001655031, + "loss": 1.0271, + "step": 4929 + }, + { + "epoch": 0.3429684510765592, + "grad_norm": 1.3828125, + "learning_rate": 0.0015288529704454601, + "loss": 1.0625, + "step": 4930 + }, + { + "epoch": 0.3430380187136944, + "grad_norm": 0.9609375, + "learning_rate": 0.0015286617138729288, + "loss": 0.9363, + "step": 4931 + }, + { + "epoch": 0.3431075863508296, + "grad_norm": 1.0390625, + "learning_rate": 0.0015284704304576204, + "loss": 0.9087, + "step": 4932 + }, + { + "epoch": 0.3431771539879648, + "grad_norm": 1.359375, + "learning_rate": 0.0015282791202092475, + "loss": 1.0606, + "step": 4933 + }, + { + "epoch": 0.3432467216251, + "grad_norm": 0.9765625, + "learning_rate": 0.001528087783137523, + "loss": 0.9597, + "step": 4934 + }, + { + "epoch": 0.34331628926223523, + "grad_norm": 1.2734375, + "learning_rate": 0.0015278964192521629, + "loss": 1.043, + "step": 4935 + }, + { + "epoch": 0.3433858568993704, + "grad_norm": 1.046875, + "learning_rate": 0.0015277050285628835, + "loss": 0.7973, + "step": 4936 + }, + { + "epoch": 0.3434554245365056, + "grad_norm": 0.96875, + "learning_rate": 0.0015275136110794027, + "loss": 0.9741, + "step": 4937 + }, + { + "epoch": 0.3435249921736408, + "grad_norm": 1.0390625, + "learning_rate": 0.0015273221668114392, + "loss": 0.8344, + "step": 4938 + }, + { + "epoch": 0.343594559810776, + "grad_norm": 1.3828125, + "learning_rate": 0.0015271306957687142, + "loss": 0.991, + "step": 4939 + }, + { + "epoch": 0.34366412744791125, + "grad_norm": 1.171875, + "learning_rate": 0.001526939197960949, + "loss": 0.6336, + "step": 4940 + }, + { + "epoch": 0.3437336950850464, + "grad_norm": 1.0859375, + "learning_rate": 0.001526747673397868, + "loss": 0.8836, + "step": 4941 + }, + { + "epoch": 0.34380326272218165, + "grad_norm": 1.4921875, + "learning_rate": 0.0015265561220891948, + "loss": 0.9956, + "step": 4942 + }, + { + "epoch": 0.3438728303593169, + "grad_norm": 0.86328125, + "learning_rate": 0.0015263645440446558, + "loss": 0.7178, + "step": 4943 + }, + { + "epoch": 0.34394239799645204, + "grad_norm": 1.0703125, + "learning_rate": 0.0015261729392739786, + "loss": 0.7417, + "step": 4944 + }, + { + "epoch": 0.34401196563358727, + "grad_norm": 1.5078125, + "learning_rate": 0.001525981307786891, + "loss": 0.6659, + "step": 4945 + }, + { + "epoch": 0.34408153327072244, + "grad_norm": 1.2109375, + "learning_rate": 0.0015257896495931244, + "loss": 1.1262, + "step": 4946 + }, + { + "epoch": 0.34415110090785767, + "grad_norm": 0.9921875, + "learning_rate": 0.001525597964702409, + "loss": 1.0506, + "step": 4947 + }, + { + "epoch": 0.3442206685449929, + "grad_norm": 1.421875, + "learning_rate": 0.0015254062531244786, + "loss": 1.0611, + "step": 4948 + }, + { + "epoch": 0.34429023618212806, + "grad_norm": 1.0546875, + "learning_rate": 0.0015252145148690666, + "loss": 0.8728, + "step": 4949 + }, + { + "epoch": 0.3443598038192633, + "grad_norm": 0.91015625, + "learning_rate": 0.0015250227499459088, + "loss": 0.8899, + "step": 4950 + }, + { + "epoch": 0.34442937145639846, + "grad_norm": 0.9453125, + "learning_rate": 0.0015248309583647424, + "loss": 0.9292, + "step": 4951 + }, + { + "epoch": 0.3444989390935337, + "grad_norm": 0.9609375, + "learning_rate": 0.0015246391401353052, + "loss": 0.7855, + "step": 4952 + }, + { + "epoch": 0.3445685067306689, + "grad_norm": 0.7890625, + "learning_rate": 0.0015244472952673368, + "loss": 0.8608, + "step": 4953 + }, + { + "epoch": 0.3446380743678041, + "grad_norm": 1.03125, + "learning_rate": 0.0015242554237705778, + "loss": 0.6868, + "step": 4954 + }, + { + "epoch": 0.3447076420049393, + "grad_norm": 1.109375, + "learning_rate": 0.0015240635256547712, + "loss": 0.8707, + "step": 4955 + }, + { + "epoch": 0.34477720964207453, + "grad_norm": 1.03125, + "learning_rate": 0.00152387160092966, + "loss": 0.968, + "step": 4956 + }, + { + "epoch": 0.3448467772792097, + "grad_norm": 1.1875, + "learning_rate": 0.0015236796496049898, + "loss": 0.8383, + "step": 4957 + }, + { + "epoch": 0.34491634491634493, + "grad_norm": 1.0625, + "learning_rate": 0.0015234876716905062, + "loss": 0.8117, + "step": 4958 + }, + { + "epoch": 0.3449859125534801, + "grad_norm": 0.94140625, + "learning_rate": 0.0015232956671959574, + "loss": 0.9149, + "step": 4959 + }, + { + "epoch": 0.3450554801906153, + "grad_norm": 1.34375, + "learning_rate": 0.001523103636131092, + "loss": 0.8582, + "step": 4960 + }, + { + "epoch": 0.34512504782775055, + "grad_norm": 1.09375, + "learning_rate": 0.001522911578505661, + "loss": 0.7958, + "step": 4961 + }, + { + "epoch": 0.3451946154648857, + "grad_norm": 0.9609375, + "learning_rate": 0.0015227194943294154, + "loss": 0.6385, + "step": 4962 + }, + { + "epoch": 0.34526418310202095, + "grad_norm": 1.0625, + "learning_rate": 0.0015225273836121085, + "loss": 0.8825, + "step": 4963 + }, + { + "epoch": 0.3453337507391561, + "grad_norm": 1.140625, + "learning_rate": 0.001522335246363495, + "loss": 0.8911, + "step": 4964 + }, + { + "epoch": 0.34540331837629135, + "grad_norm": 1.15625, + "learning_rate": 0.0015221430825933305, + "loss": 0.7903, + "step": 4965 + }, + { + "epoch": 0.34547288601342657, + "grad_norm": 0.953125, + "learning_rate": 0.001521950892311372, + "loss": 0.844, + "step": 4966 + }, + { + "epoch": 0.34554245365056174, + "grad_norm": 1.265625, + "learning_rate": 0.0015217586755273778, + "loss": 0.9461, + "step": 4967 + }, + { + "epoch": 0.34561202128769697, + "grad_norm": 1.109375, + "learning_rate": 0.001521566432251108, + "loss": 0.8895, + "step": 4968 + }, + { + "epoch": 0.3456815889248322, + "grad_norm": 0.99609375, + "learning_rate": 0.0015213741624923239, + "loss": 0.6103, + "step": 4969 + }, + { + "epoch": 0.34575115656196737, + "grad_norm": 1.5234375, + "learning_rate": 0.0015211818662607872, + "loss": 0.8747, + "step": 4970 + }, + { + "epoch": 0.3458207241991026, + "grad_norm": 1.28125, + "learning_rate": 0.001520989543566263, + "loss": 0.7802, + "step": 4971 + }, + { + "epoch": 0.34589029183623776, + "grad_norm": 0.953125, + "learning_rate": 0.0015207971944185155, + "loss": 0.8913, + "step": 4972 + }, + { + "epoch": 0.345959859473373, + "grad_norm": 1.1171875, + "learning_rate": 0.0015206048188273113, + "loss": 0.9981, + "step": 4973 + }, + { + "epoch": 0.3460294271105082, + "grad_norm": 1.265625, + "learning_rate": 0.0015204124168024184, + "loss": 0.9906, + "step": 4974 + }, + { + "epoch": 0.3460989947476434, + "grad_norm": 1.2890625, + "learning_rate": 0.0015202199883536064, + "loss": 0.807, + "step": 4975 + }, + { + "epoch": 0.3461685623847786, + "grad_norm": 1.4921875, + "learning_rate": 0.0015200275334906453, + "loss": 1.1194, + "step": 4976 + }, + { + "epoch": 0.3462381300219138, + "grad_norm": 1.1171875, + "learning_rate": 0.0015198350522233068, + "loss": 0.9465, + "step": 4977 + }, + { + "epoch": 0.346307697659049, + "grad_norm": 1.25, + "learning_rate": 0.001519642544561365, + "loss": 1.0371, + "step": 4978 + }, + { + "epoch": 0.34637726529618423, + "grad_norm": 1.015625, + "learning_rate": 0.0015194500105145936, + "loss": 0.8513, + "step": 4979 + }, + { + "epoch": 0.3464468329333194, + "grad_norm": 1.4609375, + "learning_rate": 0.0015192574500927695, + "loss": 0.8837, + "step": 4980 + }, + { + "epoch": 0.34651640057045463, + "grad_norm": 1.234375, + "learning_rate": 0.001519064863305669, + "loss": 0.7, + "step": 4981 + }, + { + "epoch": 0.34658596820758986, + "grad_norm": 1.1328125, + "learning_rate": 0.0015188722501630711, + "loss": 0.8776, + "step": 4982 + }, + { + "epoch": 0.346655535844725, + "grad_norm": 0.890625, + "learning_rate": 0.0015186796106747553, + "loss": 0.7257, + "step": 4983 + }, + { + "epoch": 0.34672510348186025, + "grad_norm": 1.015625, + "learning_rate": 0.0015184869448505035, + "loss": 0.911, + "step": 4984 + }, + { + "epoch": 0.3467946711189954, + "grad_norm": 1.203125, + "learning_rate": 0.0015182942527000982, + "loss": 0.5819, + "step": 4985 + }, + { + "epoch": 0.34686423875613065, + "grad_norm": 1.109375, + "learning_rate": 0.0015181015342333227, + "loss": 0.7151, + "step": 4986 + }, + { + "epoch": 0.3469338063932659, + "grad_norm": 1.0859375, + "learning_rate": 0.001517908789459963, + "loss": 0.9317, + "step": 4987 + }, + { + "epoch": 0.34700337403040105, + "grad_norm": 0.87890625, + "learning_rate": 0.0015177160183898054, + "loss": 0.7153, + "step": 4988 + }, + { + "epoch": 0.34707294166753627, + "grad_norm": 1.171875, + "learning_rate": 0.0015175232210326377, + "loss": 0.8403, + "step": 4989 + }, + { + "epoch": 0.34714250930467144, + "grad_norm": 1.140625, + "learning_rate": 0.0015173303973982498, + "loss": 0.8124, + "step": 4990 + }, + { + "epoch": 0.34721207694180667, + "grad_norm": 1.640625, + "learning_rate": 0.0015171375474964312, + "loss": 0.7931, + "step": 4991 + }, + { + "epoch": 0.3472816445789419, + "grad_norm": 1.25, + "learning_rate": 0.001516944671336975, + "loss": 1.0845, + "step": 4992 + }, + { + "epoch": 0.34735121221607707, + "grad_norm": 0.9375, + "learning_rate": 0.0015167517689296734, + "loss": 0.8126, + "step": 4993 + }, + { + "epoch": 0.3474207798532123, + "grad_norm": 1.1171875, + "learning_rate": 0.0015165588402843225, + "loss": 1.018, + "step": 4994 + }, + { + "epoch": 0.3474903474903475, + "grad_norm": 0.8515625, + "learning_rate": 0.0015163658854107165, + "loss": 0.8798, + "step": 4995 + }, + { + "epoch": 0.3475599151274827, + "grad_norm": 1.109375, + "learning_rate": 0.0015161729043186541, + "loss": 0.988, + "step": 4996 + }, + { + "epoch": 0.3476294827646179, + "grad_norm": 0.9765625, + "learning_rate": 0.001515979897017933, + "loss": 0.8558, + "step": 4997 + }, + { + "epoch": 0.3476990504017531, + "grad_norm": 1.15625, + "learning_rate": 0.0015157868635183537, + "loss": 1.0063, + "step": 4998 + }, + { + "epoch": 0.3477686180388883, + "grad_norm": 1.40625, + "learning_rate": 0.001515593803829717, + "loss": 1.202, + "step": 4999 + }, + { + "epoch": 0.34783818567602354, + "grad_norm": 1.390625, + "learning_rate": 0.0015154007179618257, + "loss": 1.1614, + "step": 5000 + }, + { + "epoch": 0.3479077533131587, + "grad_norm": 1.15625, + "learning_rate": 0.0015152076059244842, + "loss": 0.6836, + "step": 5001 + }, + { + "epoch": 0.34797732095029393, + "grad_norm": 1.0546875, + "learning_rate": 0.0015150144677274966, + "loss": 0.8049, + "step": 5002 + }, + { + "epoch": 0.3480468885874291, + "grad_norm": 1.0234375, + "learning_rate": 0.0015148213033806708, + "loss": 1.0787, + "step": 5003 + }, + { + "epoch": 0.34811645622456433, + "grad_norm": 1.0859375, + "learning_rate": 0.001514628112893814, + "loss": 0.781, + "step": 5004 + }, + { + "epoch": 0.34818602386169956, + "grad_norm": 1.203125, + "learning_rate": 0.0015144348962767352, + "loss": 0.9506, + "step": 5005 + }, + { + "epoch": 0.3482555914988347, + "grad_norm": 1.0234375, + "learning_rate": 0.0015142416535392457, + "loss": 0.8873, + "step": 5006 + }, + { + "epoch": 0.34832515913596995, + "grad_norm": 1.0390625, + "learning_rate": 0.0015140483846911566, + "loss": 0.879, + "step": 5007 + }, + { + "epoch": 0.3483947267731052, + "grad_norm": 1.2890625, + "learning_rate": 0.001513855089742282, + "loss": 0.969, + "step": 5008 + }, + { + "epoch": 0.34846429441024035, + "grad_norm": 1.15625, + "learning_rate": 0.0015136617687024354, + "loss": 0.9758, + "step": 5009 + }, + { + "epoch": 0.3485338620473756, + "grad_norm": 0.984375, + "learning_rate": 0.0015134684215814338, + "loss": 0.7726, + "step": 5010 + }, + { + "epoch": 0.34860342968451075, + "grad_norm": 0.9921875, + "learning_rate": 0.0015132750483890934, + "loss": 0.853, + "step": 5011 + }, + { + "epoch": 0.34867299732164597, + "grad_norm": 1.0625, + "learning_rate": 0.0015130816491352333, + "loss": 0.879, + "step": 5012 + }, + { + "epoch": 0.3487425649587812, + "grad_norm": 1.1328125, + "learning_rate": 0.0015128882238296733, + "loss": 0.9025, + "step": 5013 + }, + { + "epoch": 0.34881213259591637, + "grad_norm": 0.9921875, + "learning_rate": 0.0015126947724822342, + "loss": 0.7474, + "step": 5014 + }, + { + "epoch": 0.3488817002330516, + "grad_norm": 1.0546875, + "learning_rate": 0.001512501295102739, + "loss": 0.5989, + "step": 5015 + }, + { + "epoch": 0.34895126787018677, + "grad_norm": 1.2890625, + "learning_rate": 0.0015123077917010108, + "loss": 1.0787, + "step": 5016 + }, + { + "epoch": 0.349020835507322, + "grad_norm": 1.2265625, + "learning_rate": 0.0015121142622868758, + "loss": 0.8695, + "step": 5017 + }, + { + "epoch": 0.3490904031444572, + "grad_norm": 1.1953125, + "learning_rate": 0.0015119207068701593, + "loss": 0.9261, + "step": 5018 + }, + { + "epoch": 0.3491599707815924, + "grad_norm": 0.96484375, + "learning_rate": 0.0015117271254606898, + "loss": 0.8961, + "step": 5019 + }, + { + "epoch": 0.3492295384187276, + "grad_norm": 1.1796875, + "learning_rate": 0.0015115335180682964, + "loss": 0.6799, + "step": 5020 + }, + { + "epoch": 0.34929910605586284, + "grad_norm": 1.046875, + "learning_rate": 0.0015113398847028086, + "loss": 0.7744, + "step": 5021 + }, + { + "epoch": 0.349368673692998, + "grad_norm": 1.0, + "learning_rate": 0.0015111462253740594, + "loss": 1.0222, + "step": 5022 + }, + { + "epoch": 0.34943824133013324, + "grad_norm": 1.0703125, + "learning_rate": 0.0015109525400918806, + "loss": 0.8946, + "step": 5023 + }, + { + "epoch": 0.3495078089672684, + "grad_norm": 1.203125, + "learning_rate": 0.0015107588288661078, + "loss": 0.9303, + "step": 5024 + }, + { + "epoch": 0.34957737660440363, + "grad_norm": 1.171875, + "learning_rate": 0.0015105650917065759, + "loss": 0.7738, + "step": 5025 + }, + { + "epoch": 0.34964694424153886, + "grad_norm": 1.0546875, + "learning_rate": 0.0015103713286231221, + "loss": 0.9991, + "step": 5026 + }, + { + "epoch": 0.34971651187867403, + "grad_norm": 0.94921875, + "learning_rate": 0.0015101775396255848, + "loss": 0.7473, + "step": 5027 + }, + { + "epoch": 0.34978607951580926, + "grad_norm": 1.03125, + "learning_rate": 0.0015099837247238032, + "loss": 0.908, + "step": 5028 + }, + { + "epoch": 0.3498556471529444, + "grad_norm": 1.375, + "learning_rate": 0.0015097898839276191, + "loss": 1.0346, + "step": 5029 + }, + { + "epoch": 0.34992521479007965, + "grad_norm": 1.0703125, + "learning_rate": 0.0015095960172468736, + "loss": 0.7678, + "step": 5030 + }, + { + "epoch": 0.3499947824272149, + "grad_norm": 0.9140625, + "learning_rate": 0.0015094021246914117, + "loss": 0.6541, + "step": 5031 + }, + { + "epoch": 0.35006435006435005, + "grad_norm": 1.1640625, + "learning_rate": 0.0015092082062710766, + "loss": 0.829, + "step": 5032 + }, + { + "epoch": 0.3501339177014853, + "grad_norm": 1.1171875, + "learning_rate": 0.0015090142619957158, + "loss": 1.0688, + "step": 5033 + }, + { + "epoch": 0.35020348533862045, + "grad_norm": 1.0625, + "learning_rate": 0.0015088202918751763, + "loss": 1.0056, + "step": 5034 + }, + { + "epoch": 0.35027305297575567, + "grad_norm": 1.2265625, + "learning_rate": 0.0015086262959193074, + "loss": 1.0583, + "step": 5035 + }, + { + "epoch": 0.3503426206128909, + "grad_norm": 1.03125, + "learning_rate": 0.0015084322741379585, + "loss": 0.9359, + "step": 5036 + }, + { + "epoch": 0.35041218825002607, + "grad_norm": 1.015625, + "learning_rate": 0.0015082382265409811, + "loss": 0.8333, + "step": 5037 + }, + { + "epoch": 0.3504817558871613, + "grad_norm": 0.88671875, + "learning_rate": 0.001508044153138229, + "loss": 0.7506, + "step": 5038 + }, + { + "epoch": 0.3505513235242965, + "grad_norm": 0.99609375, + "learning_rate": 0.001507850053939555, + "loss": 0.7149, + "step": 5039 + }, + { + "epoch": 0.3506208911614317, + "grad_norm": 1.1484375, + "learning_rate": 0.0015076559289548153, + "loss": 0.9226, + "step": 5040 + }, + { + "epoch": 0.3506904587985669, + "grad_norm": 1.0625, + "learning_rate": 0.001507461778193866, + "loss": 0.8957, + "step": 5041 + }, + { + "epoch": 0.3507600264357021, + "grad_norm": 1.1171875, + "learning_rate": 0.0015072676016665656, + "loss": 0.8803, + "step": 5042 + }, + { + "epoch": 0.3508295940728373, + "grad_norm": 0.86328125, + "learning_rate": 0.0015070733993827732, + "loss": 0.781, + "step": 5043 + }, + { + "epoch": 0.35089916170997254, + "grad_norm": 1.171875, + "learning_rate": 0.0015068791713523492, + "loss": 1.0205, + "step": 5044 + }, + { + "epoch": 0.3509687293471077, + "grad_norm": 0.98828125, + "learning_rate": 0.0015066849175851562, + "loss": 0.8004, + "step": 5045 + }, + { + "epoch": 0.35103829698424294, + "grad_norm": 1.2265625, + "learning_rate": 0.0015064906380910566, + "loss": 0.9252, + "step": 5046 + }, + { + "epoch": 0.3511078646213781, + "grad_norm": 1.0390625, + "learning_rate": 0.0015062963328799155, + "loss": 0.9361, + "step": 5047 + }, + { + "epoch": 0.35117743225851333, + "grad_norm": 0.97265625, + "learning_rate": 0.0015061020019615982, + "loss": 0.7734, + "step": 5048 + }, + { + "epoch": 0.35124699989564856, + "grad_norm": 1.1328125, + "learning_rate": 0.0015059076453459727, + "loss": 0.9989, + "step": 5049 + }, + { + "epoch": 0.35131656753278373, + "grad_norm": 1.3046875, + "learning_rate": 0.0015057132630429066, + "loss": 0.9318, + "step": 5050 + }, + { + "epoch": 0.35138613516991896, + "grad_norm": 1.0234375, + "learning_rate": 0.00150551885506227, + "loss": 0.9111, + "step": 5051 + }, + { + "epoch": 0.3514557028070542, + "grad_norm": 0.953125, + "learning_rate": 0.0015053244214139343, + "loss": 0.8417, + "step": 5052 + }, + { + "epoch": 0.35152527044418935, + "grad_norm": 1.2890625, + "learning_rate": 0.001505129962107771, + "loss": 1.3842, + "step": 5053 + }, + { + "epoch": 0.3515948380813246, + "grad_norm": 1.0625, + "learning_rate": 0.0015049354771536545, + "loss": 0.8825, + "step": 5054 + }, + { + "epoch": 0.35166440571845975, + "grad_norm": 1.1796875, + "learning_rate": 0.0015047409665614594, + "loss": 0.9776, + "step": 5055 + }, + { + "epoch": 0.351733973355595, + "grad_norm": 1.2265625, + "learning_rate": 0.0015045464303410623, + "loss": 0.9288, + "step": 5056 + }, + { + "epoch": 0.3518035409927302, + "grad_norm": 0.87109375, + "learning_rate": 0.0015043518685023403, + "loss": 0.6324, + "step": 5057 + }, + { + "epoch": 0.35187310862986537, + "grad_norm": 0.80078125, + "learning_rate": 0.0015041572810551727, + "loss": 0.7409, + "step": 5058 + }, + { + "epoch": 0.3519426762670006, + "grad_norm": 1.1328125, + "learning_rate": 0.0015039626680094398, + "loss": 0.8132, + "step": 5059 + }, + { + "epoch": 0.35201224390413577, + "grad_norm": 1.09375, + "learning_rate": 0.0015037680293750223, + "loss": 1.0056, + "step": 5060 + }, + { + "epoch": 0.352081811541271, + "grad_norm": 1.1875, + "learning_rate": 0.0015035733651618038, + "loss": 0.8673, + "step": 5061 + }, + { + "epoch": 0.3521513791784062, + "grad_norm": 1.2890625, + "learning_rate": 0.0015033786753796676, + "loss": 0.8713, + "step": 5062 + }, + { + "epoch": 0.3522209468155414, + "grad_norm": 1.265625, + "learning_rate": 0.0015031839600385, + "loss": 0.89, + "step": 5063 + }, + { + "epoch": 0.3522905144526766, + "grad_norm": 1.1640625, + "learning_rate": 0.0015029892191481867, + "loss": 0.9799, + "step": 5064 + }, + { + "epoch": 0.35236008208981184, + "grad_norm": 0.90625, + "learning_rate": 0.001502794452718616, + "loss": 0.58, + "step": 5065 + }, + { + "epoch": 0.352429649726947, + "grad_norm": 1.1171875, + "learning_rate": 0.0015025996607596777, + "loss": 0.9915, + "step": 5066 + }, + { + "epoch": 0.35249921736408224, + "grad_norm": 1.3828125, + "learning_rate": 0.001502404843281262, + "loss": 1.0942, + "step": 5067 + }, + { + "epoch": 0.3525687850012174, + "grad_norm": 0.953125, + "learning_rate": 0.0015022100002932606, + "loss": 0.8471, + "step": 5068 + }, + { + "epoch": 0.35263835263835264, + "grad_norm": 0.92578125, + "learning_rate": 0.0015020151318055662, + "loss": 0.7711, + "step": 5069 + }, + { + "epoch": 0.35270792027548786, + "grad_norm": 1.0078125, + "learning_rate": 0.0015018202378280746, + "loss": 0.8061, + "step": 5070 + }, + { + "epoch": 0.35277748791262303, + "grad_norm": 0.95703125, + "learning_rate": 0.0015016253183706798, + "loss": 0.8691, + "step": 5071 + }, + { + "epoch": 0.35284705554975826, + "grad_norm": 1.265625, + "learning_rate": 0.00150143037344328, + "loss": 1.1209, + "step": 5072 + }, + { + "epoch": 0.35291662318689343, + "grad_norm": 1.1796875, + "learning_rate": 0.0015012354030557735, + "loss": 1.1614, + "step": 5073 + }, + { + "epoch": 0.35298619082402866, + "grad_norm": 0.84765625, + "learning_rate": 0.0015010404072180595, + "loss": 0.9, + "step": 5074 + }, + { + "epoch": 0.3530557584611639, + "grad_norm": 1.140625, + "learning_rate": 0.001500845385940039, + "loss": 0.8916, + "step": 5075 + }, + { + "epoch": 0.35312532609829905, + "grad_norm": 1.0546875, + "learning_rate": 0.0015006503392316142, + "loss": 0.8861, + "step": 5076 + }, + { + "epoch": 0.3531948937354343, + "grad_norm": 1.3125, + "learning_rate": 0.001500455267102689, + "loss": 0.9147, + "step": 5077 + }, + { + "epoch": 0.3532644613725695, + "grad_norm": 0.9609375, + "learning_rate": 0.0015002601695631673, + "loss": 0.9116, + "step": 5078 + }, + { + "epoch": 0.3533340290097047, + "grad_norm": 1.1796875, + "learning_rate": 0.001500065046622956, + "loss": 0.7114, + "step": 5079 + }, + { + "epoch": 0.3534035966468399, + "grad_norm": 1.25, + "learning_rate": 0.0014998698982919621, + "loss": 1.2384, + "step": 5080 + }, + { + "epoch": 0.35347316428397507, + "grad_norm": 1.125, + "learning_rate": 0.0014996747245800942, + "loss": 0.9718, + "step": 5081 + }, + { + "epoch": 0.3535427319211103, + "grad_norm": 0.953125, + "learning_rate": 0.0014994795254972622, + "loss": 0.9054, + "step": 5082 + }, + { + "epoch": 0.3536122995582455, + "grad_norm": 1.328125, + "learning_rate": 0.0014992843010533776, + "loss": 0.7544, + "step": 5083 + }, + { + "epoch": 0.3536818671953807, + "grad_norm": 1.171875, + "learning_rate": 0.0014990890512583534, + "loss": 1.0095, + "step": 5084 + }, + { + "epoch": 0.3537514348325159, + "grad_norm": 0.97265625, + "learning_rate": 0.0014988937761221018, + "loss": 0.9885, + "step": 5085 + }, + { + "epoch": 0.3538210024696511, + "grad_norm": 1.265625, + "learning_rate": 0.0014986984756545393, + "loss": 1.0343, + "step": 5086 + }, + { + "epoch": 0.3538905701067863, + "grad_norm": 0.8828125, + "learning_rate": 0.0014985031498655817, + "loss": 0.7228, + "step": 5087 + }, + { + "epoch": 0.35396013774392154, + "grad_norm": 1.28125, + "learning_rate": 0.001498307798765147, + "loss": 1.2578, + "step": 5088 + }, + { + "epoch": 0.3540297053810567, + "grad_norm": 1.3671875, + "learning_rate": 0.0014981124223631538, + "loss": 0.9989, + "step": 5089 + }, + { + "epoch": 0.35409927301819194, + "grad_norm": 1.2265625, + "learning_rate": 0.0014979170206695226, + "loss": 0.8049, + "step": 5090 + }, + { + "epoch": 0.35416884065532717, + "grad_norm": 1.1015625, + "learning_rate": 0.0014977215936941746, + "loss": 0.9737, + "step": 5091 + }, + { + "epoch": 0.35423840829246234, + "grad_norm": 1.015625, + "learning_rate": 0.0014975261414470328, + "loss": 0.8074, + "step": 5092 + }, + { + "epoch": 0.35430797592959756, + "grad_norm": 0.84765625, + "learning_rate": 0.0014973306639380214, + "loss": 0.7584, + "step": 5093 + }, + { + "epoch": 0.35437754356673273, + "grad_norm": 1.1484375, + "learning_rate": 0.0014971351611770653, + "loss": 0.9073, + "step": 5094 + }, + { + "epoch": 0.35444711120386796, + "grad_norm": 1.3125, + "learning_rate": 0.0014969396331740916, + "loss": 0.9608, + "step": 5095 + }, + { + "epoch": 0.3545166788410032, + "grad_norm": 1.234375, + "learning_rate": 0.0014967440799390284, + "loss": 1.0383, + "step": 5096 + }, + { + "epoch": 0.35458624647813836, + "grad_norm": 1.0703125, + "learning_rate": 0.0014965485014818043, + "loss": 0.8309, + "step": 5097 + }, + { + "epoch": 0.3546558141152736, + "grad_norm": 0.98046875, + "learning_rate": 0.0014963528978123501, + "loss": 0.6906, + "step": 5098 + }, + { + "epoch": 0.35472538175240875, + "grad_norm": 1.015625, + "learning_rate": 0.0014961572689405976, + "loss": 0.7935, + "step": 5099 + }, + { + "epoch": 0.354794949389544, + "grad_norm": 1.0234375, + "learning_rate": 0.0014959616148764799, + "loss": 0.7032, + "step": 5100 + }, + { + "epoch": 0.3548645170266792, + "grad_norm": 1.2578125, + "learning_rate": 0.001495765935629931, + "loss": 0.8586, + "step": 5101 + }, + { + "epoch": 0.3549340846638144, + "grad_norm": 1.3515625, + "learning_rate": 0.0014955702312108867, + "loss": 0.8868, + "step": 5102 + }, + { + "epoch": 0.3550036523009496, + "grad_norm": 1.34375, + "learning_rate": 0.0014953745016292844, + "loss": 0.9855, + "step": 5103 + }, + { + "epoch": 0.3550732199380848, + "grad_norm": 0.99609375, + "learning_rate": 0.0014951787468950612, + "loss": 0.9177, + "step": 5104 + }, + { + "epoch": 0.35514278757522, + "grad_norm": 0.9921875, + "learning_rate": 0.0014949829670181573, + "loss": 0.6573, + "step": 5105 + }, + { + "epoch": 0.3552123552123552, + "grad_norm": 0.90625, + "learning_rate": 0.0014947871620085134, + "loss": 0.8794, + "step": 5106 + }, + { + "epoch": 0.3552819228494904, + "grad_norm": 1.0859375, + "learning_rate": 0.0014945913318760715, + "loss": 0.8961, + "step": 5107 + }, + { + "epoch": 0.3553514904866256, + "grad_norm": 1.0234375, + "learning_rate": 0.0014943954766307743, + "loss": 0.8715, + "step": 5108 + }, + { + "epoch": 0.35542105812376085, + "grad_norm": 1.2578125, + "learning_rate": 0.0014941995962825668, + "loss": 0.9822, + "step": 5109 + }, + { + "epoch": 0.355490625760896, + "grad_norm": 1.4140625, + "learning_rate": 0.0014940036908413948, + "loss": 0.9308, + "step": 5110 + }, + { + "epoch": 0.35556019339803124, + "grad_norm": 0.94140625, + "learning_rate": 0.0014938077603172052, + "loss": 0.6205, + "step": 5111 + }, + { + "epoch": 0.3556297610351664, + "grad_norm": 1.2421875, + "learning_rate": 0.0014936118047199467, + "loss": 0.8679, + "step": 5112 + }, + { + "epoch": 0.35569932867230164, + "grad_norm": 1.1328125, + "learning_rate": 0.0014934158240595687, + "loss": 0.913, + "step": 5113 + }, + { + "epoch": 0.35576889630943687, + "grad_norm": 1.1484375, + "learning_rate": 0.0014932198183460223, + "loss": 0.7827, + "step": 5114 + }, + { + "epoch": 0.35583846394657204, + "grad_norm": 1.015625, + "learning_rate": 0.0014930237875892594, + "loss": 0.7733, + "step": 5115 + }, + { + "epoch": 0.35590803158370726, + "grad_norm": 0.9140625, + "learning_rate": 0.0014928277317992338, + "loss": 0.686, + "step": 5116 + }, + { + "epoch": 0.3559775992208425, + "grad_norm": 1.1796875, + "learning_rate": 0.0014926316509858996, + "loss": 0.8916, + "step": 5117 + }, + { + "epoch": 0.35604716685797766, + "grad_norm": 0.9140625, + "learning_rate": 0.0014924355451592134, + "loss": 0.9492, + "step": 5118 + }, + { + "epoch": 0.3561167344951129, + "grad_norm": 0.8828125, + "learning_rate": 0.0014922394143291322, + "loss": 0.9, + "step": 5119 + }, + { + "epoch": 0.35618630213224806, + "grad_norm": 1.21875, + "learning_rate": 0.0014920432585056147, + "loss": 0.9832, + "step": 5120 + }, + { + "epoch": 0.3562558697693833, + "grad_norm": 0.9921875, + "learning_rate": 0.001491847077698621, + "loss": 0.7843, + "step": 5121 + }, + { + "epoch": 0.3563254374065185, + "grad_norm": 1.0078125, + "learning_rate": 0.001491650871918111, + "loss": 0.829, + "step": 5122 + }, + { + "epoch": 0.3563950050436537, + "grad_norm": 1.140625, + "learning_rate": 0.0014914546411740487, + "loss": 0.9153, + "step": 5123 + }, + { + "epoch": 0.3564645726807889, + "grad_norm": 1.0078125, + "learning_rate": 0.001491258385476396, + "loss": 0.8968, + "step": 5124 + }, + { + "epoch": 0.3565341403179241, + "grad_norm": 1.234375, + "learning_rate": 0.001491062104835119, + "loss": 0.9404, + "step": 5125 + }, + { + "epoch": 0.3566037079550593, + "grad_norm": 1.421875, + "learning_rate": 0.0014908657992601833, + "loss": 1.2725, + "step": 5126 + }, + { + "epoch": 0.3566732755921945, + "grad_norm": 1.0546875, + "learning_rate": 0.0014906694687615567, + "loss": 0.7339, + "step": 5127 + }, + { + "epoch": 0.3567428432293297, + "grad_norm": 1.453125, + "learning_rate": 0.0014904731133492076, + "loss": 0.813, + "step": 5128 + }, + { + "epoch": 0.3568124108664649, + "grad_norm": 1.0234375, + "learning_rate": 0.001490276733033106, + "loss": 0.8772, + "step": 5129 + }, + { + "epoch": 0.35688197850360015, + "grad_norm": 1.046875, + "learning_rate": 0.0014900803278232227, + "loss": 0.7159, + "step": 5130 + }, + { + "epoch": 0.3569515461407353, + "grad_norm": 0.94921875, + "learning_rate": 0.0014898838977295311, + "loss": 0.8305, + "step": 5131 + }, + { + "epoch": 0.35702111377787055, + "grad_norm": 0.9296875, + "learning_rate": 0.0014896874427620039, + "loss": 0.8337, + "step": 5132 + }, + { + "epoch": 0.3570906814150057, + "grad_norm": 1.0078125, + "learning_rate": 0.0014894909629306168, + "loss": 0.759, + "step": 5133 + }, + { + "epoch": 0.35716024905214094, + "grad_norm": 1.296875, + "learning_rate": 0.001489294458245346, + "loss": 1.0684, + "step": 5134 + }, + { + "epoch": 0.35722981668927617, + "grad_norm": 1.1484375, + "learning_rate": 0.0014890979287161684, + "loss": 0.8084, + "step": 5135 + }, + { + "epoch": 0.35729938432641134, + "grad_norm": 0.9609375, + "learning_rate": 0.0014889013743530632, + "loss": 0.8792, + "step": 5136 + }, + { + "epoch": 0.35736895196354657, + "grad_norm": 1.0625, + "learning_rate": 0.001488704795166011, + "loss": 0.8146, + "step": 5137 + }, + { + "epoch": 0.35743851960068174, + "grad_norm": 1.140625, + "learning_rate": 0.001488508191164992, + "loss": 0.7602, + "step": 5138 + }, + { + "epoch": 0.35750808723781696, + "grad_norm": 1.2578125, + "learning_rate": 0.0014883115623599897, + "loss": 1.1143, + "step": 5139 + }, + { + "epoch": 0.3575776548749522, + "grad_norm": 0.92578125, + "learning_rate": 0.0014881149087609873, + "loss": 0.8189, + "step": 5140 + }, + { + "epoch": 0.35764722251208736, + "grad_norm": 1.0859375, + "learning_rate": 0.0014879182303779701, + "loss": 0.8648, + "step": 5141 + }, + { + "epoch": 0.3577167901492226, + "grad_norm": 1.546875, + "learning_rate": 0.0014877215272209245, + "loss": 1.0972, + "step": 5142 + }, + { + "epoch": 0.3577863577863578, + "grad_norm": 1.109375, + "learning_rate": 0.0014875247992998382, + "loss": 0.8679, + "step": 5143 + }, + { + "epoch": 0.357855925423493, + "grad_norm": 1.3046875, + "learning_rate": 0.0014873280466247, + "loss": 0.907, + "step": 5144 + }, + { + "epoch": 0.3579254930606282, + "grad_norm": 0.859375, + "learning_rate": 0.0014871312692054995, + "loss": 0.8558, + "step": 5145 + }, + { + "epoch": 0.3579950606977634, + "grad_norm": 0.78125, + "learning_rate": 0.0014869344670522286, + "loss": 0.607, + "step": 5146 + }, + { + "epoch": 0.3580646283348986, + "grad_norm": 1.0, + "learning_rate": 0.00148673764017488, + "loss": 0.8488, + "step": 5147 + }, + { + "epoch": 0.35813419597203383, + "grad_norm": 1.015625, + "learning_rate": 0.0014865407885834472, + "loss": 0.8936, + "step": 5148 + }, + { + "epoch": 0.358203763609169, + "grad_norm": 0.98046875, + "learning_rate": 0.0014863439122879253, + "loss": 0.8855, + "step": 5149 + }, + { + "epoch": 0.3582733312463042, + "grad_norm": 1.0234375, + "learning_rate": 0.0014861470112983116, + "loss": 0.8521, + "step": 5150 + }, + { + "epoch": 0.3583428988834394, + "grad_norm": 1.125, + "learning_rate": 0.0014859500856246024, + "loss": 1.1338, + "step": 5151 + }, + { + "epoch": 0.3584124665205746, + "grad_norm": 1.3046875, + "learning_rate": 0.0014857531352767972, + "loss": 1.0693, + "step": 5152 + }, + { + "epoch": 0.35848203415770985, + "grad_norm": 0.97265625, + "learning_rate": 0.0014855561602648965, + "loss": 0.9105, + "step": 5153 + }, + { + "epoch": 0.358551601794845, + "grad_norm": 1.125, + "learning_rate": 0.0014853591605989013, + "loss": 0.9043, + "step": 5154 + }, + { + "epoch": 0.35862116943198025, + "grad_norm": 1.03125, + "learning_rate": 0.0014851621362888142, + "loss": 0.9153, + "step": 5155 + }, + { + "epoch": 0.3586907370691155, + "grad_norm": 0.98046875, + "learning_rate": 0.001484965087344639, + "loss": 1.0623, + "step": 5156 + }, + { + "epoch": 0.35876030470625064, + "grad_norm": 1.1796875, + "learning_rate": 0.0014847680137763815, + "loss": 1.1121, + "step": 5157 + }, + { + "epoch": 0.35882987234338587, + "grad_norm": 1.0, + "learning_rate": 0.0014845709155940474, + "loss": 0.8297, + "step": 5158 + }, + { + "epoch": 0.35889943998052104, + "grad_norm": 0.828125, + "learning_rate": 0.0014843737928076448, + "loss": 0.9511, + "step": 5159 + }, + { + "epoch": 0.35896900761765627, + "grad_norm": 1.34375, + "learning_rate": 0.0014841766454271824, + "loss": 1.129, + "step": 5160 + }, + { + "epoch": 0.3590385752547915, + "grad_norm": 1.1796875, + "learning_rate": 0.0014839794734626704, + "loss": 0.8025, + "step": 5161 + }, + { + "epoch": 0.35910814289192666, + "grad_norm": 1.3984375, + "learning_rate": 0.00148378227692412, + "loss": 0.8423, + "step": 5162 + }, + { + "epoch": 0.3591777105290619, + "grad_norm": 1.125, + "learning_rate": 0.001483585055821544, + "loss": 1.051, + "step": 5163 + }, + { + "epoch": 0.35924727816619706, + "grad_norm": 1.3828125, + "learning_rate": 0.0014833878101649565, + "loss": 0.9794, + "step": 5164 + }, + { + "epoch": 0.3593168458033323, + "grad_norm": 1.0703125, + "learning_rate": 0.0014831905399643724, + "loss": 0.9215, + "step": 5165 + }, + { + "epoch": 0.3593864134404675, + "grad_norm": 1.0625, + "learning_rate": 0.001482993245229808, + "loss": 0.7982, + "step": 5166 + }, + { + "epoch": 0.3594559810776027, + "grad_norm": 1.2421875, + "learning_rate": 0.0014827959259712813, + "loss": 0.9635, + "step": 5167 + }, + { + "epoch": 0.3595255487147379, + "grad_norm": 1.2421875, + "learning_rate": 0.0014825985821988108, + "loss": 1.0276, + "step": 5168 + }, + { + "epoch": 0.35959511635187313, + "grad_norm": 1.078125, + "learning_rate": 0.001482401213922417, + "loss": 0.9149, + "step": 5169 + }, + { + "epoch": 0.3596646839890083, + "grad_norm": 1.265625, + "learning_rate": 0.0014822038211521208, + "loss": 1.3285, + "step": 5170 + }, + { + "epoch": 0.35973425162614353, + "grad_norm": 0.88671875, + "learning_rate": 0.0014820064038979452, + "loss": 0.7815, + "step": 5171 + }, + { + "epoch": 0.3598038192632787, + "grad_norm": 1.015625, + "learning_rate": 0.0014818089621699139, + "loss": 0.799, + "step": 5172 + }, + { + "epoch": 0.3598733869004139, + "grad_norm": 1.1328125, + "learning_rate": 0.0014816114959780517, + "loss": 1.0051, + "step": 5173 + }, + { + "epoch": 0.35994295453754915, + "grad_norm": 1.046875, + "learning_rate": 0.0014814140053323855, + "loss": 0.687, + "step": 5174 + }, + { + "epoch": 0.3600125221746843, + "grad_norm": 1.125, + "learning_rate": 0.0014812164902429426, + "loss": 0.8509, + "step": 5175 + }, + { + "epoch": 0.36008208981181955, + "grad_norm": 0.83203125, + "learning_rate": 0.0014810189507197518, + "loss": 0.561, + "step": 5176 + }, + { + "epoch": 0.3601516574489547, + "grad_norm": 1.0859375, + "learning_rate": 0.0014808213867728434, + "loss": 0.8071, + "step": 5177 + }, + { + "epoch": 0.36022122508608995, + "grad_norm": 0.92578125, + "learning_rate": 0.0014806237984122481, + "loss": 0.6641, + "step": 5178 + }, + { + "epoch": 0.3602907927232252, + "grad_norm": 0.99609375, + "learning_rate": 0.001480426185647999, + "loss": 0.7432, + "step": 5179 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 1.0234375, + "learning_rate": 0.0014802285484901297, + "loss": 0.8742, + "step": 5180 + }, + { + "epoch": 0.36042992799749557, + "grad_norm": 1.265625, + "learning_rate": 0.0014800308869486753, + "loss": 0.886, + "step": 5181 + }, + { + "epoch": 0.3604994956346308, + "grad_norm": 1.21875, + "learning_rate": 0.0014798332010336722, + "loss": 0.9504, + "step": 5182 + }, + { + "epoch": 0.36056906327176597, + "grad_norm": 1.140625, + "learning_rate": 0.0014796354907551574, + "loss": 0.8872, + "step": 5183 + }, + { + "epoch": 0.3606386309089012, + "grad_norm": 1.40625, + "learning_rate": 0.00147943775612317, + "loss": 1.0139, + "step": 5184 + }, + { + "epoch": 0.36070819854603636, + "grad_norm": 1.015625, + "learning_rate": 0.00147923999714775, + "loss": 0.8477, + "step": 5185 + }, + { + "epoch": 0.3607777661831716, + "grad_norm": 1.28125, + "learning_rate": 0.0014790422138389384, + "loss": 0.9271, + "step": 5186 + }, + { + "epoch": 0.3608473338203068, + "grad_norm": 0.953125, + "learning_rate": 0.0014788444062067776, + "loss": 0.8271, + "step": 5187 + }, + { + "epoch": 0.360916901457442, + "grad_norm": 1.2578125, + "learning_rate": 0.0014786465742613116, + "loss": 1.1401, + "step": 5188 + }, + { + "epoch": 0.3609864690945772, + "grad_norm": 1.046875, + "learning_rate": 0.001478448718012585, + "loss": 0.8429, + "step": 5189 + }, + { + "epoch": 0.3610560367317124, + "grad_norm": 1.15625, + "learning_rate": 0.001478250837470644, + "loss": 0.9735, + "step": 5190 + }, + { + "epoch": 0.3611256043688476, + "grad_norm": 1.1171875, + "learning_rate": 0.0014780529326455362, + "loss": 0.8389, + "step": 5191 + }, + { + "epoch": 0.36119517200598283, + "grad_norm": 1.2109375, + "learning_rate": 0.00147785500354731, + "loss": 0.831, + "step": 5192 + }, + { + "epoch": 0.361264739643118, + "grad_norm": 1.140625, + "learning_rate": 0.0014776570501860153, + "loss": 1.0879, + "step": 5193 + }, + { + "epoch": 0.36133430728025323, + "grad_norm": 0.89453125, + "learning_rate": 0.0014774590725717032, + "loss": 0.8871, + "step": 5194 + }, + { + "epoch": 0.36140387491738846, + "grad_norm": 0.90234375, + "learning_rate": 0.0014772610707144257, + "loss": 0.9803, + "step": 5195 + }, + { + "epoch": 0.3614734425545236, + "grad_norm": 1.0234375, + "learning_rate": 0.001477063044624237, + "loss": 0.8462, + "step": 5196 + }, + { + "epoch": 0.36154301019165885, + "grad_norm": 1.546875, + "learning_rate": 0.0014768649943111911, + "loss": 1.0829, + "step": 5197 + }, + { + "epoch": 0.361612577828794, + "grad_norm": 1.0859375, + "learning_rate": 0.0014766669197853446, + "loss": 0.7897, + "step": 5198 + }, + { + "epoch": 0.36168214546592925, + "grad_norm": 1.03125, + "learning_rate": 0.0014764688210567546, + "loss": 0.7425, + "step": 5199 + }, + { + "epoch": 0.3617517131030645, + "grad_norm": 1.03125, + "learning_rate": 0.0014762706981354791, + "loss": 0.9892, + "step": 5200 + }, + { + "epoch": 0.36182128074019965, + "grad_norm": 1.0859375, + "learning_rate": 0.0014760725510315784, + "loss": 0.8421, + "step": 5201 + }, + { + "epoch": 0.3618908483773349, + "grad_norm": 1.140625, + "learning_rate": 0.001475874379755113, + "loss": 0.8271, + "step": 5202 + }, + { + "epoch": 0.36196041601447004, + "grad_norm": 1.1328125, + "learning_rate": 0.0014756761843161452, + "loss": 1.192, + "step": 5203 + }, + { + "epoch": 0.36202998365160527, + "grad_norm": 0.83203125, + "learning_rate": 0.0014754779647247385, + "loss": 0.6577, + "step": 5204 + }, + { + "epoch": 0.3620995512887405, + "grad_norm": 1.28125, + "learning_rate": 0.0014752797209909572, + "loss": 0.7588, + "step": 5205 + }, + { + "epoch": 0.36216911892587567, + "grad_norm": 1.0625, + "learning_rate": 0.0014750814531248673, + "loss": 0.7133, + "step": 5206 + }, + { + "epoch": 0.3622386865630109, + "grad_norm": 0.984375, + "learning_rate": 0.001474883161136536, + "loss": 0.9246, + "step": 5207 + }, + { + "epoch": 0.3623082542001461, + "grad_norm": 0.8515625, + "learning_rate": 0.001474684845036031, + "loss": 0.6941, + "step": 5208 + }, + { + "epoch": 0.3623778218372813, + "grad_norm": 1.0234375, + "learning_rate": 0.0014744865048334221, + "loss": 0.777, + "step": 5209 + }, + { + "epoch": 0.3624473894744165, + "grad_norm": 1.15625, + "learning_rate": 0.0014742881405387803, + "loss": 0.7638, + "step": 5210 + }, + { + "epoch": 0.3625169571115517, + "grad_norm": 1.078125, + "learning_rate": 0.0014740897521621772, + "loss": 0.8489, + "step": 5211 + }, + { + "epoch": 0.3625865247486869, + "grad_norm": 1.28125, + "learning_rate": 0.0014738913397136862, + "loss": 0.8281, + "step": 5212 + }, + { + "epoch": 0.36265609238582214, + "grad_norm": 1.0859375, + "learning_rate": 0.0014736929032033816, + "loss": 0.8747, + "step": 5213 + }, + { + "epoch": 0.3627256600229573, + "grad_norm": 1.0234375, + "learning_rate": 0.0014734944426413388, + "loss": 0.7183, + "step": 5214 + }, + { + "epoch": 0.36279522766009253, + "grad_norm": 1.03125, + "learning_rate": 0.001473295958037635, + "loss": 0.9431, + "step": 5215 + }, + { + "epoch": 0.3628647952972277, + "grad_norm": 1.1484375, + "learning_rate": 0.0014730974494023478, + "loss": 0.9157, + "step": 5216 + }, + { + "epoch": 0.36293436293436293, + "grad_norm": 1.078125, + "learning_rate": 0.001472898916745557, + "loss": 0.9453, + "step": 5217 + }, + { + "epoch": 0.36300393057149816, + "grad_norm": 1.0234375, + "learning_rate": 0.0014727003600773425, + "loss": 0.6536, + "step": 5218 + }, + { + "epoch": 0.3630734982086333, + "grad_norm": 1.0546875, + "learning_rate": 0.0014725017794077863, + "loss": 0.9153, + "step": 5219 + }, + { + "epoch": 0.36314306584576855, + "grad_norm": 1.1484375, + "learning_rate": 0.0014723031747469713, + "loss": 0.8485, + "step": 5220 + }, + { + "epoch": 0.3632126334829038, + "grad_norm": 1.0859375, + "learning_rate": 0.001472104546104982, + "loss": 0.7691, + "step": 5221 + }, + { + "epoch": 0.36328220112003895, + "grad_norm": 1.1875, + "learning_rate": 0.0014719058934919034, + "loss": 0.9719, + "step": 5222 + }, + { + "epoch": 0.3633517687571742, + "grad_norm": 1.28125, + "learning_rate": 0.0014717072169178219, + "loss": 0.9064, + "step": 5223 + }, + { + "epoch": 0.36342133639430935, + "grad_norm": 1.1875, + "learning_rate": 0.0014715085163928255, + "loss": 0.9613, + "step": 5224 + }, + { + "epoch": 0.3634909040314446, + "grad_norm": 1.0625, + "learning_rate": 0.0014713097919270032, + "loss": 1.0228, + "step": 5225 + }, + { + "epoch": 0.3635604716685798, + "grad_norm": 1.1796875, + "learning_rate": 0.0014711110435304455, + "loss": 0.9179, + "step": 5226 + }, + { + "epoch": 0.36363003930571497, + "grad_norm": 1.0625, + "learning_rate": 0.0014709122712132433, + "loss": 0.7914, + "step": 5227 + }, + { + "epoch": 0.3636996069428502, + "grad_norm": 1.1328125, + "learning_rate": 0.0014707134749854898, + "loss": 0.7456, + "step": 5228 + }, + { + "epoch": 0.36376917457998537, + "grad_norm": 1.15625, + "learning_rate": 0.0014705146548572782, + "loss": 1.0304, + "step": 5229 + }, + { + "epoch": 0.3638387422171206, + "grad_norm": 1.0625, + "learning_rate": 0.0014703158108387044, + "loss": 1.0316, + "step": 5230 + }, + { + "epoch": 0.3639083098542558, + "grad_norm": 1.078125, + "learning_rate": 0.0014701169429398643, + "loss": 0.8354, + "step": 5231 + }, + { + "epoch": 0.363977877491391, + "grad_norm": 1.0625, + "learning_rate": 0.0014699180511708553, + "loss": 0.7559, + "step": 5232 + }, + { + "epoch": 0.3640474451285262, + "grad_norm": 1.4140625, + "learning_rate": 0.0014697191355417761, + "loss": 1.1107, + "step": 5233 + }, + { + "epoch": 0.36411701276566144, + "grad_norm": 1.203125, + "learning_rate": 0.0014695201960627266, + "loss": 0.8169, + "step": 5234 + }, + { + "epoch": 0.3641865804027966, + "grad_norm": 1.0390625, + "learning_rate": 0.0014693212327438086, + "loss": 0.6953, + "step": 5235 + }, + { + "epoch": 0.36425614803993184, + "grad_norm": 1.0859375, + "learning_rate": 0.0014691222455951235, + "loss": 0.9619, + "step": 5236 + }, + { + "epoch": 0.364325715677067, + "grad_norm": 1.03125, + "learning_rate": 0.0014689232346267755, + "loss": 0.9999, + "step": 5237 + }, + { + "epoch": 0.36439528331420223, + "grad_norm": 1.296875, + "learning_rate": 0.0014687241998488695, + "loss": 1.0531, + "step": 5238 + }, + { + "epoch": 0.36446485095133746, + "grad_norm": 0.96484375, + "learning_rate": 0.0014685251412715106, + "loss": 0.7005, + "step": 5239 + }, + { + "epoch": 0.36453441858847263, + "grad_norm": 1.078125, + "learning_rate": 0.0014683260589048069, + "loss": 0.7436, + "step": 5240 + }, + { + "epoch": 0.36460398622560786, + "grad_norm": 1.3515625, + "learning_rate": 0.0014681269527588663, + "loss": 1.1294, + "step": 5241 + }, + { + "epoch": 0.364673553862743, + "grad_norm": 0.9296875, + "learning_rate": 0.001467927822843799, + "loss": 0.7699, + "step": 5242 + }, + { + "epoch": 0.36474312149987825, + "grad_norm": 1.109375, + "learning_rate": 0.0014677286691697146, + "loss": 0.827, + "step": 5243 + }, + { + "epoch": 0.3648126891370135, + "grad_norm": 1.09375, + "learning_rate": 0.0014675294917467269, + "loss": 0.8862, + "step": 5244 + }, + { + "epoch": 0.36488225677414865, + "grad_norm": 1.2890625, + "learning_rate": 0.0014673302905849476, + "loss": 0.9959, + "step": 5245 + }, + { + "epoch": 0.3649518244112839, + "grad_norm": 0.96484375, + "learning_rate": 0.0014671310656944915, + "loss": 0.7926, + "step": 5246 + }, + { + "epoch": 0.3650213920484191, + "grad_norm": 0.9609375, + "learning_rate": 0.0014669318170854747, + "loss": 0.8058, + "step": 5247 + }, + { + "epoch": 0.3650909596855543, + "grad_norm": 0.96484375, + "learning_rate": 0.0014667325447680136, + "loss": 0.6047, + "step": 5248 + }, + { + "epoch": 0.3651605273226895, + "grad_norm": 1.671875, + "learning_rate": 0.0014665332487522262, + "loss": 1.2125, + "step": 5249 + }, + { + "epoch": 0.36523009495982467, + "grad_norm": 1.4296875, + "learning_rate": 0.001466333929048232, + "loss": 1.0838, + "step": 5250 + }, + { + "epoch": 0.3652996625969599, + "grad_norm": 0.921875, + "learning_rate": 0.0014661345856661517, + "loss": 0.7864, + "step": 5251 + }, + { + "epoch": 0.3653692302340951, + "grad_norm": 1.0390625, + "learning_rate": 0.0014659352186161064, + "loss": 1.1664, + "step": 5252 + }, + { + "epoch": 0.3654387978712303, + "grad_norm": 1.3828125, + "learning_rate": 0.0014657358279082193, + "loss": 0.9814, + "step": 5253 + }, + { + "epoch": 0.3655083655083655, + "grad_norm": 1.2734375, + "learning_rate": 0.0014655364135526142, + "loss": 0.9766, + "step": 5254 + }, + { + "epoch": 0.3655779331455007, + "grad_norm": 1.0625, + "learning_rate": 0.0014653369755594165, + "loss": 0.8501, + "step": 5255 + }, + { + "epoch": 0.3656475007826359, + "grad_norm": 1.15625, + "learning_rate": 0.001465137513938753, + "loss": 0.9529, + "step": 5256 + }, + { + "epoch": 0.36571706841977114, + "grad_norm": 1.015625, + "learning_rate": 0.0014649380287007504, + "loss": 0.8705, + "step": 5257 + }, + { + "epoch": 0.3657866360569063, + "grad_norm": 1.078125, + "learning_rate": 0.0014647385198555388, + "loss": 0.909, + "step": 5258 + }, + { + "epoch": 0.36585620369404154, + "grad_norm": 1.1328125, + "learning_rate": 0.001464538987413247, + "loss": 1.0303, + "step": 5259 + }, + { + "epoch": 0.36592577133117676, + "grad_norm": 1.328125, + "learning_rate": 0.0014643394313840076, + "loss": 1.0606, + "step": 5260 + }, + { + "epoch": 0.36599533896831193, + "grad_norm": 1.125, + "learning_rate": 0.0014641398517779517, + "loss": 0.8019, + "step": 5261 + }, + { + "epoch": 0.36606490660544716, + "grad_norm": 1.4140625, + "learning_rate": 0.0014639402486052138, + "loss": 0.9951, + "step": 5262 + }, + { + "epoch": 0.36613447424258233, + "grad_norm": 1.234375, + "learning_rate": 0.0014637406218759284, + "loss": 0.9101, + "step": 5263 + }, + { + "epoch": 0.36620404187971756, + "grad_norm": 1.0625, + "learning_rate": 0.0014635409716002314, + "loss": 0.8821, + "step": 5264 + }, + { + "epoch": 0.3662736095168528, + "grad_norm": 1.0234375, + "learning_rate": 0.001463341297788261, + "loss": 0.753, + "step": 5265 + }, + { + "epoch": 0.36634317715398795, + "grad_norm": 1.34375, + "learning_rate": 0.0014631416004501543, + "loss": 0.7014, + "step": 5266 + }, + { + "epoch": 0.3664127447911232, + "grad_norm": 1.1953125, + "learning_rate": 0.0014629418795960517, + "loss": 1.1205, + "step": 5267 + }, + { + "epoch": 0.36648231242825835, + "grad_norm": 1.140625, + "learning_rate": 0.001462742135236094, + "loss": 0.8285, + "step": 5268 + }, + { + "epoch": 0.3665518800653936, + "grad_norm": 1.1875, + "learning_rate": 0.001462542367380423, + "loss": 0.9696, + "step": 5269 + }, + { + "epoch": 0.3666214477025288, + "grad_norm": 1.203125, + "learning_rate": 0.001462342576039182, + "loss": 1.135, + "step": 5270 + }, + { + "epoch": 0.366691015339664, + "grad_norm": 1.125, + "learning_rate": 0.0014621427612225154, + "loss": 0.8164, + "step": 5271 + }, + { + "epoch": 0.3667605829767992, + "grad_norm": 1.078125, + "learning_rate": 0.0014619429229405685, + "loss": 0.8362, + "step": 5272 + }, + { + "epoch": 0.3668301506139344, + "grad_norm": 1.0625, + "learning_rate": 0.0014617430612034884, + "loss": 0.6728, + "step": 5273 + }, + { + "epoch": 0.3668997182510696, + "grad_norm": 1.171875, + "learning_rate": 0.0014615431760214232, + "loss": 0.7946, + "step": 5274 + }, + { + "epoch": 0.3669692858882048, + "grad_norm": 0.97265625, + "learning_rate": 0.0014613432674045216, + "loss": 0.9709, + "step": 5275 + }, + { + "epoch": 0.36703885352534, + "grad_norm": 1.1484375, + "learning_rate": 0.0014611433353629347, + "loss": 0.8764, + "step": 5276 + }, + { + "epoch": 0.3671084211624752, + "grad_norm": 1.0390625, + "learning_rate": 0.0014609433799068132, + "loss": 0.7207, + "step": 5277 + }, + { + "epoch": 0.36717798879961044, + "grad_norm": 1.15625, + "learning_rate": 0.0014607434010463103, + "loss": 0.8309, + "step": 5278 + }, + { + "epoch": 0.3672475564367456, + "grad_norm": 1.1796875, + "learning_rate": 0.0014605433987915797, + "loss": 0.7943, + "step": 5279 + }, + { + "epoch": 0.36731712407388084, + "grad_norm": 1.40625, + "learning_rate": 0.0014603433731527767, + "loss": 1.1244, + "step": 5280 + }, + { + "epoch": 0.367386691711016, + "grad_norm": 1.15625, + "learning_rate": 0.0014601433241400576, + "loss": 0.9917, + "step": 5281 + }, + { + "epoch": 0.36745625934815124, + "grad_norm": 0.92578125, + "learning_rate": 0.0014599432517635796, + "loss": 0.9159, + "step": 5282 + }, + { + "epoch": 0.36752582698528646, + "grad_norm": 1.03125, + "learning_rate": 0.0014597431560335018, + "loss": 0.8739, + "step": 5283 + }, + { + "epoch": 0.36759539462242163, + "grad_norm": 1.3203125, + "learning_rate": 0.0014595430369599837, + "loss": 1.2074, + "step": 5284 + }, + { + "epoch": 0.36766496225955686, + "grad_norm": 1.21875, + "learning_rate": 0.0014593428945531863, + "loss": 0.9927, + "step": 5285 + }, + { + "epoch": 0.3677345298966921, + "grad_norm": 1.2421875, + "learning_rate": 0.0014591427288232722, + "loss": 0.8677, + "step": 5286 + }, + { + "epoch": 0.36780409753382726, + "grad_norm": 1.0390625, + "learning_rate": 0.0014589425397804044, + "loss": 1.0091, + "step": 5287 + }, + { + "epoch": 0.3678736651709625, + "grad_norm": 0.97265625, + "learning_rate": 0.0014587423274347478, + "loss": 1.0127, + "step": 5288 + }, + { + "epoch": 0.36794323280809765, + "grad_norm": 1.0859375, + "learning_rate": 0.0014585420917964677, + "loss": 1.1267, + "step": 5289 + }, + { + "epoch": 0.3680128004452329, + "grad_norm": 1.1171875, + "learning_rate": 0.001458341832875732, + "loss": 0.7491, + "step": 5290 + }, + { + "epoch": 0.3680823680823681, + "grad_norm": 1.0390625, + "learning_rate": 0.0014581415506827078, + "loss": 0.7425, + "step": 5291 + }, + { + "epoch": 0.3681519357195033, + "grad_norm": 1.0, + "learning_rate": 0.0014579412452275654, + "loss": 0.721, + "step": 5292 + }, + { + "epoch": 0.3682215033566385, + "grad_norm": 1.296875, + "learning_rate": 0.0014577409165204742, + "loss": 0.6941, + "step": 5293 + }, + { + "epoch": 0.3682910709937737, + "grad_norm": 1.265625, + "learning_rate": 0.0014575405645716065, + "loss": 1.0086, + "step": 5294 + }, + { + "epoch": 0.3683606386309089, + "grad_norm": 0.953125, + "learning_rate": 0.0014573401893911353, + "loss": 0.7428, + "step": 5295 + }, + { + "epoch": 0.3684302062680441, + "grad_norm": 0.9296875, + "learning_rate": 0.0014571397909892343, + "loss": 0.7044, + "step": 5296 + }, + { + "epoch": 0.3684997739051793, + "grad_norm": 1.3671875, + "learning_rate": 0.001456939369376079, + "loss": 0.9907, + "step": 5297 + }, + { + "epoch": 0.3685693415423145, + "grad_norm": 0.9609375, + "learning_rate": 0.0014567389245618454, + "loss": 0.6828, + "step": 5298 + }, + { + "epoch": 0.36863890917944975, + "grad_norm": 1.09375, + "learning_rate": 0.001456538456556712, + "loss": 1.0632, + "step": 5299 + }, + { + "epoch": 0.3687084768165849, + "grad_norm": 1.09375, + "learning_rate": 0.0014563379653708562, + "loss": 0.7712, + "step": 5300 + }, + { + "epoch": 0.36877804445372014, + "grad_norm": 1.28125, + "learning_rate": 0.0014561374510144588, + "loss": 1.0056, + "step": 5301 + }, + { + "epoch": 0.3688476120908553, + "grad_norm": 1.09375, + "learning_rate": 0.001455936913497701, + "loss": 0.8905, + "step": 5302 + }, + { + "epoch": 0.36891717972799054, + "grad_norm": 1.203125, + "learning_rate": 0.0014557363528307646, + "loss": 0.9535, + "step": 5303 + }, + { + "epoch": 0.36898674736512577, + "grad_norm": 1.21875, + "learning_rate": 0.0014555357690238333, + "loss": 0.8259, + "step": 5304 + }, + { + "epoch": 0.36905631500226094, + "grad_norm": 1.125, + "learning_rate": 0.0014553351620870917, + "loss": 0.7287, + "step": 5305 + }, + { + "epoch": 0.36912588263939616, + "grad_norm": 1.421875, + "learning_rate": 0.001455134532030726, + "loss": 1.0635, + "step": 5306 + }, + { + "epoch": 0.36919545027653133, + "grad_norm": 1.359375, + "learning_rate": 0.0014549338788649223, + "loss": 1.118, + "step": 5307 + }, + { + "epoch": 0.36926501791366656, + "grad_norm": 1.3046875, + "learning_rate": 0.0014547332025998693, + "loss": 0.7552, + "step": 5308 + }, + { + "epoch": 0.3693345855508018, + "grad_norm": 1.3203125, + "learning_rate": 0.0014545325032457566, + "loss": 0.8597, + "step": 5309 + }, + { + "epoch": 0.36940415318793696, + "grad_norm": 1.0859375, + "learning_rate": 0.0014543317808127741, + "loss": 0.9539, + "step": 5310 + }, + { + "epoch": 0.3694737208250722, + "grad_norm": 1.125, + "learning_rate": 0.001454131035311114, + "loss": 0.8004, + "step": 5311 + }, + { + "epoch": 0.3695432884622074, + "grad_norm": 0.93359375, + "learning_rate": 0.001453930266750969, + "loss": 0.9075, + "step": 5312 + }, + { + "epoch": 0.3696128560993426, + "grad_norm": 1.03125, + "learning_rate": 0.001453729475142533, + "loss": 0.7649, + "step": 5313 + }, + { + "epoch": 0.3696824237364778, + "grad_norm": 1.0625, + "learning_rate": 0.0014535286604960007, + "loss": 0.8814, + "step": 5314 + }, + { + "epoch": 0.369751991373613, + "grad_norm": 0.9921875, + "learning_rate": 0.0014533278228215697, + "loss": 0.6539, + "step": 5315 + }, + { + "epoch": 0.3698215590107482, + "grad_norm": 1.1875, + "learning_rate": 0.0014531269621294366, + "loss": 0.967, + "step": 5316 + }, + { + "epoch": 0.36989112664788343, + "grad_norm": 0.9375, + "learning_rate": 0.0014529260784297998, + "loss": 0.7468, + "step": 5317 + }, + { + "epoch": 0.3699606942850186, + "grad_norm": 0.97265625, + "learning_rate": 0.0014527251717328603, + "loss": 0.7324, + "step": 5318 + }, + { + "epoch": 0.3700302619221538, + "grad_norm": 1.1015625, + "learning_rate": 0.0014525242420488178, + "loss": 0.7423, + "step": 5319 + }, + { + "epoch": 0.370099829559289, + "grad_norm": 0.85546875, + "learning_rate": 0.001452323289387876, + "loss": 0.6566, + "step": 5320 + }, + { + "epoch": 0.3701693971964242, + "grad_norm": 1.2734375, + "learning_rate": 0.0014521223137602367, + "loss": 0.976, + "step": 5321 + }, + { + "epoch": 0.37023896483355945, + "grad_norm": 0.7890625, + "learning_rate": 0.0014519213151761056, + "loss": 0.8218, + "step": 5322 + }, + { + "epoch": 0.3703085324706946, + "grad_norm": 1.2265625, + "learning_rate": 0.0014517202936456877, + "loss": 0.9544, + "step": 5323 + }, + { + "epoch": 0.37037810010782984, + "grad_norm": 1.125, + "learning_rate": 0.0014515192491791904, + "loss": 0.987, + "step": 5324 + }, + { + "epoch": 0.370447667744965, + "grad_norm": 1.09375, + "learning_rate": 0.0014513181817868215, + "loss": 1.1131, + "step": 5325 + }, + { + "epoch": 0.37051723538210024, + "grad_norm": 1.0390625, + "learning_rate": 0.0014511170914787899, + "loss": 0.7968, + "step": 5326 + }, + { + "epoch": 0.37058680301923547, + "grad_norm": 1.2109375, + "learning_rate": 0.0014509159782653063, + "loss": 1.0672, + "step": 5327 + }, + { + "epoch": 0.37065637065637064, + "grad_norm": 1.1796875, + "learning_rate": 0.001450714842156582, + "loss": 0.7135, + "step": 5328 + }, + { + "epoch": 0.37072593829350586, + "grad_norm": 1.2109375, + "learning_rate": 0.00145051368316283, + "loss": 0.7866, + "step": 5329 + }, + { + "epoch": 0.3707955059306411, + "grad_norm": 1.0234375, + "learning_rate": 0.0014503125012942637, + "loss": 0.9565, + "step": 5330 + }, + { + "epoch": 0.37086507356777626, + "grad_norm": 1.140625, + "learning_rate": 0.0014501112965610986, + "loss": 0.9084, + "step": 5331 + }, + { + "epoch": 0.3709346412049115, + "grad_norm": 1.0234375, + "learning_rate": 0.0014499100689735504, + "loss": 1.0043, + "step": 5332 + }, + { + "epoch": 0.37100420884204666, + "grad_norm": 0.99609375, + "learning_rate": 0.0014497088185418364, + "loss": 0.8889, + "step": 5333 + }, + { + "epoch": 0.3710737764791819, + "grad_norm": 0.96484375, + "learning_rate": 0.0014495075452761758, + "loss": 0.9213, + "step": 5334 + }, + { + "epoch": 0.3711433441163171, + "grad_norm": 1.1015625, + "learning_rate": 0.0014493062491867871, + "loss": 0.8642, + "step": 5335 + }, + { + "epoch": 0.3712129117534523, + "grad_norm": 1.2109375, + "learning_rate": 0.0014491049302838923, + "loss": 0.7673, + "step": 5336 + }, + { + "epoch": 0.3712824793905875, + "grad_norm": 0.921875, + "learning_rate": 0.0014489035885777125, + "loss": 0.9089, + "step": 5337 + }, + { + "epoch": 0.3713520470277227, + "grad_norm": 1.0390625, + "learning_rate": 0.0014487022240784713, + "loss": 0.8892, + "step": 5338 + }, + { + "epoch": 0.3714216146648579, + "grad_norm": 1.1484375, + "learning_rate": 0.0014485008367963927, + "loss": 0.8305, + "step": 5339 + }, + { + "epoch": 0.37149118230199313, + "grad_norm": 1.296875, + "learning_rate": 0.0014482994267417022, + "loss": 1.1563, + "step": 5340 + }, + { + "epoch": 0.3715607499391283, + "grad_norm": 1.03125, + "learning_rate": 0.0014480979939246266, + "loss": 0.7855, + "step": 5341 + }, + { + "epoch": 0.3716303175762635, + "grad_norm": 1.0234375, + "learning_rate": 0.001447896538355393, + "loss": 0.8052, + "step": 5342 + }, + { + "epoch": 0.37169988521339875, + "grad_norm": 1.0234375, + "learning_rate": 0.0014476950600442315, + "loss": 0.8226, + "step": 5343 + }, + { + "epoch": 0.3717694528505339, + "grad_norm": 1.078125, + "learning_rate": 0.0014474935590013704, + "loss": 0.7776, + "step": 5344 + }, + { + "epoch": 0.37183902048766915, + "grad_norm": 1.0078125, + "learning_rate": 0.0014472920352370426, + "loss": 0.7737, + "step": 5345 + }, + { + "epoch": 0.3719085881248043, + "grad_norm": 1.1484375, + "learning_rate": 0.0014470904887614795, + "loss": 0.8102, + "step": 5346 + }, + { + "epoch": 0.37197815576193954, + "grad_norm": 1.0078125, + "learning_rate": 0.001446888919584915, + "loss": 0.8435, + "step": 5347 + }, + { + "epoch": 0.37204772339907477, + "grad_norm": 0.98828125, + "learning_rate": 0.0014466873277175839, + "loss": 0.832, + "step": 5348 + }, + { + "epoch": 0.37211729103620994, + "grad_norm": 1.234375, + "learning_rate": 0.0014464857131697214, + "loss": 0.8213, + "step": 5349 + }, + { + "epoch": 0.37218685867334517, + "grad_norm": 1.1171875, + "learning_rate": 0.001446284075951565, + "loss": 0.6133, + "step": 5350 + }, + { + "epoch": 0.37225642631048034, + "grad_norm": 1.234375, + "learning_rate": 0.0014460824160733524, + "loss": 0.9, + "step": 5351 + }, + { + "epoch": 0.37232599394761556, + "grad_norm": 1.1484375, + "learning_rate": 0.0014458807335453235, + "loss": 0.6688, + "step": 5352 + }, + { + "epoch": 0.3723955615847508, + "grad_norm": 0.84765625, + "learning_rate": 0.0014456790283777182, + "loss": 0.7935, + "step": 5353 + }, + { + "epoch": 0.37246512922188596, + "grad_norm": 1.046875, + "learning_rate": 0.001445477300580778, + "loss": 0.8331, + "step": 5354 + }, + { + "epoch": 0.3725346968590212, + "grad_norm": 1.0703125, + "learning_rate": 0.001445275550164746, + "loss": 1.0071, + "step": 5355 + }, + { + "epoch": 0.3726042644961564, + "grad_norm": 1.15625, + "learning_rate": 0.0014450737771398662, + "loss": 0.6313, + "step": 5356 + }, + { + "epoch": 0.3726738321332916, + "grad_norm": 1.328125, + "learning_rate": 0.0014448719815163833, + "loss": 0.9666, + "step": 5357 + }, + { + "epoch": 0.3727433997704268, + "grad_norm": 1.21875, + "learning_rate": 0.0014446701633045432, + "loss": 0.989, + "step": 5358 + }, + { + "epoch": 0.372812967407562, + "grad_norm": 1.109375, + "learning_rate": 0.0014444683225145938, + "loss": 0.795, + "step": 5359 + }, + { + "epoch": 0.3728825350446972, + "grad_norm": 1.171875, + "learning_rate": 0.001444266459156783, + "loss": 0.9144, + "step": 5360 + }, + { + "epoch": 0.37295210268183243, + "grad_norm": 1.0390625, + "learning_rate": 0.0014440645732413607, + "loss": 1.0063, + "step": 5361 + }, + { + "epoch": 0.3730216703189676, + "grad_norm": 1.0234375, + "learning_rate": 0.0014438626647785779, + "loss": 0.8353, + "step": 5362 + }, + { + "epoch": 0.37309123795610283, + "grad_norm": 1.0859375, + "learning_rate": 0.0014436607337786859, + "loss": 0.707, + "step": 5363 + }, + { + "epoch": 0.373160805593238, + "grad_norm": 1.59375, + "learning_rate": 0.0014434587802519383, + "loss": 1.0309, + "step": 5364 + }, + { + "epoch": 0.3732303732303732, + "grad_norm": 1.015625, + "learning_rate": 0.0014432568042085886, + "loss": 0.9463, + "step": 5365 + }, + { + "epoch": 0.37329994086750845, + "grad_norm": 0.98828125, + "learning_rate": 0.001443054805658893, + "loss": 0.8642, + "step": 5366 + }, + { + "epoch": 0.3733695085046436, + "grad_norm": 0.98828125, + "learning_rate": 0.0014428527846131072, + "loss": 0.9723, + "step": 5367 + }, + { + "epoch": 0.37343907614177885, + "grad_norm": 0.89453125, + "learning_rate": 0.0014426507410814895, + "loss": 0.9318, + "step": 5368 + }, + { + "epoch": 0.3735086437789141, + "grad_norm": 1.265625, + "learning_rate": 0.001442448675074298, + "loss": 0.8876, + "step": 5369 + }, + { + "epoch": 0.37357821141604924, + "grad_norm": 1.078125, + "learning_rate": 0.001442246586601793, + "loss": 0.8524, + "step": 5370 + }, + { + "epoch": 0.37364777905318447, + "grad_norm": 0.890625, + "learning_rate": 0.0014420444756742354, + "loss": 0.6487, + "step": 5371 + }, + { + "epoch": 0.37371734669031964, + "grad_norm": 1.0859375, + "learning_rate": 0.0014418423423018876, + "loss": 0.7525, + "step": 5372 + }, + { + "epoch": 0.37378691432745487, + "grad_norm": 1.3125, + "learning_rate": 0.001441640186495013, + "loss": 1.0137, + "step": 5373 + }, + { + "epoch": 0.3738564819645901, + "grad_norm": 1.21875, + "learning_rate": 0.0014414380082638748, + "loss": 0.9516, + "step": 5374 + }, + { + "epoch": 0.37392604960172526, + "grad_norm": 0.97265625, + "learning_rate": 0.0014412358076187402, + "loss": 0.7149, + "step": 5375 + }, + { + "epoch": 0.3739956172388605, + "grad_norm": 0.984375, + "learning_rate": 0.001441033584569875, + "loss": 0.9754, + "step": 5376 + }, + { + "epoch": 0.37406518487599566, + "grad_norm": 1.0234375, + "learning_rate": 0.0014408313391275475, + "loss": 0.6876, + "step": 5377 + }, + { + "epoch": 0.3741347525131309, + "grad_norm": 1.0703125, + "learning_rate": 0.0014406290713020265, + "loss": 0.8897, + "step": 5378 + }, + { + "epoch": 0.3742043201502661, + "grad_norm": 1.1875, + "learning_rate": 0.0014404267811035823, + "loss": 1.1495, + "step": 5379 + }, + { + "epoch": 0.3742738877874013, + "grad_norm": 1.2734375, + "learning_rate": 0.0014402244685424862, + "loss": 0.9556, + "step": 5380 + }, + { + "epoch": 0.3743434554245365, + "grad_norm": 0.9765625, + "learning_rate": 0.00144002213362901, + "loss": 0.975, + "step": 5381 + }, + { + "epoch": 0.37441302306167173, + "grad_norm": 1.234375, + "learning_rate": 0.0014398197763734282, + "loss": 0.9633, + "step": 5382 + }, + { + "epoch": 0.3744825906988069, + "grad_norm": 1.140625, + "learning_rate": 0.0014396173967860149, + "loss": 0.9289, + "step": 5383 + }, + { + "epoch": 0.37455215833594213, + "grad_norm": 1.0703125, + "learning_rate": 0.001439414994877046, + "loss": 0.7651, + "step": 5384 + }, + { + "epoch": 0.3746217259730773, + "grad_norm": 0.94140625, + "learning_rate": 0.0014392125706567981, + "loss": 0.7602, + "step": 5385 + }, + { + "epoch": 0.37469129361021253, + "grad_norm": 0.921875, + "learning_rate": 0.0014390101241355503, + "loss": 0.5503, + "step": 5386 + }, + { + "epoch": 0.37476086124734775, + "grad_norm": 1.015625, + "learning_rate": 0.0014388076553235808, + "loss": 0.9548, + "step": 5387 + }, + { + "epoch": 0.3748304288844829, + "grad_norm": 0.86328125, + "learning_rate": 0.0014386051642311705, + "loss": 0.8298, + "step": 5388 + }, + { + "epoch": 0.37489999652161815, + "grad_norm": 1.2265625, + "learning_rate": 0.0014384026508686006, + "loss": 0.9172, + "step": 5389 + }, + { + "epoch": 0.3749695641587533, + "grad_norm": 1.0546875, + "learning_rate": 0.0014382001152461537, + "loss": 0.8015, + "step": 5390 + }, + { + "epoch": 0.37503913179588855, + "grad_norm": 1.3125, + "learning_rate": 0.0014379975573741135, + "loss": 0.8311, + "step": 5391 + }, + { + "epoch": 0.3751086994330238, + "grad_norm": 0.82421875, + "learning_rate": 0.0014377949772627651, + "loss": 0.703, + "step": 5392 + }, + { + "epoch": 0.37517826707015894, + "grad_norm": 1.0859375, + "learning_rate": 0.0014375923749223947, + "loss": 0.8893, + "step": 5393 + }, + { + "epoch": 0.37524783470729417, + "grad_norm": 1.03125, + "learning_rate": 0.001437389750363289, + "loss": 0.7522, + "step": 5394 + }, + { + "epoch": 0.3753174023444294, + "grad_norm": 1.0703125, + "learning_rate": 0.0014371871035957363, + "loss": 0.9608, + "step": 5395 + }, + { + "epoch": 0.37538696998156457, + "grad_norm": 1.3359375, + "learning_rate": 0.0014369844346300265, + "loss": 0.9194, + "step": 5396 + }, + { + "epoch": 0.3754565376186998, + "grad_norm": 1.1484375, + "learning_rate": 0.001436781743476449, + "loss": 0.9093, + "step": 5397 + }, + { + "epoch": 0.37552610525583496, + "grad_norm": 1.0703125, + "learning_rate": 0.0014365790301452963, + "loss": 0.8586, + "step": 5398 + }, + { + "epoch": 0.3755956728929702, + "grad_norm": 1.265625, + "learning_rate": 0.001436376294646861, + "loss": 0.9752, + "step": 5399 + }, + { + "epoch": 0.3756652405301054, + "grad_norm": 1.0625, + "learning_rate": 0.001436173536991437, + "loss": 0.8677, + "step": 5400 + }, + { + "epoch": 0.3757348081672406, + "grad_norm": 1.0234375, + "learning_rate": 0.0014359707571893194, + "loss": 0.8918, + "step": 5401 + }, + { + "epoch": 0.3758043758043758, + "grad_norm": 1.15625, + "learning_rate": 0.0014357679552508041, + "loss": 0.7776, + "step": 5402 + }, + { + "epoch": 0.375873943441511, + "grad_norm": 1.078125, + "learning_rate": 0.0014355651311861886, + "loss": 0.8775, + "step": 5403 + }, + { + "epoch": 0.3759435110786462, + "grad_norm": 1.21875, + "learning_rate": 0.0014353622850057709, + "loss": 1.008, + "step": 5404 + }, + { + "epoch": 0.37601307871578143, + "grad_norm": 1.1484375, + "learning_rate": 0.0014351594167198508, + "loss": 0.8674, + "step": 5405 + }, + { + "epoch": 0.3760826463529166, + "grad_norm": 1.078125, + "learning_rate": 0.001434956526338729, + "loss": 0.831, + "step": 5406 + }, + { + "epoch": 0.37615221399005183, + "grad_norm": 1.15625, + "learning_rate": 0.001434753613872707, + "loss": 0.9915, + "step": 5407 + }, + { + "epoch": 0.37622178162718706, + "grad_norm": 1.59375, + "learning_rate": 0.001434550679332088, + "loss": 1.0004, + "step": 5408 + }, + { + "epoch": 0.37629134926432223, + "grad_norm": 1.046875, + "learning_rate": 0.0014343477227271757, + "loss": 0.7761, + "step": 5409 + }, + { + "epoch": 0.37636091690145745, + "grad_norm": 1.2109375, + "learning_rate": 0.0014341447440682754, + "loss": 0.9584, + "step": 5410 + }, + { + "epoch": 0.3764304845385926, + "grad_norm": 1.15625, + "learning_rate": 0.001433941743365693, + "loss": 0.7237, + "step": 5411 + }, + { + "epoch": 0.37650005217572785, + "grad_norm": 1.2421875, + "learning_rate": 0.0014337387206297364, + "loss": 0.6604, + "step": 5412 + }, + { + "epoch": 0.3765696198128631, + "grad_norm": 1.078125, + "learning_rate": 0.0014335356758707137, + "loss": 0.8516, + "step": 5413 + }, + { + "epoch": 0.37663918744999825, + "grad_norm": 0.8984375, + "learning_rate": 0.0014333326090989345, + "loss": 0.85, + "step": 5414 + }, + { + "epoch": 0.3767087550871335, + "grad_norm": 1.234375, + "learning_rate": 0.0014331295203247095, + "loss": 0.8663, + "step": 5415 + }, + { + "epoch": 0.37677832272426864, + "grad_norm": 1.0234375, + "learning_rate": 0.0014329264095583505, + "loss": 0.7427, + "step": 5416 + }, + { + "epoch": 0.37684789036140387, + "grad_norm": 1.1171875, + "learning_rate": 0.0014327232768101708, + "loss": 0.9801, + "step": 5417 + }, + { + "epoch": 0.3769174579985391, + "grad_norm": 0.93359375, + "learning_rate": 0.001432520122090484, + "loss": 1.0844, + "step": 5418 + }, + { + "epoch": 0.37698702563567427, + "grad_norm": 1.296875, + "learning_rate": 0.0014323169454096057, + "loss": 1.1017, + "step": 5419 + }, + { + "epoch": 0.3770565932728095, + "grad_norm": 1.3984375, + "learning_rate": 0.0014321137467778518, + "loss": 0.9933, + "step": 5420 + }, + { + "epoch": 0.3771261609099447, + "grad_norm": 1.03125, + "learning_rate": 0.0014319105262055399, + "loss": 0.974, + "step": 5421 + }, + { + "epoch": 0.3771957285470799, + "grad_norm": 1.203125, + "learning_rate": 0.0014317072837029883, + "loss": 0.8098, + "step": 5422 + }, + { + "epoch": 0.3772652961842151, + "grad_norm": 1.03125, + "learning_rate": 0.001431504019280517, + "loss": 0.7295, + "step": 5423 + }, + { + "epoch": 0.3773348638213503, + "grad_norm": 0.8984375, + "learning_rate": 0.0014313007329484462, + "loss": 0.6766, + "step": 5424 + }, + { + "epoch": 0.3774044314584855, + "grad_norm": 0.90625, + "learning_rate": 0.0014310974247170984, + "loss": 0.9285, + "step": 5425 + }, + { + "epoch": 0.37747399909562074, + "grad_norm": 1.203125, + "learning_rate": 0.0014308940945967964, + "loss": 0.9051, + "step": 5426 + }, + { + "epoch": 0.3775435667327559, + "grad_norm": 1.21875, + "learning_rate": 0.001430690742597864, + "loss": 0.9952, + "step": 5427 + }, + { + "epoch": 0.37761313436989113, + "grad_norm": 1.046875, + "learning_rate": 0.0014304873687306264, + "loss": 1.0094, + "step": 5428 + }, + { + "epoch": 0.3776827020070263, + "grad_norm": 1.28125, + "learning_rate": 0.00143028397300541, + "loss": 0.8287, + "step": 5429 + }, + { + "epoch": 0.37775226964416153, + "grad_norm": 1.1015625, + "learning_rate": 0.0014300805554325424, + "loss": 0.6493, + "step": 5430 + }, + { + "epoch": 0.37782183728129676, + "grad_norm": 1.1640625, + "learning_rate": 0.001429877116022352, + "loss": 0.8438, + "step": 5431 + }, + { + "epoch": 0.37789140491843193, + "grad_norm": 1.0, + "learning_rate": 0.0014296736547851684, + "loss": 0.8506, + "step": 5432 + }, + { + "epoch": 0.37796097255556715, + "grad_norm": 1.2109375, + "learning_rate": 0.001429470171731322, + "loss": 0.889, + "step": 5433 + }, + { + "epoch": 0.3780305401927024, + "grad_norm": 1.109375, + "learning_rate": 0.0014292666668711453, + "loss": 0.8362, + "step": 5434 + }, + { + "epoch": 0.37810010782983755, + "grad_norm": 1.1875, + "learning_rate": 0.0014290631402149709, + "loss": 0.7322, + "step": 5435 + }, + { + "epoch": 0.3781696754669728, + "grad_norm": 1.0703125, + "learning_rate": 0.0014288595917731329, + "loss": 0.9391, + "step": 5436 + }, + { + "epoch": 0.37823924310410795, + "grad_norm": 1.1328125, + "learning_rate": 0.0014286560215559664, + "loss": 0.7884, + "step": 5437 + }, + { + "epoch": 0.3783088107412432, + "grad_norm": 1.21875, + "learning_rate": 0.0014284524295738075, + "loss": 0.8923, + "step": 5438 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 0.94921875, + "learning_rate": 0.001428248815836994, + "loss": 0.8666, + "step": 5439 + }, + { + "epoch": 0.37844794601551357, + "grad_norm": 1.2890625, + "learning_rate": 0.001428045180355864, + "loss": 0.9711, + "step": 5440 + }, + { + "epoch": 0.3785175136526488, + "grad_norm": 1.0078125, + "learning_rate": 0.0014278415231407575, + "loss": 0.5071, + "step": 5441 + }, + { + "epoch": 0.37858708128978397, + "grad_norm": 1.015625, + "learning_rate": 0.0014276378442020148, + "loss": 0.8519, + "step": 5442 + }, + { + "epoch": 0.3786566489269192, + "grad_norm": 1.25, + "learning_rate": 0.0014274341435499779, + "loss": 1.1103, + "step": 5443 + }, + { + "epoch": 0.3787262165640544, + "grad_norm": 1.0625, + "learning_rate": 0.0014272304211949895, + "loss": 1.0963, + "step": 5444 + }, + { + "epoch": 0.3787957842011896, + "grad_norm": 1.203125, + "learning_rate": 0.0014270266771473938, + "loss": 0.914, + "step": 5445 + }, + { + "epoch": 0.3788653518383248, + "grad_norm": 1.2890625, + "learning_rate": 0.0014268229114175357, + "loss": 0.9931, + "step": 5446 + }, + { + "epoch": 0.37893491947546004, + "grad_norm": 1.375, + "learning_rate": 0.0014266191240157617, + "loss": 0.775, + "step": 5447 + }, + { + "epoch": 0.3790044871125952, + "grad_norm": 1.03125, + "learning_rate": 0.0014264153149524189, + "loss": 0.9089, + "step": 5448 + }, + { + "epoch": 0.37907405474973044, + "grad_norm": 1.0, + "learning_rate": 0.0014262114842378555, + "loss": 0.7882, + "step": 5449 + }, + { + "epoch": 0.3791436223868656, + "grad_norm": 1.34375, + "learning_rate": 0.0014260076318824211, + "loss": 0.9675, + "step": 5450 + }, + { + "epoch": 0.37921319002400083, + "grad_norm": 1.3828125, + "learning_rate": 0.0014258037578964667, + "loss": 0.8656, + "step": 5451 + }, + { + "epoch": 0.37928275766113606, + "grad_norm": 0.9296875, + "learning_rate": 0.0014255998622903433, + "loss": 0.8968, + "step": 5452 + }, + { + "epoch": 0.37935232529827123, + "grad_norm": 0.9921875, + "learning_rate": 0.0014253959450744045, + "loss": 0.8417, + "step": 5453 + }, + { + "epoch": 0.37942189293540646, + "grad_norm": 1.2265625, + "learning_rate": 0.0014251920062590036, + "loss": 1.1655, + "step": 5454 + }, + { + "epoch": 0.37949146057254163, + "grad_norm": 1.265625, + "learning_rate": 0.0014249880458544956, + "loss": 0.9787, + "step": 5455 + }, + { + "epoch": 0.37956102820967685, + "grad_norm": 0.8125, + "learning_rate": 0.001424784063871237, + "loss": 0.4948, + "step": 5456 + }, + { + "epoch": 0.3796305958468121, + "grad_norm": 1.0234375, + "learning_rate": 0.0014245800603195846, + "loss": 0.6736, + "step": 5457 + }, + { + "epoch": 0.37970016348394725, + "grad_norm": 0.99609375, + "learning_rate": 0.0014243760352098968, + "loss": 0.9282, + "step": 5458 + }, + { + "epoch": 0.3797697311210825, + "grad_norm": 1.40625, + "learning_rate": 0.001424171988552533, + "loss": 0.7448, + "step": 5459 + }, + { + "epoch": 0.3798392987582177, + "grad_norm": 1.0625, + "learning_rate": 0.0014239679203578532, + "loss": 0.9497, + "step": 5460 + }, + { + "epoch": 0.3799088663953529, + "grad_norm": 1.1328125, + "learning_rate": 0.00142376383063622, + "loss": 1.0226, + "step": 5461 + }, + { + "epoch": 0.3799784340324881, + "grad_norm": 0.75390625, + "learning_rate": 0.001423559719397995, + "loss": 1.1281, + "step": 5462 + }, + { + "epoch": 0.38004800166962327, + "grad_norm": 0.828125, + "learning_rate": 0.0014233555866535424, + "loss": 0.8493, + "step": 5463 + }, + { + "epoch": 0.3801175693067585, + "grad_norm": 1.359375, + "learning_rate": 0.0014231514324132269, + "loss": 0.9384, + "step": 5464 + }, + { + "epoch": 0.3801871369438937, + "grad_norm": 1.0625, + "learning_rate": 0.0014229472566874147, + "loss": 0.6505, + "step": 5465 + }, + { + "epoch": 0.3802567045810289, + "grad_norm": 0.89453125, + "learning_rate": 0.0014227430594864726, + "loss": 0.7257, + "step": 5466 + }, + { + "epoch": 0.3803262722181641, + "grad_norm": 1.359375, + "learning_rate": 0.0014225388408207684, + "loss": 1.0635, + "step": 5467 + }, + { + "epoch": 0.3803958398552993, + "grad_norm": 1.09375, + "learning_rate": 0.001422334600700672, + "loss": 0.6463, + "step": 5468 + }, + { + "epoch": 0.3804654074924345, + "grad_norm": 1.296875, + "learning_rate": 0.0014221303391365532, + "loss": 1.1023, + "step": 5469 + }, + { + "epoch": 0.38053497512956974, + "grad_norm": 1.1640625, + "learning_rate": 0.0014219260561387835, + "loss": 0.7375, + "step": 5470 + }, + { + "epoch": 0.3806045427667049, + "grad_norm": 1.2578125, + "learning_rate": 0.0014217217517177353, + "loss": 0.8715, + "step": 5471 + }, + { + "epoch": 0.38067411040384014, + "grad_norm": 0.87109375, + "learning_rate": 0.001421517425883782, + "loss": 0.6879, + "step": 5472 + }, + { + "epoch": 0.38074367804097536, + "grad_norm": 1.1484375, + "learning_rate": 0.0014213130786472985, + "loss": 0.891, + "step": 5473 + }, + { + "epoch": 0.38081324567811053, + "grad_norm": 1.0625, + "learning_rate": 0.0014211087100186605, + "loss": 0.9219, + "step": 5474 + }, + { + "epoch": 0.38088281331524576, + "grad_norm": 1.0390625, + "learning_rate": 0.001420904320008245, + "loss": 0.9867, + "step": 5475 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 1.21875, + "learning_rate": 0.0014206999086264292, + "loss": 1.0281, + "step": 5476 + }, + { + "epoch": 0.38102194858951616, + "grad_norm": 1.09375, + "learning_rate": 0.0014204954758835929, + "loss": 0.8644, + "step": 5477 + }, + { + "epoch": 0.3810915162266514, + "grad_norm": 1.4140625, + "learning_rate": 0.0014202910217901155, + "loss": 0.914, + "step": 5478 + }, + { + "epoch": 0.38116108386378655, + "grad_norm": 1.09375, + "learning_rate": 0.0014200865463563786, + "loss": 0.8536, + "step": 5479 + }, + { + "epoch": 0.3812306515009218, + "grad_norm": 1.1640625, + "learning_rate": 0.0014198820495927643, + "loss": 1.0379, + "step": 5480 + }, + { + "epoch": 0.38130021913805695, + "grad_norm": 1.1875, + "learning_rate": 0.0014196775315096558, + "loss": 0.879, + "step": 5481 + }, + { + "epoch": 0.3813697867751922, + "grad_norm": 1.1484375, + "learning_rate": 0.0014194729921174374, + "loss": 0.9843, + "step": 5482 + }, + { + "epoch": 0.3814393544123274, + "grad_norm": 1.2109375, + "learning_rate": 0.0014192684314264952, + "loss": 0.9935, + "step": 5483 + }, + { + "epoch": 0.3815089220494626, + "grad_norm": 1.109375, + "learning_rate": 0.001419063849447215, + "loss": 0.9397, + "step": 5484 + }, + { + "epoch": 0.3815784896865978, + "grad_norm": 1.53125, + "learning_rate": 0.0014188592461899848, + "loss": 0.958, + "step": 5485 + }, + { + "epoch": 0.381648057323733, + "grad_norm": 1.1796875, + "learning_rate": 0.0014186546216651932, + "loss": 0.9617, + "step": 5486 + }, + { + "epoch": 0.3817176249608682, + "grad_norm": 0.94140625, + "learning_rate": 0.0014184499758832304, + "loss": 0.8839, + "step": 5487 + }, + { + "epoch": 0.3817871925980034, + "grad_norm": 1.1796875, + "learning_rate": 0.0014182453088544867, + "loss": 1.0621, + "step": 5488 + }, + { + "epoch": 0.3818567602351386, + "grad_norm": 1.2109375, + "learning_rate": 0.0014180406205893546, + "loss": 1.0318, + "step": 5489 + }, + { + "epoch": 0.3819263278722738, + "grad_norm": 1.03125, + "learning_rate": 0.0014178359110982265, + "loss": 0.7853, + "step": 5490 + }, + { + "epoch": 0.38199589550940904, + "grad_norm": 1.015625, + "learning_rate": 0.0014176311803914972, + "loss": 0.9092, + "step": 5491 + }, + { + "epoch": 0.3820654631465442, + "grad_norm": 1.1953125, + "learning_rate": 0.0014174264284795614, + "loss": 0.8226, + "step": 5492 + }, + { + "epoch": 0.38213503078367944, + "grad_norm": 0.96484375, + "learning_rate": 0.0014172216553728152, + "loss": 0.8707, + "step": 5493 + }, + { + "epoch": 0.3822045984208146, + "grad_norm": 1.078125, + "learning_rate": 0.001417016861081657, + "loss": 0.9307, + "step": 5494 + }, + { + "epoch": 0.38227416605794984, + "grad_norm": 1.0859375, + "learning_rate": 0.001416812045616484, + "loss": 0.5553, + "step": 5495 + }, + { + "epoch": 0.38234373369508506, + "grad_norm": 1.0703125, + "learning_rate": 0.0014166072089876968, + "loss": 0.9959, + "step": 5496 + }, + { + "epoch": 0.38241330133222023, + "grad_norm": 1.03125, + "learning_rate": 0.001416402351205695, + "loss": 0.7066, + "step": 5497 + }, + { + "epoch": 0.38248286896935546, + "grad_norm": 1.0234375, + "learning_rate": 0.0014161974722808803, + "loss": 0.7988, + "step": 5498 + }, + { + "epoch": 0.3825524366064907, + "grad_norm": 1.3125, + "learning_rate": 0.001415992572223656, + "loss": 1.0347, + "step": 5499 + }, + { + "epoch": 0.38262200424362586, + "grad_norm": 1.3515625, + "learning_rate": 0.0014157876510444256, + "loss": 0.923, + "step": 5500 + }, + { + "epoch": 0.3826915718807611, + "grad_norm": 0.984375, + "learning_rate": 0.0014155827087535943, + "loss": 0.6956, + "step": 5501 + }, + { + "epoch": 0.38276113951789625, + "grad_norm": 1.3125, + "learning_rate": 0.0014153777453615678, + "loss": 0.7743, + "step": 5502 + }, + { + "epoch": 0.3828307071550315, + "grad_norm": 1.8046875, + "learning_rate": 0.0014151727608787525, + "loss": 1.0726, + "step": 5503 + }, + { + "epoch": 0.3829002747921667, + "grad_norm": 1.1640625, + "learning_rate": 0.0014149677553155575, + "loss": 0.8967, + "step": 5504 + }, + { + "epoch": 0.3829698424293019, + "grad_norm": 1.078125, + "learning_rate": 0.0014147627286823915, + "loss": 0.9015, + "step": 5505 + }, + { + "epoch": 0.3830394100664371, + "grad_norm": 1.3828125, + "learning_rate": 0.0014145576809896643, + "loss": 0.7625, + "step": 5506 + }, + { + "epoch": 0.3831089777035723, + "grad_norm": 1.15625, + "learning_rate": 0.0014143526122477879, + "loss": 0.8154, + "step": 5507 + }, + { + "epoch": 0.3831785453407075, + "grad_norm": 1.0703125, + "learning_rate": 0.0014141475224671743, + "loss": 0.9021, + "step": 5508 + }, + { + "epoch": 0.3832481129778427, + "grad_norm": 1.140625, + "learning_rate": 0.0014139424116582364, + "loss": 1.1319, + "step": 5509 + }, + { + "epoch": 0.3833176806149779, + "grad_norm": 1.1640625, + "learning_rate": 0.00141373727983139, + "loss": 1.0978, + "step": 5510 + }, + { + "epoch": 0.3833872482521131, + "grad_norm": 1.0625, + "learning_rate": 0.0014135321269970497, + "loss": 0.9746, + "step": 5511 + }, + { + "epoch": 0.38345681588924835, + "grad_norm": 1.0625, + "learning_rate": 0.0014133269531656323, + "loss": 0.7934, + "step": 5512 + }, + { + "epoch": 0.3835263835263835, + "grad_norm": 1.0625, + "learning_rate": 0.0014131217583475558, + "loss": 0.7595, + "step": 5513 + }, + { + "epoch": 0.38359595116351874, + "grad_norm": 1.3046875, + "learning_rate": 0.0014129165425532384, + "loss": 1.0819, + "step": 5514 + }, + { + "epoch": 0.3836655188006539, + "grad_norm": 1.2109375, + "learning_rate": 0.0014127113057931003, + "loss": 0.8422, + "step": 5515 + }, + { + "epoch": 0.38373508643778914, + "grad_norm": 1.3203125, + "learning_rate": 0.001412506048077562, + "loss": 1.0966, + "step": 5516 + }, + { + "epoch": 0.38380465407492437, + "grad_norm": 1.4609375, + "learning_rate": 0.0014123007694170461, + "loss": 1.1284, + "step": 5517 + }, + { + "epoch": 0.38387422171205954, + "grad_norm": 1.09375, + "learning_rate": 0.0014120954698219755, + "loss": 0.7266, + "step": 5518 + }, + { + "epoch": 0.38394378934919476, + "grad_norm": 0.7578125, + "learning_rate": 0.0014118901493027738, + "loss": 0.5555, + "step": 5519 + }, + { + "epoch": 0.38401335698632993, + "grad_norm": 1.21875, + "learning_rate": 0.0014116848078698663, + "loss": 0.8231, + "step": 5520 + }, + { + "epoch": 0.38408292462346516, + "grad_norm": 1.0, + "learning_rate": 0.0014114794455336794, + "loss": 0.8969, + "step": 5521 + }, + { + "epoch": 0.3841524922606004, + "grad_norm": 1.4765625, + "learning_rate": 0.0014112740623046403, + "loss": 1.1414, + "step": 5522 + }, + { + "epoch": 0.38422205989773556, + "grad_norm": 0.984375, + "learning_rate": 0.0014110686581931772, + "loss": 0.823, + "step": 5523 + }, + { + "epoch": 0.3842916275348708, + "grad_norm": 1.0234375, + "learning_rate": 0.0014108632332097198, + "loss": 0.8634, + "step": 5524 + }, + { + "epoch": 0.384361195172006, + "grad_norm": 0.96875, + "learning_rate": 0.0014106577873646982, + "loss": 0.9133, + "step": 5525 + }, + { + "epoch": 0.3844307628091412, + "grad_norm": 1.2421875, + "learning_rate": 0.001410452320668544, + "loss": 0.9964, + "step": 5526 + }, + { + "epoch": 0.3845003304462764, + "grad_norm": 1.234375, + "learning_rate": 0.0014102468331316897, + "loss": 1.0287, + "step": 5527 + }, + { + "epoch": 0.3845698980834116, + "grad_norm": 0.98828125, + "learning_rate": 0.001410041324764569, + "loss": 0.8902, + "step": 5528 + }, + { + "epoch": 0.3846394657205468, + "grad_norm": 1.265625, + "learning_rate": 0.0014098357955776167, + "loss": 1.0256, + "step": 5529 + }, + { + "epoch": 0.38470903335768203, + "grad_norm": 1.046875, + "learning_rate": 0.0014096302455812683, + "loss": 0.858, + "step": 5530 + }, + { + "epoch": 0.3847786009948172, + "grad_norm": 1.125, + "learning_rate": 0.0014094246747859609, + "loss": 0.997, + "step": 5531 + }, + { + "epoch": 0.3848481686319524, + "grad_norm": 1.0546875, + "learning_rate": 0.0014092190832021318, + "loss": 0.7877, + "step": 5532 + }, + { + "epoch": 0.3849177362690876, + "grad_norm": 1.2265625, + "learning_rate": 0.001409013470840221, + "loss": 1.1309, + "step": 5533 + }, + { + "epoch": 0.3849873039062228, + "grad_norm": 1.078125, + "learning_rate": 0.0014088078377106673, + "loss": 0.7741, + "step": 5534 + }, + { + "epoch": 0.38505687154335805, + "grad_norm": 1.34375, + "learning_rate": 0.001408602183823912, + "loss": 1.0019, + "step": 5535 + }, + { + "epoch": 0.3851264391804932, + "grad_norm": 0.83984375, + "learning_rate": 0.0014083965091903974, + "loss": 0.8954, + "step": 5536 + }, + { + "epoch": 0.38519600681762844, + "grad_norm": 1.1328125, + "learning_rate": 0.0014081908138205664, + "loss": 0.8182, + "step": 5537 + }, + { + "epoch": 0.38526557445476367, + "grad_norm": 1.125, + "learning_rate": 0.0014079850977248638, + "loss": 1.0547, + "step": 5538 + }, + { + "epoch": 0.38533514209189884, + "grad_norm": 1.140625, + "learning_rate": 0.0014077793609137336, + "loss": 1.1989, + "step": 5539 + }, + { + "epoch": 0.38540470972903407, + "grad_norm": 0.7734375, + "learning_rate": 0.0014075736033976236, + "loss": 0.7847, + "step": 5540 + }, + { + "epoch": 0.38547427736616924, + "grad_norm": 1.4140625, + "learning_rate": 0.00140736782518698, + "loss": 1.0591, + "step": 5541 + }, + { + "epoch": 0.38554384500330446, + "grad_norm": 0.98046875, + "learning_rate": 0.0014071620262922516, + "loss": 0.8005, + "step": 5542 + }, + { + "epoch": 0.3856134126404397, + "grad_norm": 1.3671875, + "learning_rate": 0.0014069562067238874, + "loss": 0.8348, + "step": 5543 + }, + { + "epoch": 0.38568298027757486, + "grad_norm": 1.2265625, + "learning_rate": 0.0014067503664923387, + "loss": 0.655, + "step": 5544 + }, + { + "epoch": 0.3857525479147101, + "grad_norm": 1.25, + "learning_rate": 0.0014065445056080563, + "loss": 0.8986, + "step": 5545 + }, + { + "epoch": 0.38582211555184526, + "grad_norm": 1.0546875, + "learning_rate": 0.001406338624081493, + "loss": 0.7869, + "step": 5546 + }, + { + "epoch": 0.3858916831889805, + "grad_norm": 1.1640625, + "learning_rate": 0.0014061327219231025, + "loss": 0.9504, + "step": 5547 + }, + { + "epoch": 0.3859612508261157, + "grad_norm": 1.3515625, + "learning_rate": 0.0014059267991433394, + "loss": 0.9495, + "step": 5548 + }, + { + "epoch": 0.3860308184632509, + "grad_norm": 0.93359375, + "learning_rate": 0.00140572085575266, + "loss": 0.7159, + "step": 5549 + }, + { + "epoch": 0.3861003861003861, + "grad_norm": 1.1328125, + "learning_rate": 0.00140551489176152, + "loss": 0.8652, + "step": 5550 + }, + { + "epoch": 0.38616995373752133, + "grad_norm": 1.15625, + "learning_rate": 0.0014053089071803778, + "loss": 0.919, + "step": 5551 + }, + { + "epoch": 0.3862395213746565, + "grad_norm": 1.5546875, + "learning_rate": 0.001405102902019692, + "loss": 0.7285, + "step": 5552 + }, + { + "epoch": 0.38630908901179173, + "grad_norm": 1.0859375, + "learning_rate": 0.001404896876289923, + "loss": 1.0848, + "step": 5553 + }, + { + "epoch": 0.3863786566489269, + "grad_norm": 0.9921875, + "learning_rate": 0.0014046908300015316, + "loss": 0.8936, + "step": 5554 + }, + { + "epoch": 0.3864482242860621, + "grad_norm": 1.0625, + "learning_rate": 0.0014044847631649792, + "loss": 0.9112, + "step": 5555 + }, + { + "epoch": 0.38651779192319735, + "grad_norm": 1.140625, + "learning_rate": 0.0014042786757907297, + "loss": 0.8496, + "step": 5556 + }, + { + "epoch": 0.3865873595603325, + "grad_norm": 1.1171875, + "learning_rate": 0.0014040725678892466, + "loss": 0.8048, + "step": 5557 + }, + { + "epoch": 0.38665692719746775, + "grad_norm": 1.3828125, + "learning_rate": 0.0014038664394709953, + "loss": 1.0822, + "step": 5558 + }, + { + "epoch": 0.3867264948346029, + "grad_norm": 1.421875, + "learning_rate": 0.0014036602905464414, + "loss": 0.8533, + "step": 5559 + }, + { + "epoch": 0.38679606247173814, + "grad_norm": 0.953125, + "learning_rate": 0.0014034541211260527, + "loss": 0.7703, + "step": 5560 + }, + { + "epoch": 0.38686563010887337, + "grad_norm": 1.0078125, + "learning_rate": 0.0014032479312202977, + "loss": 0.6636, + "step": 5561 + }, + { + "epoch": 0.38693519774600854, + "grad_norm": 1.3046875, + "learning_rate": 0.001403041720839645, + "loss": 0.8638, + "step": 5562 + }, + { + "epoch": 0.38700476538314377, + "grad_norm": 1.0078125, + "learning_rate": 0.0014028354899945652, + "loss": 0.8446, + "step": 5563 + }, + { + "epoch": 0.387074333020279, + "grad_norm": 0.96484375, + "learning_rate": 0.0014026292386955296, + "loss": 0.724, + "step": 5564 + }, + { + "epoch": 0.38714390065741416, + "grad_norm": 1.2265625, + "learning_rate": 0.0014024229669530109, + "loss": 1.009, + "step": 5565 + }, + { + "epoch": 0.3872134682945494, + "grad_norm": 0.88671875, + "learning_rate": 0.0014022166747774821, + "loss": 0.6755, + "step": 5566 + }, + { + "epoch": 0.38728303593168456, + "grad_norm": 1.2578125, + "learning_rate": 0.0014020103621794177, + "loss": 0.6224, + "step": 5567 + }, + { + "epoch": 0.3873526035688198, + "grad_norm": 1.03125, + "learning_rate": 0.001401804029169294, + "loss": 0.9592, + "step": 5568 + }, + { + "epoch": 0.387422171205955, + "grad_norm": 1.03125, + "learning_rate": 0.001401597675757586, + "loss": 0.6962, + "step": 5569 + }, + { + "epoch": 0.3874917388430902, + "grad_norm": 1.0625, + "learning_rate": 0.0014013913019547731, + "loss": 0.9164, + "step": 5570 + }, + { + "epoch": 0.3875613064802254, + "grad_norm": 1.09375, + "learning_rate": 0.0014011849077713325, + "loss": 0.8144, + "step": 5571 + }, + { + "epoch": 0.3876308741173606, + "grad_norm": 1.453125, + "learning_rate": 0.0014009784932177446, + "loss": 1.1767, + "step": 5572 + }, + { + "epoch": 0.3877004417544958, + "grad_norm": 1.1796875, + "learning_rate": 0.0014007720583044901, + "loss": 0.8022, + "step": 5573 + }, + { + "epoch": 0.38777000939163103, + "grad_norm": 1.234375, + "learning_rate": 0.0014005656030420502, + "loss": 0.9998, + "step": 5574 + }, + { + "epoch": 0.3878395770287662, + "grad_norm": 1.1328125, + "learning_rate": 0.0014003591274409084, + "loss": 0.8166, + "step": 5575 + }, + { + "epoch": 0.38790914466590143, + "grad_norm": 1.46875, + "learning_rate": 0.0014001526315115475, + "loss": 0.8817, + "step": 5576 + }, + { + "epoch": 0.38797871230303665, + "grad_norm": 0.99609375, + "learning_rate": 0.0013999461152644536, + "loss": 0.8195, + "step": 5577 + }, + { + "epoch": 0.3880482799401718, + "grad_norm": 1.140625, + "learning_rate": 0.001399739578710111, + "loss": 1.1256, + "step": 5578 + }, + { + "epoch": 0.38811784757730705, + "grad_norm": 1.109375, + "learning_rate": 0.0013995330218590082, + "loss": 0.744, + "step": 5579 + }, + { + "epoch": 0.3881874152144422, + "grad_norm": 1.5703125, + "learning_rate": 0.0013993264447216317, + "loss": 1.0807, + "step": 5580 + }, + { + "epoch": 0.38825698285157745, + "grad_norm": 1.328125, + "learning_rate": 0.001399119847308471, + "loss": 1.1585, + "step": 5581 + }, + { + "epoch": 0.3883265504887127, + "grad_norm": 1.2578125, + "learning_rate": 0.0013989132296300172, + "loss": 0.9526, + "step": 5582 + }, + { + "epoch": 0.38839611812584784, + "grad_norm": 1.1328125, + "learning_rate": 0.0013987065916967595, + "loss": 0.7696, + "step": 5583 + }, + { + "epoch": 0.38846568576298307, + "grad_norm": 1.0234375, + "learning_rate": 0.0013984999335191909, + "loss": 1.1156, + "step": 5584 + }, + { + "epoch": 0.38853525340011824, + "grad_norm": 1.0078125, + "learning_rate": 0.0013982932551078041, + "loss": 0.9445, + "step": 5585 + }, + { + "epoch": 0.38860482103725347, + "grad_norm": 1.171875, + "learning_rate": 0.0013980865564730935, + "loss": 0.777, + "step": 5586 + }, + { + "epoch": 0.3886743886743887, + "grad_norm": 0.859375, + "learning_rate": 0.0013978798376255536, + "loss": 0.9426, + "step": 5587 + }, + { + "epoch": 0.38874395631152386, + "grad_norm": 0.95703125, + "learning_rate": 0.0013976730985756818, + "loss": 0.8967, + "step": 5588 + }, + { + "epoch": 0.3888135239486591, + "grad_norm": 1.109375, + "learning_rate": 0.0013974663393339739, + "loss": 1.0173, + "step": 5589 + }, + { + "epoch": 0.3888830915857943, + "grad_norm": 1.2734375, + "learning_rate": 0.0013972595599109287, + "loss": 0.864, + "step": 5590 + }, + { + "epoch": 0.3889526592229295, + "grad_norm": 1.421875, + "learning_rate": 0.0013970527603170458, + "loss": 0.7937, + "step": 5591 + }, + { + "epoch": 0.3890222268600647, + "grad_norm": 0.98828125, + "learning_rate": 0.0013968459405628247, + "loss": 0.7681, + "step": 5592 + }, + { + "epoch": 0.3890917944971999, + "grad_norm": 1.03125, + "learning_rate": 0.001396639100658767, + "loss": 0.7632, + "step": 5593 + }, + { + "epoch": 0.3891613621343351, + "grad_norm": 1.015625, + "learning_rate": 0.001396432240615375, + "loss": 0.955, + "step": 5594 + }, + { + "epoch": 0.38923092977147034, + "grad_norm": 1.078125, + "learning_rate": 0.0013962253604431524, + "loss": 0.7488, + "step": 5595 + }, + { + "epoch": 0.3893004974086055, + "grad_norm": 1.015625, + "learning_rate": 0.0013960184601526024, + "loss": 0.8588, + "step": 5596 + }, + { + "epoch": 0.38937006504574073, + "grad_norm": 1.1015625, + "learning_rate": 0.0013958115397542314, + "loss": 0.7897, + "step": 5597 + }, + { + "epoch": 0.3894396326828759, + "grad_norm": 1.2734375, + "learning_rate": 0.0013956045992585457, + "loss": 0.7896, + "step": 5598 + }, + { + "epoch": 0.38950920032001113, + "grad_norm": 1.1953125, + "learning_rate": 0.001395397638676052, + "loss": 0.8627, + "step": 5599 + }, + { + "epoch": 0.38957876795714635, + "grad_norm": 1.3046875, + "learning_rate": 0.0013951906580172595, + "loss": 0.908, + "step": 5600 + }, + { + "epoch": 0.3896483355942815, + "grad_norm": 1.046875, + "learning_rate": 0.0013949836572926771, + "loss": 1.0089, + "step": 5601 + }, + { + "epoch": 0.38971790323141675, + "grad_norm": 0.94921875, + "learning_rate": 0.0013947766365128157, + "loss": 0.8768, + "step": 5602 + }, + { + "epoch": 0.389787470868552, + "grad_norm": 1.34375, + "learning_rate": 0.001394569595688186, + "loss": 1.0241, + "step": 5603 + }, + { + "epoch": 0.38985703850568715, + "grad_norm": 1.1328125, + "learning_rate": 0.0013943625348293014, + "loss": 0.9728, + "step": 5604 + }, + { + "epoch": 0.3899266061428224, + "grad_norm": 1.0546875, + "learning_rate": 0.0013941554539466752, + "loss": 0.8221, + "step": 5605 + }, + { + "epoch": 0.38999617377995754, + "grad_norm": 1.1953125, + "learning_rate": 0.0013939483530508213, + "loss": 1.0314, + "step": 5606 + }, + { + "epoch": 0.39006574141709277, + "grad_norm": 1.78125, + "learning_rate": 0.001393741232152256, + "loss": 1.1567, + "step": 5607 + }, + { + "epoch": 0.390135309054228, + "grad_norm": 1.1171875, + "learning_rate": 0.0013935340912614954, + "loss": 0.7966, + "step": 5608 + }, + { + "epoch": 0.39020487669136317, + "grad_norm": 1.296875, + "learning_rate": 0.0013933269303890575, + "loss": 1.0969, + "step": 5609 + }, + { + "epoch": 0.3902744443284984, + "grad_norm": 1.1796875, + "learning_rate": 0.00139311974954546, + "loss": 0.8874, + "step": 5610 + }, + { + "epoch": 0.39034401196563356, + "grad_norm": 1.234375, + "learning_rate": 0.0013929125487412233, + "loss": 0.7099, + "step": 5611 + }, + { + "epoch": 0.3904135796027688, + "grad_norm": 1.109375, + "learning_rate": 0.0013927053279868683, + "loss": 0.9192, + "step": 5612 + }, + { + "epoch": 0.390483147239904, + "grad_norm": 1.078125, + "learning_rate": 0.0013924980872929153, + "loss": 1.0022, + "step": 5613 + }, + { + "epoch": 0.3905527148770392, + "grad_norm": 1.2421875, + "learning_rate": 0.0013922908266698884, + "loss": 0.9358, + "step": 5614 + }, + { + "epoch": 0.3906222825141744, + "grad_norm": 1.109375, + "learning_rate": 0.00139208354612831, + "loss": 0.8857, + "step": 5615 + }, + { + "epoch": 0.3906918501513096, + "grad_norm": 1.0625, + "learning_rate": 0.0013918762456787061, + "loss": 0.7615, + "step": 5616 + }, + { + "epoch": 0.3907614177884448, + "grad_norm": 1.15625, + "learning_rate": 0.0013916689253316013, + "loss": 0.769, + "step": 5617 + }, + { + "epoch": 0.39083098542558004, + "grad_norm": 1.0625, + "learning_rate": 0.0013914615850975226, + "loss": 0.9046, + "step": 5618 + }, + { + "epoch": 0.3909005530627152, + "grad_norm": 1.4296875, + "learning_rate": 0.0013912542249869978, + "loss": 0.9263, + "step": 5619 + }, + { + "epoch": 0.39097012069985043, + "grad_norm": 1.25, + "learning_rate": 0.0013910468450105556, + "loss": 1.1956, + "step": 5620 + }, + { + "epoch": 0.39103968833698566, + "grad_norm": 0.91796875, + "learning_rate": 0.0013908394451787255, + "loss": 0.8908, + "step": 5621 + }, + { + "epoch": 0.39110925597412083, + "grad_norm": 1.1875, + "learning_rate": 0.0013906320255020384, + "loss": 0.7833, + "step": 5622 + }, + { + "epoch": 0.39117882361125605, + "grad_norm": 1.234375, + "learning_rate": 0.001390424585991026, + "loss": 1.0066, + "step": 5623 + }, + { + "epoch": 0.3912483912483912, + "grad_norm": 1.109375, + "learning_rate": 0.001390217126656221, + "loss": 0.7541, + "step": 5624 + }, + { + "epoch": 0.39131795888552645, + "grad_norm": 1.3046875, + "learning_rate": 0.0013900096475081571, + "loss": 0.8519, + "step": 5625 + }, + { + "epoch": 0.3913875265226617, + "grad_norm": 0.99609375, + "learning_rate": 0.0013898021485573688, + "loss": 0.6573, + "step": 5626 + }, + { + "epoch": 0.39145709415979685, + "grad_norm": 1.1640625, + "learning_rate": 0.0013895946298143923, + "loss": 0.8274, + "step": 5627 + }, + { + "epoch": 0.3915266617969321, + "grad_norm": 1.2109375, + "learning_rate": 0.0013893870912897648, + "loss": 0.9457, + "step": 5628 + }, + { + "epoch": 0.39159622943406724, + "grad_norm": 1.09375, + "learning_rate": 0.001389179532994023, + "loss": 0.8788, + "step": 5629 + }, + { + "epoch": 0.39166579707120247, + "grad_norm": 0.9765625, + "learning_rate": 0.0013889719549377063, + "loss": 0.9339, + "step": 5630 + }, + { + "epoch": 0.3917353647083377, + "grad_norm": 1.1171875, + "learning_rate": 0.0013887643571313538, + "loss": 0.8774, + "step": 5631 + }, + { + "epoch": 0.39180493234547287, + "grad_norm": 0.90234375, + "learning_rate": 0.0013885567395855072, + "loss": 0.7744, + "step": 5632 + }, + { + "epoch": 0.3918744999826081, + "grad_norm": 0.875, + "learning_rate": 0.0013883491023107075, + "loss": 0.846, + "step": 5633 + }, + { + "epoch": 0.3919440676197433, + "grad_norm": 1.2890625, + "learning_rate": 0.001388141445317498, + "loss": 0.775, + "step": 5634 + }, + { + "epoch": 0.3920136352568785, + "grad_norm": 1.015625, + "learning_rate": 0.0013879337686164223, + "loss": 0.7387, + "step": 5635 + }, + { + "epoch": 0.3920832028940137, + "grad_norm": 1.015625, + "learning_rate": 0.0013877260722180253, + "loss": 0.8283, + "step": 5636 + }, + { + "epoch": 0.3921527705311489, + "grad_norm": 1.015625, + "learning_rate": 0.0013875183561328527, + "loss": 0.9993, + "step": 5637 + }, + { + "epoch": 0.3922223381682841, + "grad_norm": 1.25, + "learning_rate": 0.001387310620371451, + "loss": 1.0272, + "step": 5638 + }, + { + "epoch": 0.39229190580541934, + "grad_norm": 1.2265625, + "learning_rate": 0.0013871028649443682, + "loss": 0.8058, + "step": 5639 + }, + { + "epoch": 0.3923614734425545, + "grad_norm": 0.90234375, + "learning_rate": 0.001386895089862153, + "loss": 0.7725, + "step": 5640 + }, + { + "epoch": 0.39243104107968974, + "grad_norm": 1.390625, + "learning_rate": 0.0013866872951353553, + "loss": 0.952, + "step": 5641 + }, + { + "epoch": 0.3925006087168249, + "grad_norm": 1.0234375, + "learning_rate": 0.0013864794807745258, + "loss": 0.7073, + "step": 5642 + }, + { + "epoch": 0.39257017635396013, + "grad_norm": 0.890625, + "learning_rate": 0.0013862716467902163, + "loss": 0.7186, + "step": 5643 + }, + { + "epoch": 0.39263974399109536, + "grad_norm": 1.203125, + "learning_rate": 0.0013860637931929797, + "loss": 1.0442, + "step": 5644 + }, + { + "epoch": 0.39270931162823053, + "grad_norm": 0.953125, + "learning_rate": 0.0013858559199933693, + "loss": 0.9521, + "step": 5645 + }, + { + "epoch": 0.39277887926536575, + "grad_norm": 1.4609375, + "learning_rate": 0.0013856480272019405, + "loss": 0.933, + "step": 5646 + }, + { + "epoch": 0.392848446902501, + "grad_norm": 1.4765625, + "learning_rate": 0.001385440114829248, + "loss": 0.6352, + "step": 5647 + }, + { + "epoch": 0.39291801453963615, + "grad_norm": 1.1484375, + "learning_rate": 0.0013852321828858498, + "loss": 1.0762, + "step": 5648 + }, + { + "epoch": 0.3929875821767714, + "grad_norm": 0.8828125, + "learning_rate": 0.001385024231382303, + "loss": 0.7916, + "step": 5649 + }, + { + "epoch": 0.39305714981390655, + "grad_norm": 0.890625, + "learning_rate": 0.001384816260329166, + "loss": 0.8968, + "step": 5650 + }, + { + "epoch": 0.3931267174510418, + "grad_norm": 1.125, + "learning_rate": 0.0013846082697369995, + "loss": 0.7204, + "step": 5651 + }, + { + "epoch": 0.393196285088177, + "grad_norm": 1.3203125, + "learning_rate": 0.0013844002596163634, + "loss": 1.3161, + "step": 5652 + }, + { + "epoch": 0.39326585272531217, + "grad_norm": 1.2265625, + "learning_rate": 0.0013841922299778198, + "loss": 0.7805, + "step": 5653 + }, + { + "epoch": 0.3933354203624474, + "grad_norm": 1.0703125, + "learning_rate": 0.0013839841808319306, + "loss": 0.6772, + "step": 5654 + }, + { + "epoch": 0.39340498799958257, + "grad_norm": 0.9375, + "learning_rate": 0.0013837761121892607, + "loss": 0.6694, + "step": 5655 + }, + { + "epoch": 0.3934745556367178, + "grad_norm": 1.1640625, + "learning_rate": 0.001383568024060374, + "loss": 0.795, + "step": 5656 + }, + { + "epoch": 0.393544123273853, + "grad_norm": 0.83203125, + "learning_rate": 0.0013833599164558366, + "loss": 0.8358, + "step": 5657 + }, + { + "epoch": 0.3936136909109882, + "grad_norm": 0.88671875, + "learning_rate": 0.0013831517893862146, + "loss": 0.7724, + "step": 5658 + }, + { + "epoch": 0.3936832585481234, + "grad_norm": 0.84375, + "learning_rate": 0.001382943642862076, + "loss": 0.7712, + "step": 5659 + }, + { + "epoch": 0.39375282618525864, + "grad_norm": 1.078125, + "learning_rate": 0.00138273547689399, + "loss": 0.7206, + "step": 5660 + }, + { + "epoch": 0.3938223938223938, + "grad_norm": 0.984375, + "learning_rate": 0.001382527291492525, + "loss": 0.7366, + "step": 5661 + }, + { + "epoch": 0.39389196145952904, + "grad_norm": 0.90234375, + "learning_rate": 0.0013823190866682526, + "loss": 0.7126, + "step": 5662 + }, + { + "epoch": 0.3939615290966642, + "grad_norm": 1.2109375, + "learning_rate": 0.0013821108624317434, + "loss": 0.8909, + "step": 5663 + }, + { + "epoch": 0.39403109673379944, + "grad_norm": 0.8359375, + "learning_rate": 0.0013819026187935708, + "loss": 0.7864, + "step": 5664 + }, + { + "epoch": 0.39410066437093466, + "grad_norm": 1.1953125, + "learning_rate": 0.0013816943557643081, + "loss": 0.8352, + "step": 5665 + }, + { + "epoch": 0.39417023200806983, + "grad_norm": 1.0859375, + "learning_rate": 0.0013814860733545303, + "loss": 0.9868, + "step": 5666 + }, + { + "epoch": 0.39423979964520506, + "grad_norm": 1.53125, + "learning_rate": 0.0013812777715748125, + "loss": 0.8523, + "step": 5667 + }, + { + "epoch": 0.39430936728234023, + "grad_norm": 0.98828125, + "learning_rate": 0.0013810694504357308, + "loss": 0.7038, + "step": 5668 + }, + { + "epoch": 0.39437893491947545, + "grad_norm": 1.546875, + "learning_rate": 0.0013808611099478637, + "loss": 1.205, + "step": 5669 + }, + { + "epoch": 0.3944485025566107, + "grad_norm": 0.9140625, + "learning_rate": 0.0013806527501217885, + "loss": 0.7109, + "step": 5670 + }, + { + "epoch": 0.39451807019374585, + "grad_norm": 1.03125, + "learning_rate": 0.0013804443709680857, + "loss": 0.6902, + "step": 5671 + }, + { + "epoch": 0.3945876378308811, + "grad_norm": 0.9765625, + "learning_rate": 0.001380235972497335, + "loss": 0.7888, + "step": 5672 + }, + { + "epoch": 0.3946572054680163, + "grad_norm": 1.109375, + "learning_rate": 0.0013800275547201184, + "loss": 0.8962, + "step": 5673 + }, + { + "epoch": 0.3947267731051515, + "grad_norm": 1.125, + "learning_rate": 0.001379819117647018, + "loss": 0.8865, + "step": 5674 + }, + { + "epoch": 0.3947963407422867, + "grad_norm": 1.046875, + "learning_rate": 0.0013796106612886173, + "loss": 0.7484, + "step": 5675 + }, + { + "epoch": 0.39486590837942187, + "grad_norm": 0.921875, + "learning_rate": 0.0013794021856555008, + "loss": 0.735, + "step": 5676 + }, + { + "epoch": 0.3949354760165571, + "grad_norm": 1.0546875, + "learning_rate": 0.0013791936907582532, + "loss": 0.7644, + "step": 5677 + }, + { + "epoch": 0.3950050436536923, + "grad_norm": 1.2265625, + "learning_rate": 0.0013789851766074614, + "loss": 0.8093, + "step": 5678 + }, + { + "epoch": 0.3950746112908275, + "grad_norm": 0.953125, + "learning_rate": 0.0013787766432137127, + "loss": 0.8287, + "step": 5679 + }, + { + "epoch": 0.3951441789279627, + "grad_norm": 1.2265625, + "learning_rate": 0.001378568090587595, + "loss": 0.8146, + "step": 5680 + }, + { + "epoch": 0.3952137465650979, + "grad_norm": 1.0546875, + "learning_rate": 0.001378359518739698, + "loss": 0.6935, + "step": 5681 + }, + { + "epoch": 0.3952833142022331, + "grad_norm": 1.0390625, + "learning_rate": 0.0013781509276806117, + "loss": 1.1332, + "step": 5682 + }, + { + "epoch": 0.39535288183936834, + "grad_norm": 1.140625, + "learning_rate": 0.001377942317420927, + "loss": 0.907, + "step": 5683 + }, + { + "epoch": 0.3954224494765035, + "grad_norm": 1.078125, + "learning_rate": 0.0013777336879712367, + "loss": 0.8557, + "step": 5684 + }, + { + "epoch": 0.39549201711363874, + "grad_norm": 1.15625, + "learning_rate": 0.0013775250393421336, + "loss": 0.7406, + "step": 5685 + }, + { + "epoch": 0.39556158475077396, + "grad_norm": 1.0390625, + "learning_rate": 0.0013773163715442118, + "loss": 0.7507, + "step": 5686 + }, + { + "epoch": 0.39563115238790914, + "grad_norm": 1.1484375, + "learning_rate": 0.0013771076845880668, + "loss": 0.9068, + "step": 5687 + }, + { + "epoch": 0.39570072002504436, + "grad_norm": 1.1015625, + "learning_rate": 0.0013768989784842941, + "loss": 0.8867, + "step": 5688 + }, + { + "epoch": 0.39577028766217953, + "grad_norm": 1.1875, + "learning_rate": 0.001376690253243491, + "loss": 0.9804, + "step": 5689 + }, + { + "epoch": 0.39583985529931476, + "grad_norm": 1.328125, + "learning_rate": 0.0013764815088762553, + "loss": 1.0954, + "step": 5690 + }, + { + "epoch": 0.39590942293645, + "grad_norm": 0.9921875, + "learning_rate": 0.0013762727453931862, + "loss": 0.7848, + "step": 5691 + }, + { + "epoch": 0.39597899057358515, + "grad_norm": 1.5234375, + "learning_rate": 0.0013760639628048838, + "loss": 0.6177, + "step": 5692 + }, + { + "epoch": 0.3960485582107204, + "grad_norm": 1.1484375, + "learning_rate": 0.001375855161121949, + "loss": 0.8519, + "step": 5693 + }, + { + "epoch": 0.39611812584785555, + "grad_norm": 1.0625, + "learning_rate": 0.0013756463403549835, + "loss": 0.671, + "step": 5694 + }, + { + "epoch": 0.3961876934849908, + "grad_norm": 0.96484375, + "learning_rate": 0.00137543750051459, + "loss": 0.8336, + "step": 5695 + }, + { + "epoch": 0.396257261122126, + "grad_norm": 1.046875, + "learning_rate": 0.0013752286416113728, + "loss": 0.9169, + "step": 5696 + }, + { + "epoch": 0.3963268287592612, + "grad_norm": 1.015625, + "learning_rate": 0.0013750197636559363, + "loss": 0.8185, + "step": 5697 + }, + { + "epoch": 0.3963963963963964, + "grad_norm": 1.2109375, + "learning_rate": 0.0013748108666588865, + "loss": 1.1148, + "step": 5698 + }, + { + "epoch": 0.3964659640335316, + "grad_norm": 0.953125, + "learning_rate": 0.0013746019506308302, + "loss": 0.903, + "step": 5699 + }, + { + "epoch": 0.3965355316706668, + "grad_norm": 1.0625, + "learning_rate": 0.001374393015582375, + "loss": 0.9774, + "step": 5700 + }, + { + "epoch": 0.396605099307802, + "grad_norm": 1.4921875, + "learning_rate": 0.0013741840615241294, + "loss": 1.2874, + "step": 5701 + }, + { + "epoch": 0.3966746669449372, + "grad_norm": 1.4765625, + "learning_rate": 0.001373975088466703, + "loss": 0.7905, + "step": 5702 + }, + { + "epoch": 0.3967442345820724, + "grad_norm": 1.21875, + "learning_rate": 0.0013737660964207071, + "loss": 0.8377, + "step": 5703 + }, + { + "epoch": 0.39681380221920765, + "grad_norm": 1.1328125, + "learning_rate": 0.0013735570853967522, + "loss": 0.9901, + "step": 5704 + }, + { + "epoch": 0.3968833698563428, + "grad_norm": 1.2421875, + "learning_rate": 0.0013733480554054519, + "loss": 0.8395, + "step": 5705 + }, + { + "epoch": 0.39695293749347804, + "grad_norm": 1.53125, + "learning_rate": 0.0013731390064574188, + "loss": 0.8949, + "step": 5706 + }, + { + "epoch": 0.3970225051306132, + "grad_norm": 1.0078125, + "learning_rate": 0.0013729299385632676, + "loss": 0.6607, + "step": 5707 + }, + { + "epoch": 0.39709207276774844, + "grad_norm": 0.9453125, + "learning_rate": 0.001372720851733614, + "loss": 0.9082, + "step": 5708 + }, + { + "epoch": 0.39716164040488366, + "grad_norm": 1.203125, + "learning_rate": 0.0013725117459790744, + "loss": 1.0577, + "step": 5709 + }, + { + "epoch": 0.39723120804201884, + "grad_norm": 1.1953125, + "learning_rate": 0.0013723026213102658, + "loss": 0.6586, + "step": 5710 + }, + { + "epoch": 0.39730077567915406, + "grad_norm": 1.40625, + "learning_rate": 0.0013720934777378064, + "loss": 0.8832, + "step": 5711 + }, + { + "epoch": 0.3973703433162893, + "grad_norm": 1.25, + "learning_rate": 0.001371884315272316, + "loss": 0.9176, + "step": 5712 + }, + { + "epoch": 0.39743991095342446, + "grad_norm": 1.234375, + "learning_rate": 0.0013716751339244145, + "loss": 0.9529, + "step": 5713 + }, + { + "epoch": 0.3975094785905597, + "grad_norm": 1.171875, + "learning_rate": 0.0013714659337047228, + "loss": 0.9422, + "step": 5714 + }, + { + "epoch": 0.39757904622769485, + "grad_norm": 1.28125, + "learning_rate": 0.0013712567146238635, + "loss": 0.8914, + "step": 5715 + }, + { + "epoch": 0.3976486138648301, + "grad_norm": 1.2265625, + "learning_rate": 0.0013710474766924596, + "loss": 0.9733, + "step": 5716 + }, + { + "epoch": 0.3977181815019653, + "grad_norm": 1.1875, + "learning_rate": 0.0013708382199211348, + "loss": 0.9387, + "step": 5717 + }, + { + "epoch": 0.3977877491391005, + "grad_norm": 1.3359375, + "learning_rate": 0.0013706289443205146, + "loss": 0.9794, + "step": 5718 + }, + { + "epoch": 0.3978573167762357, + "grad_norm": 1.1171875, + "learning_rate": 0.0013704196499012247, + "loss": 0.8667, + "step": 5719 + }, + { + "epoch": 0.3979268844133709, + "grad_norm": 1.078125, + "learning_rate": 0.0013702103366738919, + "loss": 0.8185, + "step": 5720 + }, + { + "epoch": 0.3979964520505061, + "grad_norm": 1.125, + "learning_rate": 0.0013700010046491442, + "loss": 1.0267, + "step": 5721 + }, + { + "epoch": 0.3980660196876413, + "grad_norm": 1.0546875, + "learning_rate": 0.0013697916538376106, + "loss": 0.9751, + "step": 5722 + }, + { + "epoch": 0.3981355873247765, + "grad_norm": 0.98828125, + "learning_rate": 0.0013695822842499203, + "loss": 0.6295, + "step": 5723 + }, + { + "epoch": 0.3982051549619117, + "grad_norm": 1.2265625, + "learning_rate": 0.001369372895896705, + "loss": 1.0598, + "step": 5724 + }, + { + "epoch": 0.39827472259904695, + "grad_norm": 1.1640625, + "learning_rate": 0.0013691634887885954, + "loss": 0.8897, + "step": 5725 + }, + { + "epoch": 0.3983442902361821, + "grad_norm": 0.9296875, + "learning_rate": 0.0013689540629362247, + "loss": 0.6013, + "step": 5726 + }, + { + "epoch": 0.39841385787331735, + "grad_norm": 1.2265625, + "learning_rate": 0.0013687446183502264, + "loss": 0.8765, + "step": 5727 + }, + { + "epoch": 0.3984834255104525, + "grad_norm": 1.3515625, + "learning_rate": 0.001368535155041235, + "loss": 0.8201, + "step": 5728 + }, + { + "epoch": 0.39855299314758774, + "grad_norm": 1.0, + "learning_rate": 0.0013683256730198858, + "loss": 0.6924, + "step": 5729 + }, + { + "epoch": 0.39862256078472297, + "grad_norm": 1.3359375, + "learning_rate": 0.0013681161722968157, + "loss": 0.9522, + "step": 5730 + }, + { + "epoch": 0.39869212842185814, + "grad_norm": 0.8515625, + "learning_rate": 0.0013679066528826617, + "loss": 0.7778, + "step": 5731 + }, + { + "epoch": 0.39876169605899336, + "grad_norm": 1.3203125, + "learning_rate": 0.001367697114788062, + "loss": 1.1964, + "step": 5732 + }, + { + "epoch": 0.39883126369612854, + "grad_norm": 1.078125, + "learning_rate": 0.0013674875580236563, + "loss": 0.7515, + "step": 5733 + }, + { + "epoch": 0.39890083133326376, + "grad_norm": 0.984375, + "learning_rate": 0.001367277982600085, + "loss": 0.6181, + "step": 5734 + }, + { + "epoch": 0.398970398970399, + "grad_norm": 1.34375, + "learning_rate": 0.0013670683885279886, + "loss": 0.9628, + "step": 5735 + }, + { + "epoch": 0.39903996660753416, + "grad_norm": 1.2890625, + "learning_rate": 0.0013668587758180095, + "loss": 1.1417, + "step": 5736 + }, + { + "epoch": 0.3991095342446694, + "grad_norm": 1.0703125, + "learning_rate": 0.0013666491444807912, + "loss": 0.9542, + "step": 5737 + }, + { + "epoch": 0.3991791018818046, + "grad_norm": 1.046875, + "learning_rate": 0.0013664394945269774, + "loss": 0.8635, + "step": 5738 + }, + { + "epoch": 0.3992486695189398, + "grad_norm": 1.4609375, + "learning_rate": 0.0013662298259672129, + "loss": 0.8736, + "step": 5739 + }, + { + "epoch": 0.399318237156075, + "grad_norm": 1.25, + "learning_rate": 0.0013660201388121438, + "loss": 0.8686, + "step": 5740 + }, + { + "epoch": 0.3993878047932102, + "grad_norm": 1.4140625, + "learning_rate": 0.0013658104330724168, + "loss": 1.0662, + "step": 5741 + }, + { + "epoch": 0.3994573724303454, + "grad_norm": 0.9296875, + "learning_rate": 0.00136560070875868, + "loss": 0.6878, + "step": 5742 + }, + { + "epoch": 0.39952694006748063, + "grad_norm": 1.296875, + "learning_rate": 0.001365390965881582, + "loss": 0.9694, + "step": 5743 + }, + { + "epoch": 0.3995965077046158, + "grad_norm": 1.0234375, + "learning_rate": 0.0013651812044517722, + "loss": 0.8046, + "step": 5744 + }, + { + "epoch": 0.399666075341751, + "grad_norm": 1.0859375, + "learning_rate": 0.0013649714244799017, + "loss": 0.8767, + "step": 5745 + }, + { + "epoch": 0.3997356429788862, + "grad_norm": 1.171875, + "learning_rate": 0.0013647616259766218, + "loss": 0.809, + "step": 5746 + }, + { + "epoch": 0.3998052106160214, + "grad_norm": 1.09375, + "learning_rate": 0.001364551808952585, + "loss": 0.96, + "step": 5747 + }, + { + "epoch": 0.39987477825315665, + "grad_norm": 0.78515625, + "learning_rate": 0.001364341973418445, + "loss": 0.6902, + "step": 5748 + }, + { + "epoch": 0.3999443458902918, + "grad_norm": 1.0859375, + "learning_rate": 0.0013641321193848558, + "loss": 0.8762, + "step": 5749 + }, + { + "epoch": 0.40001391352742705, + "grad_norm": 1.1328125, + "learning_rate": 0.0013639222468624732, + "loss": 0.7761, + "step": 5750 + }, + { + "epoch": 0.40008348116456227, + "grad_norm": 0.99609375, + "learning_rate": 0.0013637123558619532, + "loss": 0.6942, + "step": 5751 + }, + { + "epoch": 0.40015304880169744, + "grad_norm": 1.0078125, + "learning_rate": 0.0013635024463939528, + "loss": 0.7257, + "step": 5752 + }, + { + "epoch": 0.40022261643883267, + "grad_norm": 1.0234375, + "learning_rate": 0.0013632925184691304, + "loss": 0.9273, + "step": 5753 + }, + { + "epoch": 0.40029218407596784, + "grad_norm": 1.421875, + "learning_rate": 0.001363082572098145, + "loss": 1.032, + "step": 5754 + }, + { + "epoch": 0.40036175171310306, + "grad_norm": 1.3203125, + "learning_rate": 0.0013628726072916568, + "loss": 0.7682, + "step": 5755 + }, + { + "epoch": 0.4004313193502383, + "grad_norm": 0.99609375, + "learning_rate": 0.0013626626240603266, + "loss": 0.8368, + "step": 5756 + }, + { + "epoch": 0.40050088698737346, + "grad_norm": 1.0234375, + "learning_rate": 0.0013624526224148162, + "loss": 0.9481, + "step": 5757 + }, + { + "epoch": 0.4005704546245087, + "grad_norm": 1.2109375, + "learning_rate": 0.0013622426023657886, + "loss": 0.9786, + "step": 5758 + }, + { + "epoch": 0.40064002226164386, + "grad_norm": 1.40625, + "learning_rate": 0.0013620325639239076, + "loss": 0.852, + "step": 5759 + }, + { + "epoch": 0.4007095898987791, + "grad_norm": 1.0546875, + "learning_rate": 0.0013618225070998375, + "loss": 0.9142, + "step": 5760 + }, + { + "epoch": 0.4007791575359143, + "grad_norm": 1.015625, + "learning_rate": 0.0013616124319042445, + "loss": 0.8978, + "step": 5761 + }, + { + "epoch": 0.4008487251730495, + "grad_norm": 1.28125, + "learning_rate": 0.0013614023383477947, + "loss": 1.0515, + "step": 5762 + }, + { + "epoch": 0.4009182928101847, + "grad_norm": 1.1015625, + "learning_rate": 0.0013611922264411558, + "loss": 0.9575, + "step": 5763 + }, + { + "epoch": 0.40098786044731993, + "grad_norm": 1.609375, + "learning_rate": 0.0013609820961949961, + "loss": 1.222, + "step": 5764 + }, + { + "epoch": 0.4010574280844551, + "grad_norm": 1.125, + "learning_rate": 0.0013607719476199853, + "loss": 0.9202, + "step": 5765 + }, + { + "epoch": 0.40112699572159033, + "grad_norm": 1.0625, + "learning_rate": 0.0013605617807267933, + "loss": 1.0188, + "step": 5766 + }, + { + "epoch": 0.4011965633587255, + "grad_norm": 1.1484375, + "learning_rate": 0.0013603515955260912, + "loss": 0.8969, + "step": 5767 + }, + { + "epoch": 0.4012661309958607, + "grad_norm": 1.0625, + "learning_rate": 0.0013601413920285516, + "loss": 0.6961, + "step": 5768 + }, + { + "epoch": 0.40133569863299595, + "grad_norm": 1.2421875, + "learning_rate": 0.0013599311702448473, + "loss": 0.7371, + "step": 5769 + }, + { + "epoch": 0.4014052662701311, + "grad_norm": 1.1328125, + "learning_rate": 0.0013597209301856525, + "loss": 0.7546, + "step": 5770 + }, + { + "epoch": 0.40147483390726635, + "grad_norm": 1.28125, + "learning_rate": 0.0013595106718616418, + "loss": 0.9842, + "step": 5771 + }, + { + "epoch": 0.4015444015444015, + "grad_norm": 1.1015625, + "learning_rate": 0.0013593003952834914, + "loss": 0.9441, + "step": 5772 + }, + { + "epoch": 0.40161396918153675, + "grad_norm": 1.5859375, + "learning_rate": 0.0013590901004618776, + "loss": 0.6869, + "step": 5773 + }, + { + "epoch": 0.40168353681867197, + "grad_norm": 1.234375, + "learning_rate": 0.0013588797874074792, + "loss": 0.751, + "step": 5774 + }, + { + "epoch": 0.40175310445580714, + "grad_norm": 1.3828125, + "learning_rate": 0.0013586694561309736, + "loss": 1.1005, + "step": 5775 + }, + { + "epoch": 0.40182267209294237, + "grad_norm": 1.1484375, + "learning_rate": 0.0013584591066430408, + "loss": 1.0155, + "step": 5776 + }, + { + "epoch": 0.4018922397300776, + "grad_norm": 1.25, + "learning_rate": 0.0013582487389543615, + "loss": 1.0699, + "step": 5777 + }, + { + "epoch": 0.40196180736721276, + "grad_norm": 0.828125, + "learning_rate": 0.001358038353075617, + "loss": 0.9256, + "step": 5778 + }, + { + "epoch": 0.402031375004348, + "grad_norm": 1.3046875, + "learning_rate": 0.0013578279490174892, + "loss": 0.9979, + "step": 5779 + }, + { + "epoch": 0.40210094264148316, + "grad_norm": 1.0703125, + "learning_rate": 0.0013576175267906619, + "loss": 0.8082, + "step": 5780 + }, + { + "epoch": 0.4021705102786184, + "grad_norm": 1.0546875, + "learning_rate": 0.0013574070864058193, + "loss": 0.7615, + "step": 5781 + }, + { + "epoch": 0.4022400779157536, + "grad_norm": 1.3359375, + "learning_rate": 0.001357196627873646, + "loss": 1.0908, + "step": 5782 + }, + { + "epoch": 0.4023096455528888, + "grad_norm": 1.0, + "learning_rate": 0.0013569861512048285, + "loss": 0.712, + "step": 5783 + }, + { + "epoch": 0.402379213190024, + "grad_norm": 1.2578125, + "learning_rate": 0.0013567756564100537, + "loss": 0.9485, + "step": 5784 + }, + { + "epoch": 0.4024487808271592, + "grad_norm": 1.140625, + "learning_rate": 0.0013565651435000093, + "loss": 0.986, + "step": 5785 + }, + { + "epoch": 0.4025183484642944, + "grad_norm": 0.96484375, + "learning_rate": 0.001356354612485384, + "loss": 0.8165, + "step": 5786 + }, + { + "epoch": 0.40258791610142963, + "grad_norm": 1.203125, + "learning_rate": 0.0013561440633768679, + "loss": 1.107, + "step": 5787 + }, + { + "epoch": 0.4026574837385648, + "grad_norm": 1.359375, + "learning_rate": 0.001355933496185151, + "loss": 0.9454, + "step": 5788 + }, + { + "epoch": 0.40272705137570003, + "grad_norm": 1.3359375, + "learning_rate": 0.0013557229109209252, + "loss": 0.877, + "step": 5789 + }, + { + "epoch": 0.40279661901283526, + "grad_norm": 0.99609375, + "learning_rate": 0.0013555123075948835, + "loss": 0.8719, + "step": 5790 + }, + { + "epoch": 0.4028661866499704, + "grad_norm": 0.97265625, + "learning_rate": 0.0013553016862177182, + "loss": 0.871, + "step": 5791 + }, + { + "epoch": 0.40293575428710565, + "grad_norm": 1.1796875, + "learning_rate": 0.0013550910468001244, + "loss": 0.9059, + "step": 5792 + }, + { + "epoch": 0.4030053219242408, + "grad_norm": 1.1328125, + "learning_rate": 0.0013548803893527971, + "loss": 0.8285, + "step": 5793 + }, + { + "epoch": 0.40307488956137605, + "grad_norm": 1.390625, + "learning_rate": 0.0013546697138864321, + "loss": 1.0612, + "step": 5794 + }, + { + "epoch": 0.4031444571985113, + "grad_norm": 1.3671875, + "learning_rate": 0.001354459020411727, + "loss": 0.7523, + "step": 5795 + }, + { + "epoch": 0.40321402483564645, + "grad_norm": 1.3984375, + "learning_rate": 0.0013542483089393788, + "loss": 0.9797, + "step": 5796 + }, + { + "epoch": 0.40328359247278167, + "grad_norm": 1.0546875, + "learning_rate": 0.0013540375794800876, + "loss": 0.8473, + "step": 5797 + }, + { + "epoch": 0.40335316010991684, + "grad_norm": 1.1875, + "learning_rate": 0.0013538268320445526, + "loss": 0.9444, + "step": 5798 + }, + { + "epoch": 0.40342272774705207, + "grad_norm": 1.234375, + "learning_rate": 0.0013536160666434746, + "loss": 0.7539, + "step": 5799 + }, + { + "epoch": 0.4034922953841873, + "grad_norm": 1.140625, + "learning_rate": 0.0013534052832875547, + "loss": 0.7358, + "step": 5800 + }, + { + "epoch": 0.40356186302132246, + "grad_norm": 0.92578125, + "learning_rate": 0.001353194481987496, + "loss": 0.6306, + "step": 5801 + }, + { + "epoch": 0.4036314306584577, + "grad_norm": 0.88671875, + "learning_rate": 0.0013529836627540015, + "loss": 0.7535, + "step": 5802 + }, + { + "epoch": 0.4037009982955929, + "grad_norm": 1.40625, + "learning_rate": 0.0013527728255977758, + "loss": 1.0439, + "step": 5803 + }, + { + "epoch": 0.4037705659327281, + "grad_norm": 1.0078125, + "learning_rate": 0.0013525619705295245, + "loss": 0.6658, + "step": 5804 + }, + { + "epoch": 0.4038401335698633, + "grad_norm": 0.96484375, + "learning_rate": 0.001352351097559953, + "loss": 0.8662, + "step": 5805 + }, + { + "epoch": 0.4039097012069985, + "grad_norm": 1.1640625, + "learning_rate": 0.0013521402066997692, + "loss": 0.8351, + "step": 5806 + }, + { + "epoch": 0.4039792688441337, + "grad_norm": 1.34375, + "learning_rate": 0.0013519292979596801, + "loss": 0.9935, + "step": 5807 + }, + { + "epoch": 0.40404883648126894, + "grad_norm": 1.015625, + "learning_rate": 0.0013517183713503955, + "loss": 0.7469, + "step": 5808 + }, + { + "epoch": 0.4041184041184041, + "grad_norm": 1.3671875, + "learning_rate": 0.0013515074268826246, + "loss": 1.154, + "step": 5809 + }, + { + "epoch": 0.40418797175553933, + "grad_norm": 0.9296875, + "learning_rate": 0.0013512964645670783, + "loss": 0.5069, + "step": 5810 + }, + { + "epoch": 0.4042575393926745, + "grad_norm": 1.0703125, + "learning_rate": 0.0013510854844144685, + "loss": 0.8358, + "step": 5811 + }, + { + "epoch": 0.40432710702980973, + "grad_norm": 1.09375, + "learning_rate": 0.0013508744864355066, + "loss": 0.8857, + "step": 5812 + }, + { + "epoch": 0.40439667466694496, + "grad_norm": 1.1953125, + "learning_rate": 0.0013506634706409078, + "loss": 1.0108, + "step": 5813 + }, + { + "epoch": 0.4044662423040801, + "grad_norm": 1.1796875, + "learning_rate": 0.0013504524370413849, + "loss": 0.8618, + "step": 5814 + }, + { + "epoch": 0.40453580994121535, + "grad_norm": 1.2109375, + "learning_rate": 0.0013502413856476539, + "loss": 0.8774, + "step": 5815 + }, + { + "epoch": 0.4046053775783506, + "grad_norm": 1.109375, + "learning_rate": 0.0013500303164704305, + "loss": 0.8373, + "step": 5816 + }, + { + "epoch": 0.40467494521548575, + "grad_norm": 1.390625, + "learning_rate": 0.0013498192295204317, + "loss": 1.0593, + "step": 5817 + }, + { + "epoch": 0.404744512852621, + "grad_norm": 1.46875, + "learning_rate": 0.001349608124808376, + "loss": 0.7007, + "step": 5818 + }, + { + "epoch": 0.40481408048975615, + "grad_norm": 1.2578125, + "learning_rate": 0.0013493970023449814, + "loss": 0.7079, + "step": 5819 + }, + { + "epoch": 0.40488364812689137, + "grad_norm": 1.078125, + "learning_rate": 0.0013491858621409688, + "loss": 0.8407, + "step": 5820 + }, + { + "epoch": 0.4049532157640266, + "grad_norm": 0.890625, + "learning_rate": 0.0013489747042070576, + "loss": 0.6917, + "step": 5821 + }, + { + "epoch": 0.40502278340116177, + "grad_norm": 0.98046875, + "learning_rate": 0.0013487635285539703, + "loss": 0.9607, + "step": 5822 + }, + { + "epoch": 0.405092351038297, + "grad_norm": 1.03125, + "learning_rate": 0.0013485523351924288, + "loss": 0.9759, + "step": 5823 + }, + { + "epoch": 0.40516191867543216, + "grad_norm": 1.0234375, + "learning_rate": 0.0013483411241331565, + "loss": 1.0282, + "step": 5824 + }, + { + "epoch": 0.4052314863125674, + "grad_norm": 1.078125, + "learning_rate": 0.0013481298953868777, + "loss": 0.9879, + "step": 5825 + }, + { + "epoch": 0.4053010539497026, + "grad_norm": 1.2421875, + "learning_rate": 0.0013479186489643172, + "loss": 0.962, + "step": 5826 + }, + { + "epoch": 0.4053706215868378, + "grad_norm": 1.1953125, + "learning_rate": 0.0013477073848762017, + "loss": 1.0321, + "step": 5827 + }, + { + "epoch": 0.405440189223973, + "grad_norm": 1.1640625, + "learning_rate": 0.0013474961031332575, + "loss": 0.972, + "step": 5828 + }, + { + "epoch": 0.40550975686110824, + "grad_norm": 1.0703125, + "learning_rate": 0.0013472848037462133, + "loss": 0.8074, + "step": 5829 + }, + { + "epoch": 0.4055793244982434, + "grad_norm": 1.2265625, + "learning_rate": 0.0013470734867257967, + "loss": 0.9583, + "step": 5830 + }, + { + "epoch": 0.40564889213537864, + "grad_norm": 1.03125, + "learning_rate": 0.001346862152082738, + "loss": 0.72, + "step": 5831 + }, + { + "epoch": 0.4057184597725138, + "grad_norm": 1.3515625, + "learning_rate": 0.0013466507998277674, + "loss": 0.8751, + "step": 5832 + }, + { + "epoch": 0.40578802740964903, + "grad_norm": 1.015625, + "learning_rate": 0.0013464394299716163, + "loss": 0.647, + "step": 5833 + }, + { + "epoch": 0.40585759504678426, + "grad_norm": 1.0546875, + "learning_rate": 0.0013462280425250175, + "loss": 0.8034, + "step": 5834 + }, + { + "epoch": 0.40592716268391943, + "grad_norm": 0.94921875, + "learning_rate": 0.0013460166374987036, + "loss": 0.7786, + "step": 5835 + }, + { + "epoch": 0.40599673032105466, + "grad_norm": 1.046875, + "learning_rate": 0.001345805214903409, + "loss": 0.6817, + "step": 5836 + }, + { + "epoch": 0.4060662979581898, + "grad_norm": 1.53125, + "learning_rate": 0.0013455937747498686, + "loss": 0.8713, + "step": 5837 + }, + { + "epoch": 0.40613586559532505, + "grad_norm": 1.140625, + "learning_rate": 0.0013453823170488182, + "loss": 0.8209, + "step": 5838 + }, + { + "epoch": 0.4062054332324603, + "grad_norm": 0.8828125, + "learning_rate": 0.0013451708418109945, + "loss": 0.7567, + "step": 5839 + }, + { + "epoch": 0.40627500086959545, + "grad_norm": 1.5078125, + "learning_rate": 0.0013449593490471351, + "loss": 1.2558, + "step": 5840 + }, + { + "epoch": 0.4063445685067307, + "grad_norm": 1.15625, + "learning_rate": 0.001344747838767979, + "loss": 0.9353, + "step": 5841 + }, + { + "epoch": 0.4064141361438659, + "grad_norm": 1.078125, + "learning_rate": 0.001344536310984265, + "loss": 0.8507, + "step": 5842 + }, + { + "epoch": 0.40648370378100107, + "grad_norm": 0.875, + "learning_rate": 0.0013443247657067342, + "loss": 0.828, + "step": 5843 + }, + { + "epoch": 0.4065532714181363, + "grad_norm": 1.0, + "learning_rate": 0.0013441132029461268, + "loss": 0.844, + "step": 5844 + }, + { + "epoch": 0.40662283905527147, + "grad_norm": 0.85546875, + "learning_rate": 0.0013439016227131857, + "loss": 0.7064, + "step": 5845 + }, + { + "epoch": 0.4066924066924067, + "grad_norm": 0.99609375, + "learning_rate": 0.0013436900250186536, + "loss": 0.6783, + "step": 5846 + }, + { + "epoch": 0.4067619743295419, + "grad_norm": 1.0390625, + "learning_rate": 0.0013434784098732742, + "loss": 0.8343, + "step": 5847 + }, + { + "epoch": 0.4068315419666771, + "grad_norm": 1.1953125, + "learning_rate": 0.0013432667772877926, + "loss": 0.8044, + "step": 5848 + }, + { + "epoch": 0.4069011096038123, + "grad_norm": 0.98828125, + "learning_rate": 0.0013430551272729538, + "loss": 0.8037, + "step": 5849 + }, + { + "epoch": 0.4069706772409475, + "grad_norm": 0.9765625, + "learning_rate": 0.0013428434598395055, + "loss": 0.7308, + "step": 5850 + }, + { + "epoch": 0.4070402448780827, + "grad_norm": 1.421875, + "learning_rate": 0.0013426317749981936, + "loss": 1.059, + "step": 5851 + }, + { + "epoch": 0.40710981251521794, + "grad_norm": 1.078125, + "learning_rate": 0.0013424200727597678, + "loss": 0.7579, + "step": 5852 + }, + { + "epoch": 0.4071793801523531, + "grad_norm": 1.015625, + "learning_rate": 0.0013422083531349762, + "loss": 0.7347, + "step": 5853 + }, + { + "epoch": 0.40724894778948834, + "grad_norm": 1.140625, + "learning_rate": 0.0013419966161345694, + "loss": 0.9378, + "step": 5854 + }, + { + "epoch": 0.40731851542662356, + "grad_norm": 0.96875, + "learning_rate": 0.0013417848617692984, + "loss": 0.7056, + "step": 5855 + }, + { + "epoch": 0.40738808306375873, + "grad_norm": 1.1640625, + "learning_rate": 0.0013415730900499146, + "loss": 0.6704, + "step": 5856 + }, + { + "epoch": 0.40745765070089396, + "grad_norm": 1.125, + "learning_rate": 0.0013413613009871713, + "loss": 0.8853, + "step": 5857 + }, + { + "epoch": 0.40752721833802913, + "grad_norm": 1.0390625, + "learning_rate": 0.001341149494591821, + "loss": 0.7417, + "step": 5858 + }, + { + "epoch": 0.40759678597516436, + "grad_norm": 1.3046875, + "learning_rate": 0.0013409376708746197, + "loss": 0.7576, + "step": 5859 + }, + { + "epoch": 0.4076663536122996, + "grad_norm": 1.078125, + "learning_rate": 0.0013407258298463215, + "loss": 0.8548, + "step": 5860 + }, + { + "epoch": 0.40773592124943475, + "grad_norm": 1.203125, + "learning_rate": 0.0013405139715176833, + "loss": 0.9098, + "step": 5861 + }, + { + "epoch": 0.40780548888657, + "grad_norm": 1.2109375, + "learning_rate": 0.0013403020958994616, + "loss": 0.7366, + "step": 5862 + }, + { + "epoch": 0.40787505652370515, + "grad_norm": 0.95703125, + "learning_rate": 0.0013400902030024147, + "loss": 0.7845, + "step": 5863 + }, + { + "epoch": 0.4079446241608404, + "grad_norm": 1.515625, + "learning_rate": 0.0013398782928373018, + "loss": 1.0852, + "step": 5864 + }, + { + "epoch": 0.4080141917979756, + "grad_norm": 1.15625, + "learning_rate": 0.0013396663654148822, + "loss": 1.0065, + "step": 5865 + }, + { + "epoch": 0.40808375943511077, + "grad_norm": 1.0390625, + "learning_rate": 0.0013394544207459167, + "loss": 0.7465, + "step": 5866 + }, + { + "epoch": 0.408153327072246, + "grad_norm": 1.1484375, + "learning_rate": 0.0013392424588411665, + "loss": 0.871, + "step": 5867 + }, + { + "epoch": 0.4082228947093812, + "grad_norm": 1.40625, + "learning_rate": 0.0013390304797113943, + "loss": 0.708, + "step": 5868 + }, + { + "epoch": 0.4082924623465164, + "grad_norm": 1.0234375, + "learning_rate": 0.0013388184833673631, + "loss": 0.6567, + "step": 5869 + }, + { + "epoch": 0.4083620299836516, + "grad_norm": 1.046875, + "learning_rate": 0.001338606469819837, + "loss": 0.7794, + "step": 5870 + }, + { + "epoch": 0.4084315976207868, + "grad_norm": 0.890625, + "learning_rate": 0.0013383944390795812, + "loss": 0.899, + "step": 5871 + }, + { + "epoch": 0.408501165257922, + "grad_norm": 1.1640625, + "learning_rate": 0.001338182391157361, + "loss": 0.8562, + "step": 5872 + }, + { + "epoch": 0.40857073289505724, + "grad_norm": 1.140625, + "learning_rate": 0.0013379703260639442, + "loss": 0.9008, + "step": 5873 + }, + { + "epoch": 0.4086403005321924, + "grad_norm": 1.0078125, + "learning_rate": 0.0013377582438100972, + "loss": 0.8773, + "step": 5874 + }, + { + "epoch": 0.40870986816932764, + "grad_norm": 0.953125, + "learning_rate": 0.0013375461444065896, + "loss": 0.7937, + "step": 5875 + }, + { + "epoch": 0.4087794358064628, + "grad_norm": 1.4140625, + "learning_rate": 0.0013373340278641894, + "loss": 0.7907, + "step": 5876 + }, + { + "epoch": 0.40884900344359804, + "grad_norm": 1.1015625, + "learning_rate": 0.0013371218941936683, + "loss": 0.8613, + "step": 5877 + }, + { + "epoch": 0.40891857108073326, + "grad_norm": 1.0546875, + "learning_rate": 0.0013369097434057964, + "loss": 0.7615, + "step": 5878 + }, + { + "epoch": 0.40898813871786843, + "grad_norm": 0.98828125, + "learning_rate": 0.0013366975755113456, + "loss": 0.9024, + "step": 5879 + }, + { + "epoch": 0.40905770635500366, + "grad_norm": 1.125, + "learning_rate": 0.0013364853905210893, + "loss": 0.7378, + "step": 5880 + }, + { + "epoch": 0.4091272739921389, + "grad_norm": 1.140625, + "learning_rate": 0.0013362731884458006, + "loss": 0.7813, + "step": 5881 + }, + { + "epoch": 0.40919684162927406, + "grad_norm": 1.2421875, + "learning_rate": 0.0013360609692962546, + "loss": 1.0286, + "step": 5882 + }, + { + "epoch": 0.4092664092664093, + "grad_norm": 1.1484375, + "learning_rate": 0.001335848733083226, + "loss": 0.7492, + "step": 5883 + }, + { + "epoch": 0.40933597690354445, + "grad_norm": 1.0703125, + "learning_rate": 0.001335636479817492, + "loss": 0.8455, + "step": 5884 + }, + { + "epoch": 0.4094055445406797, + "grad_norm": 1.0234375, + "learning_rate": 0.0013354242095098294, + "loss": 0.9679, + "step": 5885 + }, + { + "epoch": 0.4094751121778149, + "grad_norm": 1.046875, + "learning_rate": 0.0013352119221710158, + "loss": 0.9595, + "step": 5886 + }, + { + "epoch": 0.4095446798149501, + "grad_norm": 1.0078125, + "learning_rate": 0.0013349996178118305, + "loss": 0.8342, + "step": 5887 + }, + { + "epoch": 0.4096142474520853, + "grad_norm": 1.1484375, + "learning_rate": 0.0013347872964430527, + "loss": 0.7592, + "step": 5888 + }, + { + "epoch": 0.40968381508922047, + "grad_norm": 1.125, + "learning_rate": 0.0013345749580754643, + "loss": 0.9402, + "step": 5889 + }, + { + "epoch": 0.4097533827263557, + "grad_norm": 1.046875, + "learning_rate": 0.0013343626027198451, + "loss": 0.8261, + "step": 5890 + }, + { + "epoch": 0.4098229503634909, + "grad_norm": 1.359375, + "learning_rate": 0.0013341502303869787, + "loss": 0.8737, + "step": 5891 + }, + { + "epoch": 0.4098925180006261, + "grad_norm": 1.09375, + "learning_rate": 0.0013339378410876478, + "loss": 0.8406, + "step": 5892 + }, + { + "epoch": 0.4099620856377613, + "grad_norm": 1.078125, + "learning_rate": 0.0013337254348326363, + "loss": 0.8002, + "step": 5893 + }, + { + "epoch": 0.4100316532748965, + "grad_norm": 1.3125, + "learning_rate": 0.0013335130116327296, + "loss": 0.7979, + "step": 5894 + }, + { + "epoch": 0.4101012209120317, + "grad_norm": 1.3515625, + "learning_rate": 0.0013333005714987127, + "loss": 0.9424, + "step": 5895 + }, + { + "epoch": 0.41017078854916694, + "grad_norm": 1.3046875, + "learning_rate": 0.0013330881144413733, + "loss": 0.9167, + "step": 5896 + }, + { + "epoch": 0.4102403561863021, + "grad_norm": 1.2734375, + "learning_rate": 0.0013328756404714982, + "loss": 1.0018, + "step": 5897 + }, + { + "epoch": 0.41030992382343734, + "grad_norm": 1.0234375, + "learning_rate": 0.0013326631495998759, + "loss": 0.8047, + "step": 5898 + }, + { + "epoch": 0.41037949146057257, + "grad_norm": 1.0078125, + "learning_rate": 0.0013324506418372953, + "loss": 0.918, + "step": 5899 + }, + { + "epoch": 0.41044905909770774, + "grad_norm": 1.0859375, + "learning_rate": 0.001332238117194547, + "loss": 0.9236, + "step": 5900 + }, + { + "epoch": 0.41051862673484296, + "grad_norm": 1.0546875, + "learning_rate": 0.001332025575682422, + "loss": 0.9684, + "step": 5901 + }, + { + "epoch": 0.41058819437197813, + "grad_norm": 1.6484375, + "learning_rate": 0.0013318130173117111, + "loss": 1.0373, + "step": 5902 + }, + { + "epoch": 0.41065776200911336, + "grad_norm": 1.359375, + "learning_rate": 0.0013316004420932085, + "loss": 0.8822, + "step": 5903 + }, + { + "epoch": 0.4107273296462486, + "grad_norm": 0.890625, + "learning_rate": 0.001331387850037706, + "loss": 0.7408, + "step": 5904 + }, + { + "epoch": 0.41079689728338376, + "grad_norm": 1.0859375, + "learning_rate": 0.0013311752411559994, + "loss": 0.8438, + "step": 5905 + }, + { + "epoch": 0.410866464920519, + "grad_norm": 1.1171875, + "learning_rate": 0.001330962615458883, + "loss": 0.8473, + "step": 5906 + }, + { + "epoch": 0.41093603255765415, + "grad_norm": 1.171875, + "learning_rate": 0.0013307499729571532, + "loss": 0.784, + "step": 5907 + }, + { + "epoch": 0.4110056001947894, + "grad_norm": 1.3984375, + "learning_rate": 0.001330537313661607, + "loss": 1.2242, + "step": 5908 + }, + { + "epoch": 0.4110751678319246, + "grad_norm": 1.0390625, + "learning_rate": 0.001330324637583042, + "loss": 0.7399, + "step": 5909 + }, + { + "epoch": 0.4111447354690598, + "grad_norm": 1.21875, + "learning_rate": 0.001330111944732257, + "loss": 0.7732, + "step": 5910 + }, + { + "epoch": 0.411214303106195, + "grad_norm": 1.3671875, + "learning_rate": 0.0013298992351200509, + "loss": 1.0661, + "step": 5911 + }, + { + "epoch": 0.4112838707433302, + "grad_norm": 1.2421875, + "learning_rate": 0.001329686508757225, + "loss": 0.6823, + "step": 5912 + }, + { + "epoch": 0.4113534383804654, + "grad_norm": 1.0390625, + "learning_rate": 0.0013294737656545795, + "loss": 0.8745, + "step": 5913 + }, + { + "epoch": 0.4114230060176006, + "grad_norm": 1.015625, + "learning_rate": 0.0013292610058229168, + "loss": 0.7367, + "step": 5914 + }, + { + "epoch": 0.4114925736547358, + "grad_norm": 0.87890625, + "learning_rate": 0.0013290482292730402, + "loss": 0.9285, + "step": 5915 + }, + { + "epoch": 0.411562141291871, + "grad_norm": 1.2578125, + "learning_rate": 0.0013288354360157528, + "loss": 0.7864, + "step": 5916 + }, + { + "epoch": 0.41163170892900625, + "grad_norm": 1.1015625, + "learning_rate": 0.0013286226260618597, + "loss": 0.8476, + "step": 5917 + }, + { + "epoch": 0.4117012765661414, + "grad_norm": 1.34375, + "learning_rate": 0.0013284097994221656, + "loss": 0.7926, + "step": 5918 + }, + { + "epoch": 0.41177084420327664, + "grad_norm": 1.03125, + "learning_rate": 0.0013281969561074775, + "loss": 1.0299, + "step": 5919 + }, + { + "epoch": 0.4118404118404118, + "grad_norm": 1.1328125, + "learning_rate": 0.001327984096128602, + "loss": 0.9616, + "step": 5920 + }, + { + "epoch": 0.41190997947754704, + "grad_norm": 1.2265625, + "learning_rate": 0.0013277712194963475, + "loss": 0.9856, + "step": 5921 + }, + { + "epoch": 0.41197954711468227, + "grad_norm": 1.265625, + "learning_rate": 0.0013275583262215224, + "loss": 1.2096, + "step": 5922 + }, + { + "epoch": 0.41204911475181744, + "grad_norm": 1.15625, + "learning_rate": 0.0013273454163149365, + "loss": 0.8468, + "step": 5923 + }, + { + "epoch": 0.41211868238895266, + "grad_norm": 1.5625, + "learning_rate": 0.0013271324897874007, + "loss": 1.0125, + "step": 5924 + }, + { + "epoch": 0.4121882500260879, + "grad_norm": 1.015625, + "learning_rate": 0.0013269195466497252, + "loss": 0.728, + "step": 5925 + }, + { + "epoch": 0.41225781766322306, + "grad_norm": 1.1796875, + "learning_rate": 0.0013267065869127235, + "loss": 1.0771, + "step": 5926 + }, + { + "epoch": 0.4123273853003583, + "grad_norm": 1.0, + "learning_rate": 0.0013264936105872077, + "loss": 0.6939, + "step": 5927 + }, + { + "epoch": 0.41239695293749346, + "grad_norm": 1.1171875, + "learning_rate": 0.001326280617683992, + "loss": 0.8318, + "step": 5928 + }, + { + "epoch": 0.4124665205746287, + "grad_norm": 0.9296875, + "learning_rate": 0.0013260676082138914, + "loss": 0.8941, + "step": 5929 + }, + { + "epoch": 0.4125360882117639, + "grad_norm": 1.234375, + "learning_rate": 0.001325854582187721, + "loss": 0.8441, + "step": 5930 + }, + { + "epoch": 0.4126056558488991, + "grad_norm": 0.9296875, + "learning_rate": 0.0013256415396162976, + "loss": 0.9476, + "step": 5931 + }, + { + "epoch": 0.4126752234860343, + "grad_norm": 0.96875, + "learning_rate": 0.0013254284805104377, + "loss": 0.8693, + "step": 5932 + }, + { + "epoch": 0.4127447911231695, + "grad_norm": 1.46875, + "learning_rate": 0.0013252154048809604, + "loss": 1.0098, + "step": 5933 + }, + { + "epoch": 0.4128143587603047, + "grad_norm": 0.85546875, + "learning_rate": 0.0013250023127386835, + "loss": 0.5385, + "step": 5934 + }, + { + "epoch": 0.4128839263974399, + "grad_norm": 0.95703125, + "learning_rate": 0.0013247892040944276, + "loss": 0.7623, + "step": 5935 + }, + { + "epoch": 0.4129534940345751, + "grad_norm": 0.90234375, + "learning_rate": 0.001324576078959013, + "loss": 0.6963, + "step": 5936 + }, + { + "epoch": 0.4130230616717103, + "grad_norm": 1.1875, + "learning_rate": 0.0013243629373432609, + "loss": 0.827, + "step": 5937 + }, + { + "epoch": 0.41309262930884555, + "grad_norm": 1.1171875, + "learning_rate": 0.0013241497792579938, + "loss": 0.8957, + "step": 5938 + }, + { + "epoch": 0.4131621969459807, + "grad_norm": 1.1796875, + "learning_rate": 0.0013239366047140347, + "loss": 0.8064, + "step": 5939 + }, + { + "epoch": 0.41323176458311595, + "grad_norm": 1.2734375, + "learning_rate": 0.001323723413722208, + "loss": 0.8238, + "step": 5940 + }, + { + "epoch": 0.4133013322202511, + "grad_norm": 0.9296875, + "learning_rate": 0.0013235102062933372, + "loss": 0.8791, + "step": 5941 + }, + { + "epoch": 0.41337089985738634, + "grad_norm": 1.4765625, + "learning_rate": 0.0013232969824382497, + "loss": 0.7921, + "step": 5942 + }, + { + "epoch": 0.41344046749452157, + "grad_norm": 1.078125, + "learning_rate": 0.0013230837421677702, + "loss": 0.8456, + "step": 5943 + }, + { + "epoch": 0.41351003513165674, + "grad_norm": 1.1171875, + "learning_rate": 0.0013228704854927268, + "loss": 0.8593, + "step": 5944 + }, + { + "epoch": 0.41357960276879197, + "grad_norm": 0.85546875, + "learning_rate": 0.001322657212423948, + "loss": 0.8399, + "step": 5945 + }, + { + "epoch": 0.41364917040592714, + "grad_norm": 0.96875, + "learning_rate": 0.001322443922972262, + "loss": 0.9466, + "step": 5946 + }, + { + "epoch": 0.41371873804306236, + "grad_norm": 0.97265625, + "learning_rate": 0.001322230617148499, + "loss": 0.8536, + "step": 5947 + }, + { + "epoch": 0.4137883056801976, + "grad_norm": 0.93359375, + "learning_rate": 0.0013220172949634892, + "loss": 0.8135, + "step": 5948 + }, + { + "epoch": 0.41385787331733276, + "grad_norm": 1.015625, + "learning_rate": 0.0013218039564280647, + "loss": 0.7584, + "step": 5949 + }, + { + "epoch": 0.413927440954468, + "grad_norm": 1.0390625, + "learning_rate": 0.0013215906015530568, + "loss": 0.7815, + "step": 5950 + }, + { + "epoch": 0.4139970085916032, + "grad_norm": 1.484375, + "learning_rate": 0.001321377230349299, + "loss": 1.0563, + "step": 5951 + }, + { + "epoch": 0.4140665762287384, + "grad_norm": 1.2421875, + "learning_rate": 0.0013211638428276256, + "loss": 0.962, + "step": 5952 + }, + { + "epoch": 0.4141361438658736, + "grad_norm": 1.140625, + "learning_rate": 0.0013209504389988709, + "loss": 0.7863, + "step": 5953 + }, + { + "epoch": 0.4142057115030088, + "grad_norm": 1.2578125, + "learning_rate": 0.0013207370188738708, + "loss": 0.9184, + "step": 5954 + }, + { + "epoch": 0.414275279140144, + "grad_norm": 0.96875, + "learning_rate": 0.0013205235824634615, + "loss": 0.8869, + "step": 5955 + }, + { + "epoch": 0.41434484677727923, + "grad_norm": 1.3515625, + "learning_rate": 0.0013203101297784804, + "loss": 0.786, + "step": 5956 + }, + { + "epoch": 0.4144144144144144, + "grad_norm": 1.1171875, + "learning_rate": 0.0013200966608297648, + "loss": 0.9513, + "step": 5957 + }, + { + "epoch": 0.4144839820515496, + "grad_norm": 1.109375, + "learning_rate": 0.0013198831756281546, + "loss": 1.0198, + "step": 5958 + }, + { + "epoch": 0.4145535496886848, + "grad_norm": 1.0234375, + "learning_rate": 0.001319669674184489, + "loss": 0.6842, + "step": 5959 + }, + { + "epoch": 0.41462311732582, + "grad_norm": 1.5, + "learning_rate": 0.0013194561565096085, + "loss": 1.0767, + "step": 5960 + }, + { + "epoch": 0.41469268496295525, + "grad_norm": 1.4140625, + "learning_rate": 0.0013192426226143548, + "loss": 0.9905, + "step": 5961 + }, + { + "epoch": 0.4147622526000904, + "grad_norm": 1.2109375, + "learning_rate": 0.0013190290725095695, + "loss": 0.8643, + "step": 5962 + }, + { + "epoch": 0.41483182023722565, + "grad_norm": 1.1171875, + "learning_rate": 0.0013188155062060962, + "loss": 0.6039, + "step": 5963 + }, + { + "epoch": 0.41490138787436087, + "grad_norm": 1.0625, + "learning_rate": 0.0013186019237147785, + "loss": 0.8251, + "step": 5964 + }, + { + "epoch": 0.41497095551149604, + "grad_norm": 0.94140625, + "learning_rate": 0.0013183883250464606, + "loss": 0.8622, + "step": 5965 + }, + { + "epoch": 0.41504052314863127, + "grad_norm": 1.1015625, + "learning_rate": 0.0013181747102119887, + "loss": 0.8189, + "step": 5966 + }, + { + "epoch": 0.41511009078576644, + "grad_norm": 1.4765625, + "learning_rate": 0.0013179610792222085, + "loss": 0.936, + "step": 5967 + }, + { + "epoch": 0.41517965842290167, + "grad_norm": 0.9609375, + "learning_rate": 0.0013177474320879674, + "loss": 0.857, + "step": 5968 + }, + { + "epoch": 0.4152492260600369, + "grad_norm": 1.0234375, + "learning_rate": 0.0013175337688201135, + "loss": 0.9093, + "step": 5969 + }, + { + "epoch": 0.41531879369717206, + "grad_norm": 0.94921875, + "learning_rate": 0.001317320089429495, + "loss": 0.8407, + "step": 5970 + }, + { + "epoch": 0.4153883613343073, + "grad_norm": 1.03125, + "learning_rate": 0.001317106393926962, + "loss": 0.7999, + "step": 5971 + }, + { + "epoch": 0.41545792897144246, + "grad_norm": 0.9921875, + "learning_rate": 0.0013168926823233645, + "loss": 0.9324, + "step": 5972 + }, + { + "epoch": 0.4155274966085777, + "grad_norm": 1.4609375, + "learning_rate": 0.001316678954629554, + "loss": 1.0212, + "step": 5973 + }, + { + "epoch": 0.4155970642457129, + "grad_norm": 1.3828125, + "learning_rate": 0.0013164652108563822, + "loss": 0.9504, + "step": 5974 + }, + { + "epoch": 0.4156666318828481, + "grad_norm": 1.3671875, + "learning_rate": 0.0013162514510147022, + "loss": 0.8767, + "step": 5975 + }, + { + "epoch": 0.4157361995199833, + "grad_norm": 1.359375, + "learning_rate": 0.0013160376751153674, + "loss": 0.7471, + "step": 5976 + }, + { + "epoch": 0.41580576715711853, + "grad_norm": 1.1015625, + "learning_rate": 0.0013158238831692324, + "loss": 0.9097, + "step": 5977 + }, + { + "epoch": 0.4158753347942537, + "grad_norm": 1.359375, + "learning_rate": 0.0013156100751871528, + "loss": 1.1735, + "step": 5978 + }, + { + "epoch": 0.41594490243138893, + "grad_norm": 1.1640625, + "learning_rate": 0.0013153962511799843, + "loss": 0.8713, + "step": 5979 + }, + { + "epoch": 0.4160144700685241, + "grad_norm": 1.3984375, + "learning_rate": 0.0013151824111585836, + "loss": 1.0516, + "step": 5980 + }, + { + "epoch": 0.4160840377056593, + "grad_norm": 1.0703125, + "learning_rate": 0.0013149685551338086, + "loss": 0.796, + "step": 5981 + }, + { + "epoch": 0.41615360534279455, + "grad_norm": 0.9296875, + "learning_rate": 0.0013147546831165182, + "loss": 0.9837, + "step": 5982 + }, + { + "epoch": 0.4162231729799297, + "grad_norm": 1.03125, + "learning_rate": 0.0013145407951175717, + "loss": 0.751, + "step": 5983 + }, + { + "epoch": 0.41629274061706495, + "grad_norm": 0.99609375, + "learning_rate": 0.0013143268911478287, + "loss": 0.7369, + "step": 5984 + }, + { + "epoch": 0.4163623082542001, + "grad_norm": 1.1875, + "learning_rate": 0.0013141129712181505, + "loss": 0.9013, + "step": 5985 + }, + { + "epoch": 0.41643187589133535, + "grad_norm": 0.92578125, + "learning_rate": 0.0013138990353393988, + "loss": 1.1253, + "step": 5986 + }, + { + "epoch": 0.41650144352847057, + "grad_norm": 1.078125, + "learning_rate": 0.0013136850835224366, + "loss": 0.6227, + "step": 5987 + }, + { + "epoch": 0.41657101116560574, + "grad_norm": 0.98046875, + "learning_rate": 0.0013134711157781268, + "loss": 0.6232, + "step": 5988 + }, + { + "epoch": 0.41664057880274097, + "grad_norm": 1.25, + "learning_rate": 0.0013132571321173337, + "loss": 0.7733, + "step": 5989 + }, + { + "epoch": 0.4167101464398762, + "grad_norm": 1.375, + "learning_rate": 0.0013130431325509221, + "loss": 0.9784, + "step": 5990 + }, + { + "epoch": 0.41677971407701137, + "grad_norm": 0.91796875, + "learning_rate": 0.0013128291170897584, + "loss": 0.8261, + "step": 5991 + }, + { + "epoch": 0.4168492817141466, + "grad_norm": 1.078125, + "learning_rate": 0.0013126150857447087, + "loss": 0.8843, + "step": 5992 + }, + { + "epoch": 0.41691884935128176, + "grad_norm": 1.2109375, + "learning_rate": 0.001312401038526641, + "loss": 0.7726, + "step": 5993 + }, + { + "epoch": 0.416988416988417, + "grad_norm": 1.25, + "learning_rate": 0.0013121869754464228, + "loss": 0.9126, + "step": 5994 + }, + { + "epoch": 0.4170579846255522, + "grad_norm": 1.3671875, + "learning_rate": 0.0013119728965149237, + "loss": 1.1141, + "step": 5995 + }, + { + "epoch": 0.4171275522626874, + "grad_norm": 1.28125, + "learning_rate": 0.0013117588017430134, + "loss": 0.9249, + "step": 5996 + }, + { + "epoch": 0.4171971198998226, + "grad_norm": 1.046875, + "learning_rate": 0.0013115446911415626, + "loss": 0.773, + "step": 5997 + }, + { + "epoch": 0.4172666875369578, + "grad_norm": 1.1015625, + "learning_rate": 0.0013113305647214424, + "loss": 0.7567, + "step": 5998 + }, + { + "epoch": 0.417336255174093, + "grad_norm": 1.1640625, + "learning_rate": 0.0013111164224935256, + "loss": 0.8103, + "step": 5999 + }, + { + "epoch": 0.41740582281122823, + "grad_norm": 0.96484375, + "learning_rate": 0.001310902264468685, + "loss": 0.6928, + "step": 6000 + }, + { + "epoch": 0.4174753904483634, + "grad_norm": 0.90234375, + "learning_rate": 0.0013106880906577944, + "loss": 0.6852, + "step": 6001 + }, + { + "epoch": 0.41754495808549863, + "grad_norm": 1.125, + "learning_rate": 0.0013104739010717287, + "loss": 0.8544, + "step": 6002 + }, + { + "epoch": 0.41761452572263386, + "grad_norm": 1.296875, + "learning_rate": 0.0013102596957213631, + "loss": 0.8348, + "step": 6003 + }, + { + "epoch": 0.417684093359769, + "grad_norm": 1.1015625, + "learning_rate": 0.0013100454746175739, + "loss": 0.8061, + "step": 6004 + }, + { + "epoch": 0.41775366099690425, + "grad_norm": 1.109375, + "learning_rate": 0.0013098312377712383, + "loss": 0.7722, + "step": 6005 + }, + { + "epoch": 0.4178232286340394, + "grad_norm": 0.98828125, + "learning_rate": 0.0013096169851932338, + "loss": 1.0377, + "step": 6006 + }, + { + "epoch": 0.41789279627117465, + "grad_norm": 0.9921875, + "learning_rate": 0.0013094027168944397, + "loss": 0.8373, + "step": 6007 + }, + { + "epoch": 0.4179623639083099, + "grad_norm": 1.140625, + "learning_rate": 0.001309188432885735, + "loss": 1.1225, + "step": 6008 + }, + { + "epoch": 0.41803193154544505, + "grad_norm": 1.2578125, + "learning_rate": 0.0013089741331780004, + "loss": 0.9004, + "step": 6009 + }, + { + "epoch": 0.41810149918258027, + "grad_norm": 1.109375, + "learning_rate": 0.0013087598177821166, + "loss": 0.8053, + "step": 6010 + }, + { + "epoch": 0.41817106681971544, + "grad_norm": 1.5, + "learning_rate": 0.0013085454867089652, + "loss": 0.8599, + "step": 6011 + }, + { + "epoch": 0.41824063445685067, + "grad_norm": 1.125, + "learning_rate": 0.0013083311399694293, + "loss": 0.7888, + "step": 6012 + }, + { + "epoch": 0.4183102020939859, + "grad_norm": 1.09375, + "learning_rate": 0.0013081167775743925, + "loss": 0.814, + "step": 6013 + }, + { + "epoch": 0.41837976973112107, + "grad_norm": 1.2421875, + "learning_rate": 0.0013079023995347385, + "loss": 0.9164, + "step": 6014 + }, + { + "epoch": 0.4184493373682563, + "grad_norm": 1.265625, + "learning_rate": 0.0013076880058613524, + "loss": 0.8386, + "step": 6015 + }, + { + "epoch": 0.4185189050053915, + "grad_norm": 1.203125, + "learning_rate": 0.0013074735965651206, + "loss": 0.7152, + "step": 6016 + }, + { + "epoch": 0.4185884726425267, + "grad_norm": 0.93359375, + "learning_rate": 0.0013072591716569294, + "loss": 0.6224, + "step": 6017 + }, + { + "epoch": 0.4186580402796619, + "grad_norm": 0.96484375, + "learning_rate": 0.001307044731147666, + "loss": 0.902, + "step": 6018 + }, + { + "epoch": 0.4187276079167971, + "grad_norm": 1.375, + "learning_rate": 0.0013068302750482185, + "loss": 0.7997, + "step": 6019 + }, + { + "epoch": 0.4187971755539323, + "grad_norm": 1.1640625, + "learning_rate": 0.0013066158033694763, + "loss": 0.9071, + "step": 6020 + }, + { + "epoch": 0.41886674319106754, + "grad_norm": 1.0703125, + "learning_rate": 0.0013064013161223293, + "loss": 0.913, + "step": 6021 + }, + { + "epoch": 0.4189363108282027, + "grad_norm": 1.1171875, + "learning_rate": 0.0013061868133176678, + "loss": 0.9266, + "step": 6022 + }, + { + "epoch": 0.41900587846533793, + "grad_norm": 0.87109375, + "learning_rate": 0.001305972294966383, + "loss": 0.9229, + "step": 6023 + }, + { + "epoch": 0.4190754461024731, + "grad_norm": 0.96484375, + "learning_rate": 0.0013057577610793673, + "loss": 0.8808, + "step": 6024 + }, + { + "epoch": 0.41914501373960833, + "grad_norm": 1.1171875, + "learning_rate": 0.001305543211667514, + "loss": 0.8094, + "step": 6025 + }, + { + "epoch": 0.41921458137674356, + "grad_norm": 1.3515625, + "learning_rate": 0.001305328646741716, + "loss": 0.8917, + "step": 6026 + }, + { + "epoch": 0.4192841490138787, + "grad_norm": 1.1796875, + "learning_rate": 0.0013051140663128686, + "loss": 0.8215, + "step": 6027 + }, + { + "epoch": 0.41935371665101395, + "grad_norm": 1.1640625, + "learning_rate": 0.0013048994703918667, + "loss": 0.869, + "step": 6028 + }, + { + "epoch": 0.4194232842881492, + "grad_norm": 1.1953125, + "learning_rate": 0.0013046848589896066, + "loss": 0.9581, + "step": 6029 + }, + { + "epoch": 0.41949285192528435, + "grad_norm": 1.0703125, + "learning_rate": 0.0013044702321169848, + "loss": 0.7993, + "step": 6030 + }, + { + "epoch": 0.4195624195624196, + "grad_norm": 1.265625, + "learning_rate": 0.0013042555897848996, + "loss": 0.9372, + "step": 6031 + }, + { + "epoch": 0.41963198719955475, + "grad_norm": 1.21875, + "learning_rate": 0.0013040409320042488, + "loss": 0.7678, + "step": 6032 + }, + { + "epoch": 0.41970155483668997, + "grad_norm": 1.0078125, + "learning_rate": 0.0013038262587859323, + "loss": 0.9445, + "step": 6033 + }, + { + "epoch": 0.4197711224738252, + "grad_norm": 1.53125, + "learning_rate": 0.0013036115701408493, + "loss": 0.631, + "step": 6034 + }, + { + "epoch": 0.41984069011096037, + "grad_norm": 1.5234375, + "learning_rate": 0.0013033968660799014, + "loss": 1.0625, + "step": 6035 + }, + { + "epoch": 0.4199102577480956, + "grad_norm": 1.09375, + "learning_rate": 0.00130318214661399, + "loss": 0.6111, + "step": 6036 + }, + { + "epoch": 0.41997982538523077, + "grad_norm": 0.921875, + "learning_rate": 0.001302967411754017, + "loss": 0.8909, + "step": 6037 + }, + { + "epoch": 0.420049393022366, + "grad_norm": 1.1015625, + "learning_rate": 0.0013027526615108863, + "loss": 0.8946, + "step": 6038 + }, + { + "epoch": 0.4201189606595012, + "grad_norm": 0.9296875, + "learning_rate": 0.001302537895895501, + "loss": 0.8895, + "step": 6039 + }, + { + "epoch": 0.4201885282966364, + "grad_norm": 1.28125, + "learning_rate": 0.0013023231149187663, + "loss": 0.9964, + "step": 6040 + }, + { + "epoch": 0.4202580959337716, + "grad_norm": 1.4609375, + "learning_rate": 0.0013021083185915882, + "loss": 1.0044, + "step": 6041 + }, + { + "epoch": 0.42032766357090684, + "grad_norm": 0.95703125, + "learning_rate": 0.0013018935069248718, + "loss": 0.7365, + "step": 6042 + }, + { + "epoch": 0.420397231208042, + "grad_norm": 0.87890625, + "learning_rate": 0.0013016786799295251, + "loss": 0.7353, + "step": 6043 + }, + { + "epoch": 0.42046679884517724, + "grad_norm": 1.03125, + "learning_rate": 0.0013014638376164555, + "loss": 0.7773, + "step": 6044 + }, + { + "epoch": 0.4205363664823124, + "grad_norm": 1.171875, + "learning_rate": 0.0013012489799965716, + "loss": 1.0401, + "step": 6045 + }, + { + "epoch": 0.42060593411944763, + "grad_norm": 1.0234375, + "learning_rate": 0.001301034107080783, + "loss": 0.9157, + "step": 6046 + }, + { + "epoch": 0.42067550175658286, + "grad_norm": 0.9921875, + "learning_rate": 0.00130081921888, + "loss": 0.7776, + "step": 6047 + }, + { + "epoch": 0.42074506939371803, + "grad_norm": 1.203125, + "learning_rate": 0.0013006043154051331, + "loss": 0.9943, + "step": 6048 + }, + { + "epoch": 0.42081463703085326, + "grad_norm": 1.1640625, + "learning_rate": 0.0013003893966670942, + "loss": 1.0388, + "step": 6049 + }, + { + "epoch": 0.4208842046679884, + "grad_norm": 1.265625, + "learning_rate": 0.0013001744626767958, + "loss": 0.7578, + "step": 6050 + }, + { + "epoch": 0.42095377230512365, + "grad_norm": 1.3203125, + "learning_rate": 0.0012999595134451512, + "loss": 0.9979, + "step": 6051 + }, + { + "epoch": 0.4210233399422589, + "grad_norm": 1.84375, + "learning_rate": 0.0012997445489830745, + "loss": 0.855, + "step": 6052 + }, + { + "epoch": 0.42109290757939405, + "grad_norm": 1.1875, + "learning_rate": 0.0012995295693014803, + "loss": 1.2088, + "step": 6053 + }, + { + "epoch": 0.4211624752165293, + "grad_norm": 1.1953125, + "learning_rate": 0.0012993145744112844, + "loss": 0.7431, + "step": 6054 + }, + { + "epoch": 0.4212320428536645, + "grad_norm": 0.92578125, + "learning_rate": 0.001299099564323403, + "loss": 1.0207, + "step": 6055 + }, + { + "epoch": 0.42130161049079967, + "grad_norm": 1.1953125, + "learning_rate": 0.0012988845390487533, + "loss": 0.898, + "step": 6056 + }, + { + "epoch": 0.4213711781279349, + "grad_norm": 1.3671875, + "learning_rate": 0.0012986694985982533, + "loss": 1.0087, + "step": 6057 + }, + { + "epoch": 0.42144074576507007, + "grad_norm": 1.2265625, + "learning_rate": 0.0012984544429828215, + "loss": 1.1273, + "step": 6058 + }, + { + "epoch": 0.4215103134022053, + "grad_norm": 1.0625, + "learning_rate": 0.0012982393722133774, + "loss": 0.7767, + "step": 6059 + }, + { + "epoch": 0.4215798810393405, + "grad_norm": 0.9375, + "learning_rate": 0.0012980242863008412, + "loss": 0.6377, + "step": 6060 + }, + { + "epoch": 0.4216494486764757, + "grad_norm": 1.1484375, + "learning_rate": 0.001297809185256134, + "loss": 1.0128, + "step": 6061 + }, + { + "epoch": 0.4217190163136109, + "grad_norm": 0.93359375, + "learning_rate": 0.0012975940690901772, + "loss": 0.7818, + "step": 6062 + }, + { + "epoch": 0.4217885839507461, + "grad_norm": 0.96484375, + "learning_rate": 0.0012973789378138939, + "loss": 0.6688, + "step": 6063 + }, + { + "epoch": 0.4218581515878813, + "grad_norm": 1.4296875, + "learning_rate": 0.001297163791438207, + "loss": 0.8949, + "step": 6064 + }, + { + "epoch": 0.42192771922501654, + "grad_norm": 1.3515625, + "learning_rate": 0.0012969486299740402, + "loss": 0.7757, + "step": 6065 + }, + { + "epoch": 0.4219972868621517, + "grad_norm": 0.97265625, + "learning_rate": 0.001296733453432319, + "loss": 0.871, + "step": 6066 + }, + { + "epoch": 0.42206685449928694, + "grad_norm": 1.0390625, + "learning_rate": 0.0012965182618239685, + "loss": 0.8648, + "step": 6067 + }, + { + "epoch": 0.42213642213642216, + "grad_norm": 1.1953125, + "learning_rate": 0.0012963030551599154, + "loss": 0.9459, + "step": 6068 + }, + { + "epoch": 0.42220598977355733, + "grad_norm": 0.84765625, + "learning_rate": 0.0012960878334510864, + "loss": 0.5526, + "step": 6069 + }, + { + "epoch": 0.42227555741069256, + "grad_norm": 0.9921875, + "learning_rate": 0.00129587259670841, + "loss": 0.9363, + "step": 6070 + }, + { + "epoch": 0.42234512504782773, + "grad_norm": 1.3125, + "learning_rate": 0.001295657344942814, + "loss": 1.0081, + "step": 6071 + }, + { + "epoch": 0.42241469268496296, + "grad_norm": 1.2265625, + "learning_rate": 0.0012954420781652288, + "loss": 1.0101, + "step": 6072 + }, + { + "epoch": 0.4224842603220982, + "grad_norm": 1.0625, + "learning_rate": 0.0012952267963865839, + "loss": 1.1445, + "step": 6073 + }, + { + "epoch": 0.42255382795923335, + "grad_norm": 1.1171875, + "learning_rate": 0.00129501149961781, + "loss": 0.9124, + "step": 6074 + }, + { + "epoch": 0.4226233955963686, + "grad_norm": 1.25, + "learning_rate": 0.001294796187869839, + "loss": 0.8919, + "step": 6075 + }, + { + "epoch": 0.42269296323350375, + "grad_norm": 1.046875, + "learning_rate": 0.0012945808611536038, + "loss": 0.5887, + "step": 6076 + }, + { + "epoch": 0.422762530870639, + "grad_norm": 1.46875, + "learning_rate": 0.0012943655194800371, + "loss": 0.8486, + "step": 6077 + }, + { + "epoch": 0.4228320985077742, + "grad_norm": 1.1640625, + "learning_rate": 0.0012941501628600733, + "loss": 0.9304, + "step": 6078 + }, + { + "epoch": 0.42290166614490937, + "grad_norm": 1.09375, + "learning_rate": 0.0012939347913046466, + "loss": 0.6868, + "step": 6079 + }, + { + "epoch": 0.4229712337820446, + "grad_norm": 1.03125, + "learning_rate": 0.001293719404824693, + "loss": 0.7462, + "step": 6080 + }, + { + "epoch": 0.4230408014191798, + "grad_norm": 0.99609375, + "learning_rate": 0.0012935040034311482, + "loss": 0.8727, + "step": 6081 + }, + { + "epoch": 0.423110369056315, + "grad_norm": 0.98046875, + "learning_rate": 0.0012932885871349497, + "loss": 1.006, + "step": 6082 + }, + { + "epoch": 0.4231799366934502, + "grad_norm": 1.0234375, + "learning_rate": 0.0012930731559470346, + "loss": 0.6387, + "step": 6083 + }, + { + "epoch": 0.4232495043305854, + "grad_norm": 1.1875, + "learning_rate": 0.0012928577098783422, + "loss": 0.9473, + "step": 6084 + }, + { + "epoch": 0.4233190719677206, + "grad_norm": 1.296875, + "learning_rate": 0.0012926422489398114, + "loss": 0.8836, + "step": 6085 + }, + { + "epoch": 0.42338863960485584, + "grad_norm": 1.2890625, + "learning_rate": 0.0012924267731423823, + "loss": 0.8869, + "step": 6086 + }, + { + "epoch": 0.423458207241991, + "grad_norm": 1.109375, + "learning_rate": 0.0012922112824969953, + "loss": 0.7571, + "step": 6087 + }, + { + "epoch": 0.42352777487912624, + "grad_norm": 0.96484375, + "learning_rate": 0.0012919957770145924, + "loss": 0.7119, + "step": 6088 + }, + { + "epoch": 0.4235973425162614, + "grad_norm": 0.98046875, + "learning_rate": 0.001291780256706116, + "loss": 0.6646, + "step": 6089 + }, + { + "epoch": 0.42366691015339664, + "grad_norm": 1.0625, + "learning_rate": 0.0012915647215825082, + "loss": 0.7336, + "step": 6090 + }, + { + "epoch": 0.42373647779053186, + "grad_norm": 1.1875, + "learning_rate": 0.001291349171654714, + "loss": 0.8615, + "step": 6091 + }, + { + "epoch": 0.42380604542766703, + "grad_norm": 1.34375, + "learning_rate": 0.001291133606933677, + "loss": 0.8484, + "step": 6092 + }, + { + "epoch": 0.42387561306480226, + "grad_norm": 1.0703125, + "learning_rate": 0.0012909180274303432, + "loss": 0.8087, + "step": 6093 + }, + { + "epoch": 0.4239451807019375, + "grad_norm": 1.078125, + "learning_rate": 0.001290702433155658, + "loss": 0.6936, + "step": 6094 + }, + { + "epoch": 0.42401474833907266, + "grad_norm": 1.0859375, + "learning_rate": 0.0012904868241205686, + "loss": 0.8051, + "step": 6095 + }, + { + "epoch": 0.4240843159762079, + "grad_norm": 0.9453125, + "learning_rate": 0.0012902712003360227, + "loss": 0.7881, + "step": 6096 + }, + { + "epoch": 0.42415388361334305, + "grad_norm": 1.0859375, + "learning_rate": 0.001290055561812968, + "loss": 0.8346, + "step": 6097 + }, + { + "epoch": 0.4242234512504783, + "grad_norm": 1.15625, + "learning_rate": 0.0012898399085623537, + "loss": 0.9084, + "step": 6098 + }, + { + "epoch": 0.4242930188876135, + "grad_norm": 0.97265625, + "learning_rate": 0.00128962424059513, + "loss": 0.6617, + "step": 6099 + }, + { + "epoch": 0.4243625865247487, + "grad_norm": 1.0625, + "learning_rate": 0.0012894085579222472, + "loss": 0.8488, + "step": 6100 + }, + { + "epoch": 0.4244321541618839, + "grad_norm": 0.98046875, + "learning_rate": 0.0012891928605546564, + "loss": 0.7682, + "step": 6101 + }, + { + "epoch": 0.42450172179901907, + "grad_norm": 1.0390625, + "learning_rate": 0.00128897714850331, + "loss": 0.8503, + "step": 6102 + }, + { + "epoch": 0.4245712894361543, + "grad_norm": 3.171875, + "learning_rate": 0.0012887614217791605, + "loss": 0.905, + "step": 6103 + }, + { + "epoch": 0.4246408570732895, + "grad_norm": 1.2421875, + "learning_rate": 0.0012885456803931614, + "loss": 0.9455, + "step": 6104 + }, + { + "epoch": 0.4247104247104247, + "grad_norm": 1.234375, + "learning_rate": 0.0012883299243562673, + "loss": 1.0008, + "step": 6105 + }, + { + "epoch": 0.4247799923475599, + "grad_norm": 1.25, + "learning_rate": 0.0012881141536794322, + "loss": 0.8103, + "step": 6106 + }, + { + "epoch": 0.42484955998469515, + "grad_norm": 1.2265625, + "learning_rate": 0.001287898368373613, + "loss": 0.9041, + "step": 6107 + }, + { + "epoch": 0.4249191276218303, + "grad_norm": 1.4296875, + "learning_rate": 0.0012876825684497658, + "loss": 0.8348, + "step": 6108 + }, + { + "epoch": 0.42498869525896554, + "grad_norm": 1.2265625, + "learning_rate": 0.001287466753918848, + "loss": 1.0223, + "step": 6109 + }, + { + "epoch": 0.4250582628961007, + "grad_norm": 1.0859375, + "learning_rate": 0.0012872509247918173, + "loss": 0.8618, + "step": 6110 + }, + { + "epoch": 0.42512783053323594, + "grad_norm": 1.0078125, + "learning_rate": 0.0012870350810796323, + "loss": 0.8407, + "step": 6111 + }, + { + "epoch": 0.42519739817037117, + "grad_norm": 1.1875, + "learning_rate": 0.0012868192227932526, + "loss": 0.864, + "step": 6112 + }, + { + "epoch": 0.42526696580750634, + "grad_norm": 0.9296875, + "learning_rate": 0.0012866033499436384, + "loss": 0.7074, + "step": 6113 + }, + { + "epoch": 0.42533653344464156, + "grad_norm": 1.3046875, + "learning_rate": 0.0012863874625417514, + "loss": 0.919, + "step": 6114 + }, + { + "epoch": 0.42540610108177673, + "grad_norm": 1.0234375, + "learning_rate": 0.0012861715605985515, + "loss": 0.8278, + "step": 6115 + }, + { + "epoch": 0.42547566871891196, + "grad_norm": 1.1796875, + "learning_rate": 0.0012859556441250032, + "loss": 1.0292, + "step": 6116 + }, + { + "epoch": 0.4255452363560472, + "grad_norm": 1.0859375, + "learning_rate": 0.0012857397131320677, + "loss": 0.803, + "step": 6117 + }, + { + "epoch": 0.42561480399318236, + "grad_norm": 1.1171875, + "learning_rate": 0.0012855237676307103, + "loss": 0.8109, + "step": 6118 + }, + { + "epoch": 0.4256843716303176, + "grad_norm": 1.125, + "learning_rate": 0.0012853078076318952, + "loss": 1.0028, + "step": 6119 + }, + { + "epoch": 0.4257539392674528, + "grad_norm": 1.125, + "learning_rate": 0.0012850918331465872, + "loss": 0.9196, + "step": 6120 + }, + { + "epoch": 0.425823506904588, + "grad_norm": 1.3125, + "learning_rate": 0.0012848758441857534, + "loss": 0.9724, + "step": 6121 + }, + { + "epoch": 0.4258930745417232, + "grad_norm": 0.890625, + "learning_rate": 0.0012846598407603596, + "loss": 0.7952, + "step": 6122 + }, + { + "epoch": 0.4259626421788584, + "grad_norm": 1.09375, + "learning_rate": 0.0012844438228813745, + "loss": 0.8355, + "step": 6123 + }, + { + "epoch": 0.4260322098159936, + "grad_norm": 1.0546875, + "learning_rate": 0.0012842277905597652, + "loss": 0.6901, + "step": 6124 + }, + { + "epoch": 0.4261017774531288, + "grad_norm": 1.203125, + "learning_rate": 0.0012840117438065017, + "loss": 0.9057, + "step": 6125 + }, + { + "epoch": 0.426171345090264, + "grad_norm": 0.92578125, + "learning_rate": 0.0012837956826325532, + "loss": 0.7786, + "step": 6126 + }, + { + "epoch": 0.4262409127273992, + "grad_norm": 1.515625, + "learning_rate": 0.0012835796070488903, + "loss": 1.1199, + "step": 6127 + }, + { + "epoch": 0.4263104803645344, + "grad_norm": 1.296875, + "learning_rate": 0.0012833635170664845, + "loss": 0.9589, + "step": 6128 + }, + { + "epoch": 0.4263800480016696, + "grad_norm": 1.1640625, + "learning_rate": 0.0012831474126963074, + "loss": 0.7652, + "step": 6129 + }, + { + "epoch": 0.42644961563880485, + "grad_norm": 1.171875, + "learning_rate": 0.001282931293949332, + "loss": 0.8077, + "step": 6130 + }, + { + "epoch": 0.42651918327594, + "grad_norm": 1.015625, + "learning_rate": 0.0012827151608365312, + "loss": 0.9046, + "step": 6131 + }, + { + "epoch": 0.42658875091307524, + "grad_norm": 1.046875, + "learning_rate": 0.0012824990133688803, + "loss": 0.9335, + "step": 6132 + }, + { + "epoch": 0.42665831855021047, + "grad_norm": 0.78515625, + "learning_rate": 0.0012822828515573527, + "loss": 0.6665, + "step": 6133 + }, + { + "epoch": 0.42672788618734564, + "grad_norm": 1.0703125, + "learning_rate": 0.0012820666754129251, + "loss": 0.791, + "step": 6134 + }, + { + "epoch": 0.42679745382448087, + "grad_norm": 0.96875, + "learning_rate": 0.001281850484946573, + "loss": 0.7337, + "step": 6135 + }, + { + "epoch": 0.42686702146161604, + "grad_norm": 1.0, + "learning_rate": 0.001281634280169274, + "loss": 0.867, + "step": 6136 + }, + { + "epoch": 0.42693658909875126, + "grad_norm": 1.453125, + "learning_rate": 0.0012814180610920063, + "loss": 0.7578, + "step": 6137 + }, + { + "epoch": 0.4270061567358865, + "grad_norm": 1.2421875, + "learning_rate": 0.0012812018277257474, + "loss": 0.8261, + "step": 6138 + }, + { + "epoch": 0.42707572437302166, + "grad_norm": 1.1171875, + "learning_rate": 0.0012809855800814773, + "loss": 0.9987, + "step": 6139 + }, + { + "epoch": 0.4271452920101569, + "grad_norm": 1.5390625, + "learning_rate": 0.0012807693181701757, + "loss": 1.1341, + "step": 6140 + }, + { + "epoch": 0.42721485964729206, + "grad_norm": 1.0703125, + "learning_rate": 0.0012805530420028233, + "loss": 1.0794, + "step": 6141 + }, + { + "epoch": 0.4272844272844273, + "grad_norm": 1.0390625, + "learning_rate": 0.0012803367515904017, + "loss": 0.838, + "step": 6142 + }, + { + "epoch": 0.4273539949215625, + "grad_norm": 1.4140625, + "learning_rate": 0.0012801204469438923, + "loss": 1.0831, + "step": 6143 + }, + { + "epoch": 0.4274235625586977, + "grad_norm": 1.15625, + "learning_rate": 0.001279904128074279, + "loss": 1.0262, + "step": 6144 + }, + { + "epoch": 0.4274931301958329, + "grad_norm": 1.1484375, + "learning_rate": 0.0012796877949925445, + "loss": 0.8847, + "step": 6145 + }, + { + "epoch": 0.42756269783296813, + "grad_norm": 1.0546875, + "learning_rate": 0.0012794714477096741, + "loss": 0.96, + "step": 6146 + }, + { + "epoch": 0.4276322654701033, + "grad_norm": 1.4609375, + "learning_rate": 0.0012792550862366517, + "loss": 1.0317, + "step": 6147 + }, + { + "epoch": 0.4277018331072385, + "grad_norm": 1.1171875, + "learning_rate": 0.0012790387105844638, + "loss": 0.8646, + "step": 6148 + }, + { + "epoch": 0.4277714007443737, + "grad_norm": 1.0078125, + "learning_rate": 0.0012788223207640963, + "loss": 0.8444, + "step": 6149 + }, + { + "epoch": 0.4278409683815089, + "grad_norm": 0.9453125, + "learning_rate": 0.0012786059167865372, + "loss": 0.7417, + "step": 6150 + }, + { + "epoch": 0.42791053601864415, + "grad_norm": 0.83984375, + "learning_rate": 0.0012783894986627738, + "loss": 0.686, + "step": 6151 + }, + { + "epoch": 0.4279801036557793, + "grad_norm": 0.9609375, + "learning_rate": 0.0012781730664037944, + "loss": 0.7532, + "step": 6152 + }, + { + "epoch": 0.42804967129291455, + "grad_norm": 1.046875, + "learning_rate": 0.0012779566200205894, + "loss": 0.8931, + "step": 6153 + }, + { + "epoch": 0.4281192389300497, + "grad_norm": 1.6171875, + "learning_rate": 0.0012777401595241479, + "loss": 0.7948, + "step": 6154 + }, + { + "epoch": 0.42818880656718494, + "grad_norm": 1.2265625, + "learning_rate": 0.0012775236849254612, + "loss": 0.9641, + "step": 6155 + }, + { + "epoch": 0.42825837420432017, + "grad_norm": 1.109375, + "learning_rate": 0.0012773071962355203, + "loss": 0.8638, + "step": 6156 + }, + { + "epoch": 0.42832794184145534, + "grad_norm": 1.1484375, + "learning_rate": 0.001277090693465318, + "loss": 0.8303, + "step": 6157 + }, + { + "epoch": 0.42839750947859057, + "grad_norm": 1.21875, + "learning_rate": 0.001276874176625847, + "loss": 1.1343, + "step": 6158 + }, + { + "epoch": 0.4284670771157258, + "grad_norm": 1.3046875, + "learning_rate": 0.0012766576457281006, + "loss": 0.7364, + "step": 6159 + }, + { + "epoch": 0.42853664475286096, + "grad_norm": 1.0546875, + "learning_rate": 0.0012764411007830736, + "loss": 0.8541, + "step": 6160 + }, + { + "epoch": 0.4286062123899962, + "grad_norm": 1.203125, + "learning_rate": 0.0012762245418017606, + "loss": 1.0099, + "step": 6161 + }, + { + "epoch": 0.42867578002713136, + "grad_norm": 1.0390625, + "learning_rate": 0.001276007968795158, + "loss": 0.5945, + "step": 6162 + }, + { + "epoch": 0.4287453476642666, + "grad_norm": 1.109375, + "learning_rate": 0.0012757913817742614, + "loss": 0.8322, + "step": 6163 + }, + { + "epoch": 0.4288149153014018, + "grad_norm": 1.1015625, + "learning_rate": 0.001275574780750069, + "loss": 0.9469, + "step": 6164 + }, + { + "epoch": 0.428884482938537, + "grad_norm": 0.9296875, + "learning_rate": 0.0012753581657335782, + "loss": 0.9341, + "step": 6165 + }, + { + "epoch": 0.4289540505756722, + "grad_norm": 0.95703125, + "learning_rate": 0.0012751415367357876, + "loss": 0.7707, + "step": 6166 + }, + { + "epoch": 0.4290236182128074, + "grad_norm": 1.09375, + "learning_rate": 0.0012749248937676968, + "loss": 0.7381, + "step": 6167 + }, + { + "epoch": 0.4290931858499426, + "grad_norm": 1.09375, + "learning_rate": 0.0012747082368403048, + "loss": 0.8526, + "step": 6168 + }, + { + "epoch": 0.42916275348707783, + "grad_norm": 1.2578125, + "learning_rate": 0.0012744915659646141, + "loss": 1.0997, + "step": 6169 + }, + { + "epoch": 0.429232321124213, + "grad_norm": 1.2109375, + "learning_rate": 0.0012742748811516247, + "loss": 0.9804, + "step": 6170 + }, + { + "epoch": 0.4293018887613482, + "grad_norm": 1.171875, + "learning_rate": 0.0012740581824123396, + "loss": 0.8845, + "step": 6171 + }, + { + "epoch": 0.42937145639848345, + "grad_norm": 1.1796875, + "learning_rate": 0.0012738414697577609, + "loss": 0.9485, + "step": 6172 + }, + { + "epoch": 0.4294410240356186, + "grad_norm": 1.0859375, + "learning_rate": 0.001273624743198893, + "loss": 0.6952, + "step": 6173 + }, + { + "epoch": 0.42951059167275385, + "grad_norm": 1.03125, + "learning_rate": 0.0012734080027467399, + "loss": 0.7642, + "step": 6174 + }, + { + "epoch": 0.429580159309889, + "grad_norm": 0.953125, + "learning_rate": 0.001273191248412306, + "loss": 0.6815, + "step": 6175 + }, + { + "epoch": 0.42964972694702425, + "grad_norm": 1.0, + "learning_rate": 0.001272974480206598, + "loss": 0.8678, + "step": 6176 + }, + { + "epoch": 0.4297192945841595, + "grad_norm": 1.265625, + "learning_rate": 0.0012727576981406215, + "loss": 1.0667, + "step": 6177 + }, + { + "epoch": 0.42978886222129464, + "grad_norm": 0.99609375, + "learning_rate": 0.0012725409022253842, + "loss": 0.9059, + "step": 6178 + }, + { + "epoch": 0.42985842985842987, + "grad_norm": 1.3984375, + "learning_rate": 0.001272324092471893, + "loss": 0.77, + "step": 6179 + }, + { + "epoch": 0.42992799749556504, + "grad_norm": 1.125, + "learning_rate": 0.0012721072688911576, + "loss": 0.8317, + "step": 6180 + }, + { + "epoch": 0.42999756513270027, + "grad_norm": 0.9921875, + "learning_rate": 0.0012718904314941866, + "loss": 0.5463, + "step": 6181 + }, + { + "epoch": 0.4300671327698355, + "grad_norm": 1.359375, + "learning_rate": 0.0012716735802919894, + "loss": 0.7145, + "step": 6182 + }, + { + "epoch": 0.43013670040697066, + "grad_norm": 1.1875, + "learning_rate": 0.0012714567152955776, + "loss": 0.9652, + "step": 6183 + }, + { + "epoch": 0.4302062680441059, + "grad_norm": 1.15625, + "learning_rate": 0.0012712398365159617, + "loss": 0.9703, + "step": 6184 + }, + { + "epoch": 0.43027583568124106, + "grad_norm": 1.078125, + "learning_rate": 0.0012710229439641544, + "loss": 0.8048, + "step": 6185 + }, + { + "epoch": 0.4303454033183763, + "grad_norm": 1.1171875, + "learning_rate": 0.0012708060376511677, + "loss": 0.8256, + "step": 6186 + }, + { + "epoch": 0.4304149709555115, + "grad_norm": 1.203125, + "learning_rate": 0.0012705891175880156, + "loss": 1.0206, + "step": 6187 + }, + { + "epoch": 0.4304845385926467, + "grad_norm": 1.328125, + "learning_rate": 0.0012703721837857118, + "loss": 0.6895, + "step": 6188 + }, + { + "epoch": 0.4305541062297819, + "grad_norm": 1.1640625, + "learning_rate": 0.0012701552362552714, + "loss": 0.7009, + "step": 6189 + }, + { + "epoch": 0.43062367386691713, + "grad_norm": 1.109375, + "learning_rate": 0.0012699382750077102, + "loss": 0.7006, + "step": 6190 + }, + { + "epoch": 0.4306932415040523, + "grad_norm": 1.296875, + "learning_rate": 0.0012697213000540434, + "loss": 0.9176, + "step": 6191 + }, + { + "epoch": 0.43076280914118753, + "grad_norm": 0.99609375, + "learning_rate": 0.0012695043114052886, + "loss": 0.7925, + "step": 6192 + }, + { + "epoch": 0.4308323767783227, + "grad_norm": 1.2109375, + "learning_rate": 0.0012692873090724632, + "loss": 0.7769, + "step": 6193 + }, + { + "epoch": 0.4309019444154579, + "grad_norm": 1.3125, + "learning_rate": 0.001269070293066586, + "loss": 1.0246, + "step": 6194 + }, + { + "epoch": 0.43097151205259315, + "grad_norm": 1.0078125, + "learning_rate": 0.001268853263398675, + "loss": 0.9114, + "step": 6195 + }, + { + "epoch": 0.4310410796897283, + "grad_norm": 1.109375, + "learning_rate": 0.0012686362200797507, + "loss": 0.6292, + "step": 6196 + }, + { + "epoch": 0.43111064732686355, + "grad_norm": 1.0546875, + "learning_rate": 0.0012684191631208333, + "loss": 0.8646, + "step": 6197 + }, + { + "epoch": 0.4311802149639987, + "grad_norm": 1.3671875, + "learning_rate": 0.0012682020925329433, + "loss": 0.9575, + "step": 6198 + }, + { + "epoch": 0.43124978260113395, + "grad_norm": 1.078125, + "learning_rate": 0.0012679850083271034, + "loss": 0.9149, + "step": 6199 + }, + { + "epoch": 0.4313193502382692, + "grad_norm": 1.046875, + "learning_rate": 0.0012677679105143349, + "loss": 0.9711, + "step": 6200 + }, + { + "epoch": 0.43138891787540434, + "grad_norm": 0.8046875, + "learning_rate": 0.0012675507991056622, + "loss": 0.644, + "step": 6201 + }, + { + "epoch": 0.43145848551253957, + "grad_norm": 1.2265625, + "learning_rate": 0.001267333674112108, + "loss": 0.706, + "step": 6202 + }, + { + "epoch": 0.4315280531496748, + "grad_norm": 1.171875, + "learning_rate": 0.0012671165355446973, + "loss": 0.7567, + "step": 6203 + }, + { + "epoch": 0.43159762078680997, + "grad_norm": 1.0859375, + "learning_rate": 0.0012668993834144555, + "loss": 0.9832, + "step": 6204 + }, + { + "epoch": 0.4316671884239452, + "grad_norm": 1.2734375, + "learning_rate": 0.0012666822177324082, + "loss": 0.8723, + "step": 6205 + }, + { + "epoch": 0.43173675606108036, + "grad_norm": 0.98828125, + "learning_rate": 0.0012664650385095825, + "loss": 0.8979, + "step": 6206 + }, + { + "epoch": 0.4318063236982156, + "grad_norm": 0.8359375, + "learning_rate": 0.0012662478457570044, + "loss": 0.7803, + "step": 6207 + }, + { + "epoch": 0.4318758913353508, + "grad_norm": 1.765625, + "learning_rate": 0.0012660306394857033, + "loss": 1.0101, + "step": 6208 + }, + { + "epoch": 0.431945458972486, + "grad_norm": 1.4140625, + "learning_rate": 0.0012658134197067069, + "loss": 0.778, + "step": 6209 + }, + { + "epoch": 0.4320150266096212, + "grad_norm": 0.953125, + "learning_rate": 0.001265596186431045, + "loss": 0.8064, + "step": 6210 + }, + { + "epoch": 0.4320845942467564, + "grad_norm": 1.1015625, + "learning_rate": 0.0012653789396697476, + "loss": 0.9607, + "step": 6211 + }, + { + "epoch": 0.4321541618838916, + "grad_norm": 1.0625, + "learning_rate": 0.0012651616794338448, + "loss": 0.9648, + "step": 6212 + }, + { + "epoch": 0.43222372952102683, + "grad_norm": 1.1484375, + "learning_rate": 0.0012649444057343691, + "loss": 0.7897, + "step": 6213 + }, + { + "epoch": 0.432293297158162, + "grad_norm": 1.0703125, + "learning_rate": 0.0012647271185823512, + "loss": 0.9157, + "step": 6214 + }, + { + "epoch": 0.43236286479529723, + "grad_norm": 0.86328125, + "learning_rate": 0.001264509817988825, + "loss": 0.5547, + "step": 6215 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.9375, + "learning_rate": 0.0012642925039648232, + "loss": 0.7265, + "step": 6216 + }, + { + "epoch": 0.4325020000695676, + "grad_norm": 1.203125, + "learning_rate": 0.0012640751765213803, + "loss": 1.0178, + "step": 6217 + }, + { + "epoch": 0.43257156770670285, + "grad_norm": 0.95703125, + "learning_rate": 0.001263857835669531, + "loss": 0.9097, + "step": 6218 + }, + { + "epoch": 0.432641135343838, + "grad_norm": 0.9609375, + "learning_rate": 0.0012636404814203106, + "loss": 0.6847, + "step": 6219 + }, + { + "epoch": 0.43271070298097325, + "grad_norm": 1.0390625, + "learning_rate": 0.0012634231137847556, + "loss": 1.0189, + "step": 6220 + }, + { + "epoch": 0.4327802706181085, + "grad_norm": 0.8671875, + "learning_rate": 0.0012632057327739026, + "loss": 0.8217, + "step": 6221 + }, + { + "epoch": 0.43284983825524365, + "grad_norm": 1.234375, + "learning_rate": 0.0012629883383987893, + "loss": 0.7257, + "step": 6222 + }, + { + "epoch": 0.4329194058923789, + "grad_norm": 1.1328125, + "learning_rate": 0.0012627709306704533, + "loss": 0.8535, + "step": 6223 + }, + { + "epoch": 0.43298897352951404, + "grad_norm": 1.3203125, + "learning_rate": 0.0012625535095999341, + "loss": 1.0538, + "step": 6224 + }, + { + "epoch": 0.43305854116664927, + "grad_norm": 1.3984375, + "learning_rate": 0.0012623360751982712, + "loss": 1.0261, + "step": 6225 + }, + { + "epoch": 0.4331281088037845, + "grad_norm": 1.390625, + "learning_rate": 0.0012621186274765044, + "loss": 0.8567, + "step": 6226 + }, + { + "epoch": 0.43319767644091967, + "grad_norm": 1.6328125, + "learning_rate": 0.001261901166445675, + "loss": 1.0199, + "step": 6227 + }, + { + "epoch": 0.4332672440780549, + "grad_norm": 1.203125, + "learning_rate": 0.0012616836921168243, + "loss": 0.8066, + "step": 6228 + }, + { + "epoch": 0.4333368117151901, + "grad_norm": 1.015625, + "learning_rate": 0.0012614662045009953, + "loss": 0.7862, + "step": 6229 + }, + { + "epoch": 0.4334063793523253, + "grad_norm": 1.15625, + "learning_rate": 0.0012612487036092297, + "loss": 0.8326, + "step": 6230 + }, + { + "epoch": 0.4334759469894605, + "grad_norm": 0.95703125, + "learning_rate": 0.0012610311894525718, + "loss": 0.8385, + "step": 6231 + }, + { + "epoch": 0.4335455146265957, + "grad_norm": 1.25, + "learning_rate": 0.001260813662042066, + "loss": 0.9319, + "step": 6232 + }, + { + "epoch": 0.4336150822637309, + "grad_norm": 1.1640625, + "learning_rate": 0.001260596121388757, + "loss": 0.7254, + "step": 6233 + }, + { + "epoch": 0.43368464990086614, + "grad_norm": 1.109375, + "learning_rate": 0.0012603785675036905, + "loss": 0.8268, + "step": 6234 + }, + { + "epoch": 0.4337542175380013, + "grad_norm": 1.0859375, + "learning_rate": 0.0012601610003979125, + "loss": 0.9525, + "step": 6235 + }, + { + "epoch": 0.43382378517513653, + "grad_norm": 0.9453125, + "learning_rate": 0.0012599434200824705, + "loss": 0.7402, + "step": 6236 + }, + { + "epoch": 0.4338933528122717, + "grad_norm": 1.2421875, + "learning_rate": 0.0012597258265684118, + "loss": 0.8009, + "step": 6237 + }, + { + "epoch": 0.43396292044940693, + "grad_norm": 1.140625, + "learning_rate": 0.0012595082198667846, + "loss": 0.8976, + "step": 6238 + }, + { + "epoch": 0.43403248808654216, + "grad_norm": 1.0625, + "learning_rate": 0.001259290599988638, + "loss": 0.8552, + "step": 6239 + }, + { + "epoch": 0.4341020557236773, + "grad_norm": 1.3984375, + "learning_rate": 0.0012590729669450219, + "loss": 1.1349, + "step": 6240 + }, + { + "epoch": 0.43417162336081255, + "grad_norm": 1.640625, + "learning_rate": 0.001258855320746986, + "loss": 0.9319, + "step": 6241 + }, + { + "epoch": 0.4342411909979478, + "grad_norm": 1.3046875, + "learning_rate": 0.001258637661405582, + "loss": 1.2469, + "step": 6242 + }, + { + "epoch": 0.43431075863508295, + "grad_norm": 0.8828125, + "learning_rate": 0.0012584199889318609, + "loss": 0.7951, + "step": 6243 + }, + { + "epoch": 0.4343803262722182, + "grad_norm": 1.015625, + "learning_rate": 0.0012582023033368755, + "loss": 0.5938, + "step": 6244 + }, + { + "epoch": 0.43444989390935335, + "grad_norm": 1.0078125, + "learning_rate": 0.0012579846046316782, + "loss": 0.9024, + "step": 6245 + }, + { + "epoch": 0.4345194615464886, + "grad_norm": 1.078125, + "learning_rate": 0.0012577668928273234, + "loss": 1.0139, + "step": 6246 + }, + { + "epoch": 0.4345890291836238, + "grad_norm": 1.3203125, + "learning_rate": 0.001257549167934865, + "loss": 0.9458, + "step": 6247 + }, + { + "epoch": 0.43465859682075897, + "grad_norm": 0.9921875, + "learning_rate": 0.0012573314299653578, + "loss": 0.8125, + "step": 6248 + }, + { + "epoch": 0.4347281644578942, + "grad_norm": 1.03125, + "learning_rate": 0.0012571136789298579, + "loss": 0.8901, + "step": 6249 + }, + { + "epoch": 0.43479773209502937, + "grad_norm": 1.03125, + "learning_rate": 0.0012568959148394213, + "loss": 0.6321, + "step": 6250 + }, + { + "epoch": 0.4348672997321646, + "grad_norm": 1.2578125, + "learning_rate": 0.0012566781377051047, + "loss": 0.934, + "step": 6251 + }, + { + "epoch": 0.4349368673692998, + "grad_norm": 1.0078125, + "learning_rate": 0.0012564603475379663, + "loss": 0.6629, + "step": 6252 + }, + { + "epoch": 0.435006435006435, + "grad_norm": 1.09375, + "learning_rate": 0.001256242544349064, + "loss": 0.9645, + "step": 6253 + }, + { + "epoch": 0.4350760026435702, + "grad_norm": 1.1796875, + "learning_rate": 0.0012560247281494569, + "loss": 0.8213, + "step": 6254 + }, + { + "epoch": 0.43514557028070544, + "grad_norm": 1.234375, + "learning_rate": 0.0012558068989502044, + "loss": 0.9691, + "step": 6255 + }, + { + "epoch": 0.4352151379178406, + "grad_norm": 1.296875, + "learning_rate": 0.0012555890567623668, + "loss": 0.9168, + "step": 6256 + }, + { + "epoch": 0.43528470555497584, + "grad_norm": 1.6484375, + "learning_rate": 0.0012553712015970055, + "loss": 0.9164, + "step": 6257 + }, + { + "epoch": 0.435354273192111, + "grad_norm": 1.0625, + "learning_rate": 0.0012551533334651816, + "loss": 0.7233, + "step": 6258 + }, + { + "epoch": 0.43542384082924623, + "grad_norm": 1.125, + "learning_rate": 0.0012549354523779578, + "loss": 0.6932, + "step": 6259 + }, + { + "epoch": 0.43549340846638146, + "grad_norm": 1.75, + "learning_rate": 0.0012547175583463963, + "loss": 1.0742, + "step": 6260 + }, + { + "epoch": 0.43556297610351663, + "grad_norm": 1.3125, + "learning_rate": 0.0012544996513815614, + "loss": 0.9574, + "step": 6261 + }, + { + "epoch": 0.43563254374065186, + "grad_norm": 0.9375, + "learning_rate": 0.0012542817314945168, + "loss": 0.8548, + "step": 6262 + }, + { + "epoch": 0.435702111377787, + "grad_norm": 1.234375, + "learning_rate": 0.0012540637986963275, + "loss": 0.8449, + "step": 6263 + }, + { + "epoch": 0.43577167901492225, + "grad_norm": 1.203125, + "learning_rate": 0.001253845852998059, + "loss": 0.9364, + "step": 6264 + }, + { + "epoch": 0.4358412466520575, + "grad_norm": 1.203125, + "learning_rate": 0.0012536278944107776, + "loss": 0.8772, + "step": 6265 + }, + { + "epoch": 0.43591081428919265, + "grad_norm": 1.2578125, + "learning_rate": 0.0012534099229455505, + "loss": 1.127, + "step": 6266 + }, + { + "epoch": 0.4359803819263279, + "grad_norm": 0.94140625, + "learning_rate": 0.0012531919386134444, + "loss": 0.7243, + "step": 6267 + }, + { + "epoch": 0.4360499495634631, + "grad_norm": 1.1875, + "learning_rate": 0.001252973941425528, + "loss": 1.2287, + "step": 6268 + }, + { + "epoch": 0.4361195172005983, + "grad_norm": 0.98046875, + "learning_rate": 0.0012527559313928699, + "loss": 0.7078, + "step": 6269 + }, + { + "epoch": 0.4361890848377335, + "grad_norm": 1.0546875, + "learning_rate": 0.0012525379085265393, + "loss": 0.8298, + "step": 6270 + }, + { + "epoch": 0.43625865247486867, + "grad_norm": 1.0234375, + "learning_rate": 0.0012523198728376069, + "loss": 0.9078, + "step": 6271 + }, + { + "epoch": 0.4363282201120039, + "grad_norm": 1.0, + "learning_rate": 0.001252101824337143, + "loss": 0.898, + "step": 6272 + }, + { + "epoch": 0.4363977877491391, + "grad_norm": 1.1171875, + "learning_rate": 0.0012518837630362194, + "loss": 1.1462, + "step": 6273 + }, + { + "epoch": 0.4364673553862743, + "grad_norm": 1.1171875, + "learning_rate": 0.0012516656889459078, + "loss": 0.8964, + "step": 6274 + }, + { + "epoch": 0.4365369230234095, + "grad_norm": 1.0625, + "learning_rate": 0.0012514476020772808, + "loss": 0.9142, + "step": 6275 + }, + { + "epoch": 0.4366064906605447, + "grad_norm": 1.109375, + "learning_rate": 0.001251229502441412, + "loss": 1.0183, + "step": 6276 + }, + { + "epoch": 0.4366760582976799, + "grad_norm": 1.0625, + "learning_rate": 0.0012510113900493756, + "loss": 0.7949, + "step": 6277 + }, + { + "epoch": 0.43674562593481514, + "grad_norm": 1.0390625, + "learning_rate": 0.0012507932649122458, + "loss": 0.9381, + "step": 6278 + }, + { + "epoch": 0.4368151935719503, + "grad_norm": 1.2421875, + "learning_rate": 0.0012505751270410982, + "loss": 0.9689, + "step": 6279 + }, + { + "epoch": 0.43688476120908554, + "grad_norm": 0.859375, + "learning_rate": 0.0012503569764470085, + "loss": 0.8015, + "step": 6280 + }, + { + "epoch": 0.43695432884622076, + "grad_norm": 0.984375, + "learning_rate": 0.0012501388131410537, + "loss": 0.6995, + "step": 6281 + }, + { + "epoch": 0.43702389648335593, + "grad_norm": 1.0390625, + "learning_rate": 0.0012499206371343104, + "loss": 0.7622, + "step": 6282 + }, + { + "epoch": 0.43709346412049116, + "grad_norm": 0.984375, + "learning_rate": 0.001249702448437857, + "loss": 0.8139, + "step": 6283 + }, + { + "epoch": 0.43716303175762633, + "grad_norm": 1.1640625, + "learning_rate": 0.0012494842470627719, + "loss": 0.914, + "step": 6284 + }, + { + "epoch": 0.43723259939476156, + "grad_norm": 0.96875, + "learning_rate": 0.0012492660330201341, + "loss": 0.7262, + "step": 6285 + }, + { + "epoch": 0.4373021670318968, + "grad_norm": 1.109375, + "learning_rate": 0.0012490478063210237, + "loss": 1.0207, + "step": 6286 + }, + { + "epoch": 0.43737173466903195, + "grad_norm": 0.86328125, + "learning_rate": 0.001248829566976521, + "loss": 0.8092, + "step": 6287 + }, + { + "epoch": 0.4374413023061672, + "grad_norm": 1.0625, + "learning_rate": 0.001248611314997707, + "loss": 0.6535, + "step": 6288 + }, + { + "epoch": 0.43751086994330235, + "grad_norm": 1.1171875, + "learning_rate": 0.0012483930503956635, + "loss": 0.9891, + "step": 6289 + }, + { + "epoch": 0.4375804375804376, + "grad_norm": 1.140625, + "learning_rate": 0.001248174773181473, + "loss": 0.9172, + "step": 6290 + }, + { + "epoch": 0.4376500052175728, + "grad_norm": 1.0546875, + "learning_rate": 0.0012479564833662185, + "loss": 0.6461, + "step": 6291 + }, + { + "epoch": 0.437719572854708, + "grad_norm": 1.140625, + "learning_rate": 0.0012477381809609834, + "loss": 0.7959, + "step": 6292 + }, + { + "epoch": 0.4377891404918432, + "grad_norm": 1.140625, + "learning_rate": 0.0012475198659768522, + "loss": 0.8386, + "step": 6293 + }, + { + "epoch": 0.4378587081289784, + "grad_norm": 1.2421875, + "learning_rate": 0.0012473015384249096, + "loss": 0.8491, + "step": 6294 + }, + { + "epoch": 0.4379282757661136, + "grad_norm": 1.1875, + "learning_rate": 0.0012470831983162416, + "loss": 0.9507, + "step": 6295 + }, + { + "epoch": 0.4379978434032488, + "grad_norm": 1.6328125, + "learning_rate": 0.001246864845661934, + "loss": 1.2016, + "step": 6296 + }, + { + "epoch": 0.438067411040384, + "grad_norm": 1.1796875, + "learning_rate": 0.001246646480473074, + "loss": 0.8521, + "step": 6297 + }, + { + "epoch": 0.4381369786775192, + "grad_norm": 1.1171875, + "learning_rate": 0.0012464281027607489, + "loss": 0.7937, + "step": 6298 + }, + { + "epoch": 0.43820654631465444, + "grad_norm": 1.0859375, + "learning_rate": 0.0012462097125360467, + "loss": 0.7912, + "step": 6299 + }, + { + "epoch": 0.4382761139517896, + "grad_norm": 1.15625, + "learning_rate": 0.0012459913098100566, + "loss": 0.8754, + "step": 6300 + }, + { + "epoch": 0.43834568158892484, + "grad_norm": 1.046875, + "learning_rate": 0.0012457728945938673, + "loss": 0.6109, + "step": 6301 + }, + { + "epoch": 0.43841524922606, + "grad_norm": 1.21875, + "learning_rate": 0.0012455544668985693, + "loss": 0.8394, + "step": 6302 + }, + { + "epoch": 0.43848481686319524, + "grad_norm": 1.3359375, + "learning_rate": 0.0012453360267352534, + "loss": 1.0656, + "step": 6303 + }, + { + "epoch": 0.43855438450033046, + "grad_norm": 1.1796875, + "learning_rate": 0.0012451175741150105, + "loss": 0.8723, + "step": 6304 + }, + { + "epoch": 0.43862395213746563, + "grad_norm": 1.1015625, + "learning_rate": 0.0012448991090489325, + "loss": 0.6628, + "step": 6305 + }, + { + "epoch": 0.43869351977460086, + "grad_norm": 1.2265625, + "learning_rate": 0.0012446806315481124, + "loss": 0.7501, + "step": 6306 + }, + { + "epoch": 0.4387630874117361, + "grad_norm": 1.078125, + "learning_rate": 0.0012444621416236427, + "loss": 1.0503, + "step": 6307 + }, + { + "epoch": 0.43883265504887126, + "grad_norm": 1.1484375, + "learning_rate": 0.0012442436392866181, + "loss": 0.6441, + "step": 6308 + }, + { + "epoch": 0.4389022226860065, + "grad_norm": 1.1796875, + "learning_rate": 0.0012440251245481324, + "loss": 1.0388, + "step": 6309 + }, + { + "epoch": 0.43897179032314165, + "grad_norm": 0.8984375, + "learning_rate": 0.0012438065974192808, + "loss": 0.7253, + "step": 6310 + }, + { + "epoch": 0.4390413579602769, + "grad_norm": 1.296875, + "learning_rate": 0.001243588057911159, + "loss": 0.8736, + "step": 6311 + }, + { + "epoch": 0.4391109255974121, + "grad_norm": 0.96484375, + "learning_rate": 0.0012433695060348636, + "loss": 0.9115, + "step": 6312 + }, + { + "epoch": 0.4391804932345473, + "grad_norm": 0.8828125, + "learning_rate": 0.0012431509418014913, + "loss": 0.8458, + "step": 6313 + }, + { + "epoch": 0.4392500608716825, + "grad_norm": 1.1953125, + "learning_rate": 0.0012429323652221396, + "loss": 0.9502, + "step": 6314 + }, + { + "epoch": 0.43931962850881767, + "grad_norm": 1.2734375, + "learning_rate": 0.001242713776307907, + "loss": 1.1474, + "step": 6315 + }, + { + "epoch": 0.4393891961459529, + "grad_norm": 1.203125, + "learning_rate": 0.001242495175069892, + "loss": 0.8861, + "step": 6316 + }, + { + "epoch": 0.4394587637830881, + "grad_norm": 0.9375, + "learning_rate": 0.0012422765615191947, + "loss": 0.7672, + "step": 6317 + }, + { + "epoch": 0.4395283314202233, + "grad_norm": 1.171875, + "learning_rate": 0.0012420579356669144, + "loss": 0.7132, + "step": 6318 + }, + { + "epoch": 0.4395978990573585, + "grad_norm": 1.2109375, + "learning_rate": 0.0012418392975241522, + "loss": 1.0153, + "step": 6319 + }, + { + "epoch": 0.43966746669449375, + "grad_norm": 1.1640625, + "learning_rate": 0.0012416206471020095, + "loss": 0.7832, + "step": 6320 + }, + { + "epoch": 0.4397370343316289, + "grad_norm": 1.1484375, + "learning_rate": 0.0012414019844115883, + "loss": 1.0355, + "step": 6321 + }, + { + "epoch": 0.43980660196876414, + "grad_norm": 1.0546875, + "learning_rate": 0.001241183309463991, + "loss": 0.8978, + "step": 6322 + }, + { + "epoch": 0.4398761696058993, + "grad_norm": 1.15625, + "learning_rate": 0.001240964622270321, + "loss": 0.8164, + "step": 6323 + }, + { + "epoch": 0.43994573724303454, + "grad_norm": 1.109375, + "learning_rate": 0.0012407459228416819, + "loss": 1.0299, + "step": 6324 + }, + { + "epoch": 0.44001530488016977, + "grad_norm": 0.9765625, + "learning_rate": 0.0012405272111891783, + "loss": 0.7, + "step": 6325 + }, + { + "epoch": 0.44008487251730494, + "grad_norm": 1.015625, + "learning_rate": 0.0012403084873239152, + "loss": 0.7141, + "step": 6326 + }, + { + "epoch": 0.44015444015444016, + "grad_norm": 1.234375, + "learning_rate": 0.0012400897512569987, + "loss": 1.0925, + "step": 6327 + }, + { + "epoch": 0.44022400779157533, + "grad_norm": 1.2890625, + "learning_rate": 0.0012398710029995345, + "loss": 1.0427, + "step": 6328 + }, + { + "epoch": 0.44029357542871056, + "grad_norm": 1.390625, + "learning_rate": 0.0012396522425626299, + "loss": 0.8871, + "step": 6329 + }, + { + "epoch": 0.4403631430658458, + "grad_norm": 1.1640625, + "learning_rate": 0.001239433469957392, + "loss": 0.9091, + "step": 6330 + }, + { + "epoch": 0.44043271070298096, + "grad_norm": 1.1796875, + "learning_rate": 0.0012392146851949296, + "loss": 0.6989, + "step": 6331 + }, + { + "epoch": 0.4405022783401162, + "grad_norm": 1.2265625, + "learning_rate": 0.0012389958882863515, + "loss": 0.5548, + "step": 6332 + }, + { + "epoch": 0.4405718459772514, + "grad_norm": 0.90625, + "learning_rate": 0.0012387770792427664, + "loss": 0.8693, + "step": 6333 + }, + { + "epoch": 0.4406414136143866, + "grad_norm": 1.1328125, + "learning_rate": 0.001238558258075285, + "loss": 0.7976, + "step": 6334 + }, + { + "epoch": 0.4407109812515218, + "grad_norm": 1.03125, + "learning_rate": 0.0012383394247950175, + "loss": 0.7941, + "step": 6335 + }, + { + "epoch": 0.440780548888657, + "grad_norm": 1.078125, + "learning_rate": 0.0012381205794130754, + "loss": 0.7462, + "step": 6336 + }, + { + "epoch": 0.4408501165257922, + "grad_norm": 0.9140625, + "learning_rate": 0.0012379017219405705, + "loss": 0.7067, + "step": 6337 + }, + { + "epoch": 0.4409196841629274, + "grad_norm": 1.0859375, + "learning_rate": 0.0012376828523886151, + "loss": 0.948, + "step": 6338 + }, + { + "epoch": 0.4409892518000626, + "grad_norm": 0.81640625, + "learning_rate": 0.0012374639707683228, + "loss": 0.7899, + "step": 6339 + }, + { + "epoch": 0.4410588194371978, + "grad_norm": 1.0625, + "learning_rate": 0.0012372450770908067, + "loss": 0.8701, + "step": 6340 + }, + { + "epoch": 0.441128387074333, + "grad_norm": 1.0703125, + "learning_rate": 0.0012370261713671817, + "loss": 0.8288, + "step": 6341 + }, + { + "epoch": 0.4411979547114682, + "grad_norm": 1.1171875, + "learning_rate": 0.001236807253608562, + "loss": 0.8467, + "step": 6342 + }, + { + "epoch": 0.44126752234860345, + "grad_norm": 1.0859375, + "learning_rate": 0.001236588323826064, + "loss": 0.934, + "step": 6343 + }, + { + "epoch": 0.4413370899857386, + "grad_norm": 1.078125, + "learning_rate": 0.0012363693820308032, + "loss": 0.7287, + "step": 6344 + }, + { + "epoch": 0.44140665762287384, + "grad_norm": 1.0390625, + "learning_rate": 0.0012361504282338964, + "loss": 0.7829, + "step": 6345 + }, + { + "epoch": 0.44147622526000907, + "grad_norm": 1.1796875, + "learning_rate": 0.0012359314624464616, + "loss": 0.9451, + "step": 6346 + }, + { + "epoch": 0.44154579289714424, + "grad_norm": 1.3203125, + "learning_rate": 0.001235712484679616, + "loss": 0.7119, + "step": 6347 + }, + { + "epoch": 0.44161536053427947, + "grad_norm": 1.4765625, + "learning_rate": 0.0012354934949444785, + "loss": 1.2297, + "step": 6348 + }, + { + "epoch": 0.44168492817141464, + "grad_norm": 1.0234375, + "learning_rate": 0.001235274493252168, + "loss": 0.8412, + "step": 6349 + }, + { + "epoch": 0.44175449580854986, + "grad_norm": 0.88671875, + "learning_rate": 0.0012350554796138051, + "loss": 0.7168, + "step": 6350 + }, + { + "epoch": 0.4418240634456851, + "grad_norm": 1.1484375, + "learning_rate": 0.0012348364540405096, + "loss": 0.8528, + "step": 6351 + }, + { + "epoch": 0.44189363108282026, + "grad_norm": 1.09375, + "learning_rate": 0.0012346174165434026, + "loss": 0.7164, + "step": 6352 + }, + { + "epoch": 0.4419631987199555, + "grad_norm": 1.28125, + "learning_rate": 0.0012343983671336057, + "loss": 1.1093, + "step": 6353 + }, + { + "epoch": 0.44203276635709066, + "grad_norm": 0.96875, + "learning_rate": 0.0012341793058222412, + "loss": 0.6447, + "step": 6354 + }, + { + "epoch": 0.4421023339942259, + "grad_norm": 0.9765625, + "learning_rate": 0.001233960232620432, + "loss": 0.7405, + "step": 6355 + }, + { + "epoch": 0.4421719016313611, + "grad_norm": 1.0, + "learning_rate": 0.001233741147539301, + "loss": 0.7226, + "step": 6356 + }, + { + "epoch": 0.4422414692684963, + "grad_norm": 0.96484375, + "learning_rate": 0.001233522050589973, + "loss": 0.7298, + "step": 6357 + }, + { + "epoch": 0.4423110369056315, + "grad_norm": 1.0859375, + "learning_rate": 0.0012333029417835725, + "loss": 0.8299, + "step": 6358 + }, + { + "epoch": 0.44238060454276673, + "grad_norm": 0.87109375, + "learning_rate": 0.0012330838211312243, + "loss": 0.6317, + "step": 6359 + }, + { + "epoch": 0.4424501721799019, + "grad_norm": 1.125, + "learning_rate": 0.0012328646886440547, + "loss": 0.9076, + "step": 6360 + }, + { + "epoch": 0.4425197398170371, + "grad_norm": 1.265625, + "learning_rate": 0.0012326455443331897, + "loss": 0.98, + "step": 6361 + }, + { + "epoch": 0.4425893074541723, + "grad_norm": 1.0625, + "learning_rate": 0.0012324263882097567, + "loss": 0.8287, + "step": 6362 + }, + { + "epoch": 0.4426588750913075, + "grad_norm": 1.3046875, + "learning_rate": 0.0012322072202848831, + "loss": 0.9169, + "step": 6363 + }, + { + "epoch": 0.44272844272844275, + "grad_norm": 1.15625, + "learning_rate": 0.0012319880405696974, + "loss": 1.0304, + "step": 6364 + }, + { + "epoch": 0.4427980103655779, + "grad_norm": 0.921875, + "learning_rate": 0.0012317688490753281, + "loss": 0.7497, + "step": 6365 + }, + { + "epoch": 0.44286757800271315, + "grad_norm": 1.078125, + "learning_rate": 0.0012315496458129053, + "loss": 0.8016, + "step": 6366 + }, + { + "epoch": 0.4429371456398483, + "grad_norm": 1.375, + "learning_rate": 0.0012313304307935583, + "loss": 1.0096, + "step": 6367 + }, + { + "epoch": 0.44300671327698354, + "grad_norm": 1.25, + "learning_rate": 0.001231111204028418, + "loss": 0.7129, + "step": 6368 + }, + { + "epoch": 0.44307628091411877, + "grad_norm": 0.9609375, + "learning_rate": 0.0012308919655286154, + "loss": 0.6975, + "step": 6369 + }, + { + "epoch": 0.44314584855125394, + "grad_norm": 0.97265625, + "learning_rate": 0.001230672715305283, + "loss": 0.9218, + "step": 6370 + }, + { + "epoch": 0.44321541618838917, + "grad_norm": 1.1796875, + "learning_rate": 0.0012304534533695527, + "loss": 0.9669, + "step": 6371 + }, + { + "epoch": 0.4432849838255244, + "grad_norm": 1.1953125, + "learning_rate": 0.0012302341797325572, + "loss": 0.7856, + "step": 6372 + }, + { + "epoch": 0.44335455146265956, + "grad_norm": 1.203125, + "learning_rate": 0.001230014894405431, + "loss": 0.7618, + "step": 6373 + }, + { + "epoch": 0.4434241190997948, + "grad_norm": 1.140625, + "learning_rate": 0.0012297955973993076, + "loss": 0.8446, + "step": 6374 + }, + { + "epoch": 0.44349368673692996, + "grad_norm": 0.9453125, + "learning_rate": 0.001229576288725322, + "loss": 0.8408, + "step": 6375 + }, + { + "epoch": 0.4435632543740652, + "grad_norm": 1.75, + "learning_rate": 0.00122935696839461, + "loss": 0.9296, + "step": 6376 + }, + { + "epoch": 0.4436328220112004, + "grad_norm": 1.0078125, + "learning_rate": 0.0012291376364183069, + "loss": 0.6171, + "step": 6377 + }, + { + "epoch": 0.4437023896483356, + "grad_norm": 1.1796875, + "learning_rate": 0.0012289182928075495, + "loss": 0.7637, + "step": 6378 + }, + { + "epoch": 0.4437719572854708, + "grad_norm": 1.15625, + "learning_rate": 0.0012286989375734749, + "loss": 0.697, + "step": 6379 + }, + { + "epoch": 0.443841524922606, + "grad_norm": 0.94921875, + "learning_rate": 0.0012284795707272213, + "loss": 0.7791, + "step": 6380 + }, + { + "epoch": 0.4439110925597412, + "grad_norm": 1.3046875, + "learning_rate": 0.0012282601922799263, + "loss": 0.9094, + "step": 6381 + }, + { + "epoch": 0.44398066019687643, + "grad_norm": 0.94140625, + "learning_rate": 0.0012280408022427298, + "loss": 0.7983, + "step": 6382 + }, + { + "epoch": 0.4440502278340116, + "grad_norm": 1.0546875, + "learning_rate": 0.0012278214006267705, + "loss": 0.872, + "step": 6383 + }, + { + "epoch": 0.4441197954711468, + "grad_norm": 0.93359375, + "learning_rate": 0.0012276019874431887, + "loss": 0.7685, + "step": 6384 + }, + { + "epoch": 0.44418936310828205, + "grad_norm": 1.09375, + "learning_rate": 0.0012273825627031254, + "loss": 0.8077, + "step": 6385 + }, + { + "epoch": 0.4442589307454172, + "grad_norm": 1.0625, + "learning_rate": 0.0012271631264177212, + "loss": 0.7381, + "step": 6386 + }, + { + "epoch": 0.44432849838255245, + "grad_norm": 0.94921875, + "learning_rate": 0.001226943678598119, + "loss": 0.7848, + "step": 6387 + }, + { + "epoch": 0.4443980660196876, + "grad_norm": 1.046875, + "learning_rate": 0.0012267242192554601, + "loss": 0.9076, + "step": 6388 + }, + { + "epoch": 0.44446763365682285, + "grad_norm": 1.0078125, + "learning_rate": 0.0012265047484008886, + "loss": 0.9831, + "step": 6389 + }, + { + "epoch": 0.4445372012939581, + "grad_norm": 1.53125, + "learning_rate": 0.0012262852660455477, + "loss": 1.0663, + "step": 6390 + }, + { + "epoch": 0.44460676893109324, + "grad_norm": 1.1171875, + "learning_rate": 0.0012260657722005812, + "loss": 0.8143, + "step": 6391 + }, + { + "epoch": 0.44467633656822847, + "grad_norm": 0.9921875, + "learning_rate": 0.0012258462668771344, + "loss": 0.6318, + "step": 6392 + }, + { + "epoch": 0.44474590420536364, + "grad_norm": 1.0078125, + "learning_rate": 0.0012256267500863522, + "loss": 0.7823, + "step": 6393 + }, + { + "epoch": 0.44481547184249887, + "grad_norm": 1.125, + "learning_rate": 0.0012254072218393815, + "loss": 0.8409, + "step": 6394 + }, + { + "epoch": 0.4448850394796341, + "grad_norm": 1.3984375, + "learning_rate": 0.0012251876821473676, + "loss": 0.9326, + "step": 6395 + }, + { + "epoch": 0.44495460711676926, + "grad_norm": 1.1796875, + "learning_rate": 0.001224968131021459, + "loss": 0.8727, + "step": 6396 + }, + { + "epoch": 0.4450241747539045, + "grad_norm": 0.94140625, + "learning_rate": 0.0012247485684728017, + "loss": 0.6485, + "step": 6397 + }, + { + "epoch": 0.4450937423910397, + "grad_norm": 0.98828125, + "learning_rate": 0.0012245289945125458, + "loss": 0.8116, + "step": 6398 + }, + { + "epoch": 0.4451633100281749, + "grad_norm": 0.98046875, + "learning_rate": 0.0012243094091518387, + "loss": 0.5843, + "step": 6399 + }, + { + "epoch": 0.4452328776653101, + "grad_norm": 1.421875, + "learning_rate": 0.0012240898124018303, + "loss": 0.6947, + "step": 6400 + }, + { + "epoch": 0.4453024453024453, + "grad_norm": 1.2890625, + "learning_rate": 0.001223870204273671, + "loss": 0.9957, + "step": 6401 + }, + { + "epoch": 0.4453720129395805, + "grad_norm": 1.2265625, + "learning_rate": 0.0012236505847785112, + "loss": 0.902, + "step": 6402 + }, + { + "epoch": 0.44544158057671573, + "grad_norm": 1.2578125, + "learning_rate": 0.0012234309539275018, + "loss": 0.857, + "step": 6403 + }, + { + "epoch": 0.4455111482138509, + "grad_norm": 1.40625, + "learning_rate": 0.0012232113117317948, + "loss": 0.7336, + "step": 6404 + }, + { + "epoch": 0.44558071585098613, + "grad_norm": 1.2421875, + "learning_rate": 0.0012229916582025427, + "loss": 1.2174, + "step": 6405 + }, + { + "epoch": 0.4456502834881213, + "grad_norm": 1.1328125, + "learning_rate": 0.0012227719933508977, + "loss": 0.8288, + "step": 6406 + }, + { + "epoch": 0.4457198511252565, + "grad_norm": 1.203125, + "learning_rate": 0.001222552317188014, + "loss": 0.8039, + "step": 6407 + }, + { + "epoch": 0.44578941876239175, + "grad_norm": 1.359375, + "learning_rate": 0.0012223326297250453, + "loss": 1.055, + "step": 6408 + }, + { + "epoch": 0.4458589863995269, + "grad_norm": 1.09375, + "learning_rate": 0.0012221129309731463, + "loss": 0.8263, + "step": 6409 + }, + { + "epoch": 0.44592855403666215, + "grad_norm": 1.140625, + "learning_rate": 0.0012218932209434722, + "loss": 0.8756, + "step": 6410 + }, + { + "epoch": 0.4459981216737974, + "grad_norm": 0.94921875, + "learning_rate": 0.0012216734996471788, + "loss": 0.8042, + "step": 6411 + }, + { + "epoch": 0.44606768931093255, + "grad_norm": 0.94921875, + "learning_rate": 0.0012214537670954225, + "loss": 0.8315, + "step": 6412 + }, + { + "epoch": 0.4461372569480678, + "grad_norm": 1.359375, + "learning_rate": 0.0012212340232993597, + "loss": 0.8429, + "step": 6413 + }, + { + "epoch": 0.44620682458520294, + "grad_norm": 1.0703125, + "learning_rate": 0.0012210142682701488, + "loss": 0.9472, + "step": 6414 + }, + { + "epoch": 0.44627639222233817, + "grad_norm": 0.90234375, + "learning_rate": 0.0012207945020189473, + "loss": 0.8691, + "step": 6415 + }, + { + "epoch": 0.4463459598594734, + "grad_norm": 1.125, + "learning_rate": 0.0012205747245569135, + "loss": 0.7448, + "step": 6416 + }, + { + "epoch": 0.44641552749660857, + "grad_norm": 0.9296875, + "learning_rate": 0.0012203549358952076, + "loss": 1.11, + "step": 6417 + }, + { + "epoch": 0.4464850951337438, + "grad_norm": 1.140625, + "learning_rate": 0.001220135136044988, + "loss": 0.8819, + "step": 6418 + }, + { + "epoch": 0.44655466277087896, + "grad_norm": 0.95703125, + "learning_rate": 0.0012199153250174162, + "loss": 0.8179, + "step": 6419 + }, + { + "epoch": 0.4466242304080142, + "grad_norm": 1.015625, + "learning_rate": 0.0012196955028236523, + "loss": 0.9889, + "step": 6420 + }, + { + "epoch": 0.4466937980451494, + "grad_norm": 1.0625, + "learning_rate": 0.0012194756694748586, + "loss": 0.8049, + "step": 6421 + }, + { + "epoch": 0.4467633656822846, + "grad_norm": 1.03125, + "learning_rate": 0.0012192558249821963, + "loss": 0.9424, + "step": 6422 + }, + { + "epoch": 0.4468329333194198, + "grad_norm": 1.15625, + "learning_rate": 0.0012190359693568284, + "loss": 0.9192, + "step": 6423 + }, + { + "epoch": 0.44690250095655504, + "grad_norm": 1.1015625, + "learning_rate": 0.0012188161026099183, + "loss": 0.9867, + "step": 6424 + }, + { + "epoch": 0.4469720685936902, + "grad_norm": 1.3125, + "learning_rate": 0.0012185962247526288, + "loss": 0.8561, + "step": 6425 + }, + { + "epoch": 0.44704163623082543, + "grad_norm": 1.078125, + "learning_rate": 0.0012183763357961252, + "loss": 0.8188, + "step": 6426 + }, + { + "epoch": 0.4471112038679606, + "grad_norm": 1.25, + "learning_rate": 0.001218156435751572, + "loss": 0.7588, + "step": 6427 + }, + { + "epoch": 0.44718077150509583, + "grad_norm": 1.046875, + "learning_rate": 0.0012179365246301347, + "loss": 0.8457, + "step": 6428 + }, + { + "epoch": 0.44725033914223106, + "grad_norm": 1.3203125, + "learning_rate": 0.0012177166024429787, + "loss": 0.9289, + "step": 6429 + }, + { + "epoch": 0.4473199067793662, + "grad_norm": 0.9453125, + "learning_rate": 0.0012174966692012712, + "loss": 0.6494, + "step": 6430 + }, + { + "epoch": 0.44738947441650145, + "grad_norm": 1.25, + "learning_rate": 0.0012172767249161796, + "loss": 0.8693, + "step": 6431 + }, + { + "epoch": 0.4474590420536366, + "grad_norm": 1.1640625, + "learning_rate": 0.0012170567695988703, + "loss": 0.8021, + "step": 6432 + }, + { + "epoch": 0.44752860969077185, + "grad_norm": 1.109375, + "learning_rate": 0.0012168368032605128, + "loss": 0.7772, + "step": 6433 + }, + { + "epoch": 0.4475981773279071, + "grad_norm": 1.171875, + "learning_rate": 0.001216616825912275, + "loss": 0.9562, + "step": 6434 + }, + { + "epoch": 0.44766774496504225, + "grad_norm": 1.1484375, + "learning_rate": 0.001216396837565327, + "loss": 0.9181, + "step": 6435 + }, + { + "epoch": 0.4477373126021775, + "grad_norm": 1.046875, + "learning_rate": 0.001216176838230838, + "loss": 0.7663, + "step": 6436 + }, + { + "epoch": 0.4478068802393127, + "grad_norm": 1.078125, + "learning_rate": 0.001215956827919979, + "loss": 0.7702, + "step": 6437 + }, + { + "epoch": 0.44787644787644787, + "grad_norm": 1.09375, + "learning_rate": 0.0012157368066439207, + "loss": 0.7952, + "step": 6438 + }, + { + "epoch": 0.4479460155135831, + "grad_norm": 1.171875, + "learning_rate": 0.0012155167744138345, + "loss": 0.5708, + "step": 6439 + }, + { + "epoch": 0.44801558315071827, + "grad_norm": 1.0078125, + "learning_rate": 0.0012152967312408932, + "loss": 1.1361, + "step": 6440 + }, + { + "epoch": 0.4480851507878535, + "grad_norm": 1.25, + "learning_rate": 0.0012150766771362688, + "loss": 0.9889, + "step": 6441 + }, + { + "epoch": 0.4481547184249887, + "grad_norm": 1.0234375, + "learning_rate": 0.0012148566121111348, + "loss": 0.617, + "step": 6442 + }, + { + "epoch": 0.4482242860621239, + "grad_norm": 1.09375, + "learning_rate": 0.001214636536176665, + "loss": 1.0036, + "step": 6443 + }, + { + "epoch": 0.4482938536992591, + "grad_norm": 1.359375, + "learning_rate": 0.001214416449344034, + "loss": 0.8777, + "step": 6444 + }, + { + "epoch": 0.4483634213363943, + "grad_norm": 1.1328125, + "learning_rate": 0.001214196351624416, + "loss": 0.9089, + "step": 6445 + }, + { + "epoch": 0.4484329889735295, + "grad_norm": 1.90625, + "learning_rate": 0.0012139762430289872, + "loss": 1.0769, + "step": 6446 + }, + { + "epoch": 0.44850255661066474, + "grad_norm": 1.0859375, + "learning_rate": 0.0012137561235689234, + "loss": 0.8059, + "step": 6447 + }, + { + "epoch": 0.4485721242477999, + "grad_norm": 0.90625, + "learning_rate": 0.0012135359932554006, + "loss": 0.6083, + "step": 6448 + }, + { + "epoch": 0.44864169188493513, + "grad_norm": 1.078125, + "learning_rate": 0.001213315852099597, + "loss": 0.6459, + "step": 6449 + }, + { + "epoch": 0.44871125952207036, + "grad_norm": 1.21875, + "learning_rate": 0.001213095700112689, + "loss": 1.041, + "step": 6450 + }, + { + "epoch": 0.44878082715920553, + "grad_norm": 0.9765625, + "learning_rate": 0.001212875537305856, + "loss": 0.8277, + "step": 6451 + }, + { + "epoch": 0.44885039479634076, + "grad_norm": 1.21875, + "learning_rate": 0.0012126553636902758, + "loss": 0.7094, + "step": 6452 + }, + { + "epoch": 0.4489199624334759, + "grad_norm": 1.3515625, + "learning_rate": 0.001212435179277128, + "loss": 0.9776, + "step": 6453 + }, + { + "epoch": 0.44898953007061115, + "grad_norm": 1.390625, + "learning_rate": 0.0012122149840775932, + "loss": 0.9729, + "step": 6454 + }, + { + "epoch": 0.4490590977077464, + "grad_norm": 1.2265625, + "learning_rate": 0.0012119947781028503, + "loss": 0.7491, + "step": 6455 + }, + { + "epoch": 0.44912866534488155, + "grad_norm": 1.2265625, + "learning_rate": 0.0012117745613640816, + "loss": 0.8354, + "step": 6456 + }, + { + "epoch": 0.4491982329820168, + "grad_norm": 1.1796875, + "learning_rate": 0.001211554333872468, + "loss": 0.9233, + "step": 6457 + }, + { + "epoch": 0.44926780061915195, + "grad_norm": 0.9609375, + "learning_rate": 0.0012113340956391916, + "loss": 0.6918, + "step": 6458 + }, + { + "epoch": 0.4493373682562872, + "grad_norm": 0.98046875, + "learning_rate": 0.001211113846675435, + "loss": 0.7891, + "step": 6459 + }, + { + "epoch": 0.4494069358934224, + "grad_norm": 1.125, + "learning_rate": 0.0012108935869923813, + "loss": 0.679, + "step": 6460 + }, + { + "epoch": 0.44947650353055757, + "grad_norm": 0.890625, + "learning_rate": 0.0012106733166012144, + "loss": 0.6628, + "step": 6461 + }, + { + "epoch": 0.4495460711676928, + "grad_norm": 1.234375, + "learning_rate": 0.0012104530355131183, + "loss": 0.7719, + "step": 6462 + }, + { + "epoch": 0.449615638804828, + "grad_norm": 0.9765625, + "learning_rate": 0.001210232743739278, + "loss": 0.8797, + "step": 6463 + }, + { + "epoch": 0.4496852064419632, + "grad_norm": 1.078125, + "learning_rate": 0.001210012441290878, + "loss": 1.2471, + "step": 6464 + }, + { + "epoch": 0.4497547740790984, + "grad_norm": 0.9453125, + "learning_rate": 0.0012097921281791057, + "loss": 0.57, + "step": 6465 + }, + { + "epoch": 0.4498243417162336, + "grad_norm": 1.15625, + "learning_rate": 0.0012095718044151458, + "loss": 0.7033, + "step": 6466 + }, + { + "epoch": 0.4498939093533688, + "grad_norm": 1.1796875, + "learning_rate": 0.0012093514700101864, + "loss": 0.9363, + "step": 6467 + }, + { + "epoch": 0.44996347699050404, + "grad_norm": 1.1640625, + "learning_rate": 0.0012091311249754144, + "loss": 0.9462, + "step": 6468 + }, + { + "epoch": 0.4500330446276392, + "grad_norm": 1.03125, + "learning_rate": 0.001208910769322018, + "loss": 1.047, + "step": 6469 + }, + { + "epoch": 0.45010261226477444, + "grad_norm": 1.1640625, + "learning_rate": 0.0012086904030611859, + "loss": 0.7955, + "step": 6470 + }, + { + "epoch": 0.4501721799019096, + "grad_norm": 1.0703125, + "learning_rate": 0.0012084700262041067, + "loss": 0.7809, + "step": 6471 + }, + { + "epoch": 0.45024174753904483, + "grad_norm": 1.296875, + "learning_rate": 0.0012082496387619706, + "loss": 0.9051, + "step": 6472 + }, + { + "epoch": 0.45031131517618006, + "grad_norm": 0.94140625, + "learning_rate": 0.0012080292407459672, + "loss": 0.6128, + "step": 6473 + }, + { + "epoch": 0.45038088281331523, + "grad_norm": 0.84375, + "learning_rate": 0.0012078088321672874, + "loss": 0.6237, + "step": 6474 + }, + { + "epoch": 0.45045045045045046, + "grad_norm": 1.2109375, + "learning_rate": 0.001207588413037123, + "loss": 0.9595, + "step": 6475 + }, + { + "epoch": 0.4505200180875856, + "grad_norm": 1.3359375, + "learning_rate": 0.001207367983366665, + "loss": 0.9554, + "step": 6476 + }, + { + "epoch": 0.45058958572472085, + "grad_norm": 1.1796875, + "learning_rate": 0.0012071475431671066, + "loss": 0.8974, + "step": 6477 + }, + { + "epoch": 0.4506591533618561, + "grad_norm": 1.078125, + "learning_rate": 0.0012069270924496393, + "loss": 0.7054, + "step": 6478 + }, + { + "epoch": 0.45072872099899125, + "grad_norm": 1.015625, + "learning_rate": 0.0012067066312254579, + "loss": 0.6205, + "step": 6479 + }, + { + "epoch": 0.4507982886361265, + "grad_norm": 1.0859375, + "learning_rate": 0.0012064861595057548, + "loss": 1.0292, + "step": 6480 + }, + { + "epoch": 0.4508678562732617, + "grad_norm": 1.046875, + "learning_rate": 0.001206265677301726, + "loss": 0.7173, + "step": 6481 + }, + { + "epoch": 0.4509374239103969, + "grad_norm": 1.0234375, + "learning_rate": 0.0012060451846245654, + "loss": 0.8283, + "step": 6482 + }, + { + "epoch": 0.4510069915475321, + "grad_norm": 1.1328125, + "learning_rate": 0.001205824681485469, + "loss": 0.9758, + "step": 6483 + }, + { + "epoch": 0.45107655918466727, + "grad_norm": 1.0, + "learning_rate": 0.0012056041678956326, + "loss": 0.7103, + "step": 6484 + }, + { + "epoch": 0.4511461268218025, + "grad_norm": 1.25, + "learning_rate": 0.001205383643866253, + "loss": 0.904, + "step": 6485 + }, + { + "epoch": 0.4512156944589377, + "grad_norm": 0.8203125, + "learning_rate": 0.0012051631094085274, + "loss": 0.7688, + "step": 6486 + }, + { + "epoch": 0.4512852620960729, + "grad_norm": 0.8515625, + "learning_rate": 0.0012049425645336528, + "loss": 0.7455, + "step": 6487 + }, + { + "epoch": 0.4513548297332081, + "grad_norm": 0.94921875, + "learning_rate": 0.0012047220092528282, + "loss": 0.7169, + "step": 6488 + }, + { + "epoch": 0.4514243973703433, + "grad_norm": 1.046875, + "learning_rate": 0.0012045014435772513, + "loss": 0.9323, + "step": 6489 + }, + { + "epoch": 0.4514939650074785, + "grad_norm": 1.046875, + "learning_rate": 0.001204280867518122, + "loss": 0.9188, + "step": 6490 + }, + { + "epoch": 0.45156353264461374, + "grad_norm": 1.421875, + "learning_rate": 0.0012040602810866401, + "loss": 0.9239, + "step": 6491 + }, + { + "epoch": 0.4516331002817489, + "grad_norm": 0.99609375, + "learning_rate": 0.0012038396842940055, + "loss": 0.9611, + "step": 6492 + }, + { + "epoch": 0.45170266791888414, + "grad_norm": 1.3828125, + "learning_rate": 0.0012036190771514195, + "loss": 0.9009, + "step": 6493 + }, + { + "epoch": 0.45177223555601936, + "grad_norm": 1.09375, + "learning_rate": 0.0012033984596700827, + "loss": 0.6225, + "step": 6494 + }, + { + "epoch": 0.45184180319315453, + "grad_norm": 0.98046875, + "learning_rate": 0.0012031778318611977, + "loss": 0.8611, + "step": 6495 + }, + { + "epoch": 0.45191137083028976, + "grad_norm": 0.77734375, + "learning_rate": 0.001202957193735966, + "loss": 0.7015, + "step": 6496 + }, + { + "epoch": 0.45198093846742493, + "grad_norm": 0.9765625, + "learning_rate": 0.001202736545305591, + "loss": 0.8262, + "step": 6497 + }, + { + "epoch": 0.45205050610456016, + "grad_norm": 1.2265625, + "learning_rate": 0.0012025158865812764, + "loss": 0.8527, + "step": 6498 + }, + { + "epoch": 0.4521200737416954, + "grad_norm": 1.1015625, + "learning_rate": 0.001202295217574226, + "loss": 0.7397, + "step": 6499 + }, + { + "epoch": 0.45218964137883055, + "grad_norm": 1.1640625, + "learning_rate": 0.0012020745382956438, + "loss": 0.9292, + "step": 6500 + }, + { + "epoch": 0.4522592090159658, + "grad_norm": 1.421875, + "learning_rate": 0.001201853848756735, + "loss": 0.7857, + "step": 6501 + }, + { + "epoch": 0.45232877665310095, + "grad_norm": 1.09375, + "learning_rate": 0.0012016331489687056, + "loss": 0.8355, + "step": 6502 + }, + { + "epoch": 0.4523983442902362, + "grad_norm": 1.1640625, + "learning_rate": 0.0012014124389427606, + "loss": 0.9161, + "step": 6503 + }, + { + "epoch": 0.4524679119273714, + "grad_norm": 1.046875, + "learning_rate": 0.0012011917186901075, + "loss": 0.9408, + "step": 6504 + }, + { + "epoch": 0.4525374795645066, + "grad_norm": 1.1015625, + "learning_rate": 0.0012009709882219528, + "loss": 0.9124, + "step": 6505 + }, + { + "epoch": 0.4526070472016418, + "grad_norm": 1.0390625, + "learning_rate": 0.0012007502475495048, + "loss": 0.8368, + "step": 6506 + }, + { + "epoch": 0.452676614838777, + "grad_norm": 1.578125, + "learning_rate": 0.0012005294966839703, + "loss": 0.9683, + "step": 6507 + }, + { + "epoch": 0.4527461824759122, + "grad_norm": 1.4296875, + "learning_rate": 0.0012003087356365595, + "loss": 0.7981, + "step": 6508 + }, + { + "epoch": 0.4528157501130474, + "grad_norm": 1.0703125, + "learning_rate": 0.0012000879644184803, + "loss": 0.6341, + "step": 6509 + }, + { + "epoch": 0.4528853177501826, + "grad_norm": 1.1875, + "learning_rate": 0.0011998671830409427, + "loss": 0.6034, + "step": 6510 + }, + { + "epoch": 0.4529548853873178, + "grad_norm": 1.2578125, + "learning_rate": 0.0011996463915151573, + "loss": 0.6165, + "step": 6511 + }, + { + "epoch": 0.45302445302445304, + "grad_norm": 1.09375, + "learning_rate": 0.0011994255898523341, + "loss": 0.9816, + "step": 6512 + }, + { + "epoch": 0.4530940206615882, + "grad_norm": 1.234375, + "learning_rate": 0.0011992047780636848, + "loss": 0.7594, + "step": 6513 + }, + { + "epoch": 0.45316358829872344, + "grad_norm": 1.0078125, + "learning_rate": 0.0011989839561604208, + "loss": 0.7464, + "step": 6514 + }, + { + "epoch": 0.4532331559358586, + "grad_norm": 0.8671875, + "learning_rate": 0.0011987631241537546, + "loss": 0.883, + "step": 6515 + }, + { + "epoch": 0.45330272357299384, + "grad_norm": 1.265625, + "learning_rate": 0.0011985422820548989, + "loss": 0.776, + "step": 6516 + }, + { + "epoch": 0.45337229121012906, + "grad_norm": 1.4609375, + "learning_rate": 0.0011983214298750663, + "loss": 0.9282, + "step": 6517 + }, + { + "epoch": 0.45344185884726423, + "grad_norm": 0.92578125, + "learning_rate": 0.0011981005676254717, + "loss": 0.7438, + "step": 6518 + }, + { + "epoch": 0.45351142648439946, + "grad_norm": 1.1328125, + "learning_rate": 0.0011978796953173285, + "loss": 0.9153, + "step": 6519 + }, + { + "epoch": 0.4535809941215347, + "grad_norm": 0.95703125, + "learning_rate": 0.001197658812961852, + "loss": 0.7996, + "step": 6520 + }, + { + "epoch": 0.45365056175866986, + "grad_norm": 0.8984375, + "learning_rate": 0.001197437920570257, + "loss": 0.7393, + "step": 6521 + }, + { + "epoch": 0.4537201293958051, + "grad_norm": 1.3046875, + "learning_rate": 0.0011972170181537595, + "loss": 0.8687, + "step": 6522 + }, + { + "epoch": 0.45378969703294025, + "grad_norm": 1.4296875, + "learning_rate": 0.001196996105723576, + "loss": 0.7682, + "step": 6523 + }, + { + "epoch": 0.4538592646700755, + "grad_norm": 1.0703125, + "learning_rate": 0.0011967751832909232, + "loss": 0.7974, + "step": 6524 + }, + { + "epoch": 0.4539288323072107, + "grad_norm": 1.3125, + "learning_rate": 0.0011965542508670188, + "loss": 1.0709, + "step": 6525 + }, + { + "epoch": 0.4539983999443459, + "grad_norm": 0.921875, + "learning_rate": 0.0011963333084630797, + "loss": 0.8347, + "step": 6526 + }, + { + "epoch": 0.4540679675814811, + "grad_norm": 1.109375, + "learning_rate": 0.0011961123560903248, + "loss": 0.9493, + "step": 6527 + }, + { + "epoch": 0.4541375352186163, + "grad_norm": 1.1484375, + "learning_rate": 0.0011958913937599731, + "loss": 0.9302, + "step": 6528 + }, + { + "epoch": 0.4542071028557515, + "grad_norm": 1.0703125, + "learning_rate": 0.001195670421483244, + "loss": 0.8061, + "step": 6529 + }, + { + "epoch": 0.4542766704928867, + "grad_norm": 0.9453125, + "learning_rate": 0.0011954494392713566, + "loss": 0.8252, + "step": 6530 + }, + { + "epoch": 0.4543462381300219, + "grad_norm": 1.09375, + "learning_rate": 0.0011952284471355324, + "loss": 0.9535, + "step": 6531 + }, + { + "epoch": 0.4544158057671571, + "grad_norm": 1.1796875, + "learning_rate": 0.0011950074450869912, + "loss": 0.7242, + "step": 6532 + }, + { + "epoch": 0.45448537340429235, + "grad_norm": 1.1484375, + "learning_rate": 0.001194786433136955, + "loss": 1.0569, + "step": 6533 + }, + { + "epoch": 0.4545549410414275, + "grad_norm": 1.078125, + "learning_rate": 0.0011945654112966457, + "loss": 0.9092, + "step": 6534 + }, + { + "epoch": 0.45462450867856274, + "grad_norm": 0.98828125, + "learning_rate": 0.0011943443795772854, + "loss": 0.7104, + "step": 6535 + }, + { + "epoch": 0.4546940763156979, + "grad_norm": 1.0625, + "learning_rate": 0.0011941233379900971, + "loss": 0.8083, + "step": 6536 + }, + { + "epoch": 0.45476364395283314, + "grad_norm": 1.015625, + "learning_rate": 0.001193902286546304, + "loss": 0.7365, + "step": 6537 + }, + { + "epoch": 0.45483321158996837, + "grad_norm": 0.921875, + "learning_rate": 0.0011936812252571303, + "loss": 0.8173, + "step": 6538 + }, + { + "epoch": 0.45490277922710354, + "grad_norm": 1.0703125, + "learning_rate": 0.0011934601541338003, + "loss": 0.8049, + "step": 6539 + }, + { + "epoch": 0.45497234686423876, + "grad_norm": 1.4375, + "learning_rate": 0.0011932390731875385, + "loss": 0.983, + "step": 6540 + }, + { + "epoch": 0.45504191450137393, + "grad_norm": 1.2265625, + "learning_rate": 0.0011930179824295706, + "loss": 0.9033, + "step": 6541 + }, + { + "epoch": 0.45511148213850916, + "grad_norm": 0.93359375, + "learning_rate": 0.0011927968818711227, + "loss": 0.9472, + "step": 6542 + }, + { + "epoch": 0.4551810497756444, + "grad_norm": 0.98828125, + "learning_rate": 0.0011925757715234204, + "loss": 0.5104, + "step": 6543 + }, + { + "epoch": 0.45525061741277956, + "grad_norm": 1.0, + "learning_rate": 0.0011923546513976915, + "loss": 1.0062, + "step": 6544 + }, + { + "epoch": 0.4553201850499148, + "grad_norm": 1.109375, + "learning_rate": 0.001192133521505163, + "loss": 0.8641, + "step": 6545 + }, + { + "epoch": 0.45538975268705, + "grad_norm": 1.4140625, + "learning_rate": 0.0011919123818570625, + "loss": 0.911, + "step": 6546 + }, + { + "epoch": 0.4554593203241852, + "grad_norm": 1.234375, + "learning_rate": 0.0011916912324646184, + "loss": 0.7561, + "step": 6547 + }, + { + "epoch": 0.4555288879613204, + "grad_norm": 1.3046875, + "learning_rate": 0.00119147007333906, + "loss": 0.9323, + "step": 6548 + }, + { + "epoch": 0.4555984555984556, + "grad_norm": 1.1328125, + "learning_rate": 0.0011912489044916164, + "loss": 0.6791, + "step": 6549 + }, + { + "epoch": 0.4556680232355908, + "grad_norm": 1.0859375, + "learning_rate": 0.0011910277259335172, + "loss": 1.0677, + "step": 6550 + }, + { + "epoch": 0.45573759087272603, + "grad_norm": 0.99609375, + "learning_rate": 0.001190806537675993, + "loss": 0.9121, + "step": 6551 + }, + { + "epoch": 0.4558071585098612, + "grad_norm": 1.0859375, + "learning_rate": 0.0011905853397302746, + "loss": 0.9033, + "step": 6552 + }, + { + "epoch": 0.4558767261469964, + "grad_norm": 1.375, + "learning_rate": 0.001190364132107593, + "loss": 0.9323, + "step": 6553 + }, + { + "epoch": 0.4559462937841316, + "grad_norm": 1.0625, + "learning_rate": 0.0011901429148191806, + "loss": 0.6688, + "step": 6554 + }, + { + "epoch": 0.4560158614212668, + "grad_norm": 1.1015625, + "learning_rate": 0.0011899216878762692, + "loss": 0.7977, + "step": 6555 + }, + { + "epoch": 0.45608542905840205, + "grad_norm": 0.94921875, + "learning_rate": 0.001189700451290092, + "loss": 0.8965, + "step": 6556 + }, + { + "epoch": 0.4561549966955372, + "grad_norm": 1.0078125, + "learning_rate": 0.0011894792050718818, + "loss": 0.7538, + "step": 6557 + }, + { + "epoch": 0.45622456433267244, + "grad_norm": 1.453125, + "learning_rate": 0.0011892579492328728, + "loss": 0.8227, + "step": 6558 + }, + { + "epoch": 0.45629413196980767, + "grad_norm": 1.265625, + "learning_rate": 0.001189036683784299, + "loss": 1.0174, + "step": 6559 + }, + { + "epoch": 0.45636369960694284, + "grad_norm": 1.078125, + "learning_rate": 0.001188815408737395, + "loss": 0.7564, + "step": 6560 + }, + { + "epoch": 0.45643326724407807, + "grad_norm": 1.109375, + "learning_rate": 0.0011885941241033967, + "loss": 0.72, + "step": 6561 + }, + { + "epoch": 0.45650283488121324, + "grad_norm": 1.109375, + "learning_rate": 0.001188372829893539, + "loss": 0.7294, + "step": 6562 + }, + { + "epoch": 0.45657240251834846, + "grad_norm": 1.09375, + "learning_rate": 0.0011881515261190586, + "loss": 0.8049, + "step": 6563 + }, + { + "epoch": 0.4566419701554837, + "grad_norm": 1.421875, + "learning_rate": 0.001187930212791192, + "loss": 0.8503, + "step": 6564 + }, + { + "epoch": 0.45671153779261886, + "grad_norm": 1.1953125, + "learning_rate": 0.0011877088899211762, + "loss": 0.7304, + "step": 6565 + }, + { + "epoch": 0.4567811054297541, + "grad_norm": 1.0625, + "learning_rate": 0.0011874875575202495, + "loss": 0.6712, + "step": 6566 + }, + { + "epoch": 0.45685067306688926, + "grad_norm": 1.2734375, + "learning_rate": 0.0011872662155996494, + "loss": 0.7354, + "step": 6567 + }, + { + "epoch": 0.4569202407040245, + "grad_norm": 1.0703125, + "learning_rate": 0.0011870448641706148, + "loss": 0.8842, + "step": 6568 + }, + { + "epoch": 0.4569898083411597, + "grad_norm": 1.1484375, + "learning_rate": 0.0011868235032443848, + "loss": 0.8922, + "step": 6569 + }, + { + "epoch": 0.4570593759782949, + "grad_norm": 1.09375, + "learning_rate": 0.001186602132832199, + "loss": 0.9433, + "step": 6570 + }, + { + "epoch": 0.4571289436154301, + "grad_norm": 0.9765625, + "learning_rate": 0.0011863807529452974, + "loss": 1.005, + "step": 6571 + }, + { + "epoch": 0.45719851125256533, + "grad_norm": 0.9375, + "learning_rate": 0.0011861593635949207, + "loss": 0.7681, + "step": 6572 + }, + { + "epoch": 0.4572680788897005, + "grad_norm": 1.125, + "learning_rate": 0.0011859379647923096, + "loss": 0.8305, + "step": 6573 + }, + { + "epoch": 0.45733764652683573, + "grad_norm": 1.2890625, + "learning_rate": 0.001185716556548706, + "loss": 1.1047, + "step": 6574 + }, + { + "epoch": 0.4574072141639709, + "grad_norm": 0.81640625, + "learning_rate": 0.001185495138875352, + "loss": 0.7204, + "step": 6575 + }, + { + "epoch": 0.4574767818011061, + "grad_norm": 1.3359375, + "learning_rate": 0.0011852737117834893, + "loss": 0.8898, + "step": 6576 + }, + { + "epoch": 0.45754634943824135, + "grad_norm": 1.1640625, + "learning_rate": 0.0011850522752843615, + "loss": 0.945, + "step": 6577 + }, + { + "epoch": 0.4576159170753765, + "grad_norm": 1.15625, + "learning_rate": 0.001184830829389212, + "loss": 0.9251, + "step": 6578 + }, + { + "epoch": 0.45768548471251175, + "grad_norm": 1.0546875, + "learning_rate": 0.0011846093741092847, + "loss": 0.874, + "step": 6579 + }, + { + "epoch": 0.4577550523496469, + "grad_norm": 1.1171875, + "learning_rate": 0.0011843879094558239, + "loss": 0.9574, + "step": 6580 + }, + { + "epoch": 0.45782461998678214, + "grad_norm": 1.15625, + "learning_rate": 0.0011841664354400741, + "loss": 0.9693, + "step": 6581 + }, + { + "epoch": 0.45789418762391737, + "grad_norm": 1.15625, + "learning_rate": 0.0011839449520732812, + "loss": 0.9989, + "step": 6582 + }, + { + "epoch": 0.45796375526105254, + "grad_norm": 1.1171875, + "learning_rate": 0.0011837234593666908, + "loss": 0.7398, + "step": 6583 + }, + { + "epoch": 0.45803332289818777, + "grad_norm": 1.0234375, + "learning_rate": 0.0011835019573315493, + "loss": 0.7772, + "step": 6584 + }, + { + "epoch": 0.458102890535323, + "grad_norm": 1.3671875, + "learning_rate": 0.0011832804459791031, + "loss": 0.7726, + "step": 6585 + }, + { + "epoch": 0.45817245817245816, + "grad_norm": 1.1875, + "learning_rate": 0.0011830589253205997, + "loss": 0.9433, + "step": 6586 + }, + { + "epoch": 0.4582420258095934, + "grad_norm": 1.1953125, + "learning_rate": 0.0011828373953672868, + "loss": 0.765, + "step": 6587 + }, + { + "epoch": 0.45831159344672856, + "grad_norm": 1.1484375, + "learning_rate": 0.0011826158561304126, + "loss": 0.7674, + "step": 6588 + }, + { + "epoch": 0.4583811610838638, + "grad_norm": 1.3515625, + "learning_rate": 0.0011823943076212256, + "loss": 1.1343, + "step": 6589 + }, + { + "epoch": 0.458450728720999, + "grad_norm": 1.2734375, + "learning_rate": 0.001182172749850975, + "loss": 1.2282, + "step": 6590 + }, + { + "epoch": 0.4585202963581342, + "grad_norm": 1.1875, + "learning_rate": 0.0011819511828309102, + "loss": 0.9132, + "step": 6591 + }, + { + "epoch": 0.4585898639952694, + "grad_norm": 1.171875, + "learning_rate": 0.0011817296065722816, + "loss": 0.9758, + "step": 6592 + }, + { + "epoch": 0.4586594316324046, + "grad_norm": 0.921875, + "learning_rate": 0.0011815080210863397, + "loss": 0.8898, + "step": 6593 + }, + { + "epoch": 0.4587289992695398, + "grad_norm": 0.88671875, + "learning_rate": 0.0011812864263843353, + "loss": 0.6673, + "step": 6594 + }, + { + "epoch": 0.45879856690667503, + "grad_norm": 0.91796875, + "learning_rate": 0.0011810648224775198, + "loss": 0.727, + "step": 6595 + }, + { + "epoch": 0.4588681345438102, + "grad_norm": 0.8203125, + "learning_rate": 0.0011808432093771454, + "loss": 0.7471, + "step": 6596 + }, + { + "epoch": 0.45893770218094543, + "grad_norm": 1.296875, + "learning_rate": 0.0011806215870944642, + "loss": 0.7002, + "step": 6597 + }, + { + "epoch": 0.45900726981808065, + "grad_norm": 1.078125, + "learning_rate": 0.0011803999556407293, + "loss": 0.8018, + "step": 6598 + }, + { + "epoch": 0.4590768374552158, + "grad_norm": 0.87109375, + "learning_rate": 0.0011801783150271934, + "loss": 0.6078, + "step": 6599 + }, + { + "epoch": 0.45914640509235105, + "grad_norm": 1.234375, + "learning_rate": 0.0011799566652651117, + "loss": 0.8592, + "step": 6600 + }, + { + "epoch": 0.4592159727294862, + "grad_norm": 1.0703125, + "learning_rate": 0.001179735006365737, + "loss": 0.8635, + "step": 6601 + }, + { + "epoch": 0.45928554036662145, + "grad_norm": 1.359375, + "learning_rate": 0.001179513338340325, + "loss": 0.7542, + "step": 6602 + }, + { + "epoch": 0.4593551080037567, + "grad_norm": 0.9375, + "learning_rate": 0.0011792916612001303, + "loss": 0.7602, + "step": 6603 + }, + { + "epoch": 0.45942467564089184, + "grad_norm": 1.0859375, + "learning_rate": 0.0011790699749564086, + "loss": 0.7673, + "step": 6604 + }, + { + "epoch": 0.45949424327802707, + "grad_norm": 0.84765625, + "learning_rate": 0.0011788482796204164, + "loss": 0.6943, + "step": 6605 + }, + { + "epoch": 0.45956381091516224, + "grad_norm": 1.2109375, + "learning_rate": 0.0011786265752034098, + "loss": 0.7012, + "step": 6606 + }, + { + "epoch": 0.45963337855229747, + "grad_norm": 1.3515625, + "learning_rate": 0.0011784048617166463, + "loss": 0.9853, + "step": 6607 + }, + { + "epoch": 0.4597029461894327, + "grad_norm": 1.1953125, + "learning_rate": 0.001178183139171383, + "loss": 0.7788, + "step": 6608 + }, + { + "epoch": 0.45977251382656786, + "grad_norm": 1.3828125, + "learning_rate": 0.0011779614075788781, + "loss": 0.5682, + "step": 6609 + }, + { + "epoch": 0.4598420814637031, + "grad_norm": 1.0390625, + "learning_rate": 0.0011777396669503898, + "loss": 0.6482, + "step": 6610 + }, + { + "epoch": 0.4599116491008383, + "grad_norm": 1.046875, + "learning_rate": 0.0011775179172971771, + "loss": 0.7693, + "step": 6611 + }, + { + "epoch": 0.4599812167379735, + "grad_norm": 1.2734375, + "learning_rate": 0.0011772961586304993, + "loss": 0.7675, + "step": 6612 + }, + { + "epoch": 0.4600507843751087, + "grad_norm": 0.984375, + "learning_rate": 0.0011770743909616161, + "loss": 0.7296, + "step": 6613 + }, + { + "epoch": 0.4601203520122439, + "grad_norm": 1.1796875, + "learning_rate": 0.0011768526143017882, + "loss": 0.9234, + "step": 6614 + }, + { + "epoch": 0.4601899196493791, + "grad_norm": 0.97265625, + "learning_rate": 0.0011766308286622756, + "loss": 0.8889, + "step": 6615 + }, + { + "epoch": 0.46025948728651433, + "grad_norm": 0.85546875, + "learning_rate": 0.00117640903405434, + "loss": 0.756, + "step": 6616 + }, + { + "epoch": 0.4603290549236495, + "grad_norm": 0.9921875, + "learning_rate": 0.0011761872304892427, + "loss": 0.6366, + "step": 6617 + }, + { + "epoch": 0.46039862256078473, + "grad_norm": 1.34375, + "learning_rate": 0.001175965417978246, + "loss": 1.0019, + "step": 6618 + }, + { + "epoch": 0.4604681901979199, + "grad_norm": 1.03125, + "learning_rate": 0.0011757435965326123, + "loss": 0.7696, + "step": 6619 + }, + { + "epoch": 0.46053775783505513, + "grad_norm": 1.0078125, + "learning_rate": 0.0011755217661636047, + "loss": 0.7882, + "step": 6620 + }, + { + "epoch": 0.46060732547219035, + "grad_norm": 1.1171875, + "learning_rate": 0.0011752999268824862, + "loss": 0.8832, + "step": 6621 + }, + { + "epoch": 0.4606768931093255, + "grad_norm": 1.2734375, + "learning_rate": 0.001175078078700521, + "loss": 1.199, + "step": 6622 + }, + { + "epoch": 0.46074646074646075, + "grad_norm": 1.046875, + "learning_rate": 0.0011748562216289738, + "loss": 0.9617, + "step": 6623 + }, + { + "epoch": 0.460816028383596, + "grad_norm": 1.2421875, + "learning_rate": 0.0011746343556791085, + "loss": 0.8008, + "step": 6624 + }, + { + "epoch": 0.46088559602073115, + "grad_norm": 1.2578125, + "learning_rate": 0.001174412480862191, + "loss": 0.6703, + "step": 6625 + }, + { + "epoch": 0.4609551636578664, + "grad_norm": 1.265625, + "learning_rate": 0.0011741905971894872, + "loss": 0.9244, + "step": 6626 + }, + { + "epoch": 0.46102473129500154, + "grad_norm": 1.0390625, + "learning_rate": 0.0011739687046722627, + "loss": 0.6949, + "step": 6627 + }, + { + "epoch": 0.46109429893213677, + "grad_norm": 1.2734375, + "learning_rate": 0.001173746803321784, + "loss": 0.998, + "step": 6628 + }, + { + "epoch": 0.461163866569272, + "grad_norm": 1.2734375, + "learning_rate": 0.0011735248931493184, + "loss": 0.9241, + "step": 6629 + }, + { + "epoch": 0.46123343420640717, + "grad_norm": 1.3203125, + "learning_rate": 0.0011733029741661336, + "loss": 1.089, + "step": 6630 + }, + { + "epoch": 0.4613030018435424, + "grad_norm": 1.515625, + "learning_rate": 0.0011730810463834972, + "loss": 0.7909, + "step": 6631 + }, + { + "epoch": 0.46137256948067756, + "grad_norm": 0.96875, + "learning_rate": 0.0011728591098126775, + "loss": 0.6703, + "step": 6632 + }, + { + "epoch": 0.4614421371178128, + "grad_norm": 1.125, + "learning_rate": 0.0011726371644649436, + "loss": 0.8211, + "step": 6633 + }, + { + "epoch": 0.461511704754948, + "grad_norm": 1.046875, + "learning_rate": 0.0011724152103515647, + "loss": 0.9068, + "step": 6634 + }, + { + "epoch": 0.4615812723920832, + "grad_norm": 1.1640625, + "learning_rate": 0.0011721932474838103, + "loss": 0.813, + "step": 6635 + }, + { + "epoch": 0.4616508400292184, + "grad_norm": 1.234375, + "learning_rate": 0.0011719712758729505, + "loss": 0.7287, + "step": 6636 + }, + { + "epoch": 0.46172040766635364, + "grad_norm": 1.234375, + "learning_rate": 0.0011717492955302569, + "loss": 0.9651, + "step": 6637 + }, + { + "epoch": 0.4617899753034888, + "grad_norm": 0.96484375, + "learning_rate": 0.0011715273064669988, + "loss": 0.7242, + "step": 6638 + }, + { + "epoch": 0.46185954294062403, + "grad_norm": 1.1328125, + "learning_rate": 0.0011713053086944494, + "loss": 1.2099, + "step": 6639 + }, + { + "epoch": 0.4619291105777592, + "grad_norm": 1.109375, + "learning_rate": 0.0011710833022238797, + "loss": 0.6752, + "step": 6640 + }, + { + "epoch": 0.46199867821489443, + "grad_norm": 1.1328125, + "learning_rate": 0.001170861287066562, + "loss": 0.9003, + "step": 6641 + }, + { + "epoch": 0.46206824585202966, + "grad_norm": 1.125, + "learning_rate": 0.0011706392632337694, + "loss": 0.8056, + "step": 6642 + }, + { + "epoch": 0.46213781348916483, + "grad_norm": 0.9609375, + "learning_rate": 0.0011704172307367754, + "loss": 0.697, + "step": 6643 + }, + { + "epoch": 0.46220738112630005, + "grad_norm": 1.1015625, + "learning_rate": 0.001170195189586853, + "loss": 0.7523, + "step": 6644 + }, + { + "epoch": 0.4622769487634352, + "grad_norm": 1.1015625, + "learning_rate": 0.0011699731397952766, + "loss": 1.0126, + "step": 6645 + }, + { + "epoch": 0.46234651640057045, + "grad_norm": 1.28125, + "learning_rate": 0.0011697510813733214, + "loss": 0.869, + "step": 6646 + }, + { + "epoch": 0.4624160840377057, + "grad_norm": 1.171875, + "learning_rate": 0.0011695290143322616, + "loss": 0.8879, + "step": 6647 + }, + { + "epoch": 0.46248565167484085, + "grad_norm": 1.234375, + "learning_rate": 0.001169306938683373, + "loss": 0.8482, + "step": 6648 + }, + { + "epoch": 0.4625552193119761, + "grad_norm": 1.1796875, + "learning_rate": 0.0011690848544379316, + "loss": 0.6539, + "step": 6649 + }, + { + "epoch": 0.4626247869491113, + "grad_norm": 0.85546875, + "learning_rate": 0.0011688627616072132, + "loss": 0.732, + "step": 6650 + }, + { + "epoch": 0.46269435458624647, + "grad_norm": 1.4765625, + "learning_rate": 0.001168640660202495, + "loss": 1.0946, + "step": 6651 + }, + { + "epoch": 0.4627639222233817, + "grad_norm": 1.0078125, + "learning_rate": 0.001168418550235054, + "loss": 0.8689, + "step": 6652 + }, + { + "epoch": 0.46283348986051687, + "grad_norm": 1.15625, + "learning_rate": 0.0011681964317161685, + "loss": 0.9217, + "step": 6653 + }, + { + "epoch": 0.4629030574976521, + "grad_norm": 1.78125, + "learning_rate": 0.001167974304657115, + "loss": 0.93, + "step": 6654 + }, + { + "epoch": 0.4629726251347873, + "grad_norm": 1.140625, + "learning_rate": 0.001167752169069174, + "loss": 0.8417, + "step": 6655 + }, + { + "epoch": 0.4630421927719225, + "grad_norm": 1.15625, + "learning_rate": 0.0011675300249636227, + "loss": 0.8598, + "step": 6656 + }, + { + "epoch": 0.4631117604090577, + "grad_norm": 1.171875, + "learning_rate": 0.0011673078723517414, + "loss": 0.9799, + "step": 6657 + }, + { + "epoch": 0.4631813280461929, + "grad_norm": 1.1953125, + "learning_rate": 0.0011670857112448094, + "loss": 0.8178, + "step": 6658 + }, + { + "epoch": 0.4632508956833281, + "grad_norm": 1.0546875, + "learning_rate": 0.0011668635416541072, + "loss": 0.9696, + "step": 6659 + }, + { + "epoch": 0.46332046332046334, + "grad_norm": 1.109375, + "learning_rate": 0.0011666413635909156, + "loss": 0.8456, + "step": 6660 + }, + { + "epoch": 0.4633900309575985, + "grad_norm": 0.89453125, + "learning_rate": 0.0011664191770665154, + "loss": 0.6578, + "step": 6661 + }, + { + "epoch": 0.46345959859473373, + "grad_norm": 1.1171875, + "learning_rate": 0.0011661969820921884, + "loss": 0.742, + "step": 6662 + }, + { + "epoch": 0.46352916623186896, + "grad_norm": 1.0859375, + "learning_rate": 0.0011659747786792161, + "loss": 0.8278, + "step": 6663 + }, + { + "epoch": 0.46359873386900413, + "grad_norm": 0.91796875, + "learning_rate": 0.0011657525668388813, + "loss": 0.7629, + "step": 6664 + }, + { + "epoch": 0.46366830150613936, + "grad_norm": 1.0546875, + "learning_rate": 0.0011655303465824664, + "loss": 0.637, + "step": 6665 + }, + { + "epoch": 0.46373786914327453, + "grad_norm": 1.28125, + "learning_rate": 0.0011653081179212549, + "loss": 0.7785, + "step": 6666 + }, + { + "epoch": 0.46380743678040975, + "grad_norm": 1.203125, + "learning_rate": 0.0011650858808665303, + "loss": 0.9898, + "step": 6667 + }, + { + "epoch": 0.463877004417545, + "grad_norm": 1.0, + "learning_rate": 0.0011648636354295767, + "loss": 0.7137, + "step": 6668 + }, + { + "epoch": 0.46394657205468015, + "grad_norm": 0.890625, + "learning_rate": 0.0011646413816216792, + "loss": 0.5219, + "step": 6669 + }, + { + "epoch": 0.4640161396918154, + "grad_norm": 0.8359375, + "learning_rate": 0.0011644191194541216, + "loss": 0.6708, + "step": 6670 + }, + { + "epoch": 0.46408570732895055, + "grad_norm": 1.0390625, + "learning_rate": 0.0011641968489381903, + "loss": 0.7185, + "step": 6671 + }, + { + "epoch": 0.4641552749660858, + "grad_norm": 1.265625, + "learning_rate": 0.0011639745700851702, + "loss": 0.755, + "step": 6672 + }, + { + "epoch": 0.464224842603221, + "grad_norm": 1.5703125, + "learning_rate": 0.001163752282906348, + "loss": 0.7932, + "step": 6673 + }, + { + "epoch": 0.46429441024035617, + "grad_norm": 1.390625, + "learning_rate": 0.0011635299874130107, + "loss": 0.9895, + "step": 6674 + }, + { + "epoch": 0.4643639778774914, + "grad_norm": 1.2890625, + "learning_rate": 0.0011633076836164444, + "loss": 1.0151, + "step": 6675 + }, + { + "epoch": 0.4644335455146266, + "grad_norm": 1.25, + "learning_rate": 0.0011630853715279374, + "loss": 1.0777, + "step": 6676 + }, + { + "epoch": 0.4645031131517618, + "grad_norm": 1.2890625, + "learning_rate": 0.0011628630511587767, + "loss": 0.8239, + "step": 6677 + }, + { + "epoch": 0.464572680788897, + "grad_norm": 1.3046875, + "learning_rate": 0.001162640722520252, + "loss": 0.9758, + "step": 6678 + }, + { + "epoch": 0.4646422484260322, + "grad_norm": 1.1796875, + "learning_rate": 0.0011624183856236505, + "loss": 1.0418, + "step": 6679 + }, + { + "epoch": 0.4647118160631674, + "grad_norm": 0.984375, + "learning_rate": 0.0011621960404802623, + "loss": 0.8334, + "step": 6680 + }, + { + "epoch": 0.46478138370030264, + "grad_norm": 0.9375, + "learning_rate": 0.0011619736871013766, + "loss": 0.764, + "step": 6681 + }, + { + "epoch": 0.4648509513374378, + "grad_norm": 1.4453125, + "learning_rate": 0.0011617513254982834, + "loss": 1.0208, + "step": 6682 + }, + { + "epoch": 0.46492051897457304, + "grad_norm": 1.0703125, + "learning_rate": 0.0011615289556822735, + "loss": 0.8085, + "step": 6683 + }, + { + "epoch": 0.4649900866117082, + "grad_norm": 1.1484375, + "learning_rate": 0.001161306577664637, + "loss": 0.8879, + "step": 6684 + }, + { + "epoch": 0.46505965424884343, + "grad_norm": 1.0859375, + "learning_rate": 0.0011610841914566658, + "loss": 0.9558, + "step": 6685 + }, + { + "epoch": 0.46512922188597866, + "grad_norm": 0.89453125, + "learning_rate": 0.0011608617970696512, + "loss": 0.6572, + "step": 6686 + }, + { + "epoch": 0.46519878952311383, + "grad_norm": 1.2734375, + "learning_rate": 0.0011606393945148854, + "loss": 1.0288, + "step": 6687 + }, + { + "epoch": 0.46526835716024906, + "grad_norm": 1.21875, + "learning_rate": 0.0011604169838036608, + "loss": 0.9335, + "step": 6688 + }, + { + "epoch": 0.4653379247973843, + "grad_norm": 1.109375, + "learning_rate": 0.00116019456494727, + "loss": 0.8959, + "step": 6689 + }, + { + "epoch": 0.46540749243451945, + "grad_norm": 0.83984375, + "learning_rate": 0.0011599721379570071, + "loss": 0.6395, + "step": 6690 + }, + { + "epoch": 0.4654770600716547, + "grad_norm": 1.109375, + "learning_rate": 0.001159749702844165, + "loss": 1.0481, + "step": 6691 + }, + { + "epoch": 0.46554662770878985, + "grad_norm": 1.0859375, + "learning_rate": 0.0011595272596200386, + "loss": 0.792, + "step": 6692 + }, + { + "epoch": 0.4656161953459251, + "grad_norm": 1.0625, + "learning_rate": 0.0011593048082959216, + "loss": 0.714, + "step": 6693 + }, + { + "epoch": 0.4656857629830603, + "grad_norm": 1.1796875, + "learning_rate": 0.00115908234888311, + "loss": 0.845, + "step": 6694 + }, + { + "epoch": 0.4657553306201955, + "grad_norm": 1.125, + "learning_rate": 0.0011588598813928978, + "loss": 0.6841, + "step": 6695 + }, + { + "epoch": 0.4658248982573307, + "grad_norm": 1.0859375, + "learning_rate": 0.001158637405836582, + "loss": 0.7866, + "step": 6696 + }, + { + "epoch": 0.46589446589446587, + "grad_norm": 1.0078125, + "learning_rate": 0.0011584149222254583, + "loss": 0.7629, + "step": 6697 + }, + { + "epoch": 0.4659640335316011, + "grad_norm": 0.86328125, + "learning_rate": 0.0011581924305708229, + "loss": 0.6811, + "step": 6698 + }, + { + "epoch": 0.4660336011687363, + "grad_norm": 1.125, + "learning_rate": 0.0011579699308839739, + "loss": 0.9732, + "step": 6699 + }, + { + "epoch": 0.4661031688058715, + "grad_norm": 1.2421875, + "learning_rate": 0.0011577474231762076, + "loss": 0.7477, + "step": 6700 + }, + { + "epoch": 0.4661727364430067, + "grad_norm": 0.93359375, + "learning_rate": 0.0011575249074588223, + "loss": 0.8375, + "step": 6701 + }, + { + "epoch": 0.46624230408014194, + "grad_norm": 1.3203125, + "learning_rate": 0.0011573023837431163, + "loss": 0.6758, + "step": 6702 + }, + { + "epoch": 0.4663118717172771, + "grad_norm": 1.09375, + "learning_rate": 0.0011570798520403878, + "loss": 0.6657, + "step": 6703 + }, + { + "epoch": 0.46638143935441234, + "grad_norm": 1.046875, + "learning_rate": 0.0011568573123619367, + "loss": 0.7631, + "step": 6704 + }, + { + "epoch": 0.4664510069915475, + "grad_norm": 0.90625, + "learning_rate": 0.0011566347647190614, + "loss": 0.5423, + "step": 6705 + }, + { + "epoch": 0.46652057462868274, + "grad_norm": 0.97265625, + "learning_rate": 0.0011564122091230627, + "loss": 0.7526, + "step": 6706 + }, + { + "epoch": 0.46659014226581796, + "grad_norm": 1.1875, + "learning_rate": 0.00115618964558524, + "loss": 1.0364, + "step": 6707 + }, + { + "epoch": 0.46665970990295313, + "grad_norm": 1.3203125, + "learning_rate": 0.0011559670741168946, + "loss": 0.9639, + "step": 6708 + }, + { + "epoch": 0.46672927754008836, + "grad_norm": 1.203125, + "learning_rate": 0.001155744494729327, + "loss": 0.8695, + "step": 6709 + }, + { + "epoch": 0.46679884517722353, + "grad_norm": 1.328125, + "learning_rate": 0.0011555219074338393, + "loss": 0.9251, + "step": 6710 + }, + { + "epoch": 0.46686841281435876, + "grad_norm": 1.0234375, + "learning_rate": 0.001155299312241733, + "loss": 0.8074, + "step": 6711 + }, + { + "epoch": 0.466937980451494, + "grad_norm": 1.1875, + "learning_rate": 0.00115507670916431, + "loss": 0.8851, + "step": 6712 + }, + { + "epoch": 0.46700754808862915, + "grad_norm": 0.96875, + "learning_rate": 0.001154854098212874, + "loss": 0.7742, + "step": 6713 + }, + { + "epoch": 0.4670771157257644, + "grad_norm": 1.265625, + "learning_rate": 0.0011546314793987268, + "loss": 0.7902, + "step": 6714 + }, + { + "epoch": 0.4671466833628996, + "grad_norm": 1.1796875, + "learning_rate": 0.001154408852733173, + "loss": 1.073, + "step": 6715 + }, + { + "epoch": 0.4672162510000348, + "grad_norm": 0.9296875, + "learning_rate": 0.0011541862182275155, + "loss": 0.844, + "step": 6716 + }, + { + "epoch": 0.46728581863717, + "grad_norm": 1.015625, + "learning_rate": 0.0011539635758930592, + "loss": 0.6413, + "step": 6717 + }, + { + "epoch": 0.4673553862743052, + "grad_norm": 1.1875, + "learning_rate": 0.0011537409257411084, + "loss": 0.8827, + "step": 6718 + }, + { + "epoch": 0.4674249539114404, + "grad_norm": 1.234375, + "learning_rate": 0.0011535182677829684, + "loss": 0.7699, + "step": 6719 + }, + { + "epoch": 0.4674945215485756, + "grad_norm": 1.0078125, + "learning_rate": 0.0011532956020299447, + "loss": 0.8396, + "step": 6720 + }, + { + "epoch": 0.4675640891857108, + "grad_norm": 1.0390625, + "learning_rate": 0.0011530729284933428, + "loss": 0.8124, + "step": 6721 + }, + { + "epoch": 0.467633656822846, + "grad_norm": 1.2265625, + "learning_rate": 0.0011528502471844693, + "loss": 0.6586, + "step": 6722 + }, + { + "epoch": 0.4677032244599812, + "grad_norm": 1.2421875, + "learning_rate": 0.0011526275581146303, + "loss": 1.1657, + "step": 6723 + }, + { + "epoch": 0.4677727920971164, + "grad_norm": 0.96484375, + "learning_rate": 0.0011524048612951336, + "loss": 0.7377, + "step": 6724 + }, + { + "epoch": 0.46784235973425164, + "grad_norm": 1.0859375, + "learning_rate": 0.0011521821567372862, + "loss": 0.7954, + "step": 6725 + }, + { + "epoch": 0.4679119273713868, + "grad_norm": 1.2265625, + "learning_rate": 0.0011519594444523956, + "loss": 0.8808, + "step": 6726 + }, + { + "epoch": 0.46798149500852204, + "grad_norm": 1.09375, + "learning_rate": 0.001151736724451771, + "loss": 0.5918, + "step": 6727 + }, + { + "epoch": 0.46805106264565727, + "grad_norm": 1.265625, + "learning_rate": 0.0011515139967467195, + "loss": 0.9816, + "step": 6728 + }, + { + "epoch": 0.46812063028279244, + "grad_norm": 1.046875, + "learning_rate": 0.0011512912613485516, + "loss": 0.96, + "step": 6729 + }, + { + "epoch": 0.46819019791992766, + "grad_norm": 1.265625, + "learning_rate": 0.0011510685182685755, + "loss": 1.0691, + "step": 6730 + }, + { + "epoch": 0.46825976555706283, + "grad_norm": 0.89453125, + "learning_rate": 0.001150845767518102, + "loss": 0.7501, + "step": 6731 + }, + { + "epoch": 0.46832933319419806, + "grad_norm": 0.83203125, + "learning_rate": 0.0011506230091084403, + "loss": 0.6844, + "step": 6732 + }, + { + "epoch": 0.4683989008313333, + "grad_norm": 0.89453125, + "learning_rate": 0.0011504002430509014, + "loss": 0.7496, + "step": 6733 + }, + { + "epoch": 0.46846846846846846, + "grad_norm": 1.25, + "learning_rate": 0.0011501774693567968, + "loss": 0.9753, + "step": 6734 + }, + { + "epoch": 0.4685380361056037, + "grad_norm": 1.1640625, + "learning_rate": 0.0011499546880374366, + "loss": 0.9054, + "step": 6735 + }, + { + "epoch": 0.46860760374273885, + "grad_norm": 1.265625, + "learning_rate": 0.0011497318991041336, + "loss": 0.7376, + "step": 6736 + }, + { + "epoch": 0.4686771713798741, + "grad_norm": 0.953125, + "learning_rate": 0.001149509102568199, + "loss": 0.6246, + "step": 6737 + }, + { + "epoch": 0.4687467390170093, + "grad_norm": 1.1953125, + "learning_rate": 0.0011492862984409464, + "loss": 0.8417, + "step": 6738 + }, + { + "epoch": 0.4688163066541445, + "grad_norm": 1.15625, + "learning_rate": 0.0011490634867336875, + "loss": 1.0912, + "step": 6739 + }, + { + "epoch": 0.4688858742912797, + "grad_norm": 1.171875, + "learning_rate": 0.0011488406674577364, + "loss": 0.8998, + "step": 6740 + }, + { + "epoch": 0.46895544192841493, + "grad_norm": 1.140625, + "learning_rate": 0.001148617840624406, + "loss": 1.0196, + "step": 6741 + }, + { + "epoch": 0.4690250095655501, + "grad_norm": 1.328125, + "learning_rate": 0.0011483950062450112, + "loss": 0.981, + "step": 6742 + }, + { + "epoch": 0.4690945772026853, + "grad_norm": 0.96875, + "learning_rate": 0.001148172164330866, + "loss": 0.7669, + "step": 6743 + }, + { + "epoch": 0.4691641448398205, + "grad_norm": 1.3671875, + "learning_rate": 0.0011479493148932847, + "loss": 0.8624, + "step": 6744 + }, + { + "epoch": 0.4692337124769557, + "grad_norm": 0.953125, + "learning_rate": 0.0011477264579435834, + "loss": 0.6625, + "step": 6745 + }, + { + "epoch": 0.46930328011409095, + "grad_norm": 1.2421875, + "learning_rate": 0.0011475035934930768, + "loss": 0.9669, + "step": 6746 + }, + { + "epoch": 0.4693728477512261, + "grad_norm": 0.96484375, + "learning_rate": 0.0011472807215530813, + "loss": 0.7936, + "step": 6747 + }, + { + "epoch": 0.46944241538836134, + "grad_norm": 1.0, + "learning_rate": 0.001147057842134913, + "loss": 0.7275, + "step": 6748 + }, + { + "epoch": 0.4695119830254965, + "grad_norm": 1.140625, + "learning_rate": 0.0011468349552498887, + "loss": 0.7721, + "step": 6749 + }, + { + "epoch": 0.46958155066263174, + "grad_norm": 1.296875, + "learning_rate": 0.0011466120609093257, + "loss": 0.9905, + "step": 6750 + }, + { + "epoch": 0.46965111829976697, + "grad_norm": 1.4609375, + "learning_rate": 0.001146389159124541, + "loss": 0.9612, + "step": 6751 + }, + { + "epoch": 0.46972068593690214, + "grad_norm": 1.1171875, + "learning_rate": 0.0011461662499068527, + "loss": 0.8758, + "step": 6752 + }, + { + "epoch": 0.46979025357403736, + "grad_norm": 0.96484375, + "learning_rate": 0.001145943333267579, + "loss": 0.7701, + "step": 6753 + }, + { + "epoch": 0.4698598212111726, + "grad_norm": 1.0859375, + "learning_rate": 0.0011457204092180384, + "loss": 0.8275, + "step": 6754 + }, + { + "epoch": 0.46992938884830776, + "grad_norm": 0.9453125, + "learning_rate": 0.00114549747776955, + "loss": 0.6869, + "step": 6755 + }, + { + "epoch": 0.469998956485443, + "grad_norm": 0.984375, + "learning_rate": 0.001145274538933433, + "loss": 0.848, + "step": 6756 + }, + { + "epoch": 0.47006852412257816, + "grad_norm": 1.03125, + "learning_rate": 0.0011450515927210073, + "loss": 0.8047, + "step": 6757 + }, + { + "epoch": 0.4701380917597134, + "grad_norm": 1.03125, + "learning_rate": 0.0011448286391435925, + "loss": 0.7052, + "step": 6758 + }, + { + "epoch": 0.4702076593968486, + "grad_norm": 1.015625, + "learning_rate": 0.0011446056782125097, + "loss": 0.7012, + "step": 6759 + }, + { + "epoch": 0.4702772270339838, + "grad_norm": 1.5, + "learning_rate": 0.0011443827099390793, + "loss": 0.8022, + "step": 6760 + }, + { + "epoch": 0.470346794671119, + "grad_norm": 1.3359375, + "learning_rate": 0.001144159734334623, + "loss": 0.8481, + "step": 6761 + }, + { + "epoch": 0.4704163623082542, + "grad_norm": 1.1015625, + "learning_rate": 0.0011439367514104613, + "loss": 0.6478, + "step": 6762 + }, + { + "epoch": 0.4704859299453894, + "grad_norm": 1.0546875, + "learning_rate": 0.0011437137611779171, + "loss": 0.6574, + "step": 6763 + }, + { + "epoch": 0.47055549758252463, + "grad_norm": 1.0, + "learning_rate": 0.0011434907636483126, + "loss": 0.8813, + "step": 6764 + }, + { + "epoch": 0.4706250652196598, + "grad_norm": 1.3515625, + "learning_rate": 0.0011432677588329703, + "loss": 0.8534, + "step": 6765 + }, + { + "epoch": 0.470694632856795, + "grad_norm": 1.375, + "learning_rate": 0.0011430447467432137, + "loss": 1.1378, + "step": 6766 + }, + { + "epoch": 0.4707642004939302, + "grad_norm": 1.21875, + "learning_rate": 0.0011428217273903654, + "loss": 0.7197, + "step": 6767 + }, + { + "epoch": 0.4708337681310654, + "grad_norm": 1.28125, + "learning_rate": 0.0011425987007857498, + "loss": 1.1085, + "step": 6768 + }, + { + "epoch": 0.47090333576820065, + "grad_norm": 0.9453125, + "learning_rate": 0.0011423756669406908, + "loss": 0.6024, + "step": 6769 + }, + { + "epoch": 0.4709729034053358, + "grad_norm": 1.1796875, + "learning_rate": 0.0011421526258665131, + "loss": 0.6622, + "step": 6770 + }, + { + "epoch": 0.47104247104247104, + "grad_norm": 1.0, + "learning_rate": 0.0011419295775745417, + "loss": 0.8794, + "step": 6771 + }, + { + "epoch": 0.47111203867960627, + "grad_norm": 1.1796875, + "learning_rate": 0.001141706522076102, + "loss": 0.9826, + "step": 6772 + }, + { + "epoch": 0.47118160631674144, + "grad_norm": 1.1640625, + "learning_rate": 0.0011414834593825188, + "loss": 0.9709, + "step": 6773 + }, + { + "epoch": 0.47125117395387667, + "grad_norm": 1.078125, + "learning_rate": 0.001141260389505119, + "loss": 0.937, + "step": 6774 + }, + { + "epoch": 0.47132074159101184, + "grad_norm": 1.25, + "learning_rate": 0.0011410373124552287, + "loss": 0.962, + "step": 6775 + }, + { + "epoch": 0.47139030922814706, + "grad_norm": 1.2734375, + "learning_rate": 0.001140814228244174, + "loss": 0.797, + "step": 6776 + }, + { + "epoch": 0.4714598768652823, + "grad_norm": 1.3671875, + "learning_rate": 0.0011405911368832832, + "loss": 0.8643, + "step": 6777 + }, + { + "epoch": 0.47152944450241746, + "grad_norm": 1.28125, + "learning_rate": 0.0011403680383838828, + "loss": 0.7165, + "step": 6778 + }, + { + "epoch": 0.4715990121395527, + "grad_norm": 1.0546875, + "learning_rate": 0.0011401449327573007, + "loss": 0.9037, + "step": 6779 + }, + { + "epoch": 0.47166857977668786, + "grad_norm": 0.9296875, + "learning_rate": 0.0011399218200148658, + "loss": 0.726, + "step": 6780 + }, + { + "epoch": 0.4717381474138231, + "grad_norm": 1.421875, + "learning_rate": 0.0011396987001679058, + "loss": 1.0062, + "step": 6781 + }, + { + "epoch": 0.4718077150509583, + "grad_norm": 1.2109375, + "learning_rate": 0.0011394755732277502, + "loss": 0.9328, + "step": 6782 + }, + { + "epoch": 0.4718772826880935, + "grad_norm": 0.91796875, + "learning_rate": 0.0011392524392057277, + "loss": 0.6841, + "step": 6783 + }, + { + "epoch": 0.4719468503252287, + "grad_norm": 1.3203125, + "learning_rate": 0.0011390292981131682, + "loss": 0.7211, + "step": 6784 + }, + { + "epoch": 0.47201641796236393, + "grad_norm": 1.3828125, + "learning_rate": 0.001138806149961402, + "loss": 0.8626, + "step": 6785 + }, + { + "epoch": 0.4720859855994991, + "grad_norm": 1.3671875, + "learning_rate": 0.001138582994761759, + "loss": 0.5986, + "step": 6786 + }, + { + "epoch": 0.47215555323663433, + "grad_norm": 1.3203125, + "learning_rate": 0.00113835983252557, + "loss": 1.2783, + "step": 6787 + }, + { + "epoch": 0.4722251208737695, + "grad_norm": 1.5546875, + "learning_rate": 0.0011381366632641661, + "loss": 1.1105, + "step": 6788 + }, + { + "epoch": 0.4722946885109047, + "grad_norm": 1.1484375, + "learning_rate": 0.0011379134869888789, + "loss": 0.8647, + "step": 6789 + }, + { + "epoch": 0.47236425614803995, + "grad_norm": 1.0625, + "learning_rate": 0.0011376903037110396, + "loss": 0.8945, + "step": 6790 + }, + { + "epoch": 0.4724338237851751, + "grad_norm": 0.9140625, + "learning_rate": 0.0011374671134419807, + "loss": 0.6122, + "step": 6791 + }, + { + "epoch": 0.47250339142231035, + "grad_norm": 1.0390625, + "learning_rate": 0.001137243916193035, + "loss": 0.8256, + "step": 6792 + }, + { + "epoch": 0.4725729590594455, + "grad_norm": 1.2890625, + "learning_rate": 0.0011370207119755346, + "loss": 0.7419, + "step": 6793 + }, + { + "epoch": 0.47264252669658074, + "grad_norm": 0.98046875, + "learning_rate": 0.0011367975008008133, + "loss": 0.6636, + "step": 6794 + }, + { + "epoch": 0.47271209433371597, + "grad_norm": 0.8515625, + "learning_rate": 0.0011365742826802046, + "loss": 0.5432, + "step": 6795 + }, + { + "epoch": 0.47278166197085114, + "grad_norm": 1.2578125, + "learning_rate": 0.001136351057625042, + "loss": 0.9745, + "step": 6796 + }, + { + "epoch": 0.47285122960798637, + "grad_norm": 1.2421875, + "learning_rate": 0.00113612782564666, + "loss": 0.9833, + "step": 6797 + }, + { + "epoch": 0.4729207972451216, + "grad_norm": 1.1796875, + "learning_rate": 0.0011359045867563933, + "loss": 0.8484, + "step": 6798 + }, + { + "epoch": 0.47299036488225676, + "grad_norm": 1.0859375, + "learning_rate": 0.0011356813409655764, + "loss": 0.7557, + "step": 6799 + }, + { + "epoch": 0.473059932519392, + "grad_norm": 1.0546875, + "learning_rate": 0.0011354580882855449, + "loss": 0.949, + "step": 6800 + }, + { + "epoch": 0.47312950015652716, + "grad_norm": 1.015625, + "learning_rate": 0.0011352348287276346, + "loss": 0.8126, + "step": 6801 + }, + { + "epoch": 0.4731990677936624, + "grad_norm": 0.9765625, + "learning_rate": 0.0011350115623031815, + "loss": 0.6654, + "step": 6802 + }, + { + "epoch": 0.4732686354307976, + "grad_norm": 1.40625, + "learning_rate": 0.0011347882890235216, + "loss": 0.7928, + "step": 6803 + }, + { + "epoch": 0.4733382030679328, + "grad_norm": 1.3046875, + "learning_rate": 0.0011345650088999918, + "loss": 0.8891, + "step": 6804 + }, + { + "epoch": 0.473407770705068, + "grad_norm": 0.95703125, + "learning_rate": 0.0011343417219439292, + "loss": 0.6199, + "step": 6805 + }, + { + "epoch": 0.4734773383422032, + "grad_norm": 0.85546875, + "learning_rate": 0.0011341184281666705, + "loss": 0.6123, + "step": 6806 + }, + { + "epoch": 0.4735469059793384, + "grad_norm": 1.125, + "learning_rate": 0.0011338951275795546, + "loss": 0.9712, + "step": 6807 + }, + { + "epoch": 0.47361647361647363, + "grad_norm": 1.03125, + "learning_rate": 0.0011336718201939186, + "loss": 0.5731, + "step": 6808 + }, + { + "epoch": 0.4736860412536088, + "grad_norm": 1.296875, + "learning_rate": 0.0011334485060211018, + "loss": 0.932, + "step": 6809 + }, + { + "epoch": 0.47375560889074403, + "grad_norm": 1.3125, + "learning_rate": 0.0011332251850724423, + "loss": 1.0456, + "step": 6810 + }, + { + "epoch": 0.47382517652787925, + "grad_norm": 1.09375, + "learning_rate": 0.0011330018573592793, + "loss": 0.8988, + "step": 6811 + }, + { + "epoch": 0.4738947441650144, + "grad_norm": 1.4296875, + "learning_rate": 0.0011327785228929525, + "loss": 0.8061, + "step": 6812 + }, + { + "epoch": 0.47396431180214965, + "grad_norm": 1.1328125, + "learning_rate": 0.0011325551816848015, + "loss": 0.7523, + "step": 6813 + }, + { + "epoch": 0.4740338794392848, + "grad_norm": 1.3046875, + "learning_rate": 0.0011323318337461666, + "loss": 0.9292, + "step": 6814 + }, + { + "epoch": 0.47410344707642005, + "grad_norm": 1.1328125, + "learning_rate": 0.001132108479088388, + "loss": 0.6064, + "step": 6815 + }, + { + "epoch": 0.4741730147135553, + "grad_norm": 1.2578125, + "learning_rate": 0.001131885117722807, + "loss": 0.7639, + "step": 6816 + }, + { + "epoch": 0.47424258235069044, + "grad_norm": 1.0703125, + "learning_rate": 0.0011316617496607642, + "loss": 0.783, + "step": 6817 + }, + { + "epoch": 0.47431214998782567, + "grad_norm": 0.89453125, + "learning_rate": 0.0011314383749136015, + "loss": 0.6997, + "step": 6818 + }, + { + "epoch": 0.47438171762496084, + "grad_norm": 0.9453125, + "learning_rate": 0.0011312149934926605, + "loss": 0.6055, + "step": 6819 + }, + { + "epoch": 0.47445128526209607, + "grad_norm": 1.2734375, + "learning_rate": 0.0011309916054092835, + "loss": 0.9023, + "step": 6820 + }, + { + "epoch": 0.4745208528992313, + "grad_norm": 0.953125, + "learning_rate": 0.0011307682106748132, + "loss": 0.8706, + "step": 6821 + }, + { + "epoch": 0.47459042053636646, + "grad_norm": 1.125, + "learning_rate": 0.001130544809300592, + "loss": 0.8186, + "step": 6822 + }, + { + "epoch": 0.4746599881735017, + "grad_norm": 1.0546875, + "learning_rate": 0.0011303214012979637, + "loss": 0.9062, + "step": 6823 + }, + { + "epoch": 0.4747295558106369, + "grad_norm": 1.1015625, + "learning_rate": 0.0011300979866782715, + "loss": 0.8621, + "step": 6824 + }, + { + "epoch": 0.4747991234477721, + "grad_norm": 1.1875, + "learning_rate": 0.0011298745654528591, + "loss": 0.9045, + "step": 6825 + }, + { + "epoch": 0.4748686910849073, + "grad_norm": 1.2421875, + "learning_rate": 0.001129651137633071, + "loss": 0.9506, + "step": 6826 + }, + { + "epoch": 0.4749382587220425, + "grad_norm": 1.15625, + "learning_rate": 0.0011294277032302513, + "loss": 0.8287, + "step": 6827 + }, + { + "epoch": 0.4750078263591777, + "grad_norm": 1.328125, + "learning_rate": 0.0011292042622557457, + "loss": 1.0259, + "step": 6828 + }, + { + "epoch": 0.47507739399631294, + "grad_norm": 1.359375, + "learning_rate": 0.0011289808147208987, + "loss": 0.895, + "step": 6829 + }, + { + "epoch": 0.4751469616334481, + "grad_norm": 1.234375, + "learning_rate": 0.0011287573606370558, + "loss": 0.9821, + "step": 6830 + }, + { + "epoch": 0.47521652927058333, + "grad_norm": 0.90625, + "learning_rate": 0.0011285339000155635, + "loss": 0.8931, + "step": 6831 + }, + { + "epoch": 0.4752860969077185, + "grad_norm": 1.2890625, + "learning_rate": 0.0011283104328677674, + "loss": 0.815, + "step": 6832 + }, + { + "epoch": 0.47535566454485373, + "grad_norm": 0.92578125, + "learning_rate": 0.001128086959205014, + "loss": 0.7367, + "step": 6833 + }, + { + "epoch": 0.47542523218198895, + "grad_norm": 1.0234375, + "learning_rate": 0.0011278634790386508, + "loss": 0.8959, + "step": 6834 + }, + { + "epoch": 0.4754947998191241, + "grad_norm": 0.984375, + "learning_rate": 0.0011276399923800245, + "loss": 0.9691, + "step": 6835 + }, + { + "epoch": 0.47556436745625935, + "grad_norm": 0.97265625, + "learning_rate": 0.0011274164992404827, + "loss": 0.734, + "step": 6836 + }, + { + "epoch": 0.4756339350933946, + "grad_norm": 1.015625, + "learning_rate": 0.0011271929996313735, + "loss": 1.028, + "step": 6837 + }, + { + "epoch": 0.47570350273052975, + "grad_norm": 1.125, + "learning_rate": 0.0011269694935640447, + "loss": 0.7971, + "step": 6838 + }, + { + "epoch": 0.475773070367665, + "grad_norm": 0.9609375, + "learning_rate": 0.0011267459810498448, + "loss": 0.7243, + "step": 6839 + }, + { + "epoch": 0.47584263800480014, + "grad_norm": 1.34375, + "learning_rate": 0.0011265224621001232, + "loss": 0.8573, + "step": 6840 + }, + { + "epoch": 0.47591220564193537, + "grad_norm": 1.2109375, + "learning_rate": 0.0011262989367262285, + "loss": 0.782, + "step": 6841 + }, + { + "epoch": 0.4759817732790706, + "grad_norm": 1.109375, + "learning_rate": 0.0011260754049395103, + "loss": 0.9528, + "step": 6842 + }, + { + "epoch": 0.47605134091620577, + "grad_norm": 1.5, + "learning_rate": 0.0011258518667513187, + "loss": 1.1033, + "step": 6843 + }, + { + "epoch": 0.476120908553341, + "grad_norm": 1.140625, + "learning_rate": 0.0011256283221730036, + "loss": 0.894, + "step": 6844 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 1.109375, + "learning_rate": 0.0011254047712159156, + "loss": 0.7271, + "step": 6845 + }, + { + "epoch": 0.4762600438276114, + "grad_norm": 1.1796875, + "learning_rate": 0.0011251812138914053, + "loss": 0.905, + "step": 6846 + }, + { + "epoch": 0.4763296114647466, + "grad_norm": 0.9140625, + "learning_rate": 0.0011249576502108238, + "loss": 0.7876, + "step": 6847 + }, + { + "epoch": 0.4763991791018818, + "grad_norm": 1.1328125, + "learning_rate": 0.0011247340801855228, + "loss": 0.838, + "step": 6848 + }, + { + "epoch": 0.476468746739017, + "grad_norm": 1.03125, + "learning_rate": 0.001124510503826854, + "loss": 0.8328, + "step": 6849 + }, + { + "epoch": 0.47653831437615224, + "grad_norm": 1.0234375, + "learning_rate": 0.001124286921146169, + "loss": 0.9428, + "step": 6850 + }, + { + "epoch": 0.4766078820132874, + "grad_norm": 1.25, + "learning_rate": 0.001124063332154821, + "loss": 1.0431, + "step": 6851 + }, + { + "epoch": 0.47667744965042264, + "grad_norm": 0.99609375, + "learning_rate": 0.001123839736864162, + "loss": 0.607, + "step": 6852 + }, + { + "epoch": 0.4767470172875578, + "grad_norm": 1.140625, + "learning_rate": 0.0011236161352855456, + "loss": 0.7733, + "step": 6853 + }, + { + "epoch": 0.47681658492469303, + "grad_norm": 0.94140625, + "learning_rate": 0.0011233925274303249, + "loss": 0.7836, + "step": 6854 + }, + { + "epoch": 0.47688615256182826, + "grad_norm": 1.0, + "learning_rate": 0.0011231689133098537, + "loss": 0.9309, + "step": 6855 + }, + { + "epoch": 0.47695572019896343, + "grad_norm": 0.953125, + "learning_rate": 0.0011229452929354857, + "loss": 0.9256, + "step": 6856 + }, + { + "epoch": 0.47702528783609865, + "grad_norm": 0.88671875, + "learning_rate": 0.0011227216663185755, + "loss": 0.7528, + "step": 6857 + }, + { + "epoch": 0.4770948554732338, + "grad_norm": 0.9453125, + "learning_rate": 0.0011224980334704777, + "loss": 0.586, + "step": 6858 + }, + { + "epoch": 0.47716442311036905, + "grad_norm": 0.921875, + "learning_rate": 0.001122274394402547, + "loss": 0.821, + "step": 6859 + }, + { + "epoch": 0.4772339907475043, + "grad_norm": 1.0703125, + "learning_rate": 0.001122050749126139, + "loss": 1.0315, + "step": 6860 + }, + { + "epoch": 0.47730355838463945, + "grad_norm": 1.15625, + "learning_rate": 0.0011218270976526092, + "loss": 0.8908, + "step": 6861 + }, + { + "epoch": 0.4773731260217747, + "grad_norm": 1.1171875, + "learning_rate": 0.0011216034399933134, + "loss": 0.8264, + "step": 6862 + }, + { + "epoch": 0.4774426936589099, + "grad_norm": 1.015625, + "learning_rate": 0.0011213797761596078, + "loss": 0.8753, + "step": 6863 + }, + { + "epoch": 0.47751226129604507, + "grad_norm": 1.1953125, + "learning_rate": 0.001121156106162849, + "loss": 1.1162, + "step": 6864 + }, + { + "epoch": 0.4775818289331803, + "grad_norm": 0.9375, + "learning_rate": 0.0011209324300143937, + "loss": 0.6004, + "step": 6865 + }, + { + "epoch": 0.47765139657031547, + "grad_norm": 1.125, + "learning_rate": 0.0011207087477255993, + "loss": 0.7431, + "step": 6866 + }, + { + "epoch": 0.4777209642074507, + "grad_norm": 1.3125, + "learning_rate": 0.001120485059307823, + "loss": 1.0819, + "step": 6867 + }, + { + "epoch": 0.4777905318445859, + "grad_norm": 1.1875, + "learning_rate": 0.0011202613647724228, + "loss": 1.0026, + "step": 6868 + }, + { + "epoch": 0.4778600994817211, + "grad_norm": 1.2421875, + "learning_rate": 0.0011200376641307564, + "loss": 0.8103, + "step": 6869 + }, + { + "epoch": 0.4779296671188563, + "grad_norm": 1.0234375, + "learning_rate": 0.0011198139573941827, + "loss": 0.8203, + "step": 6870 + }, + { + "epoch": 0.4779992347559915, + "grad_norm": 1.3671875, + "learning_rate": 0.00111959024457406, + "loss": 0.9181, + "step": 6871 + }, + { + "epoch": 0.4780688023931267, + "grad_norm": 1.0, + "learning_rate": 0.0011193665256817476, + "loss": 0.8363, + "step": 6872 + }, + { + "epoch": 0.47813837003026194, + "grad_norm": 1.390625, + "learning_rate": 0.0011191428007286046, + "loss": 0.7834, + "step": 6873 + }, + { + "epoch": 0.4782079376673971, + "grad_norm": 1.125, + "learning_rate": 0.0011189190697259907, + "loss": 0.9514, + "step": 6874 + }, + { + "epoch": 0.47827750530453234, + "grad_norm": 1.2265625, + "learning_rate": 0.001118695332685266, + "loss": 1.0461, + "step": 6875 + }, + { + "epoch": 0.47834707294166756, + "grad_norm": 1.25, + "learning_rate": 0.0011184715896177901, + "loss": 0.9652, + "step": 6876 + }, + { + "epoch": 0.47841664057880273, + "grad_norm": 0.828125, + "learning_rate": 0.0011182478405349246, + "loss": 0.688, + "step": 6877 + }, + { + "epoch": 0.47848620821593796, + "grad_norm": 1.078125, + "learning_rate": 0.0011180240854480295, + "loss": 0.5792, + "step": 6878 + }, + { + "epoch": 0.47855577585307313, + "grad_norm": 1.0078125, + "learning_rate": 0.0011178003243684663, + "loss": 0.8728, + "step": 6879 + }, + { + "epoch": 0.47862534349020835, + "grad_norm": 1.21875, + "learning_rate": 0.0011175765573075962, + "loss": 0.7156, + "step": 6880 + }, + { + "epoch": 0.4786949111273436, + "grad_norm": 0.79296875, + "learning_rate": 0.0011173527842767812, + "loss": 0.7903, + "step": 6881 + }, + { + "epoch": 0.47876447876447875, + "grad_norm": 1.0546875, + "learning_rate": 0.0011171290052873835, + "loss": 0.734, + "step": 6882 + }, + { + "epoch": 0.478834046401614, + "grad_norm": 1.0, + "learning_rate": 0.0011169052203507653, + "loss": 0.8546, + "step": 6883 + }, + { + "epoch": 0.47890361403874915, + "grad_norm": 1.53125, + "learning_rate": 0.001116681429478289, + "loss": 0.7843, + "step": 6884 + }, + { + "epoch": 0.4789731816758844, + "grad_norm": 1.171875, + "learning_rate": 0.001116457632681318, + "loss": 0.8304, + "step": 6885 + }, + { + "epoch": 0.4790427493130196, + "grad_norm": 1.015625, + "learning_rate": 0.0011162338299712153, + "loss": 0.6309, + "step": 6886 + }, + { + "epoch": 0.47911231695015477, + "grad_norm": 1.2265625, + "learning_rate": 0.0011160100213593448, + "loss": 1.0741, + "step": 6887 + }, + { + "epoch": 0.47918188458729, + "grad_norm": 1.140625, + "learning_rate": 0.0011157862068570698, + "loss": 0.9332, + "step": 6888 + }, + { + "epoch": 0.4792514522244252, + "grad_norm": 1.0859375, + "learning_rate": 0.0011155623864757551, + "loss": 0.8916, + "step": 6889 + }, + { + "epoch": 0.4793210198615604, + "grad_norm": 1.09375, + "learning_rate": 0.0011153385602267647, + "loss": 0.98, + "step": 6890 + }, + { + "epoch": 0.4793905874986956, + "grad_norm": 1.4609375, + "learning_rate": 0.0011151147281214637, + "loss": 1.0965, + "step": 6891 + }, + { + "epoch": 0.4794601551358308, + "grad_norm": 0.85546875, + "learning_rate": 0.0011148908901712172, + "loss": 0.7172, + "step": 6892 + }, + { + "epoch": 0.479529722772966, + "grad_norm": 1.25, + "learning_rate": 0.00111466704638739, + "loss": 0.6087, + "step": 6893 + }, + { + "epoch": 0.47959929041010124, + "grad_norm": 1.0234375, + "learning_rate": 0.0011144431967813485, + "loss": 0.8725, + "step": 6894 + }, + { + "epoch": 0.4796688580472364, + "grad_norm": 1.0703125, + "learning_rate": 0.0011142193413644576, + "loss": 0.6584, + "step": 6895 + }, + { + "epoch": 0.47973842568437164, + "grad_norm": 1.3828125, + "learning_rate": 0.0011139954801480851, + "loss": 1.0956, + "step": 6896 + }, + { + "epoch": 0.4798079933215068, + "grad_norm": 0.84765625, + "learning_rate": 0.0011137716131435964, + "loss": 0.6688, + "step": 6897 + }, + { + "epoch": 0.47987756095864204, + "grad_norm": 1.1875, + "learning_rate": 0.0011135477403623585, + "loss": 0.9466, + "step": 6898 + }, + { + "epoch": 0.47994712859577726, + "grad_norm": 1.25, + "learning_rate": 0.001113323861815739, + "loss": 0.6891, + "step": 6899 + }, + { + "epoch": 0.48001669623291243, + "grad_norm": 0.921875, + "learning_rate": 0.0011130999775151047, + "loss": 0.705, + "step": 6900 + }, + { + "epoch": 0.48008626387004766, + "grad_norm": 0.953125, + "learning_rate": 0.0011128760874718237, + "loss": 0.8579, + "step": 6901 + }, + { + "epoch": 0.4801558315071829, + "grad_norm": 1.2734375, + "learning_rate": 0.0011126521916972637, + "loss": 0.8625, + "step": 6902 + }, + { + "epoch": 0.48022539914431805, + "grad_norm": 1.3125, + "learning_rate": 0.0011124282902027938, + "loss": 0.9706, + "step": 6903 + }, + { + "epoch": 0.4802949667814533, + "grad_norm": 1.09375, + "learning_rate": 0.0011122043829997815, + "loss": 0.706, + "step": 6904 + }, + { + "epoch": 0.48036453441858845, + "grad_norm": 1.3203125, + "learning_rate": 0.0011119804700995964, + "loss": 1.1041, + "step": 6905 + }, + { + "epoch": 0.4804341020557237, + "grad_norm": 1.1328125, + "learning_rate": 0.0011117565515136071, + "loss": 0.6875, + "step": 6906 + }, + { + "epoch": 0.4805036696928589, + "grad_norm": 1.140625, + "learning_rate": 0.0011115326272531838, + "loss": 0.8057, + "step": 6907 + }, + { + "epoch": 0.4805732373299941, + "grad_norm": 1.2265625, + "learning_rate": 0.0011113086973296958, + "loss": 0.8843, + "step": 6908 + }, + { + "epoch": 0.4806428049671293, + "grad_norm": 0.99609375, + "learning_rate": 0.0011110847617545128, + "loss": 0.6403, + "step": 6909 + }, + { + "epoch": 0.48071237260426447, + "grad_norm": 0.91796875, + "learning_rate": 0.001110860820539006, + "loss": 0.7008, + "step": 6910 + }, + { + "epoch": 0.4807819402413997, + "grad_norm": 0.9375, + "learning_rate": 0.0011106368736945452, + "loss": 0.6692, + "step": 6911 + }, + { + "epoch": 0.4808515078785349, + "grad_norm": 1.1953125, + "learning_rate": 0.001110412921232502, + "loss": 1.0763, + "step": 6912 + }, + { + "epoch": 0.4809210755156701, + "grad_norm": 0.92578125, + "learning_rate": 0.001110188963164247, + "loss": 0.75, + "step": 6913 + }, + { + "epoch": 0.4809906431528053, + "grad_norm": 1.0859375, + "learning_rate": 0.0011099649995011515, + "loss": 0.8156, + "step": 6914 + }, + { + "epoch": 0.48106021078994055, + "grad_norm": 1.0625, + "learning_rate": 0.0011097410302545881, + "loss": 0.8426, + "step": 6915 + }, + { + "epoch": 0.4811297784270757, + "grad_norm": 0.875, + "learning_rate": 0.001109517055435928, + "loss": 0.7825, + "step": 6916 + }, + { + "epoch": 0.48119934606421094, + "grad_norm": 1.28125, + "learning_rate": 0.001109293075056544, + "loss": 0.8922, + "step": 6917 + }, + { + "epoch": 0.4812689137013461, + "grad_norm": 1.2265625, + "learning_rate": 0.001109069089127808, + "loss": 0.9127, + "step": 6918 + }, + { + "epoch": 0.48133848133848134, + "grad_norm": 0.90625, + "learning_rate": 0.0011088450976610943, + "loss": 0.7225, + "step": 6919 + }, + { + "epoch": 0.48140804897561656, + "grad_norm": 1.4375, + "learning_rate": 0.0011086211006677744, + "loss": 0.8415, + "step": 6920 + }, + { + "epoch": 0.48147761661275174, + "grad_norm": 1.421875, + "learning_rate": 0.0011083970981592228, + "loss": 1.0372, + "step": 6921 + }, + { + "epoch": 0.48154718424988696, + "grad_norm": 1.0625, + "learning_rate": 0.001108173090146813, + "loss": 0.9047, + "step": 6922 + }, + { + "epoch": 0.48161675188702213, + "grad_norm": 1.3046875, + "learning_rate": 0.001107949076641919, + "loss": 0.791, + "step": 6923 + }, + { + "epoch": 0.48168631952415736, + "grad_norm": 1.2734375, + "learning_rate": 0.0011077250576559145, + "loss": 0.8391, + "step": 6924 + }, + { + "epoch": 0.4817558871612926, + "grad_norm": 0.98828125, + "learning_rate": 0.001107501033200175, + "loss": 0.5883, + "step": 6925 + }, + { + "epoch": 0.48182545479842775, + "grad_norm": 1.1953125, + "learning_rate": 0.0011072770032860748, + "loss": 0.8007, + "step": 6926 + }, + { + "epoch": 0.481895022435563, + "grad_norm": 0.890625, + "learning_rate": 0.0011070529679249887, + "loss": 0.6742, + "step": 6927 + }, + { + "epoch": 0.4819645900726982, + "grad_norm": 1.2578125, + "learning_rate": 0.0011068289271282932, + "loss": 0.8524, + "step": 6928 + }, + { + "epoch": 0.4820341577098334, + "grad_norm": 1.3828125, + "learning_rate": 0.0011066048809073629, + "loss": 0.9279, + "step": 6929 + }, + { + "epoch": 0.4821037253469686, + "grad_norm": 0.95703125, + "learning_rate": 0.001106380829273574, + "loss": 0.7533, + "step": 6930 + }, + { + "epoch": 0.4821732929841038, + "grad_norm": 1.09375, + "learning_rate": 0.0011061567722383029, + "loss": 1.0112, + "step": 6931 + }, + { + "epoch": 0.482242860621239, + "grad_norm": 1.015625, + "learning_rate": 0.0011059327098129255, + "loss": 0.7589, + "step": 6932 + }, + { + "epoch": 0.4823124282583742, + "grad_norm": 1.1015625, + "learning_rate": 0.0011057086420088195, + "loss": 0.7081, + "step": 6933 + }, + { + "epoch": 0.4823819958955094, + "grad_norm": 1.21875, + "learning_rate": 0.0011054845688373614, + "loss": 0.7104, + "step": 6934 + }, + { + "epoch": 0.4824515635326446, + "grad_norm": 1.2890625, + "learning_rate": 0.0011052604903099286, + "loss": 0.9116, + "step": 6935 + }, + { + "epoch": 0.4825211311697798, + "grad_norm": 1.171875, + "learning_rate": 0.0011050364064378985, + "loss": 1.0161, + "step": 6936 + }, + { + "epoch": 0.482590698806915, + "grad_norm": 1.0234375, + "learning_rate": 0.0011048123172326494, + "loss": 0.6631, + "step": 6937 + }, + { + "epoch": 0.48266026644405025, + "grad_norm": 1.0078125, + "learning_rate": 0.001104588222705559, + "loss": 0.8084, + "step": 6938 + }, + { + "epoch": 0.4827298340811854, + "grad_norm": 1.09375, + "learning_rate": 0.0011043641228680055, + "loss": 0.7227, + "step": 6939 + }, + { + "epoch": 0.48279940171832064, + "grad_norm": 1.078125, + "learning_rate": 0.0011041400177313682, + "loss": 0.9966, + "step": 6940 + }, + { + "epoch": 0.48286896935545587, + "grad_norm": 1.3671875, + "learning_rate": 0.0011039159073070258, + "loss": 0.9344, + "step": 6941 + }, + { + "epoch": 0.48293853699259104, + "grad_norm": 1.1171875, + "learning_rate": 0.0011036917916063572, + "loss": 0.6069, + "step": 6942 + }, + { + "epoch": 0.48300810462972626, + "grad_norm": 1.078125, + "learning_rate": 0.0011034676706407423, + "loss": 1.0173, + "step": 6943 + }, + { + "epoch": 0.48307767226686144, + "grad_norm": 0.8984375, + "learning_rate": 0.0011032435444215602, + "loss": 0.8079, + "step": 6944 + }, + { + "epoch": 0.48314723990399666, + "grad_norm": 1.1171875, + "learning_rate": 0.0011030194129601917, + "loss": 0.899, + "step": 6945 + }, + { + "epoch": 0.4832168075411319, + "grad_norm": 1.0078125, + "learning_rate": 0.0011027952762680162, + "loss": 0.8863, + "step": 6946 + }, + { + "epoch": 0.48328637517826706, + "grad_norm": 1.0859375, + "learning_rate": 0.001102571134356415, + "loss": 0.8912, + "step": 6947 + }, + { + "epoch": 0.4833559428154023, + "grad_norm": 0.8203125, + "learning_rate": 0.0011023469872367686, + "loss": 0.6625, + "step": 6948 + }, + { + "epoch": 0.48342551045253745, + "grad_norm": 0.9453125, + "learning_rate": 0.0011021228349204582, + "loss": 0.9037, + "step": 6949 + }, + { + "epoch": 0.4834950780896727, + "grad_norm": 1.1875, + "learning_rate": 0.0011018986774188645, + "loss": 0.8848, + "step": 6950 + }, + { + "epoch": 0.4835646457268079, + "grad_norm": 1.09375, + "learning_rate": 0.0011016745147433703, + "loss": 0.7961, + "step": 6951 + }, + { + "epoch": 0.4836342133639431, + "grad_norm": 1.109375, + "learning_rate": 0.0011014503469053563, + "loss": 0.9383, + "step": 6952 + }, + { + "epoch": 0.4837037810010783, + "grad_norm": 1.1640625, + "learning_rate": 0.0011012261739162049, + "loss": 0.7179, + "step": 6953 + }, + { + "epoch": 0.48377334863821353, + "grad_norm": 1.0546875, + "learning_rate": 0.0011010019957872989, + "loss": 1.0111, + "step": 6954 + }, + { + "epoch": 0.4838429162753487, + "grad_norm": 1.0625, + "learning_rate": 0.00110077781253002, + "loss": 0.8226, + "step": 6955 + }, + { + "epoch": 0.4839124839124839, + "grad_norm": 1.203125, + "learning_rate": 0.0011005536241557525, + "loss": 0.8878, + "step": 6956 + }, + { + "epoch": 0.4839820515496191, + "grad_norm": 1.09375, + "learning_rate": 0.0011003294306758781, + "loss": 1.0296, + "step": 6957 + }, + { + "epoch": 0.4840516191867543, + "grad_norm": 1.15625, + "learning_rate": 0.0011001052321017817, + "loss": 0.8059, + "step": 6958 + }, + { + "epoch": 0.48412118682388955, + "grad_norm": 1.09375, + "learning_rate": 0.001099881028444846, + "loss": 0.8771, + "step": 6959 + }, + { + "epoch": 0.4841907544610247, + "grad_norm": 1.1953125, + "learning_rate": 0.0010996568197164547, + "loss": 1.0343, + "step": 6960 + }, + { + "epoch": 0.48426032209815995, + "grad_norm": 1.21875, + "learning_rate": 0.0010994326059279927, + "loss": 0.6113, + "step": 6961 + }, + { + "epoch": 0.4843298897352951, + "grad_norm": 1.1015625, + "learning_rate": 0.0010992083870908437, + "loss": 0.8849, + "step": 6962 + }, + { + "epoch": 0.48439945737243034, + "grad_norm": 1.046875, + "learning_rate": 0.0010989841632163934, + "loss": 0.8569, + "step": 6963 + }, + { + "epoch": 0.48446902500956557, + "grad_norm": 1.0234375, + "learning_rate": 0.001098759934316026, + "loss": 0.8583, + "step": 6964 + }, + { + "epoch": 0.48453859264670074, + "grad_norm": 1.1796875, + "learning_rate": 0.0010985357004011272, + "loss": 0.8247, + "step": 6965 + }, + { + "epoch": 0.48460816028383596, + "grad_norm": 1.046875, + "learning_rate": 0.0010983114614830816, + "loss": 0.636, + "step": 6966 + }, + { + "epoch": 0.4846777279209712, + "grad_norm": 1.0234375, + "learning_rate": 0.0010980872175732762, + "loss": 0.7528, + "step": 6967 + }, + { + "epoch": 0.48474729555810636, + "grad_norm": 1.4140625, + "learning_rate": 0.0010978629686830958, + "loss": 0.874, + "step": 6968 + }, + { + "epoch": 0.4848168631952416, + "grad_norm": 1.21875, + "learning_rate": 0.001097638714823927, + "loss": 1.0633, + "step": 6969 + }, + { + "epoch": 0.48488643083237676, + "grad_norm": 1.09375, + "learning_rate": 0.0010974144560071568, + "loss": 0.7056, + "step": 6970 + }, + { + "epoch": 0.484955998469512, + "grad_norm": 1.1171875, + "learning_rate": 0.0010971901922441712, + "loss": 0.8113, + "step": 6971 + }, + { + "epoch": 0.4850255661066472, + "grad_norm": 1.3046875, + "learning_rate": 0.001096965923546358, + "loss": 0.9971, + "step": 6972 + }, + { + "epoch": 0.4850951337437824, + "grad_norm": 1.0546875, + "learning_rate": 0.0010967416499251034, + "loss": 0.9382, + "step": 6973 + }, + { + "epoch": 0.4851647013809176, + "grad_norm": 1.0859375, + "learning_rate": 0.0010965173713917958, + "loss": 0.6524, + "step": 6974 + }, + { + "epoch": 0.4852342690180528, + "grad_norm": 1.28125, + "learning_rate": 0.0010962930879578226, + "loss": 1.0498, + "step": 6975 + }, + { + "epoch": 0.485303836655188, + "grad_norm": 1.171875, + "learning_rate": 0.0010960687996345712, + "loss": 1.0655, + "step": 6976 + }, + { + "epoch": 0.48537340429232323, + "grad_norm": 0.83984375, + "learning_rate": 0.0010958445064334311, + "loss": 0.8055, + "step": 6977 + }, + { + "epoch": 0.4854429719294584, + "grad_norm": 1.4296875, + "learning_rate": 0.0010956202083657893, + "loss": 0.9072, + "step": 6978 + }, + { + "epoch": 0.4855125395665936, + "grad_norm": 1.3046875, + "learning_rate": 0.001095395905443036, + "loss": 0.8098, + "step": 6979 + }, + { + "epoch": 0.48558210720372885, + "grad_norm": 1.0859375, + "learning_rate": 0.0010951715976765589, + "loss": 0.7313, + "step": 6980 + }, + { + "epoch": 0.485651674840864, + "grad_norm": 1.359375, + "learning_rate": 0.0010949472850777483, + "loss": 0.8856, + "step": 6981 + }, + { + "epoch": 0.48572124247799925, + "grad_norm": 0.96484375, + "learning_rate": 0.0010947229676579926, + "loss": 0.7672, + "step": 6982 + }, + { + "epoch": 0.4857908101151344, + "grad_norm": 1.21875, + "learning_rate": 0.0010944986454286822, + "loss": 0.69, + "step": 6983 + }, + { + "epoch": 0.48586037775226965, + "grad_norm": 1.171875, + "learning_rate": 0.0010942743184012072, + "loss": 0.9117, + "step": 6984 + }, + { + "epoch": 0.48592994538940487, + "grad_norm": 1.2578125, + "learning_rate": 0.001094049986586957, + "loss": 0.9458, + "step": 6985 + }, + { + "epoch": 0.48599951302654004, + "grad_norm": 1.0078125, + "learning_rate": 0.0010938256499973232, + "loss": 0.9272, + "step": 6986 + }, + { + "epoch": 0.48606908066367527, + "grad_norm": 1.3125, + "learning_rate": 0.001093601308643695, + "loss": 0.8044, + "step": 6987 + }, + { + "epoch": 0.48613864830081044, + "grad_norm": 1.140625, + "learning_rate": 0.001093376962537465, + "loss": 0.6457, + "step": 6988 + }, + { + "epoch": 0.48620821593794566, + "grad_norm": 0.99609375, + "learning_rate": 0.0010931526116900229, + "loss": 0.827, + "step": 6989 + }, + { + "epoch": 0.4862777835750809, + "grad_norm": 1.1171875, + "learning_rate": 0.0010929282561127607, + "loss": 0.9153, + "step": 6990 + }, + { + "epoch": 0.48634735121221606, + "grad_norm": 1.171875, + "learning_rate": 0.0010927038958170703, + "loss": 0.8547, + "step": 6991 + }, + { + "epoch": 0.4864169188493513, + "grad_norm": 1.1875, + "learning_rate": 0.0010924795308143432, + "loss": 0.9355, + "step": 6992 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 0.83203125, + "learning_rate": 0.0010922551611159716, + "loss": 0.6463, + "step": 6993 + }, + { + "epoch": 0.4865560541236217, + "grad_norm": 0.99609375, + "learning_rate": 0.0010920307867333479, + "loss": 0.8102, + "step": 6994 + }, + { + "epoch": 0.4866256217607569, + "grad_norm": 1.1171875, + "learning_rate": 0.001091806407677865, + "loss": 0.5896, + "step": 6995 + }, + { + "epoch": 0.4866951893978921, + "grad_norm": 1.125, + "learning_rate": 0.001091582023960915, + "loss": 0.8311, + "step": 6996 + }, + { + "epoch": 0.4867647570350273, + "grad_norm": 1.25, + "learning_rate": 0.001091357635593892, + "loss": 1.1586, + "step": 6997 + }, + { + "epoch": 0.48683432467216253, + "grad_norm": 1.15625, + "learning_rate": 0.0010911332425881885, + "loss": 0.8111, + "step": 6998 + }, + { + "epoch": 0.4869038923092977, + "grad_norm": 1.0625, + "learning_rate": 0.001090908844955198, + "loss": 0.9535, + "step": 6999 + }, + { + "epoch": 0.48697345994643293, + "grad_norm": 0.90625, + "learning_rate": 0.001090684442706315, + "loss": 0.806, + "step": 7000 + }, + { + "epoch": 0.4870430275835681, + "grad_norm": 1.1484375, + "learning_rate": 0.0010904600358529327, + "loss": 0.7193, + "step": 7001 + }, + { + "epoch": 0.4871125952207033, + "grad_norm": 1.296875, + "learning_rate": 0.0010902356244064462, + "loss": 1.1411, + "step": 7002 + }, + { + "epoch": 0.48718216285783855, + "grad_norm": 1.296875, + "learning_rate": 0.001090011208378249, + "loss": 0.7308, + "step": 7003 + }, + { + "epoch": 0.4872517304949737, + "grad_norm": 1.0546875, + "learning_rate": 0.0010897867877797368, + "loss": 0.6471, + "step": 7004 + }, + { + "epoch": 0.48732129813210895, + "grad_norm": 0.953125, + "learning_rate": 0.0010895623626223034, + "loss": 0.7874, + "step": 7005 + }, + { + "epoch": 0.4873908657692442, + "grad_norm": 1.0859375, + "learning_rate": 0.0010893379329173453, + "loss": 0.7763, + "step": 7006 + }, + { + "epoch": 0.48746043340637935, + "grad_norm": 1.1328125, + "learning_rate": 0.0010891134986762572, + "loss": 0.9469, + "step": 7007 + }, + { + "epoch": 0.48753000104351457, + "grad_norm": 0.99609375, + "learning_rate": 0.0010888890599104345, + "loss": 0.896, + "step": 7008 + }, + { + "epoch": 0.48759956868064974, + "grad_norm": 1.359375, + "learning_rate": 0.0010886646166312736, + "loss": 0.9828, + "step": 7009 + }, + { + "epoch": 0.48766913631778497, + "grad_norm": 1.09375, + "learning_rate": 0.0010884401688501702, + "loss": 0.6623, + "step": 7010 + }, + { + "epoch": 0.4877387039549202, + "grad_norm": 0.86328125, + "learning_rate": 0.001088215716578521, + "loss": 0.6458, + "step": 7011 + }, + { + "epoch": 0.48780827159205536, + "grad_norm": 1.3125, + "learning_rate": 0.001087991259827722, + "loss": 0.9172, + "step": 7012 + }, + { + "epoch": 0.4878778392291906, + "grad_norm": 1.3203125, + "learning_rate": 0.0010877667986091705, + "loss": 0.8783, + "step": 7013 + }, + { + "epoch": 0.48794740686632576, + "grad_norm": 1.0234375, + "learning_rate": 0.0010875423329342634, + "loss": 0.5569, + "step": 7014 + }, + { + "epoch": 0.488016974503461, + "grad_norm": 0.93359375, + "learning_rate": 0.001087317862814398, + "loss": 0.7734, + "step": 7015 + }, + { + "epoch": 0.4880865421405962, + "grad_norm": 1.2578125, + "learning_rate": 0.0010870933882609717, + "loss": 0.9205, + "step": 7016 + }, + { + "epoch": 0.4881561097777314, + "grad_norm": 0.99609375, + "learning_rate": 0.0010868689092853817, + "loss": 0.9377, + "step": 7017 + }, + { + "epoch": 0.4882256774148666, + "grad_norm": 1.234375, + "learning_rate": 0.0010866444258990269, + "loss": 0.9297, + "step": 7018 + }, + { + "epoch": 0.48829524505200184, + "grad_norm": 1.7265625, + "learning_rate": 0.0010864199381133044, + "loss": 0.9733, + "step": 7019 + }, + { + "epoch": 0.488364812689137, + "grad_norm": 1.15625, + "learning_rate": 0.0010861954459396132, + "loss": 0.798, + "step": 7020 + }, + { + "epoch": 0.48843438032627223, + "grad_norm": 0.87109375, + "learning_rate": 0.0010859709493893518, + "loss": 0.9854, + "step": 7021 + }, + { + "epoch": 0.4885039479634074, + "grad_norm": 1.109375, + "learning_rate": 0.0010857464484739189, + "loss": 0.8518, + "step": 7022 + }, + { + "epoch": 0.48857351560054263, + "grad_norm": 1.125, + "learning_rate": 0.0010855219432047137, + "loss": 0.9385, + "step": 7023 + }, + { + "epoch": 0.48864308323767786, + "grad_norm": 1.1875, + "learning_rate": 0.0010852974335931347, + "loss": 0.8966, + "step": 7024 + }, + { + "epoch": 0.488712650874813, + "grad_norm": 1.1953125, + "learning_rate": 0.0010850729196505825, + "loss": 0.7525, + "step": 7025 + }, + { + "epoch": 0.48878221851194825, + "grad_norm": 1.2578125, + "learning_rate": 0.001084848401388456, + "loss": 0.7952, + "step": 7026 + }, + { + "epoch": 0.4888517861490834, + "grad_norm": 0.97265625, + "learning_rate": 0.0010846238788181552, + "loss": 0.7759, + "step": 7027 + }, + { + "epoch": 0.48892135378621865, + "grad_norm": 0.97265625, + "learning_rate": 0.0010843993519510807, + "loss": 0.9408, + "step": 7028 + }, + { + "epoch": 0.4889909214233539, + "grad_norm": 0.9296875, + "learning_rate": 0.0010841748207986324, + "loss": 0.6235, + "step": 7029 + }, + { + "epoch": 0.48906048906048905, + "grad_norm": 1.1484375, + "learning_rate": 0.001083950285372211, + "loss": 1.0157, + "step": 7030 + }, + { + "epoch": 0.48913005669762427, + "grad_norm": 1.4453125, + "learning_rate": 0.0010837257456832172, + "loss": 0.9633, + "step": 7031 + }, + { + "epoch": 0.4891996243347595, + "grad_norm": 0.91796875, + "learning_rate": 0.0010835012017430521, + "loss": 0.7513, + "step": 7032 + }, + { + "epoch": 0.48926919197189467, + "grad_norm": 1.0703125, + "learning_rate": 0.0010832766535631166, + "loss": 0.9007, + "step": 7033 + }, + { + "epoch": 0.4893387596090299, + "grad_norm": 1.140625, + "learning_rate": 0.001083052101154813, + "loss": 0.7177, + "step": 7034 + }, + { + "epoch": 0.48940832724616506, + "grad_norm": 1.15625, + "learning_rate": 0.0010828275445295414, + "loss": 0.8104, + "step": 7035 + }, + { + "epoch": 0.4894778948833003, + "grad_norm": 1.0, + "learning_rate": 0.0010826029836987052, + "loss": 0.8287, + "step": 7036 + }, + { + "epoch": 0.4895474625204355, + "grad_norm": 1.4296875, + "learning_rate": 0.0010823784186737059, + "loss": 1.0064, + "step": 7037 + }, + { + "epoch": 0.4896170301575707, + "grad_norm": 0.8671875, + "learning_rate": 0.0010821538494659453, + "loss": 0.6387, + "step": 7038 + }, + { + "epoch": 0.4896865977947059, + "grad_norm": 1.03125, + "learning_rate": 0.001081929276086827, + "loss": 0.8335, + "step": 7039 + }, + { + "epoch": 0.4897561654318411, + "grad_norm": 1.0703125, + "learning_rate": 0.0010817046985477522, + "loss": 0.8973, + "step": 7040 + }, + { + "epoch": 0.4898257330689763, + "grad_norm": 1.1953125, + "learning_rate": 0.0010814801168601252, + "loss": 0.957, + "step": 7041 + }, + { + "epoch": 0.48989530070611154, + "grad_norm": 1.03125, + "learning_rate": 0.001081255531035348, + "loss": 0.7339, + "step": 7042 + }, + { + "epoch": 0.4899648683432467, + "grad_norm": 1.0625, + "learning_rate": 0.0010810309410848248, + "loss": 0.8312, + "step": 7043 + }, + { + "epoch": 0.49003443598038193, + "grad_norm": 0.93359375, + "learning_rate": 0.001080806347019959, + "loss": 0.7359, + "step": 7044 + }, + { + "epoch": 0.4901040036175171, + "grad_norm": 1.2109375, + "learning_rate": 0.001080581748852154, + "loss": 0.9715, + "step": 7045 + }, + { + "epoch": 0.49017357125465233, + "grad_norm": 1.0703125, + "learning_rate": 0.0010803571465928142, + "loss": 0.7558, + "step": 7046 + }, + { + "epoch": 0.49024313889178756, + "grad_norm": 1.03125, + "learning_rate": 0.0010801325402533433, + "loss": 0.8256, + "step": 7047 + }, + { + "epoch": 0.4903127065289227, + "grad_norm": 1.0625, + "learning_rate": 0.001079907929845146, + "loss": 0.9817, + "step": 7048 + }, + { + "epoch": 0.49038227416605795, + "grad_norm": 1.0703125, + "learning_rate": 0.0010796833153796266, + "loss": 1.0321, + "step": 7049 + }, + { + "epoch": 0.4904518418031932, + "grad_norm": 1.1484375, + "learning_rate": 0.00107945869686819, + "loss": 0.6911, + "step": 7050 + }, + { + "epoch": 0.49052140944032835, + "grad_norm": 1.3359375, + "learning_rate": 0.0010792340743222418, + "loss": 1.0358, + "step": 7051 + }, + { + "epoch": 0.4905909770774636, + "grad_norm": 1.375, + "learning_rate": 0.0010790094477531862, + "loss": 0.7645, + "step": 7052 + }, + { + "epoch": 0.49066054471459875, + "grad_norm": 1.3125, + "learning_rate": 0.0010787848171724293, + "loss": 0.8814, + "step": 7053 + }, + { + "epoch": 0.49073011235173397, + "grad_norm": 1.3828125, + "learning_rate": 0.0010785601825913764, + "loss": 0.9904, + "step": 7054 + }, + { + "epoch": 0.4907996799888692, + "grad_norm": 1.21875, + "learning_rate": 0.0010783355440214335, + "loss": 0.9773, + "step": 7055 + }, + { + "epoch": 0.49086924762600437, + "grad_norm": 0.96875, + "learning_rate": 0.0010781109014740063, + "loss": 0.7507, + "step": 7056 + }, + { + "epoch": 0.4909388152631396, + "grad_norm": 1.0, + "learning_rate": 0.0010778862549605016, + "loss": 0.8323, + "step": 7057 + }, + { + "epoch": 0.49100838290027476, + "grad_norm": 1.203125, + "learning_rate": 0.001077661604492325, + "loss": 0.9502, + "step": 7058 + }, + { + "epoch": 0.49107795053741, + "grad_norm": 1.109375, + "learning_rate": 0.0010774369500808837, + "loss": 0.8934, + "step": 7059 + }, + { + "epoch": 0.4911475181745452, + "grad_norm": 1.2109375, + "learning_rate": 0.0010772122917375845, + "loss": 0.6439, + "step": 7060 + }, + { + "epoch": 0.4912170858116804, + "grad_norm": 1.1171875, + "learning_rate": 0.0010769876294738343, + "loss": 0.9425, + "step": 7061 + }, + { + "epoch": 0.4912866534488156, + "grad_norm": 1.109375, + "learning_rate": 0.0010767629633010407, + "loss": 0.7707, + "step": 7062 + }, + { + "epoch": 0.49135622108595084, + "grad_norm": 1.1640625, + "learning_rate": 0.00107653829323061, + "loss": 0.6929, + "step": 7063 + }, + { + "epoch": 0.491425788723086, + "grad_norm": 1.265625, + "learning_rate": 0.0010763136192739509, + "loss": 0.8058, + "step": 7064 + }, + { + "epoch": 0.49149535636022124, + "grad_norm": 0.96484375, + "learning_rate": 0.0010760889414424709, + "loss": 0.8957, + "step": 7065 + }, + { + "epoch": 0.4915649239973564, + "grad_norm": 0.8984375, + "learning_rate": 0.0010758642597475778, + "loss": 0.738, + "step": 7066 + }, + { + "epoch": 0.49163449163449163, + "grad_norm": 0.90234375, + "learning_rate": 0.0010756395742006803, + "loss": 0.6443, + "step": 7067 + }, + { + "epoch": 0.49170405927162686, + "grad_norm": 1.03125, + "learning_rate": 0.0010754148848131863, + "loss": 0.7924, + "step": 7068 + }, + { + "epoch": 0.49177362690876203, + "grad_norm": 1.0078125, + "learning_rate": 0.0010751901915965045, + "loss": 0.7784, + "step": 7069 + }, + { + "epoch": 0.49184319454589726, + "grad_norm": 1.2890625, + "learning_rate": 0.0010749654945620437, + "loss": 0.95, + "step": 7070 + }, + { + "epoch": 0.4919127621830324, + "grad_norm": 1.1015625, + "learning_rate": 0.0010747407937212133, + "loss": 0.7802, + "step": 7071 + }, + { + "epoch": 0.49198232982016765, + "grad_norm": 1.109375, + "learning_rate": 0.0010745160890854215, + "loss": 0.7512, + "step": 7072 + }, + { + "epoch": 0.4920518974573029, + "grad_norm": 1.0703125, + "learning_rate": 0.0010742913806660787, + "loss": 0.9372, + "step": 7073 + }, + { + "epoch": 0.49212146509443805, + "grad_norm": 0.71484375, + "learning_rate": 0.001074066668474594, + "loss": 0.677, + "step": 7074 + }, + { + "epoch": 0.4921910327315733, + "grad_norm": 1.2578125, + "learning_rate": 0.0010738419525223772, + "loss": 0.9019, + "step": 7075 + }, + { + "epoch": 0.4922606003687085, + "grad_norm": 1.421875, + "learning_rate": 0.0010736172328208381, + "loss": 0.8827, + "step": 7076 + }, + { + "epoch": 0.49233016800584367, + "grad_norm": 1.1015625, + "learning_rate": 0.0010733925093813872, + "loss": 0.7767, + "step": 7077 + }, + { + "epoch": 0.4923997356429789, + "grad_norm": 1.1796875, + "learning_rate": 0.0010731677822154349, + "loss": 0.7247, + "step": 7078 + }, + { + "epoch": 0.49246930328011407, + "grad_norm": 1.0546875, + "learning_rate": 0.0010729430513343908, + "loss": 0.8941, + "step": 7079 + }, + { + "epoch": 0.4925388709172493, + "grad_norm": 1.1953125, + "learning_rate": 0.0010727183167496663, + "loss": 0.899, + "step": 7080 + }, + { + "epoch": 0.4926084385543845, + "grad_norm": 0.90625, + "learning_rate": 0.0010724935784726724, + "loss": 0.5359, + "step": 7081 + }, + { + "epoch": 0.4926780061915197, + "grad_norm": 1.2578125, + "learning_rate": 0.0010722688365148198, + "loss": 0.9204, + "step": 7082 + }, + { + "epoch": 0.4927475738286549, + "grad_norm": 1.046875, + "learning_rate": 0.0010720440908875202, + "loss": 1.1598, + "step": 7083 + }, + { + "epoch": 0.4928171414657901, + "grad_norm": 1.1640625, + "learning_rate": 0.0010718193416021846, + "loss": 0.7563, + "step": 7084 + }, + { + "epoch": 0.4928867091029253, + "grad_norm": 1.1875, + "learning_rate": 0.0010715945886702247, + "loss": 0.9402, + "step": 7085 + }, + { + "epoch": 0.49295627674006054, + "grad_norm": 1.0703125, + "learning_rate": 0.0010713698321030527, + "loss": 0.8206, + "step": 7086 + }, + { + "epoch": 0.4930258443771957, + "grad_norm": 1.09375, + "learning_rate": 0.0010711450719120804, + "loss": 0.8456, + "step": 7087 + }, + { + "epoch": 0.49309541201433094, + "grad_norm": 1.203125, + "learning_rate": 0.0010709203081087197, + "loss": 0.6072, + "step": 7088 + }, + { + "epoch": 0.49316497965146616, + "grad_norm": 1.1171875, + "learning_rate": 0.0010706955407043834, + "loss": 0.8227, + "step": 7089 + }, + { + "epoch": 0.49323454728860133, + "grad_norm": 1.21875, + "learning_rate": 0.001070470769710484, + "loss": 0.9223, + "step": 7090 + }, + { + "epoch": 0.49330411492573656, + "grad_norm": 0.97265625, + "learning_rate": 0.0010702459951384337, + "loss": 0.8665, + "step": 7091 + }, + { + "epoch": 0.49337368256287173, + "grad_norm": 1.171875, + "learning_rate": 0.0010700212169996461, + "loss": 0.7868, + "step": 7092 + }, + { + "epoch": 0.49344325020000696, + "grad_norm": 1.0703125, + "learning_rate": 0.001069796435305534, + "loss": 0.9867, + "step": 7093 + }, + { + "epoch": 0.4935128178371422, + "grad_norm": 1.1171875, + "learning_rate": 0.0010695716500675107, + "loss": 0.8527, + "step": 7094 + }, + { + "epoch": 0.49358238547427735, + "grad_norm": 1.0625, + "learning_rate": 0.0010693468612969898, + "loss": 0.9523, + "step": 7095 + }, + { + "epoch": 0.4936519531114126, + "grad_norm": 0.984375, + "learning_rate": 0.0010691220690053846, + "loss": 0.8617, + "step": 7096 + }, + { + "epoch": 0.49372152074854775, + "grad_norm": 1.1015625, + "learning_rate": 0.0010688972732041094, + "loss": 0.5449, + "step": 7097 + }, + { + "epoch": 0.493791088385683, + "grad_norm": 1.203125, + "learning_rate": 0.0010686724739045776, + "loss": 0.5827, + "step": 7098 + }, + { + "epoch": 0.4938606560228182, + "grad_norm": 0.96875, + "learning_rate": 0.0010684476711182041, + "loss": 0.8263, + "step": 7099 + }, + { + "epoch": 0.49393022365995337, + "grad_norm": 1.265625, + "learning_rate": 0.0010682228648564026, + "loss": 0.7761, + "step": 7100 + }, + { + "epoch": 0.4939997912970886, + "grad_norm": 1.1484375, + "learning_rate": 0.001067998055130588, + "loss": 0.8542, + "step": 7101 + }, + { + "epoch": 0.4940693589342238, + "grad_norm": 1.0546875, + "learning_rate": 0.0010677732419521748, + "loss": 0.8381, + "step": 7102 + }, + { + "epoch": 0.494138926571359, + "grad_norm": 1.015625, + "learning_rate": 0.001067548425332578, + "loss": 0.7612, + "step": 7103 + }, + { + "epoch": 0.4942084942084942, + "grad_norm": 1.3671875, + "learning_rate": 0.0010673236052832127, + "loss": 1.0282, + "step": 7104 + }, + { + "epoch": 0.4942780618456294, + "grad_norm": 1.125, + "learning_rate": 0.0010670987818154941, + "loss": 0.9268, + "step": 7105 + }, + { + "epoch": 0.4943476294827646, + "grad_norm": 0.9921875, + "learning_rate": 0.0010668739549408372, + "loss": 0.8165, + "step": 7106 + }, + { + "epoch": 0.49441719711989984, + "grad_norm": 1.1796875, + "learning_rate": 0.0010666491246706584, + "loss": 0.8969, + "step": 7107 + }, + { + "epoch": 0.494486764757035, + "grad_norm": 1.1328125, + "learning_rate": 0.0010664242910163727, + "loss": 0.8662, + "step": 7108 + }, + { + "epoch": 0.49455633239417024, + "grad_norm": 1.0078125, + "learning_rate": 0.0010661994539893965, + "loss": 0.8012, + "step": 7109 + }, + { + "epoch": 0.4946259000313054, + "grad_norm": 1.765625, + "learning_rate": 0.0010659746136011457, + "loss": 1.2093, + "step": 7110 + }, + { + "epoch": 0.49469546766844064, + "grad_norm": 1.1171875, + "learning_rate": 0.0010657497698630363, + "loss": 0.8276, + "step": 7111 + }, + { + "epoch": 0.49476503530557586, + "grad_norm": 1.2265625, + "learning_rate": 0.0010655249227864852, + "loss": 0.7684, + "step": 7112 + }, + { + "epoch": 0.49483460294271103, + "grad_norm": 1.09375, + "learning_rate": 0.0010653000723829086, + "loss": 0.9557, + "step": 7113 + }, + { + "epoch": 0.49490417057984626, + "grad_norm": 0.921875, + "learning_rate": 0.0010650752186637238, + "loss": 0.6632, + "step": 7114 + }, + { + "epoch": 0.4949737382169815, + "grad_norm": 1.1171875, + "learning_rate": 0.001064850361640347, + "loss": 0.9204, + "step": 7115 + }, + { + "epoch": 0.49504330585411666, + "grad_norm": 1.6796875, + "learning_rate": 0.0010646255013241962, + "loss": 0.8105, + "step": 7116 + }, + { + "epoch": 0.4951128734912519, + "grad_norm": 1.3359375, + "learning_rate": 0.0010644006377266877, + "loss": 0.9422, + "step": 7117 + }, + { + "epoch": 0.49518244112838705, + "grad_norm": 1.4609375, + "learning_rate": 0.0010641757708592396, + "loss": 0.9256, + "step": 7118 + }, + { + "epoch": 0.4952520087655223, + "grad_norm": 1.421875, + "learning_rate": 0.0010639509007332694, + "loss": 0.9179, + "step": 7119 + }, + { + "epoch": 0.4953215764026575, + "grad_norm": 1.078125, + "learning_rate": 0.0010637260273601947, + "loss": 0.7678, + "step": 7120 + }, + { + "epoch": 0.4953911440397927, + "grad_norm": 1.0078125, + "learning_rate": 0.0010635011507514336, + "loss": 0.824, + "step": 7121 + }, + { + "epoch": 0.4954607116769279, + "grad_norm": 1.4765625, + "learning_rate": 0.001063276270918404, + "loss": 0.8793, + "step": 7122 + }, + { + "epoch": 0.49553027931406307, + "grad_norm": 0.96484375, + "learning_rate": 0.0010630513878725244, + "loss": 0.8169, + "step": 7123 + }, + { + "epoch": 0.4955998469511983, + "grad_norm": 1.1640625, + "learning_rate": 0.0010628265016252132, + "loss": 0.7137, + "step": 7124 + }, + { + "epoch": 0.4956694145883335, + "grad_norm": 1.46875, + "learning_rate": 0.0010626016121878887, + "loss": 1.0796, + "step": 7125 + }, + { + "epoch": 0.4957389822254687, + "grad_norm": 0.98046875, + "learning_rate": 0.00106237671957197, + "loss": 0.7397, + "step": 7126 + }, + { + "epoch": 0.4958085498626039, + "grad_norm": 0.84375, + "learning_rate": 0.0010621518237888762, + "loss": 0.7724, + "step": 7127 + }, + { + "epoch": 0.49587811749973915, + "grad_norm": 0.94921875, + "learning_rate": 0.0010619269248500257, + "loss": 0.8617, + "step": 7128 + }, + { + "epoch": 0.4959476851368743, + "grad_norm": 1.0625, + "learning_rate": 0.001061702022766838, + "loss": 0.9586, + "step": 7129 + }, + { + "epoch": 0.49601725277400954, + "grad_norm": 1.1328125, + "learning_rate": 0.0010614771175507327, + "loss": 0.8047, + "step": 7130 + }, + { + "epoch": 0.4960868204111447, + "grad_norm": 1.078125, + "learning_rate": 0.0010612522092131294, + "loss": 0.9814, + "step": 7131 + }, + { + "epoch": 0.49615638804827994, + "grad_norm": 0.81640625, + "learning_rate": 0.0010610272977654475, + "loss": 0.7223, + "step": 7132 + }, + { + "epoch": 0.49622595568541517, + "grad_norm": 1.2734375, + "learning_rate": 0.0010608023832191069, + "loss": 0.82, + "step": 7133 + }, + { + "epoch": 0.49629552332255034, + "grad_norm": 1.796875, + "learning_rate": 0.0010605774655855279, + "loss": 1.0853, + "step": 7134 + }, + { + "epoch": 0.49636509095968556, + "grad_norm": 1.171875, + "learning_rate": 0.0010603525448761304, + "loss": 0.7723, + "step": 7135 + }, + { + "epoch": 0.49643465859682073, + "grad_norm": 1.09375, + "learning_rate": 0.001060127621102335, + "loss": 0.8135, + "step": 7136 + }, + { + "epoch": 0.49650422623395596, + "grad_norm": 1.0859375, + "learning_rate": 0.001059902694275562, + "loss": 0.8008, + "step": 7137 + }, + { + "epoch": 0.4965737938710912, + "grad_norm": 1.203125, + "learning_rate": 0.0010596777644072321, + "loss": 0.7991, + "step": 7138 + }, + { + "epoch": 0.49664336150822636, + "grad_norm": 0.94921875, + "learning_rate": 0.0010594528315087664, + "loss": 0.7725, + "step": 7139 + }, + { + "epoch": 0.4967129291453616, + "grad_norm": 1.046875, + "learning_rate": 0.0010592278955915853, + "loss": 0.9814, + "step": 7140 + }, + { + "epoch": 0.4967824967824968, + "grad_norm": 1.5546875, + "learning_rate": 0.0010590029566671102, + "loss": 0.994, + "step": 7141 + }, + { + "epoch": 0.496852064419632, + "grad_norm": 1.0546875, + "learning_rate": 0.0010587780147467624, + "loss": 0.8298, + "step": 7142 + }, + { + "epoch": 0.4969216320567672, + "grad_norm": 1.0546875, + "learning_rate": 0.0010585530698419634, + "loss": 0.8363, + "step": 7143 + }, + { + "epoch": 0.4969911996939024, + "grad_norm": 1.140625, + "learning_rate": 0.0010583281219641346, + "loss": 0.807, + "step": 7144 + }, + { + "epoch": 0.4970607673310376, + "grad_norm": 1.109375, + "learning_rate": 0.0010581031711246977, + "loss": 0.9414, + "step": 7145 + }, + { + "epoch": 0.4971303349681728, + "grad_norm": 1.3828125, + "learning_rate": 0.0010578782173350746, + "loss": 0.6897, + "step": 7146 + }, + { + "epoch": 0.497199902605308, + "grad_norm": 1.09375, + "learning_rate": 0.0010576532606066873, + "loss": 0.9442, + "step": 7147 + }, + { + "epoch": 0.4972694702424432, + "grad_norm": 1.1015625, + "learning_rate": 0.001057428300950958, + "loss": 0.8061, + "step": 7148 + }, + { + "epoch": 0.4973390378795784, + "grad_norm": 1.2578125, + "learning_rate": 0.0010572033383793092, + "loss": 0.794, + "step": 7149 + }, + { + "epoch": 0.4974086055167136, + "grad_norm": 1.0, + "learning_rate": 0.0010569783729031633, + "loss": 0.8179, + "step": 7150 + }, + { + "epoch": 0.49747817315384885, + "grad_norm": 1.640625, + "learning_rate": 0.0010567534045339425, + "loss": 0.9908, + "step": 7151 + }, + { + "epoch": 0.497547740790984, + "grad_norm": 1.0546875, + "learning_rate": 0.00105652843328307, + "loss": 0.8828, + "step": 7152 + }, + { + "epoch": 0.49761730842811924, + "grad_norm": 1.75, + "learning_rate": 0.0010563034591619686, + "loss": 0.8516, + "step": 7153 + }, + { + "epoch": 0.49768687606525447, + "grad_norm": 1.3984375, + "learning_rate": 0.0010560784821820614, + "loss": 0.816, + "step": 7154 + }, + { + "epoch": 0.49775644370238964, + "grad_norm": 1.2578125, + "learning_rate": 0.0010558535023547715, + "loss": 0.7183, + "step": 7155 + }, + { + "epoch": 0.49782601133952487, + "grad_norm": 0.99609375, + "learning_rate": 0.0010556285196915223, + "loss": 0.7265, + "step": 7156 + }, + { + "epoch": 0.49789557897666004, + "grad_norm": 0.96875, + "learning_rate": 0.0010554035342037371, + "loss": 0.7026, + "step": 7157 + }, + { + "epoch": 0.49796514661379526, + "grad_norm": 0.9453125, + "learning_rate": 0.0010551785459028398, + "loss": 0.7944, + "step": 7158 + }, + { + "epoch": 0.4980347142509305, + "grad_norm": 0.92578125, + "learning_rate": 0.001054953554800254, + "loss": 0.7208, + "step": 7159 + }, + { + "epoch": 0.49810428188806566, + "grad_norm": 1.0234375, + "learning_rate": 0.0010547285609074039, + "loss": 0.9822, + "step": 7160 + }, + { + "epoch": 0.4981738495252009, + "grad_norm": 1.1328125, + "learning_rate": 0.001054503564235713, + "loss": 0.7046, + "step": 7161 + }, + { + "epoch": 0.49824341716233606, + "grad_norm": 1.21875, + "learning_rate": 0.001054278564796606, + "loss": 0.8731, + "step": 7162 + }, + { + "epoch": 0.4983129847994713, + "grad_norm": 1.109375, + "learning_rate": 0.0010540535626015072, + "loss": 0.7237, + "step": 7163 + }, + { + "epoch": 0.4983825524366065, + "grad_norm": 0.9609375, + "learning_rate": 0.0010538285576618407, + "loss": 0.7564, + "step": 7164 + }, + { + "epoch": 0.4984521200737417, + "grad_norm": 0.75, + "learning_rate": 0.0010536035499890315, + "loss": 0.5736, + "step": 7165 + }, + { + "epoch": 0.4985216877108769, + "grad_norm": 1.0078125, + "learning_rate": 0.001053378539594504, + "loss": 0.864, + "step": 7166 + }, + { + "epoch": 0.49859125534801213, + "grad_norm": 0.9140625, + "learning_rate": 0.0010531535264896837, + "loss": 0.8084, + "step": 7167 + }, + { + "epoch": 0.4986608229851473, + "grad_norm": 0.88671875, + "learning_rate": 0.0010529285106859949, + "loss": 0.6282, + "step": 7168 + }, + { + "epoch": 0.4987303906222825, + "grad_norm": 1.15625, + "learning_rate": 0.0010527034921948633, + "loss": 0.7866, + "step": 7169 + }, + { + "epoch": 0.4987999582594177, + "grad_norm": 1.0859375, + "learning_rate": 0.0010524784710277137, + "loss": 0.7971, + "step": 7170 + }, + { + "epoch": 0.4988695258965529, + "grad_norm": 1.3984375, + "learning_rate": 0.0010522534471959723, + "loss": 0.8345, + "step": 7171 + }, + { + "epoch": 0.49893909353368815, + "grad_norm": 1.3125, + "learning_rate": 0.001052028420711064, + "loss": 0.8096, + "step": 7172 + }, + { + "epoch": 0.4990086611708233, + "grad_norm": 1.2421875, + "learning_rate": 0.0010518033915844147, + "loss": 0.7644, + "step": 7173 + }, + { + "epoch": 0.49907822880795855, + "grad_norm": 1.09375, + "learning_rate": 0.0010515783598274502, + "loss": 0.9254, + "step": 7174 + }, + { + "epoch": 0.4991477964450937, + "grad_norm": 1.1171875, + "learning_rate": 0.0010513533254515965, + "loss": 0.8688, + "step": 7175 + }, + { + "epoch": 0.49921736408222894, + "grad_norm": 1.1796875, + "learning_rate": 0.0010511282884682802, + "loss": 0.8079, + "step": 7176 + }, + { + "epoch": 0.49928693171936417, + "grad_norm": 0.9765625, + "learning_rate": 0.001050903248888927, + "loss": 0.695, + "step": 7177 + }, + { + "epoch": 0.49935649935649934, + "grad_norm": 1.2265625, + "learning_rate": 0.001050678206724963, + "loss": 0.7909, + "step": 7178 + }, + { + "epoch": 0.49942606699363457, + "grad_norm": 1.2421875, + "learning_rate": 0.0010504531619878155, + "loss": 0.9214, + "step": 7179 + }, + { + "epoch": 0.4994956346307698, + "grad_norm": 1.1796875, + "learning_rate": 0.0010502281146889108, + "loss": 0.9654, + "step": 7180 + }, + { + "epoch": 0.49956520226790496, + "grad_norm": 1.09375, + "learning_rate": 0.0010500030648396752, + "loss": 0.8288, + "step": 7181 + }, + { + "epoch": 0.4996347699050402, + "grad_norm": 1.609375, + "learning_rate": 0.0010497780124515362, + "loss": 0.875, + "step": 7182 + }, + { + "epoch": 0.49970433754217536, + "grad_norm": 1.15625, + "learning_rate": 0.0010495529575359208, + "loss": 0.8785, + "step": 7183 + }, + { + "epoch": 0.4997739051793106, + "grad_norm": 1.03125, + "learning_rate": 0.0010493279001042557, + "loss": 0.7318, + "step": 7184 + }, + { + "epoch": 0.4998434728164458, + "grad_norm": 0.94140625, + "learning_rate": 0.0010491028401679687, + "loss": 0.5975, + "step": 7185 + }, + { + "epoch": 0.499913040453581, + "grad_norm": 0.890625, + "learning_rate": 0.0010488777777384868, + "loss": 0.9336, + "step": 7186 + }, + { + "epoch": 0.4999826080907162, + "grad_norm": 1.03125, + "learning_rate": 0.0010486527128272377, + "loss": 0.8408, + "step": 7187 + }, + { + "epoch": 0.5000521757278514, + "grad_norm": 1.1953125, + "learning_rate": 0.0010484276454456492, + "loss": 0.938, + "step": 7188 + }, + { + "epoch": 0.5001217433649866, + "grad_norm": 1.1171875, + "learning_rate": 0.001048202575605149, + "loss": 0.7174, + "step": 7189 + }, + { + "epoch": 0.5001913110021218, + "grad_norm": 1.2265625, + "learning_rate": 0.0010479775033171647, + "loss": 0.8057, + "step": 7190 + }, + { + "epoch": 0.5002608786392571, + "grad_norm": 1.046875, + "learning_rate": 0.0010477524285931246, + "loss": 0.7395, + "step": 7191 + }, + { + "epoch": 0.5003304462763922, + "grad_norm": 1.015625, + "learning_rate": 0.0010475273514444574, + "loss": 0.8219, + "step": 7192 + }, + { + "epoch": 0.5004000139135274, + "grad_norm": 0.9921875, + "learning_rate": 0.0010473022718825904, + "loss": 0.9962, + "step": 7193 + }, + { + "epoch": 0.5004695815506627, + "grad_norm": 1.171875, + "learning_rate": 0.0010470771899189525, + "loss": 0.9283, + "step": 7194 + }, + { + "epoch": 0.5005391491877978, + "grad_norm": 1.375, + "learning_rate": 0.0010468521055649722, + "loss": 1.09, + "step": 7195 + }, + { + "epoch": 0.500608716824933, + "grad_norm": 1.015625, + "learning_rate": 0.0010466270188320783, + "loss": 0.6323, + "step": 7196 + }, + { + "epoch": 0.5006782844620682, + "grad_norm": 1.1015625, + "learning_rate": 0.0010464019297316992, + "loss": 0.9214, + "step": 7197 + }, + { + "epoch": 0.5007478520992035, + "grad_norm": 1.59375, + "learning_rate": 0.0010461768382752639, + "loss": 1.1907, + "step": 7198 + }, + { + "epoch": 0.5008174197363386, + "grad_norm": 1.0078125, + "learning_rate": 0.001045951744474202, + "loss": 0.9154, + "step": 7199 + }, + { + "epoch": 0.5008869873734738, + "grad_norm": 1.15625, + "learning_rate": 0.0010457266483399417, + "loss": 0.623, + "step": 7200 + }, + { + "epoch": 0.5009565550106091, + "grad_norm": 1.2421875, + "learning_rate": 0.0010455015498839126, + "loss": 0.9534, + "step": 7201 + }, + { + "epoch": 0.5010261226477443, + "grad_norm": 0.97265625, + "learning_rate": 0.0010452764491175443, + "loss": 0.7288, + "step": 7202 + }, + { + "epoch": 0.5010956902848794, + "grad_norm": 0.96484375, + "learning_rate": 0.0010450513460522662, + "loss": 0.8695, + "step": 7203 + }, + { + "epoch": 0.5011652579220147, + "grad_norm": 1.3125, + "learning_rate": 0.0010448262406995076, + "loss": 1.0398, + "step": 7204 + }, + { + "epoch": 0.5012348255591499, + "grad_norm": 1.1171875, + "learning_rate": 0.0010446011330706986, + "loss": 0.6585, + "step": 7205 + }, + { + "epoch": 0.5013043931962851, + "grad_norm": 1.1953125, + "learning_rate": 0.001044376023177269, + "loss": 0.9344, + "step": 7206 + }, + { + "epoch": 0.5013739608334203, + "grad_norm": 0.9765625, + "learning_rate": 0.0010441509110306483, + "loss": 0.8191, + "step": 7207 + }, + { + "epoch": 0.5014435284705555, + "grad_norm": 1.015625, + "learning_rate": 0.0010439257966422674, + "loss": 0.8409, + "step": 7208 + }, + { + "epoch": 0.5015130961076907, + "grad_norm": 1.1953125, + "learning_rate": 0.0010437006800235553, + "loss": 0.6652, + "step": 7209 + }, + { + "epoch": 0.5015826637448259, + "grad_norm": 1.265625, + "learning_rate": 0.0010434755611859435, + "loss": 1.1023, + "step": 7210 + }, + { + "epoch": 0.5016522313819611, + "grad_norm": 1.1328125, + "learning_rate": 0.0010432504401408614, + "loss": 1.0829, + "step": 7211 + }, + { + "epoch": 0.5017217990190963, + "grad_norm": 1.171875, + "learning_rate": 0.00104302531689974, + "loss": 0.69, + "step": 7212 + }, + { + "epoch": 0.5017913666562315, + "grad_norm": 0.92578125, + "learning_rate": 0.0010428001914740102, + "loss": 0.6944, + "step": 7213 + }, + { + "epoch": 0.5018609342933668, + "grad_norm": 1.03125, + "learning_rate": 0.0010425750638751018, + "loss": 0.6269, + "step": 7214 + }, + { + "epoch": 0.5019305019305019, + "grad_norm": 1.15625, + "learning_rate": 0.001042349934114447, + "loss": 0.8376, + "step": 7215 + }, + { + "epoch": 0.5020000695676371, + "grad_norm": 0.9921875, + "learning_rate": 0.0010421248022034755, + "loss": 0.688, + "step": 7216 + }, + { + "epoch": 0.5020696372047724, + "grad_norm": 1.0546875, + "learning_rate": 0.001041899668153619, + "loss": 0.9963, + "step": 7217 + }, + { + "epoch": 0.5021392048419075, + "grad_norm": 1.0703125, + "learning_rate": 0.0010416745319763085, + "loss": 0.8619, + "step": 7218 + }, + { + "epoch": 0.5022087724790427, + "grad_norm": 1.03125, + "learning_rate": 0.0010414493936829754, + "loss": 0.8277, + "step": 7219 + }, + { + "epoch": 0.502278340116178, + "grad_norm": 1.09375, + "learning_rate": 0.001041224253285051, + "loss": 0.698, + "step": 7220 + }, + { + "epoch": 0.5023479077533132, + "grad_norm": 1.0078125, + "learning_rate": 0.0010409991107939668, + "loss": 0.9984, + "step": 7221 + }, + { + "epoch": 0.5024174753904483, + "grad_norm": 1.1484375, + "learning_rate": 0.0010407739662211546, + "loss": 0.9493, + "step": 7222 + }, + { + "epoch": 0.5024870430275835, + "grad_norm": 1.28125, + "learning_rate": 0.0010405488195780455, + "loss": 1.0066, + "step": 7223 + }, + { + "epoch": 0.5025566106647188, + "grad_norm": 1.2109375, + "learning_rate": 0.0010403236708760723, + "loss": 0.8073, + "step": 7224 + }, + { + "epoch": 0.502626178301854, + "grad_norm": 1.1328125, + "learning_rate": 0.0010400985201266656, + "loss": 1.0504, + "step": 7225 + }, + { + "epoch": 0.5026957459389891, + "grad_norm": 1.3125, + "learning_rate": 0.0010398733673412583, + "loss": 0.8966, + "step": 7226 + }, + { + "epoch": 0.5027653135761244, + "grad_norm": 1.25, + "learning_rate": 0.001039648212531283, + "loss": 0.8191, + "step": 7227 + }, + { + "epoch": 0.5028348812132596, + "grad_norm": 1.21875, + "learning_rate": 0.0010394230557081708, + "loss": 0.8162, + "step": 7228 + }, + { + "epoch": 0.5029044488503948, + "grad_norm": 1.0703125, + "learning_rate": 0.0010391978968833549, + "loss": 0.8613, + "step": 7229 + }, + { + "epoch": 0.50297401648753, + "grad_norm": 0.796875, + "learning_rate": 0.0010389727360682669, + "loss": 0.4959, + "step": 7230 + }, + { + "epoch": 0.5030435841246652, + "grad_norm": 0.9765625, + "learning_rate": 0.0010387475732743401, + "loss": 0.8968, + "step": 7231 + }, + { + "epoch": 0.5031131517618004, + "grad_norm": 1.3046875, + "learning_rate": 0.0010385224085130067, + "loss": 0.7869, + "step": 7232 + }, + { + "epoch": 0.5031827193989357, + "grad_norm": 1.265625, + "learning_rate": 0.0010382972417956997, + "loss": 0.9447, + "step": 7233 + }, + { + "epoch": 0.5032522870360708, + "grad_norm": 0.9765625, + "learning_rate": 0.0010380720731338517, + "loss": 0.6995, + "step": 7234 + }, + { + "epoch": 0.503321854673206, + "grad_norm": 1.03125, + "learning_rate": 0.0010378469025388954, + "loss": 1.0611, + "step": 7235 + }, + { + "epoch": 0.5033914223103412, + "grad_norm": 1.1328125, + "learning_rate": 0.0010376217300222647, + "loss": 0.8676, + "step": 7236 + }, + { + "epoch": 0.5034609899474765, + "grad_norm": 1.1953125, + "learning_rate": 0.0010373965555953919, + "loss": 0.751, + "step": 7237 + }, + { + "epoch": 0.5035305575846116, + "grad_norm": 0.90234375, + "learning_rate": 0.0010371713792697108, + "loss": 0.7965, + "step": 7238 + }, + { + "epoch": 0.5036001252217468, + "grad_norm": 1.1015625, + "learning_rate": 0.001036946201056654, + "loss": 0.8837, + "step": 7239 + }, + { + "epoch": 0.5036696928588821, + "grad_norm": 1.125, + "learning_rate": 0.0010367210209676556, + "loss": 0.9095, + "step": 7240 + }, + { + "epoch": 0.5037392604960172, + "grad_norm": 2.171875, + "learning_rate": 0.0010364958390141489, + "loss": 0.6597, + "step": 7241 + }, + { + "epoch": 0.5038088281331524, + "grad_norm": 1.1328125, + "learning_rate": 0.0010362706552075672, + "loss": 0.645, + "step": 7242 + }, + { + "epoch": 0.5038783957702877, + "grad_norm": 0.97265625, + "learning_rate": 0.0010360454695593447, + "loss": 0.9205, + "step": 7243 + }, + { + "epoch": 0.5039479634074229, + "grad_norm": 1.2109375, + "learning_rate": 0.0010358202820809146, + "loss": 1.0243, + "step": 7244 + }, + { + "epoch": 0.504017531044558, + "grad_norm": 1.0546875, + "learning_rate": 0.0010355950927837115, + "loss": 0.6335, + "step": 7245 + }, + { + "epoch": 0.5040870986816933, + "grad_norm": 1.1875, + "learning_rate": 0.0010353699016791684, + "loss": 0.7884, + "step": 7246 + }, + { + "epoch": 0.5041566663188285, + "grad_norm": 1.1328125, + "learning_rate": 0.0010351447087787206, + "loss": 0.9097, + "step": 7247 + }, + { + "epoch": 0.5042262339559637, + "grad_norm": 1.171875, + "learning_rate": 0.0010349195140938016, + "loss": 0.7768, + "step": 7248 + }, + { + "epoch": 0.5042958015930988, + "grad_norm": 1.3046875, + "learning_rate": 0.0010346943176358452, + "loss": 1.0664, + "step": 7249 + }, + { + "epoch": 0.5043653692302341, + "grad_norm": 1.421875, + "learning_rate": 0.0010344691194162866, + "loss": 0.9543, + "step": 7250 + }, + { + "epoch": 0.5044349368673693, + "grad_norm": 1.1171875, + "learning_rate": 0.00103424391944656, + "loss": 0.9053, + "step": 7251 + }, + { + "epoch": 0.5045045045045045, + "grad_norm": 1.140625, + "learning_rate": 0.0010340187177380995, + "loss": 0.8547, + "step": 7252 + }, + { + "epoch": 0.5045740721416397, + "grad_norm": 1.015625, + "learning_rate": 0.0010337935143023397, + "loss": 0.558, + "step": 7253 + }, + { + "epoch": 0.5046436397787749, + "grad_norm": 1.1484375, + "learning_rate": 0.0010335683091507162, + "loss": 0.7452, + "step": 7254 + }, + { + "epoch": 0.5047132074159101, + "grad_norm": 0.859375, + "learning_rate": 0.001033343102294663, + "loss": 0.5924, + "step": 7255 + }, + { + "epoch": 0.5047827750530454, + "grad_norm": 1.171875, + "learning_rate": 0.0010331178937456147, + "loss": 0.9584, + "step": 7256 + }, + { + "epoch": 0.5048523426901805, + "grad_norm": 1.0234375, + "learning_rate": 0.0010328926835150073, + "loss": 0.9278, + "step": 7257 + }, + { + "epoch": 0.5049219103273157, + "grad_norm": 1.15625, + "learning_rate": 0.0010326674716142745, + "loss": 0.9773, + "step": 7258 + }, + { + "epoch": 0.504991477964451, + "grad_norm": 1.1953125, + "learning_rate": 0.0010324422580548528, + "loss": 1.0611, + "step": 7259 + }, + { + "epoch": 0.5050610456015862, + "grad_norm": 1.1171875, + "learning_rate": 0.0010322170428481764, + "loss": 0.7346, + "step": 7260 + }, + { + "epoch": 0.5051306132387213, + "grad_norm": 1.375, + "learning_rate": 0.0010319918260056813, + "loss": 0.993, + "step": 7261 + }, + { + "epoch": 0.5052001808758565, + "grad_norm": 1.1015625, + "learning_rate": 0.001031766607538802, + "loss": 0.9258, + "step": 7262 + }, + { + "epoch": 0.5052697485129918, + "grad_norm": 1.125, + "learning_rate": 0.0010315413874589748, + "loss": 0.6496, + "step": 7263 + }, + { + "epoch": 0.505339316150127, + "grad_norm": 1.0078125, + "learning_rate": 0.0010313161657776351, + "loss": 0.669, + "step": 7264 + }, + { + "epoch": 0.5054088837872621, + "grad_norm": 1.2734375, + "learning_rate": 0.0010310909425062177, + "loss": 0.9657, + "step": 7265 + }, + { + "epoch": 0.5054784514243974, + "grad_norm": 1.53125, + "learning_rate": 0.0010308657176561597, + "loss": 1.0466, + "step": 7266 + }, + { + "epoch": 0.5055480190615326, + "grad_norm": 0.97265625, + "learning_rate": 0.0010306404912388957, + "loss": 0.9254, + "step": 7267 + }, + { + "epoch": 0.5056175866986677, + "grad_norm": 0.8125, + "learning_rate": 0.0010304152632658623, + "loss": 0.5554, + "step": 7268 + }, + { + "epoch": 0.505687154335803, + "grad_norm": 1.09375, + "learning_rate": 0.0010301900337484947, + "loss": 1.0306, + "step": 7269 + }, + { + "epoch": 0.5057567219729382, + "grad_norm": 1.3359375, + "learning_rate": 0.0010299648026982297, + "loss": 0.9134, + "step": 7270 + }, + { + "epoch": 0.5058262896100734, + "grad_norm": 1.015625, + "learning_rate": 0.001029739570126503, + "loss": 1.012, + "step": 7271 + }, + { + "epoch": 0.5058958572472086, + "grad_norm": 1.2265625, + "learning_rate": 0.0010295143360447507, + "loss": 0.717, + "step": 7272 + }, + { + "epoch": 0.5059654248843438, + "grad_norm": 1.25, + "learning_rate": 0.0010292891004644094, + "loss": 0.9266, + "step": 7273 + }, + { + "epoch": 0.506034992521479, + "grad_norm": 0.92578125, + "learning_rate": 0.001029063863396915, + "loss": 0.4913, + "step": 7274 + }, + { + "epoch": 0.5061045601586142, + "grad_norm": 0.796875, + "learning_rate": 0.0010288386248537042, + "loss": 0.5788, + "step": 7275 + }, + { + "epoch": 0.5061741277957494, + "grad_norm": 1.15625, + "learning_rate": 0.0010286133848462131, + "loss": 0.8754, + "step": 7276 + }, + { + "epoch": 0.5062436954328846, + "grad_norm": 0.9921875, + "learning_rate": 0.0010283881433858792, + "loss": 0.7639, + "step": 7277 + }, + { + "epoch": 0.5063132630700198, + "grad_norm": 1.1015625, + "learning_rate": 0.0010281629004841378, + "loss": 0.7923, + "step": 7278 + }, + { + "epoch": 0.5063828307071551, + "grad_norm": 1.0390625, + "learning_rate": 0.0010279376561524265, + "loss": 0.8669, + "step": 7279 + }, + { + "epoch": 0.5064523983442902, + "grad_norm": 1.1953125, + "learning_rate": 0.0010277124104021821, + "loss": 0.8788, + "step": 7280 + }, + { + "epoch": 0.5065219659814254, + "grad_norm": 1.0, + "learning_rate": 0.0010274871632448407, + "loss": 0.7445, + "step": 7281 + }, + { + "epoch": 0.5065915336185607, + "grad_norm": 1.09375, + "learning_rate": 0.0010272619146918403, + "loss": 0.9489, + "step": 7282 + }, + { + "epoch": 0.5066611012556959, + "grad_norm": 1.1875, + "learning_rate": 0.0010270366647546166, + "loss": 0.8435, + "step": 7283 + }, + { + "epoch": 0.506730668892831, + "grad_norm": 1.4921875, + "learning_rate": 0.001026811413444608, + "loss": 1.0238, + "step": 7284 + }, + { + "epoch": 0.5068002365299663, + "grad_norm": 1.1171875, + "learning_rate": 0.0010265861607732503, + "loss": 0.7611, + "step": 7285 + }, + { + "epoch": 0.5068698041671015, + "grad_norm": 1.171875, + "learning_rate": 0.0010263609067519817, + "loss": 0.8724, + "step": 7286 + }, + { + "epoch": 0.5069393718042366, + "grad_norm": 1.0234375, + "learning_rate": 0.0010261356513922393, + "loss": 0.9458, + "step": 7287 + }, + { + "epoch": 0.5070089394413718, + "grad_norm": 0.984375, + "learning_rate": 0.00102591039470546, + "loss": 0.7484, + "step": 7288 + }, + { + "epoch": 0.5070785070785071, + "grad_norm": 1.6015625, + "learning_rate": 0.0010256851367030817, + "loss": 1.2545, + "step": 7289 + }, + { + "epoch": 0.5071480747156423, + "grad_norm": 1.265625, + "learning_rate": 0.001025459877396541, + "loss": 0.9903, + "step": 7290 + }, + { + "epoch": 0.5072176423527774, + "grad_norm": 1.078125, + "learning_rate": 0.001025234616797277, + "loss": 0.8205, + "step": 7291 + }, + { + "epoch": 0.5072872099899127, + "grad_norm": 1.0234375, + "learning_rate": 0.0010250093549167257, + "loss": 0.8465, + "step": 7292 + }, + { + "epoch": 0.5073567776270479, + "grad_norm": 1.0859375, + "learning_rate": 0.0010247840917663254, + "loss": 0.8993, + "step": 7293 + }, + { + "epoch": 0.5074263452641831, + "grad_norm": 1.25, + "learning_rate": 0.0010245588273575142, + "loss": 0.9857, + "step": 7294 + }, + { + "epoch": 0.5074959129013183, + "grad_norm": 0.9765625, + "learning_rate": 0.001024333561701729, + "loss": 0.6849, + "step": 7295 + }, + { + "epoch": 0.5075654805384535, + "grad_norm": 1.3984375, + "learning_rate": 0.001024108294810409, + "loss": 0.9671, + "step": 7296 + }, + { + "epoch": 0.5076350481755887, + "grad_norm": 1.109375, + "learning_rate": 0.0010238830266949906, + "loss": 0.9284, + "step": 7297 + }, + { + "epoch": 0.507704615812724, + "grad_norm": 0.83203125, + "learning_rate": 0.0010236577573669128, + "loss": 0.6884, + "step": 7298 + }, + { + "epoch": 0.5077741834498591, + "grad_norm": 1.328125, + "learning_rate": 0.001023432486837613, + "loss": 0.7001, + "step": 7299 + }, + { + "epoch": 0.5078437510869943, + "grad_norm": 0.99609375, + "learning_rate": 0.00102320721511853, + "loss": 0.9619, + "step": 7300 + }, + { + "epoch": 0.5079133187241295, + "grad_norm": 1.3515625, + "learning_rate": 0.0010229819422211016, + "loss": 0.9597, + "step": 7301 + }, + { + "epoch": 0.5079828863612648, + "grad_norm": 1.6796875, + "learning_rate": 0.0010227566681567657, + "loss": 1.0313, + "step": 7302 + }, + { + "epoch": 0.5080524539983999, + "grad_norm": 0.96875, + "learning_rate": 0.0010225313929369613, + "loss": 0.7011, + "step": 7303 + }, + { + "epoch": 0.5081220216355351, + "grad_norm": 1.3046875, + "learning_rate": 0.0010223061165731257, + "loss": 0.8618, + "step": 7304 + }, + { + "epoch": 0.5081915892726704, + "grad_norm": 1.0390625, + "learning_rate": 0.0010220808390766986, + "loss": 0.7311, + "step": 7305 + }, + { + "epoch": 0.5082611569098056, + "grad_norm": 1.171875, + "learning_rate": 0.0010218555604591174, + "loss": 0.6457, + "step": 7306 + }, + { + "epoch": 0.5083307245469407, + "grad_norm": 1.109375, + "learning_rate": 0.0010216302807318214, + "loss": 1.0473, + "step": 7307 + }, + { + "epoch": 0.508400292184076, + "grad_norm": 1.1015625, + "learning_rate": 0.0010214049999062481, + "loss": 0.7673, + "step": 7308 + }, + { + "epoch": 0.5084698598212112, + "grad_norm": 1.0, + "learning_rate": 0.0010211797179938374, + "loss": 0.6966, + "step": 7309 + }, + { + "epoch": 0.5085394274583463, + "grad_norm": 0.83203125, + "learning_rate": 0.0010209544350060272, + "loss": 0.6101, + "step": 7310 + }, + { + "epoch": 0.5086089950954816, + "grad_norm": 1.03125, + "learning_rate": 0.0010207291509542562, + "loss": 0.6792, + "step": 7311 + }, + { + "epoch": 0.5086785627326168, + "grad_norm": 1.0078125, + "learning_rate": 0.0010205038658499636, + "loss": 0.6202, + "step": 7312 + }, + { + "epoch": 0.508748130369752, + "grad_norm": 1.0703125, + "learning_rate": 0.0010202785797045878, + "loss": 0.5167, + "step": 7313 + }, + { + "epoch": 0.5088176980068871, + "grad_norm": 1.1328125, + "learning_rate": 0.0010200532925295684, + "loss": 0.8961, + "step": 7314 + }, + { + "epoch": 0.5088872656440224, + "grad_norm": 1.421875, + "learning_rate": 0.0010198280043363435, + "loss": 0.8841, + "step": 7315 + }, + { + "epoch": 0.5089568332811576, + "grad_norm": 0.9140625, + "learning_rate": 0.0010196027151363526, + "loss": 0.8483, + "step": 7316 + }, + { + "epoch": 0.5090264009182928, + "grad_norm": 1.0546875, + "learning_rate": 0.0010193774249410345, + "loss": 0.6354, + "step": 7317 + }, + { + "epoch": 0.509095968555428, + "grad_norm": 1.1796875, + "learning_rate": 0.0010191521337618286, + "loss": 0.8367, + "step": 7318 + }, + { + "epoch": 0.5091655361925632, + "grad_norm": 1.0703125, + "learning_rate": 0.001018926841610174, + "loss": 0.6624, + "step": 7319 + }, + { + "epoch": 0.5092351038296984, + "grad_norm": 1.3359375, + "learning_rate": 0.0010187015484975095, + "loss": 0.7719, + "step": 7320 + }, + { + "epoch": 0.5093046714668337, + "grad_norm": 0.97265625, + "learning_rate": 0.001018476254435275, + "loss": 0.6867, + "step": 7321 + }, + { + "epoch": 0.5093742391039688, + "grad_norm": 1.265625, + "learning_rate": 0.001018250959434909, + "loss": 1.0151, + "step": 7322 + }, + { + "epoch": 0.509443806741104, + "grad_norm": 1.0078125, + "learning_rate": 0.0010180256635078514, + "loss": 0.6282, + "step": 7323 + }, + { + "epoch": 0.5095133743782393, + "grad_norm": 1.3828125, + "learning_rate": 0.0010178003666655416, + "loss": 0.895, + "step": 7324 + }, + { + "epoch": 0.5095829420153745, + "grad_norm": 1.1171875, + "learning_rate": 0.0010175750689194187, + "loss": 0.7366, + "step": 7325 + }, + { + "epoch": 0.5096525096525096, + "grad_norm": 1.2265625, + "learning_rate": 0.0010173497702809225, + "loss": 0.8923, + "step": 7326 + }, + { + "epoch": 0.5097220772896448, + "grad_norm": 1.1953125, + "learning_rate": 0.0010171244707614924, + "loss": 0.9015, + "step": 7327 + }, + { + "epoch": 0.5097916449267801, + "grad_norm": 1.03125, + "learning_rate": 0.0010168991703725682, + "loss": 0.8422, + "step": 7328 + }, + { + "epoch": 0.5098612125639153, + "grad_norm": 1.3828125, + "learning_rate": 0.001016673869125589, + "loss": 0.8664, + "step": 7329 + }, + { + "epoch": 0.5099307802010504, + "grad_norm": 1.2421875, + "learning_rate": 0.0010164485670319948, + "loss": 0.854, + "step": 7330 + }, + { + "epoch": 0.5100003478381857, + "grad_norm": 1.1328125, + "learning_rate": 0.0010162232641032253, + "loss": 0.6463, + "step": 7331 + }, + { + "epoch": 0.5100699154753209, + "grad_norm": 1.375, + "learning_rate": 0.0010159979603507204, + "loss": 0.9051, + "step": 7332 + }, + { + "epoch": 0.510139483112456, + "grad_norm": 0.91015625, + "learning_rate": 0.0010157726557859196, + "loss": 0.7682, + "step": 7333 + }, + { + "epoch": 0.5102090507495913, + "grad_norm": 1.109375, + "learning_rate": 0.0010155473504202626, + "loss": 0.7717, + "step": 7334 + }, + { + "epoch": 0.5102786183867265, + "grad_norm": 1.2109375, + "learning_rate": 0.00101532204426519, + "loss": 0.63, + "step": 7335 + }, + { + "epoch": 0.5103481860238617, + "grad_norm": 1.375, + "learning_rate": 0.0010150967373321407, + "loss": 0.8076, + "step": 7336 + }, + { + "epoch": 0.510417753660997, + "grad_norm": 0.87890625, + "learning_rate": 0.0010148714296325553, + "loss": 0.7188, + "step": 7337 + }, + { + "epoch": 0.5104873212981321, + "grad_norm": 1.2109375, + "learning_rate": 0.0010146461211778738, + "loss": 0.9532, + "step": 7338 + }, + { + "epoch": 0.5105568889352673, + "grad_norm": 1.3515625, + "learning_rate": 0.0010144208119795362, + "loss": 0.9941, + "step": 7339 + }, + { + "epoch": 0.5106264565724025, + "grad_norm": 0.8515625, + "learning_rate": 0.0010141955020489823, + "loss": 0.6594, + "step": 7340 + }, + { + "epoch": 0.5106960242095377, + "grad_norm": 1.125, + "learning_rate": 0.0010139701913976524, + "loss": 0.893, + "step": 7341 + }, + { + "epoch": 0.5107655918466729, + "grad_norm": 1.578125, + "learning_rate": 0.0010137448800369869, + "loss": 0.6618, + "step": 7342 + }, + { + "epoch": 0.5108351594838081, + "grad_norm": 1.0390625, + "learning_rate": 0.001013519567978425, + "loss": 0.6087, + "step": 7343 + }, + { + "epoch": 0.5109047271209434, + "grad_norm": 1.4921875, + "learning_rate": 0.0010132942552334078, + "loss": 0.9438, + "step": 7344 + }, + { + "epoch": 0.5109742947580785, + "grad_norm": 1.2265625, + "learning_rate": 0.0010130689418133755, + "loss": 0.6784, + "step": 7345 + }, + { + "epoch": 0.5110438623952137, + "grad_norm": 0.9453125, + "learning_rate": 0.0010128436277297684, + "loss": 0.6899, + "step": 7346 + }, + { + "epoch": 0.511113430032349, + "grad_norm": 1.296875, + "learning_rate": 0.0010126183129940264, + "loss": 1.0467, + "step": 7347 + }, + { + "epoch": 0.5111829976694842, + "grad_norm": 1.1328125, + "learning_rate": 0.0010123929976175899, + "loss": 0.8388, + "step": 7348 + }, + { + "epoch": 0.5112525653066193, + "grad_norm": 0.95703125, + "learning_rate": 0.0010121676816118997, + "loss": 0.7295, + "step": 7349 + }, + { + "epoch": 0.5113221329437546, + "grad_norm": 0.89453125, + "learning_rate": 0.0010119423649883957, + "loss": 0.6243, + "step": 7350 + }, + { + "epoch": 0.5113917005808898, + "grad_norm": 1.2109375, + "learning_rate": 0.001011717047758519, + "loss": 0.7465, + "step": 7351 + }, + { + "epoch": 0.511461268218025, + "grad_norm": 0.75390625, + "learning_rate": 0.0010114917299337092, + "loss": 0.7375, + "step": 7352 + }, + { + "epoch": 0.5115308358551601, + "grad_norm": 1.296875, + "learning_rate": 0.0010112664115254075, + "loss": 0.7938, + "step": 7353 + }, + { + "epoch": 0.5116004034922954, + "grad_norm": 1.25, + "learning_rate": 0.0010110410925450542, + "loss": 0.8754, + "step": 7354 + }, + { + "epoch": 0.5116699711294306, + "grad_norm": 0.98828125, + "learning_rate": 0.00101081577300409, + "loss": 1.0421, + "step": 7355 + }, + { + "epoch": 0.5117395387665657, + "grad_norm": 1.265625, + "learning_rate": 0.001010590452913955, + "loss": 0.9681, + "step": 7356 + }, + { + "epoch": 0.511809106403701, + "grad_norm": 1.1640625, + "learning_rate": 0.0010103651322860905, + "loss": 0.8657, + "step": 7357 + }, + { + "epoch": 0.5118786740408362, + "grad_norm": 1.46875, + "learning_rate": 0.0010101398111319372, + "loss": 0.9783, + "step": 7358 + }, + { + "epoch": 0.5119482416779714, + "grad_norm": 1.2421875, + "learning_rate": 0.0010099144894629346, + "loss": 0.6622, + "step": 7359 + }, + { + "epoch": 0.5120178093151067, + "grad_norm": 1.296875, + "learning_rate": 0.0010096891672905246, + "loss": 0.9833, + "step": 7360 + }, + { + "epoch": 0.5120873769522418, + "grad_norm": 1.484375, + "learning_rate": 0.0010094638446261474, + "loss": 0.8881, + "step": 7361 + }, + { + "epoch": 0.512156944589377, + "grad_norm": 1.125, + "learning_rate": 0.0010092385214812438, + "loss": 0.5359, + "step": 7362 + }, + { + "epoch": 0.5122265122265123, + "grad_norm": 1.4140625, + "learning_rate": 0.001009013197867255, + "loss": 0.9023, + "step": 7363 + }, + { + "epoch": 0.5122960798636474, + "grad_norm": 0.91796875, + "learning_rate": 0.001008787873795621, + "loss": 0.4285, + "step": 7364 + }, + { + "epoch": 0.5123656475007826, + "grad_norm": 1.015625, + "learning_rate": 0.0010085625492777834, + "loss": 0.8085, + "step": 7365 + }, + { + "epoch": 0.5124352151379178, + "grad_norm": 1.2890625, + "learning_rate": 0.0010083372243251828, + "loss": 1.0129, + "step": 7366 + }, + { + "epoch": 0.5125047827750531, + "grad_norm": 1.0625, + "learning_rate": 0.0010081118989492598, + "loss": 0.6876, + "step": 7367 + }, + { + "epoch": 0.5125743504121882, + "grad_norm": 1.4140625, + "learning_rate": 0.0010078865731614553, + "loss": 1.2099, + "step": 7368 + }, + { + "epoch": 0.5126439180493234, + "grad_norm": 1.25, + "learning_rate": 0.0010076612469732105, + "loss": 1.0681, + "step": 7369 + }, + { + "epoch": 0.5127134856864587, + "grad_norm": 1.4609375, + "learning_rate": 0.0010074359203959661, + "loss": 0.7904, + "step": 7370 + }, + { + "epoch": 0.5127830533235939, + "grad_norm": 1.5703125, + "learning_rate": 0.0010072105934411633, + "loss": 0.9934, + "step": 7371 + }, + { + "epoch": 0.512852620960729, + "grad_norm": 0.95703125, + "learning_rate": 0.0010069852661202428, + "loss": 0.8781, + "step": 7372 + }, + { + "epoch": 0.5129221885978643, + "grad_norm": 1.1015625, + "learning_rate": 0.0010067599384446456, + "loss": 0.884, + "step": 7373 + }, + { + "epoch": 0.5129917562349995, + "grad_norm": 1.046875, + "learning_rate": 0.001006534610425813, + "loss": 0.6635, + "step": 7374 + }, + { + "epoch": 0.5130613238721347, + "grad_norm": 1.0625, + "learning_rate": 0.0010063092820751858, + "loss": 0.9075, + "step": 7375 + }, + { + "epoch": 0.5131308915092699, + "grad_norm": 1.15625, + "learning_rate": 0.001006083953404205, + "loss": 0.6741, + "step": 7376 + }, + { + "epoch": 0.5132004591464051, + "grad_norm": 0.96484375, + "learning_rate": 0.0010058586244243118, + "loss": 0.6751, + "step": 7377 + }, + { + "epoch": 0.5132700267835403, + "grad_norm": 0.7890625, + "learning_rate": 0.0010056332951469472, + "loss": 0.6612, + "step": 7378 + }, + { + "epoch": 0.5133395944206754, + "grad_norm": 1.0, + "learning_rate": 0.001005407965583552, + "loss": 0.9126, + "step": 7379 + }, + { + "epoch": 0.5134091620578107, + "grad_norm": 0.953125, + "learning_rate": 0.0010051826357455678, + "loss": 0.676, + "step": 7380 + }, + { + "epoch": 0.5134787296949459, + "grad_norm": 1.25, + "learning_rate": 0.0010049573056444354, + "loss": 0.6661, + "step": 7381 + }, + { + "epoch": 0.5135482973320811, + "grad_norm": 1.3671875, + "learning_rate": 0.001004731975291596, + "loss": 0.7886, + "step": 7382 + }, + { + "epoch": 0.5136178649692164, + "grad_norm": 1.265625, + "learning_rate": 0.0010045066446984908, + "loss": 0.8526, + "step": 7383 + }, + { + "epoch": 0.5136874326063515, + "grad_norm": 1.46875, + "learning_rate": 0.0010042813138765607, + "loss": 1.2588, + "step": 7384 + }, + { + "epoch": 0.5137570002434867, + "grad_norm": 1.4296875, + "learning_rate": 0.0010040559828372474, + "loss": 0.7593, + "step": 7385 + }, + { + "epoch": 0.513826567880622, + "grad_norm": 0.92578125, + "learning_rate": 0.0010038306515919916, + "loss": 0.6358, + "step": 7386 + }, + { + "epoch": 0.5138961355177571, + "grad_norm": 1.375, + "learning_rate": 0.0010036053201522347, + "loss": 0.7824, + "step": 7387 + }, + { + "epoch": 0.5139657031548923, + "grad_norm": 1.2109375, + "learning_rate": 0.0010033799885294174, + "loss": 1.0388, + "step": 7388 + }, + { + "epoch": 0.5140352707920276, + "grad_norm": 1.03125, + "learning_rate": 0.0010031546567349815, + "loss": 0.8488, + "step": 7389 + }, + { + "epoch": 0.5141048384291628, + "grad_norm": 0.97265625, + "learning_rate": 0.0010029293247803685, + "loss": 0.6219, + "step": 7390 + }, + { + "epoch": 0.5141744060662979, + "grad_norm": 1.0625, + "learning_rate": 0.0010027039926770187, + "loss": 0.5945, + "step": 7391 + }, + { + "epoch": 0.5142439737034331, + "grad_norm": 1.015625, + "learning_rate": 0.0010024786604363738, + "loss": 0.8943, + "step": 7392 + }, + { + "epoch": 0.5143135413405684, + "grad_norm": 1.3359375, + "learning_rate": 0.0010022533280698751, + "loss": 0.8981, + "step": 7393 + }, + { + "epoch": 0.5143831089777036, + "grad_norm": 1.3203125, + "learning_rate": 0.0010020279955889637, + "loss": 0.7547, + "step": 7394 + }, + { + "epoch": 0.5144526766148387, + "grad_norm": 1.125, + "learning_rate": 0.001001802663005081, + "loss": 0.6436, + "step": 7395 + }, + { + "epoch": 0.514522244251974, + "grad_norm": 0.9921875, + "learning_rate": 0.0010015773303296682, + "loss": 0.6235, + "step": 7396 + }, + { + "epoch": 0.5145918118891092, + "grad_norm": 0.9375, + "learning_rate": 0.0010013519975741662, + "loss": 0.7985, + "step": 7397 + }, + { + "epoch": 0.5146613795262444, + "grad_norm": 1.2578125, + "learning_rate": 0.001001126664750017, + "loss": 0.9846, + "step": 7398 + }, + { + "epoch": 0.5147309471633796, + "grad_norm": 1.0390625, + "learning_rate": 0.0010009013318686612, + "loss": 0.8467, + "step": 7399 + }, + { + "epoch": 0.5148005148005148, + "grad_norm": 1.2265625, + "learning_rate": 0.0010006759989415403, + "loss": 0.8558, + "step": 7400 + }, + { + "epoch": 0.51487008243765, + "grad_norm": 1.015625, + "learning_rate": 0.0010004506659800959, + "loss": 0.7214, + "step": 7401 + }, + { + "epoch": 0.5149396500747853, + "grad_norm": 1.2578125, + "learning_rate": 0.0010002253329957685, + "loss": 1.0984, + "step": 7402 + }, + { + "epoch": 0.5150092177119204, + "grad_norm": 1.203125, + "learning_rate": 0.001, + "loss": 0.95, + "step": 7403 + }, + { + "epoch": 0.5150787853490556, + "grad_norm": 1.1875, + "learning_rate": 0.0009997746670042315, + "loss": 0.7794, + "step": 7404 + }, + { + "epoch": 0.5151483529861908, + "grad_norm": 0.9921875, + "learning_rate": 0.0009995493340199042, + "loss": 0.8167, + "step": 7405 + }, + { + "epoch": 0.515217920623326, + "grad_norm": 1.109375, + "learning_rate": 0.0009993240010584597, + "loss": 0.7491, + "step": 7406 + }, + { + "epoch": 0.5152874882604612, + "grad_norm": 1.1328125, + "learning_rate": 0.0009990986681313388, + "loss": 1.0262, + "step": 7407 + }, + { + "epoch": 0.5153570558975964, + "grad_norm": 0.9453125, + "learning_rate": 0.0009988733352499833, + "loss": 0.6585, + "step": 7408 + }, + { + "epoch": 0.5154266235347317, + "grad_norm": 1.296875, + "learning_rate": 0.0009986480024258338, + "loss": 0.925, + "step": 7409 + }, + { + "epoch": 0.5154961911718668, + "grad_norm": 1.2421875, + "learning_rate": 0.000998422669670332, + "loss": 0.8088, + "step": 7410 + }, + { + "epoch": 0.515565758809002, + "grad_norm": 0.953125, + "learning_rate": 0.000998197336994919, + "loss": 0.565, + "step": 7411 + }, + { + "epoch": 0.5156353264461373, + "grad_norm": 1.4296875, + "learning_rate": 0.0009979720044110362, + "loss": 0.9264, + "step": 7412 + }, + { + "epoch": 0.5157048940832725, + "grad_norm": 1.0390625, + "learning_rate": 0.0009977466719301251, + "loss": 0.8953, + "step": 7413 + }, + { + "epoch": 0.5157744617204076, + "grad_norm": 1.0390625, + "learning_rate": 0.0009975213395636263, + "loss": 0.769, + "step": 7414 + }, + { + "epoch": 0.5158440293575429, + "grad_norm": 1.140625, + "learning_rate": 0.0009972960073229818, + "loss": 0.8064, + "step": 7415 + }, + { + "epoch": 0.5159135969946781, + "grad_norm": 1.171875, + "learning_rate": 0.0009970706752196316, + "loss": 0.9126, + "step": 7416 + }, + { + "epoch": 0.5159831646318133, + "grad_norm": 1.265625, + "learning_rate": 0.0009968453432650185, + "loss": 0.9174, + "step": 7417 + }, + { + "epoch": 0.5160527322689484, + "grad_norm": 1.25, + "learning_rate": 0.0009966200114705827, + "loss": 0.8217, + "step": 7418 + }, + { + "epoch": 0.5161222999060837, + "grad_norm": 1.046875, + "learning_rate": 0.0009963946798477654, + "loss": 0.7123, + "step": 7419 + }, + { + "epoch": 0.5161918675432189, + "grad_norm": 1.21875, + "learning_rate": 0.0009961693484080087, + "loss": 1.0308, + "step": 7420 + }, + { + "epoch": 0.5162614351803541, + "grad_norm": 0.9609375, + "learning_rate": 0.000995944017162753, + "loss": 0.8344, + "step": 7421 + }, + { + "epoch": 0.5163310028174893, + "grad_norm": 1.0625, + "learning_rate": 0.0009957186861234396, + "loss": 0.6491, + "step": 7422 + }, + { + "epoch": 0.5164005704546245, + "grad_norm": 1.203125, + "learning_rate": 0.0009954933553015092, + "loss": 0.8135, + "step": 7423 + }, + { + "epoch": 0.5164701380917597, + "grad_norm": 1.2578125, + "learning_rate": 0.0009952680247084043, + "loss": 0.8808, + "step": 7424 + }, + { + "epoch": 0.516539705728895, + "grad_norm": 1.0078125, + "learning_rate": 0.0009950426943555648, + "loss": 0.6941, + "step": 7425 + }, + { + "epoch": 0.5166092733660301, + "grad_norm": 1.2421875, + "learning_rate": 0.0009948173642544322, + "loss": 0.7446, + "step": 7426 + }, + { + "epoch": 0.5166788410031653, + "grad_norm": 1.1015625, + "learning_rate": 0.000994592034416448, + "loss": 0.6119, + "step": 7427 + }, + { + "epoch": 0.5167484086403006, + "grad_norm": 1.140625, + "learning_rate": 0.000994366704853053, + "loss": 0.7891, + "step": 7428 + }, + { + "epoch": 0.5168179762774358, + "grad_norm": 0.921875, + "learning_rate": 0.0009941413755756886, + "loss": 0.6071, + "step": 7429 + }, + { + "epoch": 0.5168875439145709, + "grad_norm": 1.09375, + "learning_rate": 0.000993916046595795, + "loss": 0.751, + "step": 7430 + }, + { + "epoch": 0.5169571115517061, + "grad_norm": 0.8515625, + "learning_rate": 0.0009936907179248144, + "loss": 0.7383, + "step": 7431 + }, + { + "epoch": 0.5170266791888414, + "grad_norm": 1.2109375, + "learning_rate": 0.0009934653895741872, + "loss": 0.7809, + "step": 7432 + }, + { + "epoch": 0.5170962468259765, + "grad_norm": 0.96875, + "learning_rate": 0.0009932400615553542, + "loss": 0.9368, + "step": 7433 + }, + { + "epoch": 0.5171658144631117, + "grad_norm": 0.80859375, + "learning_rate": 0.0009930147338797573, + "loss": 0.6948, + "step": 7434 + }, + { + "epoch": 0.517235382100247, + "grad_norm": 1.0234375, + "learning_rate": 0.000992789406558837, + "loss": 0.8302, + "step": 7435 + }, + { + "epoch": 0.5173049497373822, + "grad_norm": 1.15625, + "learning_rate": 0.0009925640796040341, + "loss": 0.8594, + "step": 7436 + }, + { + "epoch": 0.5173745173745173, + "grad_norm": 1.1640625, + "learning_rate": 0.0009923387530267895, + "loss": 0.8291, + "step": 7437 + }, + { + "epoch": 0.5174440850116526, + "grad_norm": 1.3515625, + "learning_rate": 0.000992113426838545, + "loss": 0.9406, + "step": 7438 + }, + { + "epoch": 0.5175136526487878, + "grad_norm": 0.9609375, + "learning_rate": 0.0009918881010507405, + "loss": 0.8503, + "step": 7439 + }, + { + "epoch": 0.517583220285923, + "grad_norm": 0.9296875, + "learning_rate": 0.0009916627756748173, + "loss": 0.7722, + "step": 7440 + }, + { + "epoch": 0.5176527879230582, + "grad_norm": 1.1328125, + "learning_rate": 0.0009914374507222167, + "loss": 1.0453, + "step": 7441 + }, + { + "epoch": 0.5177223555601934, + "grad_norm": 1.0078125, + "learning_rate": 0.000991212126204379, + "loss": 0.8305, + "step": 7442 + }, + { + "epoch": 0.5177919231973286, + "grad_norm": 1.0234375, + "learning_rate": 0.0009909868021327451, + "loss": 0.9021, + "step": 7443 + }, + { + "epoch": 0.5178614908344638, + "grad_norm": 1.0625, + "learning_rate": 0.000990761478518756, + "loss": 0.8837, + "step": 7444 + }, + { + "epoch": 0.517931058471599, + "grad_norm": 1.1875, + "learning_rate": 0.0009905361553738529, + "loss": 0.6102, + "step": 7445 + }, + { + "epoch": 0.5180006261087342, + "grad_norm": 1.1875, + "learning_rate": 0.0009903108327094757, + "loss": 0.8095, + "step": 7446 + }, + { + "epoch": 0.5180701937458694, + "grad_norm": 1.09375, + "learning_rate": 0.0009900855105370657, + "loss": 0.8938, + "step": 7447 + }, + { + "epoch": 0.5181397613830047, + "grad_norm": 1.1328125, + "learning_rate": 0.000989860188868063, + "loss": 0.9159, + "step": 7448 + }, + { + "epoch": 0.5182093290201398, + "grad_norm": 1.140625, + "learning_rate": 0.0009896348677139095, + "loss": 0.6995, + "step": 7449 + }, + { + "epoch": 0.518278896657275, + "grad_norm": 0.9453125, + "learning_rate": 0.000989409547086045, + "loss": 0.779, + "step": 7450 + }, + { + "epoch": 0.5183484642944103, + "grad_norm": 1.21875, + "learning_rate": 0.00098918422699591, + "loss": 0.9001, + "step": 7451 + }, + { + "epoch": 0.5184180319315455, + "grad_norm": 1.0703125, + "learning_rate": 0.0009889589074549459, + "loss": 1.0165, + "step": 7452 + }, + { + "epoch": 0.5184875995686806, + "grad_norm": 1.34375, + "learning_rate": 0.0009887335884745925, + "loss": 0.8979, + "step": 7453 + }, + { + "epoch": 0.5185571672058159, + "grad_norm": 1.2578125, + "learning_rate": 0.000988508270066291, + "loss": 0.9494, + "step": 7454 + }, + { + "epoch": 0.5186267348429511, + "grad_norm": 1.125, + "learning_rate": 0.000988282952241481, + "loss": 0.9349, + "step": 7455 + }, + { + "epoch": 0.5186963024800862, + "grad_norm": 1.6171875, + "learning_rate": 0.0009880576350116044, + "loss": 0.9685, + "step": 7456 + }, + { + "epoch": 0.5187658701172214, + "grad_norm": 1.3828125, + "learning_rate": 0.0009878323183881005, + "loss": 1.3639, + "step": 7457 + }, + { + "epoch": 0.5188354377543567, + "grad_norm": 1.03125, + "learning_rate": 0.0009876070023824102, + "loss": 0.8812, + "step": 7458 + }, + { + "epoch": 0.5189050053914919, + "grad_norm": 1.109375, + "learning_rate": 0.0009873816870059739, + "loss": 0.9826, + "step": 7459 + }, + { + "epoch": 0.518974573028627, + "grad_norm": 1.5390625, + "learning_rate": 0.0009871563722702319, + "loss": 0.8891, + "step": 7460 + }, + { + "epoch": 0.5190441406657623, + "grad_norm": 1.046875, + "learning_rate": 0.0009869310581866247, + "loss": 0.8919, + "step": 7461 + }, + { + "epoch": 0.5191137083028975, + "grad_norm": 1.125, + "learning_rate": 0.000986705744766592, + "loss": 0.8724, + "step": 7462 + }, + { + "epoch": 0.5191832759400327, + "grad_norm": 0.91796875, + "learning_rate": 0.000986480432021575, + "loss": 0.6715, + "step": 7463 + }, + { + "epoch": 0.5192528435771679, + "grad_norm": 1.234375, + "learning_rate": 0.0009862551199630136, + "loss": 0.8011, + "step": 7464 + }, + { + "epoch": 0.5193224112143031, + "grad_norm": 1.1015625, + "learning_rate": 0.0009860298086023474, + "loss": 0.8478, + "step": 7465 + }, + { + "epoch": 0.5193919788514383, + "grad_norm": 1.265625, + "learning_rate": 0.0009858044979510177, + "loss": 0.7981, + "step": 7466 + }, + { + "epoch": 0.5194615464885736, + "grad_norm": 1.2421875, + "learning_rate": 0.0009855791880204639, + "loss": 0.9811, + "step": 7467 + }, + { + "epoch": 0.5195311141257087, + "grad_norm": 1.296875, + "learning_rate": 0.0009853538788221262, + "loss": 0.9317, + "step": 7468 + }, + { + "epoch": 0.5196006817628439, + "grad_norm": 1.9609375, + "learning_rate": 0.0009851285703674445, + "loss": 1.1164, + "step": 7469 + }, + { + "epoch": 0.5196702493999791, + "grad_norm": 1.109375, + "learning_rate": 0.0009849032626678595, + "loss": 0.7334, + "step": 7470 + }, + { + "epoch": 0.5197398170371144, + "grad_norm": 0.87890625, + "learning_rate": 0.0009846779557348103, + "loss": 0.5433, + "step": 7471 + }, + { + "epoch": 0.5198093846742495, + "grad_norm": 1.140625, + "learning_rate": 0.0009844526495797372, + "loss": 0.8945, + "step": 7472 + }, + { + "epoch": 0.5198789523113847, + "grad_norm": 1.1796875, + "learning_rate": 0.0009842273442140807, + "loss": 0.7181, + "step": 7473 + }, + { + "epoch": 0.51994851994852, + "grad_norm": 1.3203125, + "learning_rate": 0.0009840020396492798, + "loss": 0.9351, + "step": 7474 + }, + { + "epoch": 0.5200180875856552, + "grad_norm": 1.484375, + "learning_rate": 0.000983776735896775, + "loss": 0.648, + "step": 7475 + }, + { + "epoch": 0.5200876552227903, + "grad_norm": 0.984375, + "learning_rate": 0.0009835514329680052, + "loss": 0.8321, + "step": 7476 + }, + { + "epoch": 0.5201572228599256, + "grad_norm": 1.09375, + "learning_rate": 0.0009833261308744112, + "loss": 0.8325, + "step": 7477 + }, + { + "epoch": 0.5202267904970608, + "grad_norm": 1.3203125, + "learning_rate": 0.0009831008296274323, + "loss": 0.8579, + "step": 7478 + }, + { + "epoch": 0.520296358134196, + "grad_norm": 1.3515625, + "learning_rate": 0.0009828755292385076, + "loss": 1.0932, + "step": 7479 + }, + { + "epoch": 0.5203659257713312, + "grad_norm": 0.97265625, + "learning_rate": 0.0009826502297190776, + "loss": 0.8288, + "step": 7480 + }, + { + "epoch": 0.5204354934084664, + "grad_norm": 1.1796875, + "learning_rate": 0.0009824249310805815, + "loss": 0.9406, + "step": 7481 + }, + { + "epoch": 0.5205050610456016, + "grad_norm": 1.203125, + "learning_rate": 0.0009821996333344587, + "loss": 1.028, + "step": 7482 + }, + { + "epoch": 0.5205746286827367, + "grad_norm": 1.0234375, + "learning_rate": 0.0009819743364921484, + "loss": 0.7697, + "step": 7483 + }, + { + "epoch": 0.520644196319872, + "grad_norm": 1.2421875, + "learning_rate": 0.000981749040565091, + "loss": 0.7308, + "step": 7484 + }, + { + "epoch": 0.5207137639570072, + "grad_norm": 1.015625, + "learning_rate": 0.0009815237455647254, + "loss": 0.8124, + "step": 7485 + }, + { + "epoch": 0.5207833315941424, + "grad_norm": 1.1171875, + "learning_rate": 0.0009812984515024904, + "loss": 0.5755, + "step": 7486 + }, + { + "epoch": 0.5208528992312776, + "grad_norm": 1.046875, + "learning_rate": 0.000981073158389826, + "loss": 0.9276, + "step": 7487 + }, + { + "epoch": 0.5209224668684128, + "grad_norm": 0.78515625, + "learning_rate": 0.0009808478662381714, + "loss": 0.6658, + "step": 7488 + }, + { + "epoch": 0.520992034505548, + "grad_norm": 1.0703125, + "learning_rate": 0.0009806225750589655, + "loss": 0.8713, + "step": 7489 + }, + { + "epoch": 0.5210616021426833, + "grad_norm": 0.84375, + "learning_rate": 0.0009803972848636473, + "loss": 0.7039, + "step": 7490 + }, + { + "epoch": 0.5211311697798184, + "grad_norm": 1.234375, + "learning_rate": 0.0009801719956636567, + "loss": 0.9872, + "step": 7491 + }, + { + "epoch": 0.5212007374169536, + "grad_norm": 1.1328125, + "learning_rate": 0.0009799467074704318, + "loss": 0.6726, + "step": 7492 + }, + { + "epoch": 0.5212703050540889, + "grad_norm": 1.0390625, + "learning_rate": 0.000979721420295412, + "loss": 0.7117, + "step": 7493 + }, + { + "epoch": 0.5213398726912241, + "grad_norm": 1.4375, + "learning_rate": 0.0009794961341500364, + "loss": 0.7386, + "step": 7494 + }, + { + "epoch": 0.5214094403283592, + "grad_norm": 1.0859375, + "learning_rate": 0.0009792708490457438, + "loss": 1.0354, + "step": 7495 + }, + { + "epoch": 0.5214790079654944, + "grad_norm": 0.875, + "learning_rate": 0.000979045564993973, + "loss": 0.6982, + "step": 7496 + }, + { + "epoch": 0.5215485756026297, + "grad_norm": 0.92578125, + "learning_rate": 0.0009788202820061626, + "loss": 0.8027, + "step": 7497 + }, + { + "epoch": 0.5216181432397649, + "grad_norm": 0.86328125, + "learning_rate": 0.000978595000093752, + "loss": 0.8026, + "step": 7498 + }, + { + "epoch": 0.5216877108769, + "grad_norm": 0.95703125, + "learning_rate": 0.000978369719268179, + "loss": 0.7331, + "step": 7499 + }, + { + "epoch": 0.5217572785140353, + "grad_norm": 1.1015625, + "learning_rate": 0.0009781444395408824, + "loss": 1.0517, + "step": 7500 + }, + { + "epoch": 0.5218268461511705, + "grad_norm": 1.046875, + "learning_rate": 0.0009779191609233014, + "loss": 0.7587, + "step": 7501 + }, + { + "epoch": 0.5218964137883056, + "grad_norm": 1.34375, + "learning_rate": 0.0009776938834268744, + "loss": 1.1129, + "step": 7502 + }, + { + "epoch": 0.5219659814254409, + "grad_norm": 1.078125, + "learning_rate": 0.0009774686070630392, + "loss": 0.605, + "step": 7503 + }, + { + "epoch": 0.5220355490625761, + "grad_norm": 0.9453125, + "learning_rate": 0.0009772433318432341, + "loss": 0.6579, + "step": 7504 + }, + { + "epoch": 0.5221051166997113, + "grad_norm": 1.125, + "learning_rate": 0.0009770180577788987, + "loss": 0.7113, + "step": 7505 + }, + { + "epoch": 0.5221746843368466, + "grad_norm": 1.140625, + "learning_rate": 0.0009767927848814701, + "loss": 1.2031, + "step": 7506 + }, + { + "epoch": 0.5222442519739817, + "grad_norm": 0.9765625, + "learning_rate": 0.0009765675131623867, + "loss": 0.8527, + "step": 7507 + }, + { + "epoch": 0.5223138196111169, + "grad_norm": 0.98046875, + "learning_rate": 0.0009763422426330873, + "loss": 0.9367, + "step": 7508 + }, + { + "epoch": 0.5223833872482521, + "grad_norm": 1.0703125, + "learning_rate": 0.0009761169733050096, + "loss": 0.8686, + "step": 7509 + }, + { + "epoch": 0.5224529548853873, + "grad_norm": 0.85546875, + "learning_rate": 0.0009758917051895915, + "loss": 0.5321, + "step": 7510 + }, + { + "epoch": 0.5225225225225225, + "grad_norm": 0.96875, + "learning_rate": 0.0009756664382982708, + "loss": 0.8484, + "step": 7511 + }, + { + "epoch": 0.5225920901596577, + "grad_norm": 1.1328125, + "learning_rate": 0.0009754411726424861, + "loss": 0.7063, + "step": 7512 + }, + { + "epoch": 0.522661657796793, + "grad_norm": 1.1171875, + "learning_rate": 0.0009752159082336747, + "loss": 0.825, + "step": 7513 + }, + { + "epoch": 0.5227312254339281, + "grad_norm": 0.8203125, + "learning_rate": 0.0009749906450832744, + "loss": 0.6291, + "step": 7514 + }, + { + "epoch": 0.5228007930710633, + "grad_norm": 1.21875, + "learning_rate": 0.0009747653832027232, + "loss": 0.7437, + "step": 7515 + }, + { + "epoch": 0.5228703607081986, + "grad_norm": 0.88671875, + "learning_rate": 0.0009745401226034589, + "loss": 0.7518, + "step": 7516 + }, + { + "epoch": 0.5229399283453338, + "grad_norm": 1.3046875, + "learning_rate": 0.0009743148632969186, + "loss": 0.8273, + "step": 7517 + }, + { + "epoch": 0.5230094959824689, + "grad_norm": 1.21875, + "learning_rate": 0.00097408960529454, + "loss": 0.7889, + "step": 7518 + }, + { + "epoch": 0.5230790636196042, + "grad_norm": 1.109375, + "learning_rate": 0.0009738643486077608, + "loss": 0.7051, + "step": 7519 + }, + { + "epoch": 0.5231486312567394, + "grad_norm": 1.25, + "learning_rate": 0.0009736390932480183, + "loss": 0.7936, + "step": 7520 + }, + { + "epoch": 0.5232181988938746, + "grad_norm": 1.25, + "learning_rate": 0.0009734138392267497, + "loss": 0.9106, + "step": 7521 + }, + { + "epoch": 0.5232877665310097, + "grad_norm": 1.046875, + "learning_rate": 0.0009731885865553922, + "loss": 0.7028, + "step": 7522 + }, + { + "epoch": 0.523357334168145, + "grad_norm": 1.09375, + "learning_rate": 0.0009729633352453835, + "loss": 0.7533, + "step": 7523 + }, + { + "epoch": 0.5234269018052802, + "grad_norm": 1.3125, + "learning_rate": 0.0009727380853081601, + "loss": 0.8453, + "step": 7524 + }, + { + "epoch": 0.5234964694424153, + "grad_norm": 0.91796875, + "learning_rate": 0.0009725128367551592, + "loss": 0.6497, + "step": 7525 + }, + { + "epoch": 0.5235660370795506, + "grad_norm": 1.140625, + "learning_rate": 0.000972287589597818, + "loss": 0.9773, + "step": 7526 + }, + { + "epoch": 0.5236356047166858, + "grad_norm": 1.21875, + "learning_rate": 0.0009720623438475737, + "loss": 0.7903, + "step": 7527 + }, + { + "epoch": 0.523705172353821, + "grad_norm": 0.95703125, + "learning_rate": 0.0009718370995158623, + "loss": 0.5697, + "step": 7528 + }, + { + "epoch": 0.5237747399909563, + "grad_norm": 1.0390625, + "learning_rate": 0.000971611856614121, + "loss": 0.7219, + "step": 7529 + }, + { + "epoch": 0.5238443076280914, + "grad_norm": 0.82421875, + "learning_rate": 0.0009713866151537869, + "loss": 0.7004, + "step": 7530 + }, + { + "epoch": 0.5239138752652266, + "grad_norm": 1.1328125, + "learning_rate": 0.0009711613751462961, + "loss": 0.7109, + "step": 7531 + }, + { + "epoch": 0.5239834429023619, + "grad_norm": 0.96484375, + "learning_rate": 0.000970936136603085, + "loss": 0.6683, + "step": 7532 + }, + { + "epoch": 0.524053010539497, + "grad_norm": 1.1328125, + "learning_rate": 0.0009707108995355907, + "loss": 0.8626, + "step": 7533 + }, + { + "epoch": 0.5241225781766322, + "grad_norm": 1.2265625, + "learning_rate": 0.0009704856639552495, + "loss": 0.7987, + "step": 7534 + }, + { + "epoch": 0.5241921458137674, + "grad_norm": 1.171875, + "learning_rate": 0.0009702604298734973, + "loss": 0.8395, + "step": 7535 + }, + { + "epoch": 0.5242617134509027, + "grad_norm": 0.9296875, + "learning_rate": 0.0009700351973017704, + "loss": 0.7056, + "step": 7536 + }, + { + "epoch": 0.5243312810880378, + "grad_norm": 0.95703125, + "learning_rate": 0.0009698099662515054, + "loss": 0.7448, + "step": 7537 + }, + { + "epoch": 0.524400848725173, + "grad_norm": 1.40625, + "learning_rate": 0.000969584736734138, + "loss": 0.8765, + "step": 7538 + }, + { + "epoch": 0.5244704163623083, + "grad_norm": 1.1015625, + "learning_rate": 0.0009693595087611042, + "loss": 0.6362, + "step": 7539 + }, + { + "epoch": 0.5245399839994435, + "grad_norm": 1.3203125, + "learning_rate": 0.0009691342823438403, + "loss": 0.8538, + "step": 7540 + }, + { + "epoch": 0.5246095516365786, + "grad_norm": 1.125, + "learning_rate": 0.0009689090574937823, + "loss": 0.6046, + "step": 7541 + }, + { + "epoch": 0.5246791192737139, + "grad_norm": 0.9921875, + "learning_rate": 0.0009686838342223654, + "loss": 0.8277, + "step": 7542 + }, + { + "epoch": 0.5247486869108491, + "grad_norm": 1.234375, + "learning_rate": 0.0009684586125410252, + "loss": 0.9067, + "step": 7543 + }, + { + "epoch": 0.5248182545479843, + "grad_norm": 1.0234375, + "learning_rate": 0.0009682333924611983, + "loss": 0.5895, + "step": 7544 + }, + { + "epoch": 0.5248878221851195, + "grad_norm": 0.70703125, + "learning_rate": 0.0009680081739943192, + "loss": 0.6634, + "step": 7545 + }, + { + "epoch": 0.5249573898222547, + "grad_norm": 1.0625, + "learning_rate": 0.0009677829571518237, + "loss": 0.684, + "step": 7546 + }, + { + "epoch": 0.5250269574593899, + "grad_norm": 1.0, + "learning_rate": 0.0009675577419451473, + "loss": 0.8417, + "step": 7547 + }, + { + "epoch": 0.525096525096525, + "grad_norm": 1.6171875, + "learning_rate": 0.0009673325283857256, + "loss": 0.9912, + "step": 7548 + }, + { + "epoch": 0.5251660927336603, + "grad_norm": 1.171875, + "learning_rate": 0.0009671073164849932, + "loss": 0.9259, + "step": 7549 + }, + { + "epoch": 0.5252356603707955, + "grad_norm": 1.125, + "learning_rate": 0.0009668821062543852, + "loss": 0.6655, + "step": 7550 + }, + { + "epoch": 0.5253052280079307, + "grad_norm": 0.95703125, + "learning_rate": 0.0009666568977053371, + "loss": 1.0478, + "step": 7551 + }, + { + "epoch": 0.525374795645066, + "grad_norm": 1.4140625, + "learning_rate": 0.000966431690849284, + "loss": 0.6806, + "step": 7552 + }, + { + "epoch": 0.5254443632822011, + "grad_norm": 1.21875, + "learning_rate": 0.0009662064856976601, + "loss": 0.8352, + "step": 7553 + }, + { + "epoch": 0.5255139309193363, + "grad_norm": 1.1875, + "learning_rate": 0.0009659812822619007, + "loss": 0.7596, + "step": 7554 + }, + { + "epoch": 0.5255834985564716, + "grad_norm": 1.34375, + "learning_rate": 0.0009657560805534405, + "loss": 0.9609, + "step": 7555 + }, + { + "epoch": 0.5256530661936067, + "grad_norm": 1.4375, + "learning_rate": 0.0009655308805837135, + "loss": 1.0609, + "step": 7556 + }, + { + "epoch": 0.5257226338307419, + "grad_norm": 0.87109375, + "learning_rate": 0.0009653056823641546, + "loss": 0.7534, + "step": 7557 + }, + { + "epoch": 0.5257922014678772, + "grad_norm": 1.0703125, + "learning_rate": 0.0009650804859061985, + "loss": 0.8217, + "step": 7558 + }, + { + "epoch": 0.5258617691050124, + "grad_norm": 1.234375, + "learning_rate": 0.0009648552912212795, + "loss": 0.8723, + "step": 7559 + }, + { + "epoch": 0.5259313367421475, + "grad_norm": 1.0546875, + "learning_rate": 0.0009646300983208314, + "loss": 0.7671, + "step": 7560 + }, + { + "epoch": 0.5260009043792827, + "grad_norm": 1.109375, + "learning_rate": 0.0009644049072162887, + "loss": 0.8247, + "step": 7561 + }, + { + "epoch": 0.526070472016418, + "grad_norm": 1.1953125, + "learning_rate": 0.0009641797179190856, + "loss": 0.937, + "step": 7562 + }, + { + "epoch": 0.5261400396535532, + "grad_norm": 1.359375, + "learning_rate": 0.0009639545304406557, + "loss": 0.9648, + "step": 7563 + }, + { + "epoch": 0.5262096072906883, + "grad_norm": 1.1640625, + "learning_rate": 0.0009637293447924329, + "loss": 0.9806, + "step": 7564 + }, + { + "epoch": 0.5262791749278236, + "grad_norm": 1.265625, + "learning_rate": 0.0009635041609858513, + "loss": 1.0049, + "step": 7565 + }, + { + "epoch": 0.5263487425649588, + "grad_norm": 0.984375, + "learning_rate": 0.0009632789790323446, + "loss": 0.6768, + "step": 7566 + }, + { + "epoch": 0.526418310202094, + "grad_norm": 1.078125, + "learning_rate": 0.000963053798943346, + "loss": 0.7095, + "step": 7567 + }, + { + "epoch": 0.5264878778392292, + "grad_norm": 1.421875, + "learning_rate": 0.0009628286207302893, + "loss": 0.8472, + "step": 7568 + }, + { + "epoch": 0.5265574454763644, + "grad_norm": 1.2578125, + "learning_rate": 0.0009626034444046082, + "loss": 1.0541, + "step": 7569 + }, + { + "epoch": 0.5266270131134996, + "grad_norm": 1.0, + "learning_rate": 0.0009623782699777354, + "loss": 0.7968, + "step": 7570 + }, + { + "epoch": 0.5266965807506349, + "grad_norm": 1.15625, + "learning_rate": 0.0009621530974611044, + "loss": 0.6698, + "step": 7571 + }, + { + "epoch": 0.52676614838777, + "grad_norm": 1.078125, + "learning_rate": 0.0009619279268661484, + "loss": 1.0106, + "step": 7572 + }, + { + "epoch": 0.5268357160249052, + "grad_norm": 1.1796875, + "learning_rate": 0.0009617027582043006, + "loss": 1.1062, + "step": 7573 + }, + { + "epoch": 0.5269052836620404, + "grad_norm": 1.1171875, + "learning_rate": 0.0009614775914869934, + "loss": 0.7848, + "step": 7574 + }, + { + "epoch": 0.5269748512991757, + "grad_norm": 1.15625, + "learning_rate": 0.00096125242672566, + "loss": 0.9292, + "step": 7575 + }, + { + "epoch": 0.5270444189363108, + "grad_norm": 1.1171875, + "learning_rate": 0.0009610272639317334, + "loss": 0.8652, + "step": 7576 + }, + { + "epoch": 0.527113986573446, + "grad_norm": 1.171875, + "learning_rate": 0.0009608021031166456, + "loss": 1.0807, + "step": 7577 + }, + { + "epoch": 0.5271835542105813, + "grad_norm": 1.140625, + "learning_rate": 0.0009605769442918293, + "loss": 0.9035, + "step": 7578 + }, + { + "epoch": 0.5272531218477164, + "grad_norm": 1.0703125, + "learning_rate": 0.0009603517874687172, + "loss": 0.6871, + "step": 7579 + }, + { + "epoch": 0.5273226894848516, + "grad_norm": 1.265625, + "learning_rate": 0.0009601266326587416, + "loss": 0.9676, + "step": 7580 + }, + { + "epoch": 0.5273922571219869, + "grad_norm": 1.28125, + "learning_rate": 0.0009599014798733344, + "loss": 0.8892, + "step": 7581 + }, + { + "epoch": 0.5274618247591221, + "grad_norm": 0.89453125, + "learning_rate": 0.0009596763291239281, + "loss": 0.6882, + "step": 7582 + }, + { + "epoch": 0.5275313923962572, + "grad_norm": 1.1484375, + "learning_rate": 0.0009594511804219548, + "loss": 0.8788, + "step": 7583 + }, + { + "epoch": 0.5276009600333925, + "grad_norm": 1.0859375, + "learning_rate": 0.0009592260337788459, + "loss": 1.0275, + "step": 7584 + }, + { + "epoch": 0.5276705276705277, + "grad_norm": 1.1953125, + "learning_rate": 0.0009590008892060332, + "loss": 0.7101, + "step": 7585 + }, + { + "epoch": 0.5277400953076629, + "grad_norm": 1.53125, + "learning_rate": 0.000958775746714949, + "loss": 0.7943, + "step": 7586 + }, + { + "epoch": 0.527809662944798, + "grad_norm": 1.265625, + "learning_rate": 0.0009585506063170249, + "loss": 0.9239, + "step": 7587 + }, + { + "epoch": 0.5278792305819333, + "grad_norm": 0.88671875, + "learning_rate": 0.0009583254680236915, + "loss": 0.6241, + "step": 7588 + }, + { + "epoch": 0.5279487982190685, + "grad_norm": 1.0234375, + "learning_rate": 0.000958100331846381, + "loss": 0.8665, + "step": 7589 + }, + { + "epoch": 0.5280183658562037, + "grad_norm": 1.078125, + "learning_rate": 0.0009578751977965246, + "loss": 0.6914, + "step": 7590 + }, + { + "epoch": 0.5280879334933389, + "grad_norm": 1.1640625, + "learning_rate": 0.0009576500658855535, + "loss": 0.7779, + "step": 7591 + }, + { + "epoch": 0.5281575011304741, + "grad_norm": 1.1796875, + "learning_rate": 0.0009574249361248981, + "loss": 0.9939, + "step": 7592 + }, + { + "epoch": 0.5282270687676093, + "grad_norm": 1.234375, + "learning_rate": 0.0009571998085259901, + "loss": 0.7421, + "step": 7593 + }, + { + "epoch": 0.5282966364047446, + "grad_norm": 0.9765625, + "learning_rate": 0.0009569746831002603, + "loss": 0.7919, + "step": 7594 + }, + { + "epoch": 0.5283662040418797, + "grad_norm": 0.9296875, + "learning_rate": 0.0009567495598591387, + "loss": 0.5853, + "step": 7595 + }, + { + "epoch": 0.5284357716790149, + "grad_norm": 1.1875, + "learning_rate": 0.0009565244388140569, + "loss": 0.7668, + "step": 7596 + }, + { + "epoch": 0.5285053393161502, + "grad_norm": 1.171875, + "learning_rate": 0.0009562993199764447, + "loss": 0.7021, + "step": 7597 + }, + { + "epoch": 0.5285749069532854, + "grad_norm": 0.9296875, + "learning_rate": 0.0009560742033577332, + "loss": 0.7556, + "step": 7598 + }, + { + "epoch": 0.5286444745904205, + "grad_norm": 1.1640625, + "learning_rate": 0.0009558490889693518, + "loss": 0.8799, + "step": 7599 + }, + { + "epoch": 0.5287140422275557, + "grad_norm": 1.34375, + "learning_rate": 0.0009556239768227312, + "loss": 0.9766, + "step": 7600 + }, + { + "epoch": 0.528783609864691, + "grad_norm": 1.0703125, + "learning_rate": 0.0009553988669293017, + "loss": 0.7397, + "step": 7601 + }, + { + "epoch": 0.5288531775018261, + "grad_norm": 0.9375, + "learning_rate": 0.0009551737593004926, + "loss": 0.6558, + "step": 7602 + }, + { + "epoch": 0.5289227451389613, + "grad_norm": 0.96875, + "learning_rate": 0.000954948653947734, + "loss": 0.8443, + "step": 7603 + }, + { + "epoch": 0.5289923127760966, + "grad_norm": 0.953125, + "learning_rate": 0.0009547235508824557, + "loss": 0.6416, + "step": 7604 + }, + { + "epoch": 0.5290618804132318, + "grad_norm": 1.1171875, + "learning_rate": 0.0009544984501160878, + "loss": 0.9741, + "step": 7605 + }, + { + "epoch": 0.5291314480503669, + "grad_norm": 0.97265625, + "learning_rate": 0.0009542733516600586, + "loss": 0.9167, + "step": 7606 + }, + { + "epoch": 0.5292010156875022, + "grad_norm": 1.140625, + "learning_rate": 0.0009540482555257983, + "loss": 1.1045, + "step": 7607 + }, + { + "epoch": 0.5292705833246374, + "grad_norm": 1.140625, + "learning_rate": 0.0009538231617247363, + "loss": 1.1215, + "step": 7608 + }, + { + "epoch": 0.5293401509617726, + "grad_norm": 1.1796875, + "learning_rate": 0.0009535980702683011, + "loss": 0.7904, + "step": 7609 + }, + { + "epoch": 0.5294097185989078, + "grad_norm": 0.74609375, + "learning_rate": 0.0009533729811679219, + "loss": 0.5266, + "step": 7610 + }, + { + "epoch": 0.529479286236043, + "grad_norm": 1.21875, + "learning_rate": 0.0009531478944350278, + "loss": 0.9714, + "step": 7611 + }, + { + "epoch": 0.5295488538731782, + "grad_norm": 1.0, + "learning_rate": 0.0009529228100810479, + "loss": 0.7552, + "step": 7612 + }, + { + "epoch": 0.5296184215103134, + "grad_norm": 1.140625, + "learning_rate": 0.0009526977281174098, + "loss": 0.7659, + "step": 7613 + }, + { + "epoch": 0.5296879891474486, + "grad_norm": 1.0, + "learning_rate": 0.0009524726485555428, + "loss": 0.7575, + "step": 7614 + }, + { + "epoch": 0.5297575567845838, + "grad_norm": 1.3125, + "learning_rate": 0.0009522475714068754, + "loss": 0.9411, + "step": 7615 + }, + { + "epoch": 0.529827124421719, + "grad_norm": 1.265625, + "learning_rate": 0.0009520224966828356, + "loss": 0.9846, + "step": 7616 + }, + { + "epoch": 0.5298966920588543, + "grad_norm": 1.0078125, + "learning_rate": 0.0009517974243948512, + "loss": 0.7321, + "step": 7617 + }, + { + "epoch": 0.5299662596959894, + "grad_norm": 1.03125, + "learning_rate": 0.0009515723545543509, + "loss": 0.7617, + "step": 7618 + }, + { + "epoch": 0.5300358273331246, + "grad_norm": 1.046875, + "learning_rate": 0.0009513472871727625, + "loss": 0.8585, + "step": 7619 + }, + { + "epoch": 0.5301053949702599, + "grad_norm": 1.1171875, + "learning_rate": 0.0009511222222615133, + "loss": 0.772, + "step": 7620 + }, + { + "epoch": 0.530174962607395, + "grad_norm": 1.359375, + "learning_rate": 0.0009508971598320315, + "loss": 1.0156, + "step": 7621 + }, + { + "epoch": 0.5302445302445302, + "grad_norm": 1.0703125, + "learning_rate": 0.0009506720998957443, + "loss": 0.8706, + "step": 7622 + }, + { + "epoch": 0.5303140978816655, + "grad_norm": 1.0390625, + "learning_rate": 0.0009504470424640797, + "loss": 0.7963, + "step": 7623 + }, + { + "epoch": 0.5303836655188007, + "grad_norm": 1.1171875, + "learning_rate": 0.0009502219875484639, + "loss": 0.7009, + "step": 7624 + }, + { + "epoch": 0.5304532331559358, + "grad_norm": 1.46875, + "learning_rate": 0.0009499969351603248, + "loss": 0.8547, + "step": 7625 + }, + { + "epoch": 0.530522800793071, + "grad_norm": 1.4375, + "learning_rate": 0.0009497718853110897, + "loss": 1.025, + "step": 7626 + }, + { + "epoch": 0.5305923684302063, + "grad_norm": 1.109375, + "learning_rate": 0.0009495468380121846, + "loss": 0.8212, + "step": 7627 + }, + { + "epoch": 0.5306619360673415, + "grad_norm": 1.2890625, + "learning_rate": 0.000949321793275037, + "loss": 1.0669, + "step": 7628 + }, + { + "epoch": 0.5307315037044766, + "grad_norm": 1.046875, + "learning_rate": 0.0009490967511110733, + "loss": 1.0157, + "step": 7629 + }, + { + "epoch": 0.5308010713416119, + "grad_norm": 1.234375, + "learning_rate": 0.0009488717115317202, + "loss": 0.9015, + "step": 7630 + }, + { + "epoch": 0.5308706389787471, + "grad_norm": 1.296875, + "learning_rate": 0.0009486466745484034, + "loss": 0.859, + "step": 7631 + }, + { + "epoch": 0.5309402066158823, + "grad_norm": 1.15625, + "learning_rate": 0.0009484216401725498, + "loss": 0.6554, + "step": 7632 + }, + { + "epoch": 0.5310097742530175, + "grad_norm": 1.15625, + "learning_rate": 0.0009481966084155857, + "loss": 0.8968, + "step": 7633 + }, + { + "epoch": 0.5310793418901527, + "grad_norm": 1.453125, + "learning_rate": 0.0009479715792889363, + "loss": 1.0967, + "step": 7634 + }, + { + "epoch": 0.5311489095272879, + "grad_norm": 1.1171875, + "learning_rate": 0.000947746552804028, + "loss": 0.7383, + "step": 7635 + }, + { + "epoch": 0.5312184771644232, + "grad_norm": 1.0625, + "learning_rate": 0.0009475215289722864, + "loss": 0.7721, + "step": 7636 + }, + { + "epoch": 0.5312880448015583, + "grad_norm": 0.9609375, + "learning_rate": 0.0009472965078051372, + "loss": 0.7587, + "step": 7637 + }, + { + "epoch": 0.5313576124386935, + "grad_norm": 1.015625, + "learning_rate": 0.0009470714893140053, + "loss": 0.7739, + "step": 7638 + }, + { + "epoch": 0.5314271800758287, + "grad_norm": 1.1953125, + "learning_rate": 0.0009468464735103166, + "loss": 0.949, + "step": 7639 + }, + { + "epoch": 0.531496747712964, + "grad_norm": 1.4765625, + "learning_rate": 0.0009466214604054962, + "loss": 0.8655, + "step": 7640 + }, + { + "epoch": 0.5315663153500991, + "grad_norm": 1.2734375, + "learning_rate": 0.0009463964500109685, + "loss": 0.8699, + "step": 7641 + }, + { + "epoch": 0.5316358829872343, + "grad_norm": 1.125, + "learning_rate": 0.0009461714423381595, + "loss": 0.931, + "step": 7642 + }, + { + "epoch": 0.5317054506243696, + "grad_norm": 1.265625, + "learning_rate": 0.0009459464373984931, + "loss": 1.0457, + "step": 7643 + }, + { + "epoch": 0.5317750182615048, + "grad_norm": 1.296875, + "learning_rate": 0.0009457214352033943, + "loss": 0.8442, + "step": 7644 + }, + { + "epoch": 0.5318445858986399, + "grad_norm": 1.1640625, + "learning_rate": 0.0009454964357642872, + "loss": 0.699, + "step": 7645 + }, + { + "epoch": 0.5319141535357752, + "grad_norm": 0.88671875, + "learning_rate": 0.0009452714390925964, + "loss": 0.8539, + "step": 7646 + }, + { + "epoch": 0.5319837211729104, + "grad_norm": 1.09375, + "learning_rate": 0.0009450464451997463, + "loss": 0.8525, + "step": 7647 + }, + { + "epoch": 0.5320532888100455, + "grad_norm": 1.125, + "learning_rate": 0.0009448214540971601, + "loss": 1.0032, + "step": 7648 + }, + { + "epoch": 0.5321228564471808, + "grad_norm": 0.9921875, + "learning_rate": 0.000944596465796263, + "loss": 0.8292, + "step": 7649 + }, + { + "epoch": 0.532192424084316, + "grad_norm": 0.8671875, + "learning_rate": 0.0009443714803084779, + "loss": 0.5984, + "step": 7650 + }, + { + "epoch": 0.5322619917214512, + "grad_norm": 1.0703125, + "learning_rate": 0.0009441464976452288, + "loss": 0.7322, + "step": 7651 + }, + { + "epoch": 0.5323315593585863, + "grad_norm": 1.140625, + "learning_rate": 0.0009439215178179388, + "loss": 0.8294, + "step": 7652 + }, + { + "epoch": 0.5324011269957216, + "grad_norm": 1.140625, + "learning_rate": 0.0009436965408380314, + "loss": 0.8577, + "step": 7653 + }, + { + "epoch": 0.5324706946328568, + "grad_norm": 0.84375, + "learning_rate": 0.0009434715667169303, + "loss": 0.6898, + "step": 7654 + }, + { + "epoch": 0.532540262269992, + "grad_norm": 1.1171875, + "learning_rate": 0.0009432465954660574, + "loss": 0.9053, + "step": 7655 + }, + { + "epoch": 0.5326098299071272, + "grad_norm": 1.09375, + "learning_rate": 0.0009430216270968371, + "loss": 0.673, + "step": 7656 + }, + { + "epoch": 0.5326793975442624, + "grad_norm": 0.87890625, + "learning_rate": 0.0009427966616206909, + "loss": 0.8517, + "step": 7657 + }, + { + "epoch": 0.5327489651813976, + "grad_norm": 1.1171875, + "learning_rate": 0.0009425716990490423, + "loss": 0.8912, + "step": 7658 + }, + { + "epoch": 0.5328185328185329, + "grad_norm": 0.83984375, + "learning_rate": 0.0009423467393933128, + "loss": 0.7184, + "step": 7659 + }, + { + "epoch": 0.532888100455668, + "grad_norm": 1.203125, + "learning_rate": 0.0009421217826649257, + "loss": 0.9315, + "step": 7660 + }, + { + "epoch": 0.5329576680928032, + "grad_norm": 0.85546875, + "learning_rate": 0.0009418968288753026, + "loss": 0.7263, + "step": 7661 + }, + { + "epoch": 0.5330272357299385, + "grad_norm": 0.98046875, + "learning_rate": 0.0009416718780358654, + "loss": 0.7388, + "step": 7662 + }, + { + "epoch": 0.5330968033670737, + "grad_norm": 1.265625, + "learning_rate": 0.0009414469301580368, + "loss": 0.9644, + "step": 7663 + }, + { + "epoch": 0.5331663710042088, + "grad_norm": 1.4375, + "learning_rate": 0.0009412219852532376, + "loss": 0.9148, + "step": 7664 + }, + { + "epoch": 0.533235938641344, + "grad_norm": 1.125, + "learning_rate": 0.0009409970433328902, + "loss": 0.777, + "step": 7665 + }, + { + "epoch": 0.5333055062784793, + "grad_norm": 1.1640625, + "learning_rate": 0.0009407721044084148, + "loss": 0.8419, + "step": 7666 + }, + { + "epoch": 0.5333750739156145, + "grad_norm": 1.1015625, + "learning_rate": 0.0009405471684912338, + "loss": 0.9105, + "step": 7667 + }, + { + "epoch": 0.5334446415527496, + "grad_norm": 1.1171875, + "learning_rate": 0.0009403222355927679, + "loss": 0.8591, + "step": 7668 + }, + { + "epoch": 0.5335142091898849, + "grad_norm": 1.015625, + "learning_rate": 0.0009400973057244378, + "loss": 0.7017, + "step": 7669 + }, + { + "epoch": 0.5335837768270201, + "grad_norm": 1.078125, + "learning_rate": 0.0009398723788976651, + "loss": 0.9882, + "step": 7670 + }, + { + "epoch": 0.5336533444641552, + "grad_norm": 0.890625, + "learning_rate": 0.0009396474551238696, + "loss": 0.6953, + "step": 7671 + }, + { + "epoch": 0.5337229121012905, + "grad_norm": 1.0390625, + "learning_rate": 0.0009394225344144725, + "loss": 0.9536, + "step": 7672 + }, + { + "epoch": 0.5337924797384257, + "grad_norm": 1.1328125, + "learning_rate": 0.000939197616780893, + "loss": 0.6909, + "step": 7673 + }, + { + "epoch": 0.5338620473755609, + "grad_norm": 1.2265625, + "learning_rate": 0.0009389727022345528, + "loss": 0.9209, + "step": 7674 + }, + { + "epoch": 0.5339316150126961, + "grad_norm": 1.1953125, + "learning_rate": 0.0009387477907868709, + "loss": 0.8242, + "step": 7675 + }, + { + "epoch": 0.5340011826498313, + "grad_norm": 1.25, + "learning_rate": 0.0009385228824492672, + "loss": 1.1437, + "step": 7676 + }, + { + "epoch": 0.5340707502869665, + "grad_norm": 0.91015625, + "learning_rate": 0.0009382979772331622, + "loss": 0.8774, + "step": 7677 + }, + { + "epoch": 0.5341403179241017, + "grad_norm": 1.234375, + "learning_rate": 0.0009380730751499747, + "loss": 0.8611, + "step": 7678 + }, + { + "epoch": 0.5342098855612369, + "grad_norm": 1.4140625, + "learning_rate": 0.0009378481762111244, + "loss": 0.8717, + "step": 7679 + }, + { + "epoch": 0.5342794531983721, + "grad_norm": 0.99609375, + "learning_rate": 0.0009376232804280298, + "loss": 0.8216, + "step": 7680 + }, + { + "epoch": 0.5343490208355073, + "grad_norm": 1.125, + "learning_rate": 0.0009373983878121113, + "loss": 0.8085, + "step": 7681 + }, + { + "epoch": 0.5344185884726426, + "grad_norm": 1.1953125, + "learning_rate": 0.000937173498374787, + "loss": 0.8056, + "step": 7682 + }, + { + "epoch": 0.5344881561097777, + "grad_norm": 1.1640625, + "learning_rate": 0.0009369486121274759, + "loss": 0.8503, + "step": 7683 + }, + { + "epoch": 0.5345577237469129, + "grad_norm": 1.09375, + "learning_rate": 0.0009367237290815961, + "loss": 0.8109, + "step": 7684 + }, + { + "epoch": 0.5346272913840482, + "grad_norm": 0.78125, + "learning_rate": 0.0009364988492485667, + "loss": 0.5855, + "step": 7685 + }, + { + "epoch": 0.5346968590211834, + "grad_norm": 1.296875, + "learning_rate": 0.0009362739726398058, + "loss": 1.041, + "step": 7686 + }, + { + "epoch": 0.5347664266583185, + "grad_norm": 1.0703125, + "learning_rate": 0.0009360490992667306, + "loss": 0.8213, + "step": 7687 + }, + { + "epoch": 0.5348359942954538, + "grad_norm": 1.234375, + "learning_rate": 0.0009358242291407604, + "loss": 1.0044, + "step": 7688 + }, + { + "epoch": 0.534905561932589, + "grad_norm": 0.97265625, + "learning_rate": 0.0009355993622733124, + "loss": 0.6749, + "step": 7689 + }, + { + "epoch": 0.5349751295697242, + "grad_norm": 1.0859375, + "learning_rate": 0.0009353744986758044, + "loss": 0.7504, + "step": 7690 + }, + { + "epoch": 0.5350446972068593, + "grad_norm": 1.1640625, + "learning_rate": 0.000935149638359653, + "loss": 0.9699, + "step": 7691 + }, + { + "epoch": 0.5351142648439946, + "grad_norm": 0.8828125, + "learning_rate": 0.0009349247813362764, + "loss": 0.7765, + "step": 7692 + }, + { + "epoch": 0.5351838324811298, + "grad_norm": 1.2109375, + "learning_rate": 0.0009346999276170914, + "loss": 1.1291, + "step": 7693 + }, + { + "epoch": 0.5352534001182649, + "grad_norm": 1.3671875, + "learning_rate": 0.0009344750772135148, + "loss": 0.8869, + "step": 7694 + }, + { + "epoch": 0.5353229677554002, + "grad_norm": 1.2109375, + "learning_rate": 0.0009342502301369637, + "loss": 0.8624, + "step": 7695 + }, + { + "epoch": 0.5353925353925354, + "grad_norm": 0.765625, + "learning_rate": 0.0009340253863988545, + "loss": 0.6484, + "step": 7696 + }, + { + "epoch": 0.5354621030296706, + "grad_norm": 1.109375, + "learning_rate": 0.000933800546010604, + "loss": 0.7179, + "step": 7697 + }, + { + "epoch": 0.5355316706668058, + "grad_norm": 1.1796875, + "learning_rate": 0.0009335757089836274, + "loss": 0.6628, + "step": 7698 + }, + { + "epoch": 0.535601238303941, + "grad_norm": 1.265625, + "learning_rate": 0.0009333508753293418, + "loss": 0.9152, + "step": 7699 + }, + { + "epoch": 0.5356708059410762, + "grad_norm": 1.328125, + "learning_rate": 0.0009331260450591627, + "loss": 1.0213, + "step": 7700 + }, + { + "epoch": 0.5357403735782115, + "grad_norm": 1.1640625, + "learning_rate": 0.0009329012181845059, + "loss": 0.7104, + "step": 7701 + }, + { + "epoch": 0.5358099412153466, + "grad_norm": 1.1171875, + "learning_rate": 0.0009326763947167875, + "loss": 0.9936, + "step": 7702 + }, + { + "epoch": 0.5358795088524818, + "grad_norm": 1.375, + "learning_rate": 0.0009324515746674221, + "loss": 0.8168, + "step": 7703 + }, + { + "epoch": 0.535949076489617, + "grad_norm": 0.98046875, + "learning_rate": 0.0009322267580478255, + "loss": 0.7997, + "step": 7704 + }, + { + "epoch": 0.5360186441267523, + "grad_norm": 1.359375, + "learning_rate": 0.0009320019448694121, + "loss": 0.8153, + "step": 7705 + }, + { + "epoch": 0.5360882117638874, + "grad_norm": 0.8828125, + "learning_rate": 0.0009317771351435975, + "loss": 1.037, + "step": 7706 + }, + { + "epoch": 0.5361577794010226, + "grad_norm": 1.2265625, + "learning_rate": 0.0009315523288817961, + "loss": 0.7164, + "step": 7707 + }, + { + "epoch": 0.5362273470381579, + "grad_norm": 1.2109375, + "learning_rate": 0.0009313275260954221, + "loss": 0.8784, + "step": 7708 + }, + { + "epoch": 0.5362969146752931, + "grad_norm": 1.109375, + "learning_rate": 0.0009311027267958908, + "loss": 0.6885, + "step": 7709 + }, + { + "epoch": 0.5363664823124282, + "grad_norm": 0.91015625, + "learning_rate": 0.0009308779309946155, + "loss": 0.5003, + "step": 7710 + }, + { + "epoch": 0.5364360499495635, + "grad_norm": 1.015625, + "learning_rate": 0.0009306531387030106, + "loss": 0.8354, + "step": 7711 + }, + { + "epoch": 0.5365056175866987, + "grad_norm": 1.1015625, + "learning_rate": 0.0009304283499324892, + "loss": 0.869, + "step": 7712 + }, + { + "epoch": 0.5365751852238339, + "grad_norm": 0.98828125, + "learning_rate": 0.0009302035646944661, + "loss": 0.8773, + "step": 7713 + }, + { + "epoch": 0.5366447528609691, + "grad_norm": 1.0859375, + "learning_rate": 0.000929978783000354, + "loss": 0.6557, + "step": 7714 + }, + { + "epoch": 0.5367143204981043, + "grad_norm": 0.9140625, + "learning_rate": 0.0009297540048615661, + "loss": 0.6123, + "step": 7715 + }, + { + "epoch": 0.5367838881352395, + "grad_norm": 1.03125, + "learning_rate": 0.0009295292302895163, + "loss": 0.9011, + "step": 7716 + }, + { + "epoch": 0.5368534557723746, + "grad_norm": 1.1640625, + "learning_rate": 0.0009293044592956167, + "loss": 0.7213, + "step": 7717 + }, + { + "epoch": 0.5369230234095099, + "grad_norm": 1.2109375, + "learning_rate": 0.0009290796918912806, + "loss": 0.7583, + "step": 7718 + }, + { + "epoch": 0.5369925910466451, + "grad_norm": 1.1953125, + "learning_rate": 0.0009288549280879196, + "loss": 1.0099, + "step": 7719 + }, + { + "epoch": 0.5370621586837803, + "grad_norm": 1.046875, + "learning_rate": 0.0009286301678969474, + "loss": 0.8666, + "step": 7720 + }, + { + "epoch": 0.5371317263209155, + "grad_norm": 1.3046875, + "learning_rate": 0.0009284054113297753, + "loss": 0.6667, + "step": 7721 + }, + { + "epoch": 0.5372012939580507, + "grad_norm": 1.1796875, + "learning_rate": 0.0009281806583978155, + "loss": 0.8413, + "step": 7722 + }, + { + "epoch": 0.5372708615951859, + "grad_norm": 1.0546875, + "learning_rate": 0.00092795590911248, + "loss": 0.7475, + "step": 7723 + }, + { + "epoch": 0.5373404292323212, + "grad_norm": 1.1484375, + "learning_rate": 0.0009277311634851803, + "loss": 1.0075, + "step": 7724 + }, + { + "epoch": 0.5374099968694563, + "grad_norm": 1.125, + "learning_rate": 0.0009275064215273278, + "loss": 0.7693, + "step": 7725 + }, + { + "epoch": 0.5374795645065915, + "grad_norm": 1.140625, + "learning_rate": 0.0009272816832503335, + "loss": 0.8396, + "step": 7726 + }, + { + "epoch": 0.5375491321437268, + "grad_norm": 1.28125, + "learning_rate": 0.0009270569486656095, + "loss": 0.9725, + "step": 7727 + }, + { + "epoch": 0.537618699780862, + "grad_norm": 1.3203125, + "learning_rate": 0.0009268322177845656, + "loss": 0.9665, + "step": 7728 + }, + { + "epoch": 0.5376882674179971, + "grad_norm": 1.40625, + "learning_rate": 0.0009266074906186125, + "loss": 0.9539, + "step": 7729 + }, + { + "epoch": 0.5377578350551323, + "grad_norm": 1.1953125, + "learning_rate": 0.0009263827671791619, + "loss": 0.9783, + "step": 7730 + }, + { + "epoch": 0.5378274026922676, + "grad_norm": 1.171875, + "learning_rate": 0.0009261580474776229, + "loss": 0.9241, + "step": 7731 + }, + { + "epoch": 0.5378969703294028, + "grad_norm": 1.1171875, + "learning_rate": 0.0009259333315254062, + "loss": 0.6221, + "step": 7732 + }, + { + "epoch": 0.5379665379665379, + "grad_norm": 0.9765625, + "learning_rate": 0.0009257086193339212, + "loss": 0.9105, + "step": 7733 + }, + { + "epoch": 0.5380361056036732, + "grad_norm": 1.1328125, + "learning_rate": 0.0009254839109145785, + "loss": 0.8122, + "step": 7734 + }, + { + "epoch": 0.5381056732408084, + "grad_norm": 0.9765625, + "learning_rate": 0.0009252592062787871, + "loss": 0.9319, + "step": 7735 + }, + { + "epoch": 0.5381752408779436, + "grad_norm": 1.015625, + "learning_rate": 0.0009250345054379562, + "loss": 0.6494, + "step": 7736 + }, + { + "epoch": 0.5382448085150788, + "grad_norm": 1.2734375, + "learning_rate": 0.0009248098084034957, + "loss": 0.9644, + "step": 7737 + }, + { + "epoch": 0.538314376152214, + "grad_norm": 1.078125, + "learning_rate": 0.000924585115186814, + "loss": 0.87, + "step": 7738 + }, + { + "epoch": 0.5383839437893492, + "grad_norm": 1.078125, + "learning_rate": 0.0009243604257993199, + "loss": 0.8682, + "step": 7739 + }, + { + "epoch": 0.5384535114264845, + "grad_norm": 1.1953125, + "learning_rate": 0.0009241357402524219, + "loss": 0.822, + "step": 7740 + }, + { + "epoch": 0.5385230790636196, + "grad_norm": 1.3203125, + "learning_rate": 0.0009239110585575292, + "loss": 0.666, + "step": 7741 + }, + { + "epoch": 0.5385926467007548, + "grad_norm": 0.859375, + "learning_rate": 0.0009236863807260493, + "loss": 0.8514, + "step": 7742 + }, + { + "epoch": 0.53866221433789, + "grad_norm": 0.9375, + "learning_rate": 0.0009234617067693899, + "loss": 0.7054, + "step": 7743 + }, + { + "epoch": 0.5387317819750252, + "grad_norm": 0.890625, + "learning_rate": 0.0009232370366989596, + "loss": 0.7784, + "step": 7744 + }, + { + "epoch": 0.5388013496121604, + "grad_norm": 0.9296875, + "learning_rate": 0.0009230123705261657, + "loss": 0.8373, + "step": 7745 + }, + { + "epoch": 0.5388709172492956, + "grad_norm": 1.390625, + "learning_rate": 0.0009227877082624155, + "loss": 0.9933, + "step": 7746 + }, + { + "epoch": 0.5389404848864309, + "grad_norm": 1.1171875, + "learning_rate": 0.0009225630499191161, + "loss": 1.0417, + "step": 7747 + }, + { + "epoch": 0.539010052523566, + "grad_norm": 0.9375, + "learning_rate": 0.0009223383955076752, + "loss": 0.7612, + "step": 7748 + }, + { + "epoch": 0.5390796201607012, + "grad_norm": 1.2578125, + "learning_rate": 0.0009221137450394987, + "loss": 0.7888, + "step": 7749 + }, + { + "epoch": 0.5391491877978365, + "grad_norm": 1.171875, + "learning_rate": 0.0009218890985259935, + "loss": 0.9712, + "step": 7750 + }, + { + "epoch": 0.5392187554349717, + "grad_norm": 1.125, + "learning_rate": 0.0009216644559785665, + "loss": 0.9912, + "step": 7751 + }, + { + "epoch": 0.5392883230721068, + "grad_norm": 0.92578125, + "learning_rate": 0.0009214398174086238, + "loss": 0.6542, + "step": 7752 + }, + { + "epoch": 0.5393578907092421, + "grad_norm": 1.078125, + "learning_rate": 0.0009212151828275709, + "loss": 0.8197, + "step": 7753 + }, + { + "epoch": 0.5394274583463773, + "grad_norm": 1.0703125, + "learning_rate": 0.0009209905522468137, + "loss": 0.9404, + "step": 7754 + }, + { + "epoch": 0.5394970259835125, + "grad_norm": 0.87109375, + "learning_rate": 0.0009207659256777586, + "loss": 0.64, + "step": 7755 + }, + { + "epoch": 0.5395665936206476, + "grad_norm": 1.09375, + "learning_rate": 0.00092054130313181, + "loss": 0.6587, + "step": 7756 + }, + { + "epoch": 0.5396361612577829, + "grad_norm": 1.09375, + "learning_rate": 0.0009203166846203739, + "loss": 0.8482, + "step": 7757 + }, + { + "epoch": 0.5397057288949181, + "grad_norm": 1.1640625, + "learning_rate": 0.0009200920701548541, + "loss": 0.8016, + "step": 7758 + }, + { + "epoch": 0.5397752965320533, + "grad_norm": 0.94140625, + "learning_rate": 0.000919867459746657, + "loss": 0.7776, + "step": 7759 + }, + { + "epoch": 0.5398448641691885, + "grad_norm": 1.5546875, + "learning_rate": 0.0009196428534071861, + "loss": 1.1175, + "step": 7760 + }, + { + "epoch": 0.5399144318063237, + "grad_norm": 1.2265625, + "learning_rate": 0.000919418251147846, + "loss": 0.7893, + "step": 7761 + }, + { + "epoch": 0.5399839994434589, + "grad_norm": 1.2109375, + "learning_rate": 0.0009191936529800412, + "loss": 0.6078, + "step": 7762 + }, + { + "epoch": 0.5400535670805942, + "grad_norm": 1.5078125, + "learning_rate": 0.0009189690589151752, + "loss": 0.689, + "step": 7763 + }, + { + "epoch": 0.5401231347177293, + "grad_norm": 0.88671875, + "learning_rate": 0.0009187444689646521, + "loss": 0.5132, + "step": 7764 + }, + { + "epoch": 0.5401927023548645, + "grad_norm": 1.21875, + "learning_rate": 0.000918519883139875, + "loss": 0.778, + "step": 7765 + }, + { + "epoch": 0.5402622699919997, + "grad_norm": 1.59375, + "learning_rate": 0.000918295301452248, + "loss": 1.479, + "step": 7766 + }, + { + "epoch": 0.540331837629135, + "grad_norm": 1.171875, + "learning_rate": 0.0009180707239131735, + "loss": 0.9314, + "step": 7767 + }, + { + "epoch": 0.5404014052662701, + "grad_norm": 1.046875, + "learning_rate": 0.0009178461505340546, + "loss": 0.8695, + "step": 7768 + }, + { + "epoch": 0.5404709729034053, + "grad_norm": 1.28125, + "learning_rate": 0.0009176215813262944, + "loss": 0.8224, + "step": 7769 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 1.25, + "learning_rate": 0.0009173970163012949, + "loss": 0.6287, + "step": 7770 + }, + { + "epoch": 0.5406101081776757, + "grad_norm": 1.296875, + "learning_rate": 0.0009171724554704586, + "loss": 1.0836, + "step": 7771 + }, + { + "epoch": 0.5406796758148109, + "grad_norm": 1.15625, + "learning_rate": 0.0009169478988451873, + "loss": 0.8674, + "step": 7772 + }, + { + "epoch": 0.5407492434519462, + "grad_norm": 1.171875, + "learning_rate": 0.0009167233464368835, + "loss": 0.6791, + "step": 7773 + }, + { + "epoch": 0.5408188110890814, + "grad_norm": 1.046875, + "learning_rate": 0.0009164987982569481, + "loss": 0.7213, + "step": 7774 + }, + { + "epoch": 0.5408883787262165, + "grad_norm": 1.03125, + "learning_rate": 0.0009162742543167828, + "loss": 0.6782, + "step": 7775 + }, + { + "epoch": 0.5409579463633518, + "grad_norm": 0.9296875, + "learning_rate": 0.000916049714627789, + "loss": 0.7157, + "step": 7776 + }, + { + "epoch": 0.541027514000487, + "grad_norm": 0.97265625, + "learning_rate": 0.0009158251792013677, + "loss": 0.8124, + "step": 7777 + }, + { + "epoch": 0.5410970816376222, + "grad_norm": 0.96875, + "learning_rate": 0.0009156006480489196, + "loss": 0.642, + "step": 7778 + }, + { + "epoch": 0.5411666492747573, + "grad_norm": 1.0234375, + "learning_rate": 0.0009153761211818447, + "loss": 0.5892, + "step": 7779 + }, + { + "epoch": 0.5412362169118926, + "grad_norm": 0.9765625, + "learning_rate": 0.0009151515986115442, + "loss": 0.6871, + "step": 7780 + }, + { + "epoch": 0.5413057845490278, + "grad_norm": 1.359375, + "learning_rate": 0.0009149270803494178, + "loss": 0.9627, + "step": 7781 + }, + { + "epoch": 0.541375352186163, + "grad_norm": 1.140625, + "learning_rate": 0.0009147025664068652, + "loss": 0.8162, + "step": 7782 + }, + { + "epoch": 0.5414449198232982, + "grad_norm": 1.0, + "learning_rate": 0.0009144780567952866, + "loss": 0.6913, + "step": 7783 + }, + { + "epoch": 0.5415144874604334, + "grad_norm": 1.265625, + "learning_rate": 0.0009142535515260814, + "loss": 0.9963, + "step": 7784 + }, + { + "epoch": 0.5415840550975686, + "grad_norm": 1.3984375, + "learning_rate": 0.0009140290506106485, + "loss": 1.0329, + "step": 7785 + }, + { + "epoch": 0.5416536227347039, + "grad_norm": 1.25, + "learning_rate": 0.0009138045540603868, + "loss": 0.8808, + "step": 7786 + }, + { + "epoch": 0.541723190371839, + "grad_norm": 0.7578125, + "learning_rate": 0.0009135800618866957, + "loss": 0.6811, + "step": 7787 + }, + { + "epoch": 0.5417927580089742, + "grad_norm": 1.046875, + "learning_rate": 0.0009133555741009735, + "loss": 0.8626, + "step": 7788 + }, + { + "epoch": 0.5418623256461095, + "grad_norm": 1.0, + "learning_rate": 0.0009131310907146181, + "loss": 0.8456, + "step": 7789 + }, + { + "epoch": 0.5419318932832446, + "grad_norm": 1.0, + "learning_rate": 0.0009129066117390284, + "loss": 0.7679, + "step": 7790 + }, + { + "epoch": 0.5420014609203798, + "grad_norm": 0.9609375, + "learning_rate": 0.0009126821371856021, + "loss": 0.7074, + "step": 7791 + }, + { + "epoch": 0.542071028557515, + "grad_norm": 1.046875, + "learning_rate": 0.0009124576670657366, + "loss": 0.8705, + "step": 7792 + }, + { + "epoch": 0.5421405961946503, + "grad_norm": 1.265625, + "learning_rate": 0.0009122332013908293, + "loss": 0.8058, + "step": 7793 + }, + { + "epoch": 0.5422101638317854, + "grad_norm": 0.96875, + "learning_rate": 0.0009120087401722782, + "loss": 0.7583, + "step": 7794 + }, + { + "epoch": 0.5422797314689206, + "grad_norm": 1.2109375, + "learning_rate": 0.0009117842834214793, + "loss": 0.781, + "step": 7795 + }, + { + "epoch": 0.5423492991060559, + "grad_norm": 1.265625, + "learning_rate": 0.0009115598311498299, + "loss": 0.8502, + "step": 7796 + }, + { + "epoch": 0.5424188667431911, + "grad_norm": 1.1796875, + "learning_rate": 0.0009113353833687266, + "loss": 0.7159, + "step": 7797 + }, + { + "epoch": 0.5424884343803262, + "grad_norm": 0.90625, + "learning_rate": 0.0009111109400895659, + "loss": 0.5067, + "step": 7798 + }, + { + "epoch": 0.5425580020174615, + "grad_norm": 1.375, + "learning_rate": 0.0009108865013237433, + "loss": 0.7159, + "step": 7799 + }, + { + "epoch": 0.5426275696545967, + "grad_norm": 0.85546875, + "learning_rate": 0.0009106620670826548, + "loss": 0.6929, + "step": 7800 + }, + { + "epoch": 0.5426971372917319, + "grad_norm": 1.90625, + "learning_rate": 0.0009104376373776967, + "loss": 0.7217, + "step": 7801 + }, + { + "epoch": 0.5427667049288671, + "grad_norm": 1.109375, + "learning_rate": 0.0009102132122202638, + "loss": 0.8091, + "step": 7802 + }, + { + "epoch": 0.5428362725660023, + "grad_norm": 1.6171875, + "learning_rate": 0.000909988791621751, + "loss": 0.9184, + "step": 7803 + }, + { + "epoch": 0.5429058402031375, + "grad_norm": 1.1015625, + "learning_rate": 0.0009097643755935541, + "loss": 0.9267, + "step": 7804 + }, + { + "epoch": 0.5429754078402727, + "grad_norm": 1.1328125, + "learning_rate": 0.0009095399641470675, + "loss": 0.8922, + "step": 7805 + }, + { + "epoch": 0.5430449754774079, + "grad_norm": 0.98828125, + "learning_rate": 0.0009093155572936854, + "loss": 0.886, + "step": 7806 + }, + { + "epoch": 0.5431145431145431, + "grad_norm": 1.0390625, + "learning_rate": 0.000909091155044802, + "loss": 0.8428, + "step": 7807 + }, + { + "epoch": 0.5431841107516783, + "grad_norm": 0.94140625, + "learning_rate": 0.0009088667574118119, + "loss": 0.8639, + "step": 7808 + }, + { + "epoch": 0.5432536783888136, + "grad_norm": 1.1484375, + "learning_rate": 0.0009086423644061083, + "loss": 0.7444, + "step": 7809 + }, + { + "epoch": 0.5433232460259487, + "grad_norm": 1.015625, + "learning_rate": 0.0009084179760390849, + "loss": 0.7876, + "step": 7810 + }, + { + "epoch": 0.5433928136630839, + "grad_norm": 1.0078125, + "learning_rate": 0.0009081935923221352, + "loss": 0.4759, + "step": 7811 + }, + { + "epoch": 0.5434623813002192, + "grad_norm": 1.1953125, + "learning_rate": 0.0009079692132666523, + "loss": 0.6489, + "step": 7812 + }, + { + "epoch": 0.5435319489373543, + "grad_norm": 0.94140625, + "learning_rate": 0.0009077448388840286, + "loss": 0.6377, + "step": 7813 + }, + { + "epoch": 0.5436015165744895, + "grad_norm": 1.1015625, + "learning_rate": 0.0009075204691856569, + "loss": 0.9348, + "step": 7814 + }, + { + "epoch": 0.5436710842116248, + "grad_norm": 0.921875, + "learning_rate": 0.0009072961041829299, + "loss": 0.7295, + "step": 7815 + }, + { + "epoch": 0.54374065184876, + "grad_norm": 0.93359375, + "learning_rate": 0.0009070717438872395, + "loss": 0.6259, + "step": 7816 + }, + { + "epoch": 0.5438102194858951, + "grad_norm": 1.1875, + "learning_rate": 0.0009068473883099773, + "loss": 0.7253, + "step": 7817 + }, + { + "epoch": 0.5438797871230303, + "grad_norm": 1.203125, + "learning_rate": 0.0009066230374625353, + "loss": 0.6697, + "step": 7818 + }, + { + "epoch": 0.5439493547601656, + "grad_norm": 1.03125, + "learning_rate": 0.000906398691356305, + "loss": 0.6115, + "step": 7819 + }, + { + "epoch": 0.5440189223973008, + "grad_norm": 1.2734375, + "learning_rate": 0.0009061743500026773, + "loss": 0.9162, + "step": 7820 + }, + { + "epoch": 0.5440884900344359, + "grad_norm": 1.1484375, + "learning_rate": 0.0009059500134130428, + "loss": 1.1168, + "step": 7821 + }, + { + "epoch": 0.5441580576715712, + "grad_norm": 1.296875, + "learning_rate": 0.0009057256815987928, + "loss": 0.9183, + "step": 7822 + }, + { + "epoch": 0.5442276253087064, + "grad_norm": 1.1015625, + "learning_rate": 0.0009055013545713179, + "loss": 0.9001, + "step": 7823 + }, + { + "epoch": 0.5442971929458416, + "grad_norm": 1.3828125, + "learning_rate": 0.0009052770323420074, + "loss": 0.9196, + "step": 7824 + }, + { + "epoch": 0.5443667605829768, + "grad_norm": 1.0859375, + "learning_rate": 0.000905052714922252, + "loss": 0.8726, + "step": 7825 + }, + { + "epoch": 0.544436328220112, + "grad_norm": 1.03125, + "learning_rate": 0.0009048284023234413, + "loss": 0.558, + "step": 7826 + }, + { + "epoch": 0.5445058958572472, + "grad_norm": 1.1484375, + "learning_rate": 0.0009046040945569644, + "loss": 0.915, + "step": 7827 + }, + { + "epoch": 0.5445754634943825, + "grad_norm": 1.28125, + "learning_rate": 0.0009043797916342106, + "loss": 0.9056, + "step": 7828 + }, + { + "epoch": 0.5446450311315176, + "grad_norm": 1.2890625, + "learning_rate": 0.0009041554935665691, + "loss": 0.8979, + "step": 7829 + }, + { + "epoch": 0.5447145987686528, + "grad_norm": 1.15625, + "learning_rate": 0.000903931200365429, + "loss": 0.8202, + "step": 7830 + }, + { + "epoch": 0.544784166405788, + "grad_norm": 1.1796875, + "learning_rate": 0.0009037069120421777, + "loss": 0.8047, + "step": 7831 + }, + { + "epoch": 0.5448537340429233, + "grad_norm": 1.0546875, + "learning_rate": 0.0009034826286082043, + "loss": 0.6387, + "step": 7832 + }, + { + "epoch": 0.5449233016800584, + "grad_norm": 1.2890625, + "learning_rate": 0.0009032583500748968, + "loss": 0.8866, + "step": 7833 + }, + { + "epoch": 0.5449928693171936, + "grad_norm": 1.171875, + "learning_rate": 0.0009030340764536424, + "loss": 0.995, + "step": 7834 + }, + { + "epoch": 0.5450624369543289, + "grad_norm": 1.2421875, + "learning_rate": 0.0009028098077558287, + "loss": 0.76, + "step": 7835 + }, + { + "epoch": 0.545132004591464, + "grad_norm": 1.1796875, + "learning_rate": 0.0009025855439928433, + "loss": 0.7401, + "step": 7836 + }, + { + "epoch": 0.5452015722285992, + "grad_norm": 1.0625, + "learning_rate": 0.0009023612851760731, + "loss": 0.8346, + "step": 7837 + }, + { + "epoch": 0.5452711398657345, + "grad_norm": 0.85546875, + "learning_rate": 0.0009021370313169046, + "loss": 0.596, + "step": 7838 + }, + { + "epoch": 0.5453407075028697, + "grad_norm": 0.84765625, + "learning_rate": 0.0009019127824267242, + "loss": 0.6525, + "step": 7839 + }, + { + "epoch": 0.5454102751400048, + "grad_norm": 1.046875, + "learning_rate": 0.0009016885385169185, + "loss": 1.0252, + "step": 7840 + }, + { + "epoch": 0.5454798427771401, + "grad_norm": 1.234375, + "learning_rate": 0.0009014642995988733, + "loss": 0.801, + "step": 7841 + }, + { + "epoch": 0.5455494104142753, + "grad_norm": 0.88671875, + "learning_rate": 0.000901240065683974, + "loss": 0.5813, + "step": 7842 + }, + { + "epoch": 0.5456189780514105, + "grad_norm": 1.0703125, + "learning_rate": 0.0009010158367836066, + "loss": 0.8475, + "step": 7843 + }, + { + "epoch": 0.5456885456885456, + "grad_norm": 1.2578125, + "learning_rate": 0.0009007916129091563, + "loss": 0.8293, + "step": 7844 + }, + { + "epoch": 0.5457581133256809, + "grad_norm": 0.86328125, + "learning_rate": 0.0009005673940720077, + "loss": 0.765, + "step": 7845 + }, + { + "epoch": 0.5458276809628161, + "grad_norm": 1.0546875, + "learning_rate": 0.0009003431802835454, + "loss": 0.6574, + "step": 7846 + }, + { + "epoch": 0.5458972485999513, + "grad_norm": 1.28125, + "learning_rate": 0.0009001189715551544, + "loss": 1.1601, + "step": 7847 + }, + { + "epoch": 0.5459668162370865, + "grad_norm": 1.2890625, + "learning_rate": 0.0008998947678982187, + "loss": 0.5843, + "step": 7848 + }, + { + "epoch": 0.5460363838742217, + "grad_norm": 1.046875, + "learning_rate": 0.0008996705693241216, + "loss": 0.723, + "step": 7849 + }, + { + "epoch": 0.5461059515113569, + "grad_norm": 1.3046875, + "learning_rate": 0.0008994463758442476, + "loss": 0.848, + "step": 7850 + }, + { + "epoch": 0.5461755191484922, + "grad_norm": 1.359375, + "learning_rate": 0.0008992221874699801, + "loss": 1.0163, + "step": 7851 + }, + { + "epoch": 0.5462450867856273, + "grad_norm": 1.09375, + "learning_rate": 0.0008989980042127016, + "loss": 0.6162, + "step": 7852 + }, + { + "epoch": 0.5463146544227625, + "grad_norm": 1.234375, + "learning_rate": 0.0008987738260837952, + "loss": 0.8456, + "step": 7853 + }, + { + "epoch": 0.5463842220598978, + "grad_norm": 1.203125, + "learning_rate": 0.000898549653094644, + "loss": 0.8359, + "step": 7854 + }, + { + "epoch": 0.546453789697033, + "grad_norm": 1.2421875, + "learning_rate": 0.0008983254852566303, + "loss": 0.8284, + "step": 7855 + }, + { + "epoch": 0.5465233573341681, + "grad_norm": 0.91015625, + "learning_rate": 0.0008981013225811354, + "loss": 0.6926, + "step": 7856 + }, + { + "epoch": 0.5465929249713033, + "grad_norm": 1.4765625, + "learning_rate": 0.000897877165079542, + "loss": 0.8208, + "step": 7857 + }, + { + "epoch": 0.5466624926084386, + "grad_norm": 1.015625, + "learning_rate": 0.0008976530127632317, + "loss": 0.7933, + "step": 7858 + }, + { + "epoch": 0.5467320602455737, + "grad_norm": 1.046875, + "learning_rate": 0.0008974288656435852, + "loss": 0.7539, + "step": 7859 + }, + { + "epoch": 0.5468016278827089, + "grad_norm": 1.2265625, + "learning_rate": 0.0008972047237319838, + "loss": 0.7873, + "step": 7860 + }, + { + "epoch": 0.5468711955198442, + "grad_norm": 1.140625, + "learning_rate": 0.0008969805870398086, + "loss": 0.8811, + "step": 7861 + }, + { + "epoch": 0.5469407631569794, + "grad_norm": 1.3359375, + "learning_rate": 0.0008967564555784401, + "loss": 0.9331, + "step": 7862 + }, + { + "epoch": 0.5470103307941145, + "grad_norm": 1.234375, + "learning_rate": 0.000896532329359258, + "loss": 0.8192, + "step": 7863 + }, + { + "epoch": 0.5470798984312498, + "grad_norm": 1.203125, + "learning_rate": 0.0008963082083936429, + "loss": 0.9081, + "step": 7864 + }, + { + "epoch": 0.547149466068385, + "grad_norm": 1.375, + "learning_rate": 0.0008960840926929745, + "loss": 1.0103, + "step": 7865 + }, + { + "epoch": 0.5472190337055202, + "grad_norm": 1.1328125, + "learning_rate": 0.0008958599822686319, + "loss": 0.8191, + "step": 7866 + }, + { + "epoch": 0.5472886013426554, + "grad_norm": 1.1484375, + "learning_rate": 0.0008956358771319943, + "loss": 0.755, + "step": 7867 + }, + { + "epoch": 0.5473581689797906, + "grad_norm": 1.0, + "learning_rate": 0.0008954117772944412, + "loss": 0.8025, + "step": 7868 + }, + { + "epoch": 0.5474277366169258, + "grad_norm": 0.80078125, + "learning_rate": 0.000895187682767351, + "loss": 0.7683, + "step": 7869 + }, + { + "epoch": 0.547497304254061, + "grad_norm": 1.2890625, + "learning_rate": 0.0008949635935621014, + "loss": 0.9424, + "step": 7870 + }, + { + "epoch": 0.5475668718911962, + "grad_norm": 1.015625, + "learning_rate": 0.0008947395096900715, + "loss": 0.9196, + "step": 7871 + }, + { + "epoch": 0.5476364395283314, + "grad_norm": 0.95703125, + "learning_rate": 0.0008945154311626389, + "loss": 0.7907, + "step": 7872 + }, + { + "epoch": 0.5477060071654666, + "grad_norm": 1.234375, + "learning_rate": 0.0008942913579911808, + "loss": 0.8883, + "step": 7873 + }, + { + "epoch": 0.5477755748026019, + "grad_norm": 1.15625, + "learning_rate": 0.0008940672901870745, + "loss": 0.7396, + "step": 7874 + }, + { + "epoch": 0.547845142439737, + "grad_norm": 1.15625, + "learning_rate": 0.0008938432277616975, + "loss": 0.7292, + "step": 7875 + }, + { + "epoch": 0.5479147100768722, + "grad_norm": 1.15625, + "learning_rate": 0.0008936191707264265, + "loss": 0.814, + "step": 7876 + }, + { + "epoch": 0.5479842777140075, + "grad_norm": 1.0234375, + "learning_rate": 0.0008933951190926374, + "loss": 0.8076, + "step": 7877 + }, + { + "epoch": 0.5480538453511427, + "grad_norm": 1.328125, + "learning_rate": 0.000893171072871707, + "loss": 0.8851, + "step": 7878 + }, + { + "epoch": 0.5481234129882778, + "grad_norm": 1.0625, + "learning_rate": 0.0008929470320750114, + "loss": 1.0926, + "step": 7879 + }, + { + "epoch": 0.5481929806254131, + "grad_norm": 1.515625, + "learning_rate": 0.0008927229967139256, + "loss": 1.0434, + "step": 7880 + }, + { + "epoch": 0.5482625482625483, + "grad_norm": 1.2734375, + "learning_rate": 0.0008924989667998251, + "loss": 0.9311, + "step": 7881 + }, + { + "epoch": 0.5483321158996834, + "grad_norm": 1.015625, + "learning_rate": 0.0008922749423440854, + "loss": 0.8518, + "step": 7882 + }, + { + "epoch": 0.5484016835368186, + "grad_norm": 1.3828125, + "learning_rate": 0.0008920509233580814, + "loss": 0.8492, + "step": 7883 + }, + { + "epoch": 0.5484712511739539, + "grad_norm": 1.03125, + "learning_rate": 0.0008918269098531871, + "loss": 0.8881, + "step": 7884 + }, + { + "epoch": 0.5485408188110891, + "grad_norm": 1.1015625, + "learning_rate": 0.0008916029018407772, + "loss": 0.8287, + "step": 7885 + }, + { + "epoch": 0.5486103864482242, + "grad_norm": 1.2265625, + "learning_rate": 0.0008913788993322256, + "loss": 0.7321, + "step": 7886 + }, + { + "epoch": 0.5486799540853595, + "grad_norm": 1.1328125, + "learning_rate": 0.0008911549023389063, + "loss": 0.9308, + "step": 7887 + }, + { + "epoch": 0.5487495217224947, + "grad_norm": 0.98828125, + "learning_rate": 0.0008909309108721918, + "loss": 0.6023, + "step": 7888 + }, + { + "epoch": 0.5488190893596299, + "grad_norm": 1.46875, + "learning_rate": 0.0008907069249434563, + "loss": 0.9637, + "step": 7889 + }, + { + "epoch": 0.5488886569967651, + "grad_norm": 1.0234375, + "learning_rate": 0.0008904829445640724, + "loss": 0.7305, + "step": 7890 + }, + { + "epoch": 0.5489582246339003, + "grad_norm": 1.2421875, + "learning_rate": 0.0008902589697454122, + "loss": 0.9317, + "step": 7891 + }, + { + "epoch": 0.5490277922710355, + "grad_norm": 0.875, + "learning_rate": 0.0008900350004988484, + "loss": 0.66, + "step": 7892 + }, + { + "epoch": 0.5490973599081708, + "grad_norm": 1.359375, + "learning_rate": 0.0008898110368357533, + "loss": 0.9249, + "step": 7893 + }, + { + "epoch": 0.5491669275453059, + "grad_norm": 0.73828125, + "learning_rate": 0.0008895870787674984, + "loss": 0.6496, + "step": 7894 + }, + { + "epoch": 0.5492364951824411, + "grad_norm": 1.2578125, + "learning_rate": 0.0008893631263054547, + "loss": 0.9521, + "step": 7895 + }, + { + "epoch": 0.5493060628195763, + "grad_norm": 1.171875, + "learning_rate": 0.0008891391794609941, + "loss": 0.9686, + "step": 7896 + }, + { + "epoch": 0.5493756304567116, + "grad_norm": 0.9375, + "learning_rate": 0.0008889152382454872, + "loss": 0.7132, + "step": 7897 + }, + { + "epoch": 0.5494451980938467, + "grad_norm": 1.25, + "learning_rate": 0.0008886913026703042, + "loss": 0.5568, + "step": 7898 + }, + { + "epoch": 0.5495147657309819, + "grad_norm": 1.0390625, + "learning_rate": 0.0008884673727468164, + "loss": 0.8192, + "step": 7899 + }, + { + "epoch": 0.5495843333681172, + "grad_norm": 1.203125, + "learning_rate": 0.0008882434484863928, + "loss": 0.7144, + "step": 7900 + }, + { + "epoch": 0.5496539010052524, + "grad_norm": 1.296875, + "learning_rate": 0.000888019529900404, + "loss": 0.9487, + "step": 7901 + }, + { + "epoch": 0.5497234686423875, + "grad_norm": 0.953125, + "learning_rate": 0.0008877956170002186, + "loss": 0.5608, + "step": 7902 + }, + { + "epoch": 0.5497930362795228, + "grad_norm": 1.25, + "learning_rate": 0.0008875717097972064, + "loss": 0.7444, + "step": 7903 + }, + { + "epoch": 0.549862603916658, + "grad_norm": 1.0703125, + "learning_rate": 0.0008873478083027364, + "loss": 0.5599, + "step": 7904 + }, + { + "epoch": 0.5499321715537931, + "grad_norm": 0.921875, + "learning_rate": 0.0008871239125281761, + "loss": 0.7114, + "step": 7905 + }, + { + "epoch": 0.5500017391909284, + "grad_norm": 1.1953125, + "learning_rate": 0.0008869000224848954, + "loss": 0.9034, + "step": 7906 + }, + { + "epoch": 0.5500713068280636, + "grad_norm": 0.8828125, + "learning_rate": 0.0008866761381842612, + "loss": 0.7924, + "step": 7907 + }, + { + "epoch": 0.5501408744651988, + "grad_norm": 1.1328125, + "learning_rate": 0.0008864522596376416, + "loss": 0.585, + "step": 7908 + }, + { + "epoch": 0.5502104421023339, + "grad_norm": 1.3203125, + "learning_rate": 0.0008862283868564038, + "loss": 0.9106, + "step": 7909 + }, + { + "epoch": 0.5502800097394692, + "grad_norm": 0.91796875, + "learning_rate": 0.000886004519851915, + "loss": 0.8078, + "step": 7910 + }, + { + "epoch": 0.5503495773766044, + "grad_norm": 1.0625, + "learning_rate": 0.0008857806586355423, + "loss": 0.7807, + "step": 7911 + }, + { + "epoch": 0.5504191450137396, + "grad_norm": 1.125, + "learning_rate": 0.0008855568032186517, + "loss": 0.6597, + "step": 7912 + }, + { + "epoch": 0.5504887126508748, + "grad_norm": 1.3515625, + "learning_rate": 0.0008853329536126102, + "loss": 1.0829, + "step": 7913 + }, + { + "epoch": 0.55055828028801, + "grad_norm": 1.3046875, + "learning_rate": 0.0008851091098287831, + "loss": 0.8936, + "step": 7914 + }, + { + "epoch": 0.5506278479251452, + "grad_norm": 1.046875, + "learning_rate": 0.0008848852718785366, + "loss": 0.7169, + "step": 7915 + }, + { + "epoch": 0.5506974155622805, + "grad_norm": 0.87890625, + "learning_rate": 0.0008846614397732354, + "loss": 0.6306, + "step": 7916 + }, + { + "epoch": 0.5507669831994156, + "grad_norm": 1.1171875, + "learning_rate": 0.0008844376135242451, + "loss": 0.7502, + "step": 7917 + }, + { + "epoch": 0.5508365508365508, + "grad_norm": 1.1953125, + "learning_rate": 0.0008842137931429303, + "loss": 0.8823, + "step": 7918 + }, + { + "epoch": 0.5509061184736861, + "grad_norm": 1.2421875, + "learning_rate": 0.0008839899786406558, + "loss": 0.8811, + "step": 7919 + }, + { + "epoch": 0.5509756861108213, + "grad_norm": 1.0234375, + "learning_rate": 0.0008837661700287849, + "loss": 0.9098, + "step": 7920 + }, + { + "epoch": 0.5510452537479564, + "grad_norm": 1.390625, + "learning_rate": 0.0008835423673186822, + "loss": 0.805, + "step": 7921 + }, + { + "epoch": 0.5511148213850916, + "grad_norm": 1.15625, + "learning_rate": 0.0008833185705217114, + "loss": 0.8195, + "step": 7922 + }, + { + "epoch": 0.5511843890222269, + "grad_norm": 1.171875, + "learning_rate": 0.000883094779649235, + "loss": 0.9827, + "step": 7923 + }, + { + "epoch": 0.551253956659362, + "grad_norm": 1.1171875, + "learning_rate": 0.0008828709947126166, + "loss": 0.648, + "step": 7924 + }, + { + "epoch": 0.5513235242964972, + "grad_norm": 1.203125, + "learning_rate": 0.0008826472157232188, + "loss": 0.8791, + "step": 7925 + }, + { + "epoch": 0.5513930919336325, + "grad_norm": 1.2265625, + "learning_rate": 0.0008824234426924041, + "loss": 0.9768, + "step": 7926 + }, + { + "epoch": 0.5514626595707677, + "grad_norm": 1.15625, + "learning_rate": 0.0008821996756315341, + "loss": 0.6314, + "step": 7927 + }, + { + "epoch": 0.5515322272079028, + "grad_norm": 1.0546875, + "learning_rate": 0.0008819759145519707, + "loss": 0.7367, + "step": 7928 + }, + { + "epoch": 0.5516017948450381, + "grad_norm": 1.0234375, + "learning_rate": 0.0008817521594650759, + "loss": 0.8871, + "step": 7929 + }, + { + "epoch": 0.5516713624821733, + "grad_norm": 1.3828125, + "learning_rate": 0.0008815284103822097, + "loss": 0.9596, + "step": 7930 + }, + { + "epoch": 0.5517409301193085, + "grad_norm": 1.5234375, + "learning_rate": 0.0008813046673147344, + "loss": 0.9696, + "step": 7931 + }, + { + "epoch": 0.5518104977564438, + "grad_norm": 0.9921875, + "learning_rate": 0.0008810809302740095, + "loss": 0.9666, + "step": 7932 + }, + { + "epoch": 0.5518800653935789, + "grad_norm": 0.94140625, + "learning_rate": 0.0008808571992713958, + "loss": 0.8094, + "step": 7933 + }, + { + "epoch": 0.5519496330307141, + "grad_norm": 1.203125, + "learning_rate": 0.0008806334743182526, + "loss": 0.8582, + "step": 7934 + }, + { + "epoch": 0.5520192006678493, + "grad_norm": 1.109375, + "learning_rate": 0.0008804097554259402, + "loss": 1.0506, + "step": 7935 + }, + { + "epoch": 0.5520887683049845, + "grad_norm": 1.1640625, + "learning_rate": 0.0008801860426058177, + "loss": 0.9523, + "step": 7936 + }, + { + "epoch": 0.5521583359421197, + "grad_norm": 1.2890625, + "learning_rate": 0.0008799623358692434, + "loss": 1.1695, + "step": 7937 + }, + { + "epoch": 0.5522279035792549, + "grad_norm": 1.25, + "learning_rate": 0.0008797386352275775, + "loss": 0.9633, + "step": 7938 + }, + { + "epoch": 0.5522974712163902, + "grad_norm": 1.140625, + "learning_rate": 0.0008795149406921772, + "loss": 0.7446, + "step": 7939 + }, + { + "epoch": 0.5523670388535253, + "grad_norm": 1.0625, + "learning_rate": 0.0008792912522744011, + "loss": 0.9321, + "step": 7940 + }, + { + "epoch": 0.5524366064906605, + "grad_norm": 1.4921875, + "learning_rate": 0.0008790675699856064, + "loss": 0.8671, + "step": 7941 + }, + { + "epoch": 0.5525061741277958, + "grad_norm": 1.0625, + "learning_rate": 0.0008788438938371512, + "loss": 0.7517, + "step": 7942 + }, + { + "epoch": 0.552575741764931, + "grad_norm": 0.93359375, + "learning_rate": 0.0008786202238403926, + "loss": 0.6515, + "step": 7943 + }, + { + "epoch": 0.5526453094020661, + "grad_norm": 1.0546875, + "learning_rate": 0.0008783965600066866, + "loss": 0.7545, + "step": 7944 + }, + { + "epoch": 0.5527148770392014, + "grad_norm": 1.125, + "learning_rate": 0.000878172902347391, + "loss": 0.7001, + "step": 7945 + }, + { + "epoch": 0.5527844446763366, + "grad_norm": 1.109375, + "learning_rate": 0.0008779492508738611, + "loss": 0.9313, + "step": 7946 + }, + { + "epoch": 0.5528540123134718, + "grad_norm": 1.125, + "learning_rate": 0.0008777256055974533, + "loss": 0.82, + "step": 7947 + }, + { + "epoch": 0.5529235799506069, + "grad_norm": 1.1171875, + "learning_rate": 0.0008775019665295225, + "loss": 0.8616, + "step": 7948 + }, + { + "epoch": 0.5529931475877422, + "grad_norm": 1.171875, + "learning_rate": 0.0008772783336814246, + "loss": 0.8757, + "step": 7949 + }, + { + "epoch": 0.5530627152248774, + "grad_norm": 1.1015625, + "learning_rate": 0.0008770547070645145, + "loss": 0.9859, + "step": 7950 + }, + { + "epoch": 0.5531322828620125, + "grad_norm": 0.8671875, + "learning_rate": 0.0008768310866901463, + "loss": 0.7146, + "step": 7951 + }, + { + "epoch": 0.5532018504991478, + "grad_norm": 0.875, + "learning_rate": 0.0008766074725696752, + "loss": 0.7746, + "step": 7952 + }, + { + "epoch": 0.553271418136283, + "grad_norm": 1.2578125, + "learning_rate": 0.0008763838647144544, + "loss": 0.822, + "step": 7953 + }, + { + "epoch": 0.5533409857734182, + "grad_norm": 1.046875, + "learning_rate": 0.0008761602631358382, + "loss": 0.692, + "step": 7954 + }, + { + "epoch": 0.5534105534105535, + "grad_norm": 1.015625, + "learning_rate": 0.0008759366678451792, + "loss": 0.6741, + "step": 7955 + }, + { + "epoch": 0.5534801210476886, + "grad_norm": 1.0234375, + "learning_rate": 0.0008757130788538311, + "loss": 0.6768, + "step": 7956 + }, + { + "epoch": 0.5535496886848238, + "grad_norm": 0.9609375, + "learning_rate": 0.0008754894961731463, + "loss": 0.63, + "step": 7957 + }, + { + "epoch": 0.5536192563219591, + "grad_norm": 0.9296875, + "learning_rate": 0.0008752659198144773, + "loss": 0.7849, + "step": 7958 + }, + { + "epoch": 0.5536888239590942, + "grad_norm": 1.0078125, + "learning_rate": 0.0008750423497891764, + "loss": 0.8418, + "step": 7959 + }, + { + "epoch": 0.5537583915962294, + "grad_norm": 0.91015625, + "learning_rate": 0.000874818786108595, + "loss": 0.6265, + "step": 7960 + }, + { + "epoch": 0.5538279592333646, + "grad_norm": 1.1875, + "learning_rate": 0.0008745952287840849, + "loss": 0.9863, + "step": 7961 + }, + { + "epoch": 0.5538975268704999, + "grad_norm": 1.1875, + "learning_rate": 0.0008743716778269966, + "loss": 0.7077, + "step": 7962 + }, + { + "epoch": 0.553967094507635, + "grad_norm": 1.046875, + "learning_rate": 0.0008741481332486813, + "loss": 0.9039, + "step": 7963 + }, + { + "epoch": 0.5540366621447702, + "grad_norm": 0.93359375, + "learning_rate": 0.0008739245950604897, + "loss": 0.6563, + "step": 7964 + }, + { + "epoch": 0.5541062297819055, + "grad_norm": 1.1015625, + "learning_rate": 0.0008737010632737714, + "loss": 0.9993, + "step": 7965 + }, + { + "epoch": 0.5541757974190407, + "grad_norm": 1.0234375, + "learning_rate": 0.0008734775378998771, + "loss": 0.6714, + "step": 7966 + }, + { + "epoch": 0.5542453650561758, + "grad_norm": 1.4296875, + "learning_rate": 0.0008732540189501552, + "loss": 1.1504, + "step": 7967 + }, + { + "epoch": 0.5543149326933111, + "grad_norm": 1.046875, + "learning_rate": 0.0008730305064359558, + "loss": 0.7834, + "step": 7968 + }, + { + "epoch": 0.5543845003304463, + "grad_norm": 1.0390625, + "learning_rate": 0.0008728070003686266, + "loss": 0.7097, + "step": 7969 + }, + { + "epoch": 0.5544540679675815, + "grad_norm": 1.0546875, + "learning_rate": 0.0008725835007595174, + "loss": 0.7942, + "step": 7970 + }, + { + "epoch": 0.5545236356047167, + "grad_norm": 1.3203125, + "learning_rate": 0.0008723600076199757, + "loss": 0.8154, + "step": 7971 + }, + { + "epoch": 0.5545932032418519, + "grad_norm": 1.0390625, + "learning_rate": 0.0008721365209613491, + "loss": 0.8025, + "step": 7972 + }, + { + "epoch": 0.5546627708789871, + "grad_norm": 0.984375, + "learning_rate": 0.000871913040794986, + "loss": 0.6833, + "step": 7973 + }, + { + "epoch": 0.5547323385161222, + "grad_norm": 1.1484375, + "learning_rate": 0.0008716895671322329, + "loss": 0.9714, + "step": 7974 + }, + { + "epoch": 0.5548019061532575, + "grad_norm": 1.328125, + "learning_rate": 0.0008714660999844371, + "loss": 1.101, + "step": 7975 + }, + { + "epoch": 0.5548714737903927, + "grad_norm": 1.03125, + "learning_rate": 0.0008712426393629441, + "loss": 0.8226, + "step": 7976 + }, + { + "epoch": 0.5549410414275279, + "grad_norm": 1.1015625, + "learning_rate": 0.0008710191852791016, + "loss": 0.8727, + "step": 7977 + }, + { + "epoch": 0.5550106090646632, + "grad_norm": 1.359375, + "learning_rate": 0.0008707957377442546, + "loss": 0.8996, + "step": 7978 + }, + { + "epoch": 0.5550801767017983, + "grad_norm": 0.96875, + "learning_rate": 0.0008705722967697484, + "loss": 0.8693, + "step": 7979 + }, + { + "epoch": 0.5551497443389335, + "grad_norm": 1.1875, + "learning_rate": 0.0008703488623669293, + "loss": 0.768, + "step": 7980 + }, + { + "epoch": 0.5552193119760688, + "grad_norm": 1.0859375, + "learning_rate": 0.0008701254345471411, + "loss": 0.9451, + "step": 7981 + }, + { + "epoch": 0.555288879613204, + "grad_norm": 1.2265625, + "learning_rate": 0.000869902013321729, + "loss": 0.6398, + "step": 7982 + }, + { + "epoch": 0.5553584472503391, + "grad_norm": 1.171875, + "learning_rate": 0.0008696785987020362, + "loss": 0.932, + "step": 7983 + }, + { + "epoch": 0.5554280148874744, + "grad_norm": 0.97265625, + "learning_rate": 0.0008694551906994081, + "loss": 0.7843, + "step": 7984 + }, + { + "epoch": 0.5554975825246096, + "grad_norm": 1.21875, + "learning_rate": 0.000869231789325187, + "loss": 0.8724, + "step": 7985 + }, + { + "epoch": 0.5555671501617447, + "grad_norm": 0.97265625, + "learning_rate": 0.0008690083945907163, + "loss": 0.8195, + "step": 7986 + }, + { + "epoch": 0.5556367177988799, + "grad_norm": 1.6484375, + "learning_rate": 0.0008687850065073398, + "loss": 0.9577, + "step": 7987 + }, + { + "epoch": 0.5557062854360152, + "grad_norm": 0.82421875, + "learning_rate": 0.0008685616250863988, + "loss": 0.5295, + "step": 7988 + }, + { + "epoch": 0.5557758530731504, + "grad_norm": 1.09375, + "learning_rate": 0.0008683382503392361, + "loss": 0.613, + "step": 7989 + }, + { + "epoch": 0.5558454207102855, + "grad_norm": 1.09375, + "learning_rate": 0.0008681148822771932, + "loss": 0.8027, + "step": 7990 + }, + { + "epoch": 0.5559149883474208, + "grad_norm": 1.1015625, + "learning_rate": 0.0008678915209116121, + "loss": 0.7651, + "step": 7991 + }, + { + "epoch": 0.555984555984556, + "grad_norm": 0.95703125, + "learning_rate": 0.0008676681662538335, + "loss": 0.9081, + "step": 7992 + }, + { + "epoch": 0.5560541236216912, + "grad_norm": 1.1171875, + "learning_rate": 0.0008674448183151988, + "loss": 0.9049, + "step": 7993 + }, + { + "epoch": 0.5561236912588264, + "grad_norm": 0.90234375, + "learning_rate": 0.0008672214771070477, + "loss": 0.6433, + "step": 7994 + }, + { + "epoch": 0.5561932588959616, + "grad_norm": 0.984375, + "learning_rate": 0.0008669981426407208, + "loss": 0.6013, + "step": 7995 + }, + { + "epoch": 0.5562628265330968, + "grad_norm": 1.125, + "learning_rate": 0.0008667748149275578, + "loss": 0.7064, + "step": 7996 + }, + { + "epoch": 0.5563323941702321, + "grad_norm": 0.9921875, + "learning_rate": 0.0008665514939788981, + "loss": 0.799, + "step": 7997 + }, + { + "epoch": 0.5564019618073672, + "grad_norm": 1.375, + "learning_rate": 0.0008663281798060814, + "loss": 0.9057, + "step": 7998 + }, + { + "epoch": 0.5564715294445024, + "grad_norm": 1.0703125, + "learning_rate": 0.0008661048724204457, + "loss": 0.8069, + "step": 7999 + }, + { + "epoch": 0.5565410970816376, + "grad_norm": 1.0390625, + "learning_rate": 0.0008658815718333298, + "loss": 0.9088, + "step": 8000 + }, + { + "epoch": 0.5566106647187729, + "grad_norm": 1.0, + "learning_rate": 0.0008656582780560712, + "loss": 0.7117, + "step": 8001 + }, + { + "epoch": 0.556680232355908, + "grad_norm": 0.953125, + "learning_rate": 0.0008654349911000086, + "loss": 0.6986, + "step": 8002 + }, + { + "epoch": 0.5567497999930432, + "grad_norm": 0.9296875, + "learning_rate": 0.0008652117109764787, + "loss": 0.7315, + "step": 8003 + }, + { + "epoch": 0.5568193676301785, + "grad_norm": 0.91015625, + "learning_rate": 0.0008649884376968186, + "loss": 0.6983, + "step": 8004 + }, + { + "epoch": 0.5568889352673136, + "grad_norm": 1.0078125, + "learning_rate": 0.0008647651712723654, + "loss": 0.5391, + "step": 8005 + }, + { + "epoch": 0.5569585029044488, + "grad_norm": 1.4453125, + "learning_rate": 0.000864541911714455, + "loss": 0.9969, + "step": 8006 + }, + { + "epoch": 0.5570280705415841, + "grad_norm": 1.0546875, + "learning_rate": 0.0008643186590344239, + "loss": 0.6933, + "step": 8007 + }, + { + "epoch": 0.5570976381787193, + "grad_norm": 1.453125, + "learning_rate": 0.0008640954132436067, + "loss": 0.7267, + "step": 8008 + }, + { + "epoch": 0.5571672058158544, + "grad_norm": 1.0625, + "learning_rate": 0.0008638721743533402, + "loss": 0.7014, + "step": 8009 + }, + { + "epoch": 0.5572367734529897, + "grad_norm": 0.9296875, + "learning_rate": 0.0008636489423749581, + "loss": 0.8283, + "step": 8010 + }, + { + "epoch": 0.5573063410901249, + "grad_norm": 1.1328125, + "learning_rate": 0.0008634257173197954, + "loss": 1.0623, + "step": 8011 + }, + { + "epoch": 0.5573759087272601, + "grad_norm": 1.3671875, + "learning_rate": 0.0008632024991991867, + "loss": 0.8609, + "step": 8012 + }, + { + "epoch": 0.5574454763643952, + "grad_norm": 0.98828125, + "learning_rate": 0.0008629792880244653, + "loss": 0.8745, + "step": 8013 + }, + { + "epoch": 0.5575150440015305, + "grad_norm": 1.28125, + "learning_rate": 0.0008627560838069655, + "loss": 0.7606, + "step": 8014 + }, + { + "epoch": 0.5575846116386657, + "grad_norm": 0.91796875, + "learning_rate": 0.0008625328865580191, + "loss": 0.7748, + "step": 8015 + }, + { + "epoch": 0.5576541792758009, + "grad_norm": 1.125, + "learning_rate": 0.0008623096962889606, + "loss": 0.9064, + "step": 8016 + }, + { + "epoch": 0.5577237469129361, + "grad_norm": 1.0390625, + "learning_rate": 0.0008620865130111215, + "loss": 0.8291, + "step": 8017 + }, + { + "epoch": 0.5577933145500713, + "grad_norm": 1.6953125, + "learning_rate": 0.0008618633367358339, + "loss": 1.0929, + "step": 8018 + }, + { + "epoch": 0.5578628821872065, + "grad_norm": 1.125, + "learning_rate": 0.0008616401674744303, + "loss": 0.7828, + "step": 8019 + }, + { + "epoch": 0.5579324498243418, + "grad_norm": 1.15625, + "learning_rate": 0.0008614170052382413, + "loss": 0.9277, + "step": 8020 + }, + { + "epoch": 0.5580020174614769, + "grad_norm": 1.2578125, + "learning_rate": 0.0008611938500385983, + "loss": 0.9666, + "step": 8021 + }, + { + "epoch": 0.5580715850986121, + "grad_norm": 0.9140625, + "learning_rate": 0.0008609707018868317, + "loss": 0.4326, + "step": 8022 + }, + { + "epoch": 0.5581411527357474, + "grad_norm": 1.015625, + "learning_rate": 0.0008607475607942725, + "loss": 0.5866, + "step": 8023 + }, + { + "epoch": 0.5582107203728826, + "grad_norm": 1.640625, + "learning_rate": 0.0008605244267722502, + "loss": 0.9919, + "step": 8024 + }, + { + "epoch": 0.5582802880100177, + "grad_norm": 0.96484375, + "learning_rate": 0.0008603012998320941, + "loss": 0.8603, + "step": 8025 + }, + { + "epoch": 0.5583498556471529, + "grad_norm": 1.125, + "learning_rate": 0.0008600781799851344, + "loss": 0.8032, + "step": 8026 + }, + { + "epoch": 0.5584194232842882, + "grad_norm": 1.1875, + "learning_rate": 0.0008598550672426993, + "loss": 1.0323, + "step": 8027 + }, + { + "epoch": 0.5584889909214233, + "grad_norm": 0.94140625, + "learning_rate": 0.0008596319616161175, + "loss": 0.6604, + "step": 8028 + }, + { + "epoch": 0.5585585585585585, + "grad_norm": 1.1015625, + "learning_rate": 0.0008594088631167169, + "loss": 0.8073, + "step": 8029 + }, + { + "epoch": 0.5586281261956938, + "grad_norm": 1.15625, + "learning_rate": 0.0008591857717558261, + "loss": 0.8021, + "step": 8030 + }, + { + "epoch": 0.558697693832829, + "grad_norm": 1.0078125, + "learning_rate": 0.0008589626875447717, + "loss": 0.8974, + "step": 8031 + }, + { + "epoch": 0.5587672614699641, + "grad_norm": 1.0, + "learning_rate": 0.0008587396104948811, + "loss": 0.8931, + "step": 8032 + }, + { + "epoch": 0.5588368291070994, + "grad_norm": 1.3515625, + "learning_rate": 0.0008585165406174813, + "loss": 0.894, + "step": 8033 + }, + { + "epoch": 0.5589063967442346, + "grad_norm": 1.203125, + "learning_rate": 0.0008582934779238985, + "loss": 0.6545, + "step": 8034 + }, + { + "epoch": 0.5589759643813698, + "grad_norm": 1.0546875, + "learning_rate": 0.0008580704224254583, + "loss": 0.7657, + "step": 8035 + }, + { + "epoch": 0.559045532018505, + "grad_norm": 1.1328125, + "learning_rate": 0.0008578473741334867, + "loss": 0.7614, + "step": 8036 + }, + { + "epoch": 0.5591150996556402, + "grad_norm": 1.015625, + "learning_rate": 0.0008576243330593093, + "loss": 0.8003, + "step": 8037 + }, + { + "epoch": 0.5591846672927754, + "grad_norm": 1.046875, + "learning_rate": 0.0008574012992142504, + "loss": 1.0983, + "step": 8038 + }, + { + "epoch": 0.5592542349299106, + "grad_norm": 1.21875, + "learning_rate": 0.0008571782726096346, + "loss": 1.0023, + "step": 8039 + }, + { + "epoch": 0.5593238025670458, + "grad_norm": 1.125, + "learning_rate": 0.0008569552532567865, + "loss": 0.8936, + "step": 8040 + }, + { + "epoch": 0.559393370204181, + "grad_norm": 1.109375, + "learning_rate": 0.0008567322411670297, + "loss": 0.7366, + "step": 8041 + }, + { + "epoch": 0.5594629378413162, + "grad_norm": 1.046875, + "learning_rate": 0.0008565092363516876, + "loss": 0.7664, + "step": 8042 + }, + { + "epoch": 0.5595325054784515, + "grad_norm": 1.1640625, + "learning_rate": 0.0008562862388220828, + "loss": 0.7319, + "step": 8043 + }, + { + "epoch": 0.5596020731155866, + "grad_norm": 1.1328125, + "learning_rate": 0.000856063248589539, + "loss": 0.8718, + "step": 8044 + }, + { + "epoch": 0.5596716407527218, + "grad_norm": 1.3671875, + "learning_rate": 0.0008558402656653777, + "loss": 0.8213, + "step": 8045 + }, + { + "epoch": 0.5597412083898571, + "grad_norm": 1.53125, + "learning_rate": 0.0008556172900609207, + "loss": 0.7882, + "step": 8046 + }, + { + "epoch": 0.5598107760269923, + "grad_norm": 0.90234375, + "learning_rate": 0.0008553943217874903, + "loss": 0.768, + "step": 8047 + }, + { + "epoch": 0.5598803436641274, + "grad_norm": 1.2578125, + "learning_rate": 0.0008551713608564075, + "loss": 1.0058, + "step": 8048 + }, + { + "epoch": 0.5599499113012627, + "grad_norm": 1.0, + "learning_rate": 0.000854948407278993, + "loss": 0.8091, + "step": 8049 + }, + { + "epoch": 0.5600194789383979, + "grad_norm": 1.125, + "learning_rate": 0.000854725461066567, + "loss": 0.9949, + "step": 8050 + }, + { + "epoch": 0.560089046575533, + "grad_norm": 1.046875, + "learning_rate": 0.0008545025222304501, + "loss": 0.7258, + "step": 8051 + }, + { + "epoch": 0.5601586142126682, + "grad_norm": 0.97265625, + "learning_rate": 0.0008542795907819618, + "loss": 0.6841, + "step": 8052 + }, + { + "epoch": 0.5602281818498035, + "grad_norm": 1.125, + "learning_rate": 0.000854056666732421, + "loss": 0.6304, + "step": 8053 + }, + { + "epoch": 0.5602977494869387, + "grad_norm": 0.81640625, + "learning_rate": 0.0008538337500931472, + "loss": 0.6101, + "step": 8054 + }, + { + "epoch": 0.5603673171240738, + "grad_norm": 1.34375, + "learning_rate": 0.0008536108408754593, + "loss": 1.0405, + "step": 8055 + }, + { + "epoch": 0.5604368847612091, + "grad_norm": 1.3359375, + "learning_rate": 0.0008533879390906747, + "loss": 0.9695, + "step": 8056 + }, + { + "epoch": 0.5605064523983443, + "grad_norm": 1.3984375, + "learning_rate": 0.0008531650447501114, + "loss": 1.005, + "step": 8057 + }, + { + "epoch": 0.5605760200354795, + "grad_norm": 1.1328125, + "learning_rate": 0.0008529421578650873, + "loss": 0.8173, + "step": 8058 + }, + { + "epoch": 0.5606455876726147, + "grad_norm": 1.25, + "learning_rate": 0.0008527192784469191, + "loss": 0.9021, + "step": 8059 + }, + { + "epoch": 0.5607151553097499, + "grad_norm": 1.2265625, + "learning_rate": 0.0008524964065069234, + "loss": 0.756, + "step": 8060 + }, + { + "epoch": 0.5607847229468851, + "grad_norm": 0.92578125, + "learning_rate": 0.0008522735420564169, + "loss": 0.8285, + "step": 8061 + }, + { + "epoch": 0.5608542905840204, + "grad_norm": 1.1796875, + "learning_rate": 0.0008520506851067154, + "loss": 0.9753, + "step": 8062 + }, + { + "epoch": 0.5609238582211555, + "grad_norm": 0.99609375, + "learning_rate": 0.0008518278356691344, + "loss": 0.8214, + "step": 8063 + }, + { + "epoch": 0.5609934258582907, + "grad_norm": 1.59375, + "learning_rate": 0.0008516049937549888, + "loss": 0.8445, + "step": 8064 + }, + { + "epoch": 0.5610629934954259, + "grad_norm": 1.1796875, + "learning_rate": 0.0008513821593755939, + "loss": 1.078, + "step": 8065 + }, + { + "epoch": 0.5611325611325612, + "grad_norm": 1.109375, + "learning_rate": 0.0008511593325422639, + "loss": 0.7691, + "step": 8066 + }, + { + "epoch": 0.5612021287696963, + "grad_norm": 1.0234375, + "learning_rate": 0.0008509365132663124, + "loss": 1.0448, + "step": 8067 + }, + { + "epoch": 0.5612716964068315, + "grad_norm": 0.91796875, + "learning_rate": 0.0008507137015590537, + "loss": 0.5083, + "step": 8068 + }, + { + "epoch": 0.5613412640439668, + "grad_norm": 0.98828125, + "learning_rate": 0.0008504908974318009, + "loss": 0.884, + "step": 8069 + }, + { + "epoch": 0.561410831681102, + "grad_norm": 1.21875, + "learning_rate": 0.0008502681008958667, + "loss": 0.9151, + "step": 8070 + }, + { + "epoch": 0.5614803993182371, + "grad_norm": 0.9609375, + "learning_rate": 0.0008500453119625633, + "loss": 0.807, + "step": 8071 + }, + { + "epoch": 0.5615499669553724, + "grad_norm": 1.234375, + "learning_rate": 0.0008498225306432034, + "loss": 0.8662, + "step": 8072 + }, + { + "epoch": 0.5616195345925076, + "grad_norm": 1.0, + "learning_rate": 0.0008495997569490986, + "loss": 0.872, + "step": 8073 + }, + { + "epoch": 0.5616891022296427, + "grad_norm": 1.0234375, + "learning_rate": 0.0008493769908915599, + "loss": 0.8244, + "step": 8074 + }, + { + "epoch": 0.561758669866778, + "grad_norm": 1.140625, + "learning_rate": 0.0008491542324818982, + "loss": 0.9894, + "step": 8075 + }, + { + "epoch": 0.5618282375039132, + "grad_norm": 1.0390625, + "learning_rate": 0.0008489314817314246, + "loss": 0.8056, + "step": 8076 + }, + { + "epoch": 0.5618978051410484, + "grad_norm": 1.15625, + "learning_rate": 0.0008487087386514488, + "loss": 0.7569, + "step": 8077 + }, + { + "epoch": 0.5619673727781835, + "grad_norm": 1.1953125, + "learning_rate": 0.0008484860032532804, + "loss": 0.9542, + "step": 8078 + }, + { + "epoch": 0.5620369404153188, + "grad_norm": 1.0078125, + "learning_rate": 0.0008482632755482293, + "loss": 0.8073, + "step": 8079 + }, + { + "epoch": 0.562106508052454, + "grad_norm": 1.140625, + "learning_rate": 0.0008480405555476045, + "loss": 0.8613, + "step": 8080 + }, + { + "epoch": 0.5621760756895892, + "grad_norm": 1.296875, + "learning_rate": 0.0008478178432627142, + "loss": 0.9116, + "step": 8081 + }, + { + "epoch": 0.5622456433267244, + "grad_norm": 1.3125, + "learning_rate": 0.0008475951387048664, + "loss": 0.7866, + "step": 8082 + }, + { + "epoch": 0.5623152109638596, + "grad_norm": 0.93359375, + "learning_rate": 0.0008473724418853698, + "loss": 0.8013, + "step": 8083 + }, + { + "epoch": 0.5623847786009948, + "grad_norm": 1.1171875, + "learning_rate": 0.0008471497528155311, + "loss": 0.8295, + "step": 8084 + }, + { + "epoch": 0.5624543462381301, + "grad_norm": 1.109375, + "learning_rate": 0.0008469270715066573, + "loss": 0.9016, + "step": 8085 + }, + { + "epoch": 0.5625239138752652, + "grad_norm": 0.96484375, + "learning_rate": 0.0008467043979700554, + "loss": 0.6481, + "step": 8086 + }, + { + "epoch": 0.5625934815124004, + "grad_norm": 1.0859375, + "learning_rate": 0.0008464817322170319, + "loss": 0.8493, + "step": 8087 + }, + { + "epoch": 0.5626630491495357, + "grad_norm": 1.328125, + "learning_rate": 0.0008462590742588918, + "loss": 0.7085, + "step": 8088 + }, + { + "epoch": 0.5627326167866709, + "grad_norm": 1.125, + "learning_rate": 0.000846036424106941, + "loss": 0.8964, + "step": 8089 + }, + { + "epoch": 0.562802184423806, + "grad_norm": 1.28125, + "learning_rate": 0.0008458137817724848, + "loss": 0.9103, + "step": 8090 + }, + { + "epoch": 0.5628717520609412, + "grad_norm": 1.1796875, + "learning_rate": 0.0008455911472668276, + "loss": 0.8857, + "step": 8091 + }, + { + "epoch": 0.5629413196980765, + "grad_norm": 1.2578125, + "learning_rate": 0.0008453685206012732, + "loss": 0.8084, + "step": 8092 + }, + { + "epoch": 0.5630108873352117, + "grad_norm": 1.078125, + "learning_rate": 0.0008451459017871263, + "loss": 0.7816, + "step": 8093 + }, + { + "epoch": 0.5630804549723468, + "grad_norm": 0.890625, + "learning_rate": 0.0008449232908356901, + "loss": 0.6622, + "step": 8094 + }, + { + "epoch": 0.5631500226094821, + "grad_norm": 1.046875, + "learning_rate": 0.0008447006877582674, + "loss": 0.8106, + "step": 8095 + }, + { + "epoch": 0.5632195902466173, + "grad_norm": 1.25, + "learning_rate": 0.0008444780925661609, + "loss": 0.9192, + "step": 8096 + }, + { + "epoch": 0.5632891578837524, + "grad_norm": 1.1640625, + "learning_rate": 0.0008442555052706732, + "loss": 0.8015, + "step": 8097 + }, + { + "epoch": 0.5633587255208877, + "grad_norm": 1.1796875, + "learning_rate": 0.0008440329258831057, + "loss": 0.557, + "step": 8098 + }, + { + "epoch": 0.5634282931580229, + "grad_norm": 1.0703125, + "learning_rate": 0.0008438103544147601, + "loss": 0.8574, + "step": 8099 + }, + { + "epoch": 0.5634978607951581, + "grad_norm": 1.171875, + "learning_rate": 0.0008435877908769375, + "loss": 0.6947, + "step": 8100 + }, + { + "epoch": 0.5635674284322933, + "grad_norm": 1.4609375, + "learning_rate": 0.0008433652352809388, + "loss": 1.0825, + "step": 8101 + }, + { + "epoch": 0.5636369960694285, + "grad_norm": 1.2421875, + "learning_rate": 0.0008431426876380636, + "loss": 0.9564, + "step": 8102 + }, + { + "epoch": 0.5637065637065637, + "grad_norm": 1.5078125, + "learning_rate": 0.000842920147959612, + "loss": 1.0568, + "step": 8103 + }, + { + "epoch": 0.5637761313436989, + "grad_norm": 1.140625, + "learning_rate": 0.0008426976162568837, + "loss": 0.7129, + "step": 8104 + }, + { + "epoch": 0.5638456989808341, + "grad_norm": 1.1328125, + "learning_rate": 0.0008424750925411779, + "loss": 1.0349, + "step": 8105 + }, + { + "epoch": 0.5639152666179693, + "grad_norm": 1.296875, + "learning_rate": 0.0008422525768237925, + "loss": 0.6118, + "step": 8106 + }, + { + "epoch": 0.5639848342551045, + "grad_norm": 0.93359375, + "learning_rate": 0.0008420300691160263, + "loss": 0.7699, + "step": 8107 + }, + { + "epoch": 0.5640544018922398, + "grad_norm": 1.171875, + "learning_rate": 0.0008418075694291772, + "loss": 1.049, + "step": 8108 + }, + { + "epoch": 0.5641239695293749, + "grad_norm": 1.375, + "learning_rate": 0.0008415850777745421, + "loss": 0.7753, + "step": 8109 + }, + { + "epoch": 0.5641935371665101, + "grad_norm": 1.453125, + "learning_rate": 0.0008413625941634181, + "loss": 0.7612, + "step": 8110 + }, + { + "epoch": 0.5642631048036454, + "grad_norm": 1.21875, + "learning_rate": 0.0008411401186071022, + "loss": 0.7208, + "step": 8111 + }, + { + "epoch": 0.5643326724407806, + "grad_norm": 0.921875, + "learning_rate": 0.0008409176511168906, + "loss": 0.76, + "step": 8112 + }, + { + "epoch": 0.5644022400779157, + "grad_norm": 0.9765625, + "learning_rate": 0.0008406951917040784, + "loss": 0.6868, + "step": 8113 + }, + { + "epoch": 0.564471807715051, + "grad_norm": 0.98828125, + "learning_rate": 0.0008404727403799614, + "loss": 0.8213, + "step": 8114 + }, + { + "epoch": 0.5645413753521862, + "grad_norm": 0.9921875, + "learning_rate": 0.0008402502971558352, + "loss": 0.7647, + "step": 8115 + }, + { + "epoch": 0.5646109429893214, + "grad_norm": 1.3125, + "learning_rate": 0.0008400278620429932, + "loss": 0.7873, + "step": 8116 + }, + { + "epoch": 0.5646805106264565, + "grad_norm": 1.2421875, + "learning_rate": 0.0008398054350527298, + "loss": 0.9852, + "step": 8117 + }, + { + "epoch": 0.5647500782635918, + "grad_norm": 1.0625, + "learning_rate": 0.0008395830161963394, + "loss": 0.7469, + "step": 8118 + }, + { + "epoch": 0.564819645900727, + "grad_norm": 1.2890625, + "learning_rate": 0.000839360605485115, + "loss": 1.0719, + "step": 8119 + }, + { + "epoch": 0.5648892135378621, + "grad_norm": 0.953125, + "learning_rate": 0.000839138202930349, + "loss": 0.7244, + "step": 8120 + }, + { + "epoch": 0.5649587811749974, + "grad_norm": 1.28125, + "learning_rate": 0.0008389158085433343, + "loss": 1.0414, + "step": 8121 + }, + { + "epoch": 0.5650283488121326, + "grad_norm": 1.171875, + "learning_rate": 0.0008386934223353632, + "loss": 0.9417, + "step": 8122 + }, + { + "epoch": 0.5650979164492678, + "grad_norm": 1.2421875, + "learning_rate": 0.0008384710443177269, + "loss": 0.8437, + "step": 8123 + }, + { + "epoch": 0.565167484086403, + "grad_norm": 1.1328125, + "learning_rate": 0.0008382486745017166, + "loss": 0.8869, + "step": 8124 + }, + { + "epoch": 0.5652370517235382, + "grad_norm": 1.2109375, + "learning_rate": 0.0008380263128986235, + "loss": 1.0772, + "step": 8125 + }, + { + "epoch": 0.5653066193606734, + "grad_norm": 1.1328125, + "learning_rate": 0.000837803959519738, + "loss": 0.5437, + "step": 8126 + }, + { + "epoch": 0.5653761869978087, + "grad_norm": 0.93359375, + "learning_rate": 0.0008375816143763495, + "loss": 0.6332, + "step": 8127 + }, + { + "epoch": 0.5654457546349438, + "grad_norm": 1.1796875, + "learning_rate": 0.0008373592774797482, + "loss": 0.8213, + "step": 8128 + }, + { + "epoch": 0.565515322272079, + "grad_norm": 1.3046875, + "learning_rate": 0.0008371369488412233, + "loss": 0.8402, + "step": 8129 + }, + { + "epoch": 0.5655848899092142, + "grad_norm": 0.81640625, + "learning_rate": 0.000836914628472063, + "loss": 0.6205, + "step": 8130 + }, + { + "epoch": 0.5656544575463495, + "grad_norm": 1.5625, + "learning_rate": 0.0008366923163835556, + "loss": 0.9671, + "step": 8131 + }, + { + "epoch": 0.5657240251834846, + "grad_norm": 1.0234375, + "learning_rate": 0.0008364700125869895, + "loss": 0.5901, + "step": 8132 + }, + { + "epoch": 0.5657935928206198, + "grad_norm": 1.0703125, + "learning_rate": 0.000836247717093652, + "loss": 0.9082, + "step": 8133 + }, + { + "epoch": 0.5658631604577551, + "grad_norm": 1.421875, + "learning_rate": 0.0008360254299148298, + "loss": 0.98, + "step": 8134 + }, + { + "epoch": 0.5659327280948903, + "grad_norm": 1.25, + "learning_rate": 0.0008358031510618099, + "loss": 0.8122, + "step": 8135 + }, + { + "epoch": 0.5660022957320254, + "grad_norm": 1.0234375, + "learning_rate": 0.0008355808805458786, + "loss": 0.7831, + "step": 8136 + }, + { + "epoch": 0.5660718633691607, + "grad_norm": 0.9609375, + "learning_rate": 0.0008353586183783212, + "loss": 0.8124, + "step": 8137 + }, + { + "epoch": 0.5661414310062959, + "grad_norm": 1.0, + "learning_rate": 0.0008351363645704231, + "loss": 0.7187, + "step": 8138 + }, + { + "epoch": 0.566210998643431, + "grad_norm": 1.890625, + "learning_rate": 0.0008349141191334697, + "loss": 0.5892, + "step": 8139 + }, + { + "epoch": 0.5662805662805663, + "grad_norm": 1.09375, + "learning_rate": 0.0008346918820787455, + "loss": 0.7405, + "step": 8140 + }, + { + "epoch": 0.5663501339177015, + "grad_norm": 1.5546875, + "learning_rate": 0.0008344696534175337, + "loss": 0.7435, + "step": 8141 + }, + { + "epoch": 0.5664197015548367, + "grad_norm": 0.98828125, + "learning_rate": 0.0008342474331611189, + "loss": 0.726, + "step": 8142 + }, + { + "epoch": 0.5664892691919718, + "grad_norm": 1.0078125, + "learning_rate": 0.0008340252213207839, + "loss": 0.6987, + "step": 8143 + }, + { + "epoch": 0.5665588368291071, + "grad_norm": 0.734375, + "learning_rate": 0.000833803017907812, + "loss": 0.6203, + "step": 8144 + }, + { + "epoch": 0.5666284044662423, + "grad_norm": 0.92578125, + "learning_rate": 0.0008335808229334846, + "loss": 0.7581, + "step": 8145 + }, + { + "epoch": 0.5666979721033775, + "grad_norm": 1.1171875, + "learning_rate": 0.0008333586364090844, + "loss": 0.9403, + "step": 8146 + }, + { + "epoch": 0.5667675397405127, + "grad_norm": 1.015625, + "learning_rate": 0.0008331364583458929, + "loss": 0.7753, + "step": 8147 + }, + { + "epoch": 0.5668371073776479, + "grad_norm": 1.0, + "learning_rate": 0.0008329142887551908, + "loss": 0.9515, + "step": 8148 + }, + { + "epoch": 0.5669066750147831, + "grad_norm": 1.28125, + "learning_rate": 0.0008326921276482588, + "loss": 1.0615, + "step": 8149 + }, + { + "epoch": 0.5669762426519184, + "grad_norm": 1.1328125, + "learning_rate": 0.0008324699750363774, + "loss": 0.702, + "step": 8150 + }, + { + "epoch": 0.5670458102890535, + "grad_norm": 1.609375, + "learning_rate": 0.0008322478309308266, + "loss": 0.8821, + "step": 8151 + }, + { + "epoch": 0.5671153779261887, + "grad_norm": 1.2109375, + "learning_rate": 0.0008320256953428849, + "loss": 0.7465, + "step": 8152 + }, + { + "epoch": 0.567184945563324, + "grad_norm": 1.015625, + "learning_rate": 0.0008318035682838319, + "loss": 0.9299, + "step": 8153 + }, + { + "epoch": 0.5672545132004592, + "grad_norm": 1.21875, + "learning_rate": 0.0008315814497649461, + "loss": 0.8122, + "step": 8154 + }, + { + "epoch": 0.5673240808375943, + "grad_norm": 1.0625, + "learning_rate": 0.0008313593397975052, + "loss": 0.9976, + "step": 8155 + }, + { + "epoch": 0.5673936484747295, + "grad_norm": 1.0625, + "learning_rate": 0.0008311372383927869, + "loss": 0.743, + "step": 8156 + }, + { + "epoch": 0.5674632161118648, + "grad_norm": 1.328125, + "learning_rate": 0.0008309151455620687, + "loss": 0.9827, + "step": 8157 + }, + { + "epoch": 0.567532783749, + "grad_norm": 1.03125, + "learning_rate": 0.0008306930613166272, + "loss": 0.9007, + "step": 8158 + }, + { + "epoch": 0.5676023513861351, + "grad_norm": 0.97265625, + "learning_rate": 0.0008304709856677384, + "loss": 0.952, + "step": 8159 + }, + { + "epoch": 0.5676719190232704, + "grad_norm": 0.87890625, + "learning_rate": 0.0008302489186266788, + "loss": 0.575, + "step": 8160 + }, + { + "epoch": 0.5677414866604056, + "grad_norm": 1.1640625, + "learning_rate": 0.0008300268602047235, + "loss": 0.7995, + "step": 8161 + }, + { + "epoch": 0.5678110542975408, + "grad_norm": 0.98828125, + "learning_rate": 0.0008298048104131474, + "loss": 0.5987, + "step": 8162 + }, + { + "epoch": 0.567880621934676, + "grad_norm": 0.8671875, + "learning_rate": 0.0008295827692632249, + "loss": 0.6065, + "step": 8163 + }, + { + "epoch": 0.5679501895718112, + "grad_norm": 0.84375, + "learning_rate": 0.0008293607367662306, + "loss": 0.7019, + "step": 8164 + }, + { + "epoch": 0.5680197572089464, + "grad_norm": 0.921875, + "learning_rate": 0.0008291387129334383, + "loss": 0.4507, + "step": 8165 + }, + { + "epoch": 0.5680893248460817, + "grad_norm": 1.1484375, + "learning_rate": 0.0008289166977761205, + "loss": 0.8238, + "step": 8166 + }, + { + "epoch": 0.5681588924832168, + "grad_norm": 1.0625, + "learning_rate": 0.0008286946913055506, + "loss": 0.7219, + "step": 8167 + }, + { + "epoch": 0.568228460120352, + "grad_norm": 1.7265625, + "learning_rate": 0.0008284726935330011, + "loss": 0.7893, + "step": 8168 + }, + { + "epoch": 0.5682980277574872, + "grad_norm": 1.0703125, + "learning_rate": 0.0008282507044697436, + "loss": 0.5606, + "step": 8169 + }, + { + "epoch": 0.5683675953946224, + "grad_norm": 1.1640625, + "learning_rate": 0.0008280287241270492, + "loss": 0.9674, + "step": 8170 + }, + { + "epoch": 0.5684371630317576, + "grad_norm": 1.1875, + "learning_rate": 0.0008278067525161897, + "loss": 0.8512, + "step": 8171 + }, + { + "epoch": 0.5685067306688928, + "grad_norm": 1.125, + "learning_rate": 0.0008275847896484356, + "loss": 0.8297, + "step": 8172 + }, + { + "epoch": 0.5685762983060281, + "grad_norm": 1.265625, + "learning_rate": 0.0008273628355350564, + "loss": 0.8624, + "step": 8173 + }, + { + "epoch": 0.5686458659431632, + "grad_norm": 1.2109375, + "learning_rate": 0.0008271408901873225, + "loss": 0.7673, + "step": 8174 + }, + { + "epoch": 0.5687154335802984, + "grad_norm": 0.984375, + "learning_rate": 0.000826918953616503, + "loss": 0.7336, + "step": 8175 + }, + { + "epoch": 0.5687850012174337, + "grad_norm": 1.109375, + "learning_rate": 0.0008266970258338668, + "loss": 0.6666, + "step": 8176 + }, + { + "epoch": 0.5688545688545689, + "grad_norm": 1.3125, + "learning_rate": 0.0008264751068506816, + "loss": 0.9051, + "step": 8177 + }, + { + "epoch": 0.568924136491704, + "grad_norm": 1.0, + "learning_rate": 0.0008262531966782161, + "loss": 0.6647, + "step": 8178 + }, + { + "epoch": 0.5689937041288393, + "grad_norm": 1.0625, + "learning_rate": 0.0008260312953277378, + "loss": 0.7353, + "step": 8179 + }, + { + "epoch": 0.5690632717659745, + "grad_norm": 1.0703125, + "learning_rate": 0.000825809402810513, + "loss": 0.9339, + "step": 8180 + }, + { + "epoch": 0.5691328394031097, + "grad_norm": 1.1953125, + "learning_rate": 0.0008255875191378089, + "loss": 0.7516, + "step": 8181 + }, + { + "epoch": 0.5692024070402448, + "grad_norm": 1.0703125, + "learning_rate": 0.0008253656443208915, + "loss": 0.8792, + "step": 8182 + }, + { + "epoch": 0.5692719746773801, + "grad_norm": 1.4140625, + "learning_rate": 0.0008251437783710267, + "loss": 0.9131, + "step": 8183 + }, + { + "epoch": 0.5693415423145153, + "grad_norm": 1.3046875, + "learning_rate": 0.000824921921299479, + "loss": 0.8456, + "step": 8184 + }, + { + "epoch": 0.5694111099516505, + "grad_norm": 0.859375, + "learning_rate": 0.0008247000731175139, + "loss": 0.668, + "step": 8185 + }, + { + "epoch": 0.5694806775887857, + "grad_norm": 1.2890625, + "learning_rate": 0.0008244782338363959, + "loss": 1.0285, + "step": 8186 + }, + { + "epoch": 0.5695502452259209, + "grad_norm": 1.25, + "learning_rate": 0.0008242564034673879, + "loss": 0.876, + "step": 8187 + }, + { + "epoch": 0.5696198128630561, + "grad_norm": 0.81640625, + "learning_rate": 0.0008240345820217541, + "loss": 0.5813, + "step": 8188 + }, + { + "epoch": 0.5696893805001914, + "grad_norm": 1.3203125, + "learning_rate": 0.0008238127695107574, + "loss": 0.7834, + "step": 8189 + }, + { + "epoch": 0.5697589481373265, + "grad_norm": 1.0625, + "learning_rate": 0.0008235909659456604, + "loss": 0.8144, + "step": 8190 + }, + { + "epoch": 0.5698285157744617, + "grad_norm": 0.9453125, + "learning_rate": 0.0008233691713377245, + "loss": 0.6035, + "step": 8191 + }, + { + "epoch": 0.569898083411597, + "grad_norm": 1.296875, + "learning_rate": 0.0008231473856982121, + "loss": 1.0587, + "step": 8192 + }, + { + "epoch": 0.5699676510487321, + "grad_norm": 1.125, + "learning_rate": 0.0008229256090383841, + "loss": 0.7681, + "step": 8193 + }, + { + "epoch": 0.5700372186858673, + "grad_norm": 1.390625, + "learning_rate": 0.0008227038413695007, + "loss": 0.9053, + "step": 8194 + }, + { + "epoch": 0.5701067863230025, + "grad_norm": 1.21875, + "learning_rate": 0.0008224820827028231, + "loss": 0.9224, + "step": 8195 + }, + { + "epoch": 0.5701763539601378, + "grad_norm": 1.1015625, + "learning_rate": 0.0008222603330496105, + "loss": 0.7537, + "step": 8196 + }, + { + "epoch": 0.5702459215972729, + "grad_norm": 1.0703125, + "learning_rate": 0.0008220385924211224, + "loss": 0.6989, + "step": 8197 + }, + { + "epoch": 0.5703154892344081, + "grad_norm": 1.2578125, + "learning_rate": 0.0008218168608286172, + "loss": 0.7965, + "step": 8198 + }, + { + "epoch": 0.5703850568715434, + "grad_norm": 1.34375, + "learning_rate": 0.000821595138283354, + "loss": 0.7993, + "step": 8199 + }, + { + "epoch": 0.5704546245086786, + "grad_norm": 1.03125, + "learning_rate": 0.0008213734247965905, + "loss": 0.8572, + "step": 8200 + }, + { + "epoch": 0.5705241921458137, + "grad_norm": 1.1640625, + "learning_rate": 0.0008211517203795837, + "loss": 0.7204, + "step": 8201 + }, + { + "epoch": 0.570593759782949, + "grad_norm": 0.87890625, + "learning_rate": 0.0008209300250435915, + "loss": 0.7224, + "step": 8202 + }, + { + "epoch": 0.5706633274200842, + "grad_norm": 0.98046875, + "learning_rate": 0.00082070833879987, + "loss": 0.7471, + "step": 8203 + }, + { + "epoch": 0.5707328950572194, + "grad_norm": 1.3125, + "learning_rate": 0.0008204866616596754, + "loss": 0.9107, + "step": 8204 + }, + { + "epoch": 0.5708024626943546, + "grad_norm": 0.93359375, + "learning_rate": 0.0008202649936342631, + "loss": 0.8132, + "step": 8205 + }, + { + "epoch": 0.5708720303314898, + "grad_norm": 1.0859375, + "learning_rate": 0.0008200433347348886, + "loss": 0.8231, + "step": 8206 + }, + { + "epoch": 0.570941597968625, + "grad_norm": 0.8984375, + "learning_rate": 0.0008198216849728068, + "loss": 0.7626, + "step": 8207 + }, + { + "epoch": 0.5710111656057602, + "grad_norm": 1.1171875, + "learning_rate": 0.0008196000443592708, + "loss": 0.8453, + "step": 8208 + }, + { + "epoch": 0.5710807332428954, + "grad_norm": 1.1640625, + "learning_rate": 0.0008193784129055362, + "loss": 0.9886, + "step": 8209 + }, + { + "epoch": 0.5711503008800306, + "grad_norm": 1.0, + "learning_rate": 0.000819156790622855, + "loss": 0.7544, + "step": 8210 + }, + { + "epoch": 0.5712198685171658, + "grad_norm": 1.21875, + "learning_rate": 0.0008189351775224807, + "loss": 1.0287, + "step": 8211 + }, + { + "epoch": 0.5712894361543011, + "grad_norm": 1.234375, + "learning_rate": 0.000818713573615665, + "loss": 0.9659, + "step": 8212 + }, + { + "epoch": 0.5713590037914362, + "grad_norm": 0.9296875, + "learning_rate": 0.0008184919789136606, + "loss": 0.7124, + "step": 8213 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.0625, + "learning_rate": 0.0008182703934277184, + "loss": 0.831, + "step": 8214 + }, + { + "epoch": 0.5714981390657067, + "grad_norm": 1.1171875, + "learning_rate": 0.0008180488171690896, + "loss": 0.7193, + "step": 8215 + }, + { + "epoch": 0.5715677067028418, + "grad_norm": 1.2265625, + "learning_rate": 0.0008178272501490252, + "loss": 0.838, + "step": 8216 + }, + { + "epoch": 0.571637274339977, + "grad_norm": 1.5078125, + "learning_rate": 0.0008176056923787747, + "loss": 0.7169, + "step": 8217 + }, + { + "epoch": 0.5717068419771123, + "grad_norm": 0.90625, + "learning_rate": 0.0008173841438695879, + "loss": 0.7327, + "step": 8218 + }, + { + "epoch": 0.5717764096142475, + "grad_norm": 1.0625, + "learning_rate": 0.0008171626046327134, + "loss": 0.7963, + "step": 8219 + }, + { + "epoch": 0.5718459772513826, + "grad_norm": 1.109375, + "learning_rate": 0.0008169410746794005, + "loss": 0.9955, + "step": 8220 + }, + { + "epoch": 0.5719155448885178, + "grad_norm": 1.171875, + "learning_rate": 0.000816719554020897, + "loss": 0.8668, + "step": 8221 + }, + { + "epoch": 0.5719851125256531, + "grad_norm": 1.09375, + "learning_rate": 0.0008164980426684507, + "loss": 1.036, + "step": 8222 + }, + { + "epoch": 0.5720546801627883, + "grad_norm": 1.0078125, + "learning_rate": 0.0008162765406333093, + "loss": 0.9154, + "step": 8223 + }, + { + "epoch": 0.5721242477999234, + "grad_norm": 1.078125, + "learning_rate": 0.0008160550479267188, + "loss": 0.6411, + "step": 8224 + }, + { + "epoch": 0.5721938154370587, + "grad_norm": 1.078125, + "learning_rate": 0.0008158335645599262, + "loss": 0.7755, + "step": 8225 + }, + { + "epoch": 0.5722633830741939, + "grad_norm": 1.125, + "learning_rate": 0.0008156120905441762, + "loss": 0.941, + "step": 8226 + }, + { + "epoch": 0.5723329507113291, + "grad_norm": 0.95703125, + "learning_rate": 0.0008153906258907155, + "loss": 0.8045, + "step": 8227 + }, + { + "epoch": 0.5724025183484643, + "grad_norm": 1.09375, + "learning_rate": 0.000815169170610788, + "loss": 0.7237, + "step": 8228 + }, + { + "epoch": 0.5724720859855995, + "grad_norm": 1.09375, + "learning_rate": 0.0008149477247156387, + "loss": 0.8402, + "step": 8229 + }, + { + "epoch": 0.5725416536227347, + "grad_norm": 0.97265625, + "learning_rate": 0.0008147262882165109, + "loss": 0.7556, + "step": 8230 + }, + { + "epoch": 0.57261122125987, + "grad_norm": 0.984375, + "learning_rate": 0.0008145048611246484, + "loss": 0.8533, + "step": 8231 + }, + { + "epoch": 0.5726807888970051, + "grad_norm": 1.03125, + "learning_rate": 0.0008142834434512943, + "loss": 0.7094, + "step": 8232 + }, + { + "epoch": 0.5727503565341403, + "grad_norm": 1.0, + "learning_rate": 0.0008140620352076903, + "loss": 0.6124, + "step": 8233 + }, + { + "epoch": 0.5728199241712755, + "grad_norm": 1.3984375, + "learning_rate": 0.0008138406364050796, + "loss": 0.805, + "step": 8234 + }, + { + "epoch": 0.5728894918084108, + "grad_norm": 0.96484375, + "learning_rate": 0.0008136192470547027, + "loss": 0.6793, + "step": 8235 + }, + { + "epoch": 0.5729590594455459, + "grad_norm": 1.25, + "learning_rate": 0.0008133978671678013, + "loss": 1.1407, + "step": 8236 + }, + { + "epoch": 0.5730286270826811, + "grad_norm": 0.94921875, + "learning_rate": 0.0008131764967556154, + "loss": 0.6932, + "step": 8237 + }, + { + "epoch": 0.5730981947198164, + "grad_norm": 1.1484375, + "learning_rate": 0.0008129551358293853, + "loss": 0.6433, + "step": 8238 + }, + { + "epoch": 0.5731677623569515, + "grad_norm": 1.09375, + "learning_rate": 0.0008127337844003509, + "loss": 0.9065, + "step": 8239 + }, + { + "epoch": 0.5732373299940867, + "grad_norm": 1.40625, + "learning_rate": 0.0008125124424797506, + "loss": 0.8267, + "step": 8240 + }, + { + "epoch": 0.573306897631222, + "grad_norm": 1.3515625, + "learning_rate": 0.0008122911100788238, + "loss": 0.9824, + "step": 8241 + }, + { + "epoch": 0.5733764652683572, + "grad_norm": 1.1953125, + "learning_rate": 0.0008120697872088083, + "loss": 0.729, + "step": 8242 + }, + { + "epoch": 0.5734460329054923, + "grad_norm": 1.171875, + "learning_rate": 0.000811848473880942, + "loss": 0.7309, + "step": 8243 + }, + { + "epoch": 0.5735156005426276, + "grad_norm": 1.0703125, + "learning_rate": 0.0008116271701064612, + "loss": 0.8049, + "step": 8244 + }, + { + "epoch": 0.5735851681797628, + "grad_norm": 1.2421875, + "learning_rate": 0.0008114058758966037, + "loss": 1.1414, + "step": 8245 + }, + { + "epoch": 0.573654735816898, + "grad_norm": 1.4296875, + "learning_rate": 0.000811184591262605, + "loss": 0.9099, + "step": 8246 + }, + { + "epoch": 0.5737243034540331, + "grad_norm": 1.2578125, + "learning_rate": 0.000810963316215701, + "loss": 0.9187, + "step": 8247 + }, + { + "epoch": 0.5737938710911684, + "grad_norm": 0.86328125, + "learning_rate": 0.0008107420507671275, + "loss": 0.707, + "step": 8248 + }, + { + "epoch": 0.5738634387283036, + "grad_norm": 1.4296875, + "learning_rate": 0.0008105207949281184, + "loss": 0.9479, + "step": 8249 + }, + { + "epoch": 0.5739330063654388, + "grad_norm": 1.125, + "learning_rate": 0.0008102995487099085, + "loss": 0.6448, + "step": 8250 + }, + { + "epoch": 0.574002574002574, + "grad_norm": 1.859375, + "learning_rate": 0.0008100783121237308, + "loss": 0.7715, + "step": 8251 + }, + { + "epoch": 0.5740721416397092, + "grad_norm": 1.4453125, + "learning_rate": 0.0008098570851808194, + "loss": 1.0675, + "step": 8252 + }, + { + "epoch": 0.5741417092768444, + "grad_norm": 0.87890625, + "learning_rate": 0.000809635867892407, + "loss": 0.5275, + "step": 8253 + }, + { + "epoch": 0.5742112769139797, + "grad_norm": 1.1328125, + "learning_rate": 0.0008094146602697254, + "loss": 0.8431, + "step": 8254 + }, + { + "epoch": 0.5742808445511148, + "grad_norm": 0.8984375, + "learning_rate": 0.0008091934623240071, + "loss": 0.5103, + "step": 8255 + }, + { + "epoch": 0.57435041218825, + "grad_norm": 1.09375, + "learning_rate": 0.000808972274066483, + "loss": 0.8274, + "step": 8256 + }, + { + "epoch": 0.5744199798253853, + "grad_norm": 1.078125, + "learning_rate": 0.0008087510955083841, + "loss": 0.7134, + "step": 8257 + }, + { + "epoch": 0.5744895474625205, + "grad_norm": 1.1015625, + "learning_rate": 0.00080852992666094, + "loss": 0.9269, + "step": 8258 + }, + { + "epoch": 0.5745591150996556, + "grad_norm": 0.9921875, + "learning_rate": 0.0008083087675353816, + "loss": 0.7844, + "step": 8259 + }, + { + "epoch": 0.5746286827367908, + "grad_norm": 0.765625, + "learning_rate": 0.0008080876181429377, + "loss": 0.4759, + "step": 8260 + }, + { + "epoch": 0.5746982503739261, + "grad_norm": 0.8984375, + "learning_rate": 0.000807866478494837, + "loss": 0.9786, + "step": 8261 + }, + { + "epoch": 0.5747678180110612, + "grad_norm": 1.1953125, + "learning_rate": 0.0008076453486023087, + "loss": 0.7311, + "step": 8262 + }, + { + "epoch": 0.5748373856481964, + "grad_norm": 1.0859375, + "learning_rate": 0.0008074242284765796, + "loss": 0.6925, + "step": 8263 + }, + { + "epoch": 0.5749069532853317, + "grad_norm": 0.828125, + "learning_rate": 0.0008072031181288779, + "loss": 0.7691, + "step": 8264 + }, + { + "epoch": 0.5749765209224669, + "grad_norm": 1.265625, + "learning_rate": 0.0008069820175704293, + "loss": 0.8411, + "step": 8265 + }, + { + "epoch": 0.575046088559602, + "grad_norm": 0.98828125, + "learning_rate": 0.0008067609268124617, + "loss": 0.5944, + "step": 8266 + }, + { + "epoch": 0.5751156561967373, + "grad_norm": 1.171875, + "learning_rate": 0.0008065398458662001, + "loss": 1.019, + "step": 8267 + }, + { + "epoch": 0.5751852238338725, + "grad_norm": 1.2109375, + "learning_rate": 0.0008063187747428698, + "loss": 0.8377, + "step": 8268 + }, + { + "epoch": 0.5752547914710077, + "grad_norm": 1.0703125, + "learning_rate": 0.0008060977134536961, + "loss": 0.6803, + "step": 8269 + }, + { + "epoch": 0.575324359108143, + "grad_norm": 1.0078125, + "learning_rate": 0.0008058766620099031, + "loss": 0.7064, + "step": 8270 + }, + { + "epoch": 0.5753939267452781, + "grad_norm": 1.1328125, + "learning_rate": 0.000805655620422715, + "loss": 0.9263, + "step": 8271 + }, + { + "epoch": 0.5754634943824133, + "grad_norm": 1.390625, + "learning_rate": 0.0008054345887033542, + "loss": 0.8418, + "step": 8272 + }, + { + "epoch": 0.5755330620195485, + "grad_norm": 1.078125, + "learning_rate": 0.000805213566863045, + "loss": 0.7247, + "step": 8273 + }, + { + "epoch": 0.5756026296566837, + "grad_norm": 1.0859375, + "learning_rate": 0.0008049925549130089, + "loss": 0.8379, + "step": 8274 + }, + { + "epoch": 0.5756721972938189, + "grad_norm": 1.09375, + "learning_rate": 0.0008047715528644677, + "loss": 0.8124, + "step": 8275 + }, + { + "epoch": 0.5757417649309541, + "grad_norm": 1.0859375, + "learning_rate": 0.0008045505607286434, + "loss": 0.7437, + "step": 8276 + }, + { + "epoch": 0.5758113325680894, + "grad_norm": 1.0703125, + "learning_rate": 0.0008043295785167563, + "loss": 0.7772, + "step": 8277 + }, + { + "epoch": 0.5758809002052245, + "grad_norm": 0.84375, + "learning_rate": 0.000804108606240027, + "loss": 0.5885, + "step": 8278 + }, + { + "epoch": 0.5759504678423597, + "grad_norm": 1.0625, + "learning_rate": 0.000803887643909675, + "loss": 0.7219, + "step": 8279 + }, + { + "epoch": 0.576020035479495, + "grad_norm": 1.03125, + "learning_rate": 0.0008036666915369205, + "loss": 0.6723, + "step": 8280 + }, + { + "epoch": 0.5760896031166302, + "grad_norm": 1.03125, + "learning_rate": 0.0008034457491329816, + "loss": 0.5599, + "step": 8281 + }, + { + "epoch": 0.5761591707537653, + "grad_norm": 1.140625, + "learning_rate": 0.0008032248167090765, + "loss": 1.0868, + "step": 8282 + }, + { + "epoch": 0.5762287383909006, + "grad_norm": 1.078125, + "learning_rate": 0.0008030038942764239, + "loss": 0.9291, + "step": 8283 + }, + { + "epoch": 0.5762983060280358, + "grad_norm": 1.46875, + "learning_rate": 0.0008027829818462405, + "loss": 0.8722, + "step": 8284 + }, + { + "epoch": 0.576367873665171, + "grad_norm": 1.140625, + "learning_rate": 0.0008025620794297431, + "loss": 0.8945, + "step": 8285 + }, + { + "epoch": 0.5764374413023061, + "grad_norm": 1.3125, + "learning_rate": 0.000802341187038148, + "loss": 0.783, + "step": 8286 + }, + { + "epoch": 0.5765070089394414, + "grad_norm": 1.15625, + "learning_rate": 0.0008021203046826716, + "loss": 1.0315, + "step": 8287 + }, + { + "epoch": 0.5765765765765766, + "grad_norm": 1.078125, + "learning_rate": 0.0008018994323745284, + "loss": 0.8479, + "step": 8288 + }, + { + "epoch": 0.5766461442137117, + "grad_norm": 1.046875, + "learning_rate": 0.0008016785701249334, + "loss": 0.8675, + "step": 8289 + }, + { + "epoch": 0.576715711850847, + "grad_norm": 1.234375, + "learning_rate": 0.0008014577179451015, + "loss": 0.718, + "step": 8290 + }, + { + "epoch": 0.5767852794879822, + "grad_norm": 1.046875, + "learning_rate": 0.0008012368758462456, + "loss": 0.8693, + "step": 8291 + }, + { + "epoch": 0.5768548471251174, + "grad_norm": 0.99609375, + "learning_rate": 0.0008010160438395794, + "loss": 0.7722, + "step": 8292 + }, + { + "epoch": 0.5769244147622526, + "grad_norm": 1.03125, + "learning_rate": 0.0008007952219363152, + "loss": 0.6732, + "step": 8293 + }, + { + "epoch": 0.5769939823993878, + "grad_norm": 1.2265625, + "learning_rate": 0.0008005744101476661, + "loss": 0.6754, + "step": 8294 + }, + { + "epoch": 0.577063550036523, + "grad_norm": 1.25, + "learning_rate": 0.0008003536084848431, + "loss": 0.8421, + "step": 8295 + }, + { + "epoch": 0.5771331176736583, + "grad_norm": 1.234375, + "learning_rate": 0.0008001328169590571, + "loss": 0.9898, + "step": 8296 + }, + { + "epoch": 0.5772026853107934, + "grad_norm": 1.109375, + "learning_rate": 0.0007999120355815197, + "loss": 0.9763, + "step": 8297 + }, + { + "epoch": 0.5772722529479286, + "grad_norm": 1.171875, + "learning_rate": 0.0007996912643634409, + "loss": 0.7011, + "step": 8298 + }, + { + "epoch": 0.5773418205850638, + "grad_norm": 1.3046875, + "learning_rate": 0.0007994705033160296, + "loss": 0.9451, + "step": 8299 + }, + { + "epoch": 0.5774113882221991, + "grad_norm": 1.21875, + "learning_rate": 0.0007992497524504954, + "loss": 1.1057, + "step": 8300 + }, + { + "epoch": 0.5774809558593342, + "grad_norm": 1.1796875, + "learning_rate": 0.0007990290117780472, + "loss": 0.9615, + "step": 8301 + }, + { + "epoch": 0.5775505234964694, + "grad_norm": 1.3984375, + "learning_rate": 0.0007988082813098927, + "loss": 1.1864, + "step": 8302 + }, + { + "epoch": 0.5776200911336047, + "grad_norm": 0.828125, + "learning_rate": 0.0007985875610572393, + "loss": 0.6633, + "step": 8303 + }, + { + "epoch": 0.5776896587707399, + "grad_norm": 0.92578125, + "learning_rate": 0.0007983668510312947, + "loss": 0.543, + "step": 8304 + }, + { + "epoch": 0.577759226407875, + "grad_norm": 0.9609375, + "learning_rate": 0.0007981461512432652, + "loss": 0.4702, + "step": 8305 + }, + { + "epoch": 0.5778287940450103, + "grad_norm": 1.2578125, + "learning_rate": 0.0007979254617043565, + "loss": 1.0051, + "step": 8306 + }, + { + "epoch": 0.5778983616821455, + "grad_norm": 1.1640625, + "learning_rate": 0.0007977047824257741, + "loss": 0.9822, + "step": 8307 + }, + { + "epoch": 0.5779679293192806, + "grad_norm": 1.0390625, + "learning_rate": 0.0007974841134187236, + "loss": 0.8998, + "step": 8308 + }, + { + "epoch": 0.5780374969564159, + "grad_norm": 1.109375, + "learning_rate": 0.000797263454694409, + "loss": 0.7201, + "step": 8309 + }, + { + "epoch": 0.5781070645935511, + "grad_norm": 1.0390625, + "learning_rate": 0.0007970428062640345, + "loss": 0.6195, + "step": 8310 + }, + { + "epoch": 0.5781766322306863, + "grad_norm": 1.1640625, + "learning_rate": 0.0007968221681388026, + "loss": 0.9949, + "step": 8311 + }, + { + "epoch": 0.5782461998678214, + "grad_norm": 1.1640625, + "learning_rate": 0.0007966015403299175, + "loss": 0.9154, + "step": 8312 + }, + { + "epoch": 0.5783157675049567, + "grad_norm": 1.3046875, + "learning_rate": 0.0007963809228485807, + "loss": 0.7892, + "step": 8313 + }, + { + "epoch": 0.5783853351420919, + "grad_norm": 1.21875, + "learning_rate": 0.0007961603157059943, + "loss": 0.6663, + "step": 8314 + }, + { + "epoch": 0.5784549027792271, + "grad_norm": 0.984375, + "learning_rate": 0.00079593971891336, + "loss": 0.5892, + "step": 8315 + }, + { + "epoch": 0.5785244704163623, + "grad_norm": 1.125, + "learning_rate": 0.0007957191324818781, + "loss": 0.8504, + "step": 8316 + }, + { + "epoch": 0.5785940380534975, + "grad_norm": 1.0234375, + "learning_rate": 0.0007954985564227489, + "loss": 0.7952, + "step": 8317 + }, + { + "epoch": 0.5786636056906327, + "grad_norm": 1.3203125, + "learning_rate": 0.000795277990747172, + "loss": 0.8688, + "step": 8318 + }, + { + "epoch": 0.578733173327768, + "grad_norm": 1.4609375, + "learning_rate": 0.0007950574354663474, + "loss": 1.0818, + "step": 8319 + }, + { + "epoch": 0.5788027409649031, + "grad_norm": 1.1171875, + "learning_rate": 0.0007948368905914729, + "loss": 0.8581, + "step": 8320 + }, + { + "epoch": 0.5788723086020383, + "grad_norm": 1.1875, + "learning_rate": 0.0007946163561337468, + "loss": 0.6678, + "step": 8321 + }, + { + "epoch": 0.5789418762391736, + "grad_norm": 1.171875, + "learning_rate": 0.0007943958321043674, + "loss": 0.8144, + "step": 8322 + }, + { + "epoch": 0.5790114438763088, + "grad_norm": 1.234375, + "learning_rate": 0.0007941753185145312, + "loss": 0.7094, + "step": 8323 + }, + { + "epoch": 0.5790810115134439, + "grad_norm": 0.94921875, + "learning_rate": 0.0007939548153754347, + "loss": 0.7981, + "step": 8324 + }, + { + "epoch": 0.5791505791505791, + "grad_norm": 1.0078125, + "learning_rate": 0.0007937343226982741, + "loss": 0.8511, + "step": 8325 + }, + { + "epoch": 0.5792201467877144, + "grad_norm": 1.34375, + "learning_rate": 0.0007935138404942452, + "loss": 0.8673, + "step": 8326 + }, + { + "epoch": 0.5792897144248496, + "grad_norm": 1.28125, + "learning_rate": 0.0007932933687745426, + "loss": 0.8772, + "step": 8327 + }, + { + "epoch": 0.5793592820619847, + "grad_norm": 1.0078125, + "learning_rate": 0.0007930729075503606, + "loss": 0.6199, + "step": 8328 + }, + { + "epoch": 0.57942884969912, + "grad_norm": 1.171875, + "learning_rate": 0.0007928524568328936, + "loss": 0.8623, + "step": 8329 + }, + { + "epoch": 0.5794984173362552, + "grad_norm": 1.203125, + "learning_rate": 0.0007926320166333349, + "loss": 1.0028, + "step": 8330 + }, + { + "epoch": 0.5795679849733903, + "grad_norm": 1.1484375, + "learning_rate": 0.0007924115869628771, + "loss": 0.7935, + "step": 8331 + }, + { + "epoch": 0.5796375526105256, + "grad_norm": 1.234375, + "learning_rate": 0.0007921911678327123, + "loss": 0.9247, + "step": 8332 + }, + { + "epoch": 0.5797071202476608, + "grad_norm": 1.2109375, + "learning_rate": 0.0007919707592540329, + "loss": 0.8797, + "step": 8333 + }, + { + "epoch": 0.579776687884796, + "grad_norm": 1.2109375, + "learning_rate": 0.0007917503612380298, + "loss": 0.8182, + "step": 8334 + }, + { + "epoch": 0.5798462555219313, + "grad_norm": 1.046875, + "learning_rate": 0.0007915299737958933, + "loss": 0.8666, + "step": 8335 + }, + { + "epoch": 0.5799158231590664, + "grad_norm": 1.6953125, + "learning_rate": 0.0007913095969388143, + "loss": 0.9228, + "step": 8336 + }, + { + "epoch": 0.5799853907962016, + "grad_norm": 1.53125, + "learning_rate": 0.0007910892306779822, + "loss": 0.9564, + "step": 8337 + }, + { + "epoch": 0.5800549584333368, + "grad_norm": 1.0703125, + "learning_rate": 0.0007908688750245858, + "loss": 0.8895, + "step": 8338 + }, + { + "epoch": 0.580124526070472, + "grad_norm": 1.390625, + "learning_rate": 0.0007906485299898137, + "loss": 0.8285, + "step": 8339 + }, + { + "epoch": 0.5801940937076072, + "grad_norm": 1.3046875, + "learning_rate": 0.0007904281955848543, + "loss": 0.8271, + "step": 8340 + }, + { + "epoch": 0.5802636613447424, + "grad_norm": 1.1328125, + "learning_rate": 0.0007902078718208947, + "loss": 0.6401, + "step": 8341 + }, + { + "epoch": 0.5803332289818777, + "grad_norm": 1.0078125, + "learning_rate": 0.0007899875587091216, + "loss": 0.8646, + "step": 8342 + }, + { + "epoch": 0.5804027966190128, + "grad_norm": 1.3515625, + "learning_rate": 0.0007897672562607221, + "loss": 0.8125, + "step": 8343 + }, + { + "epoch": 0.580472364256148, + "grad_norm": 1.0859375, + "learning_rate": 0.0007895469644868819, + "loss": 0.7378, + "step": 8344 + }, + { + "epoch": 0.5805419318932833, + "grad_norm": 1.328125, + "learning_rate": 0.0007893266833987857, + "loss": 1.0015, + "step": 8345 + }, + { + "epoch": 0.5806114995304185, + "grad_norm": 1.0625, + "learning_rate": 0.0007891064130076187, + "loss": 0.6859, + "step": 8346 + }, + { + "epoch": 0.5806810671675536, + "grad_norm": 1.4375, + "learning_rate": 0.0007888861533245652, + "loss": 0.9137, + "step": 8347 + }, + { + "epoch": 0.5807506348046888, + "grad_norm": 1.265625, + "learning_rate": 0.0007886659043608086, + "loss": 0.912, + "step": 8348 + }, + { + "epoch": 0.5808202024418241, + "grad_norm": 1.09375, + "learning_rate": 0.0007884456661275321, + "loss": 0.8555, + "step": 8349 + }, + { + "epoch": 0.5808897700789593, + "grad_norm": 1.234375, + "learning_rate": 0.0007882254386359184, + "loss": 1.044, + "step": 8350 + }, + { + "epoch": 0.5809593377160944, + "grad_norm": 1.0, + "learning_rate": 0.0007880052218971499, + "loss": 0.5766, + "step": 8351 + }, + { + "epoch": 0.5810289053532297, + "grad_norm": 1.0546875, + "learning_rate": 0.0007877850159224073, + "loss": 0.6393, + "step": 8352 + }, + { + "epoch": 0.5810984729903649, + "grad_norm": 1.0859375, + "learning_rate": 0.0007875648207228719, + "loss": 0.8741, + "step": 8353 + }, + { + "epoch": 0.5811680406275, + "grad_norm": 0.83984375, + "learning_rate": 0.0007873446363097246, + "loss": 0.7753, + "step": 8354 + }, + { + "epoch": 0.5812376082646353, + "grad_norm": 1.03125, + "learning_rate": 0.0007871244626941444, + "loss": 0.9385, + "step": 8355 + }, + { + "epoch": 0.5813071759017705, + "grad_norm": 1.0859375, + "learning_rate": 0.0007869042998873108, + "loss": 0.9722, + "step": 8356 + }, + { + "epoch": 0.5813767435389057, + "grad_norm": 1.0859375, + "learning_rate": 0.0007866841479004032, + "loss": 1.0156, + "step": 8357 + }, + { + "epoch": 0.581446311176041, + "grad_norm": 1.0078125, + "learning_rate": 0.0007864640067445994, + "loss": 1.0329, + "step": 8358 + }, + { + "epoch": 0.5815158788131761, + "grad_norm": 1.4296875, + "learning_rate": 0.0007862438764310769, + "loss": 0.8019, + "step": 8359 + }, + { + "epoch": 0.5815854464503113, + "grad_norm": 1.1796875, + "learning_rate": 0.0007860237569710127, + "loss": 0.8743, + "step": 8360 + }, + { + "epoch": 0.5816550140874465, + "grad_norm": 0.859375, + "learning_rate": 0.0007858036483755842, + "loss": 0.5349, + "step": 8361 + }, + { + "epoch": 0.5817245817245817, + "grad_norm": 0.98046875, + "learning_rate": 0.0007855835506559663, + "loss": 0.9931, + "step": 8362 + }, + { + "epoch": 0.5817941493617169, + "grad_norm": 0.96484375, + "learning_rate": 0.0007853634638233349, + "loss": 0.8545, + "step": 8363 + }, + { + "epoch": 0.5818637169988521, + "grad_norm": 1.2265625, + "learning_rate": 0.0007851433878888652, + "loss": 0.899, + "step": 8364 + }, + { + "epoch": 0.5819332846359874, + "grad_norm": 0.92578125, + "learning_rate": 0.0007849233228637315, + "loss": 0.7024, + "step": 8365 + }, + { + "epoch": 0.5820028522731225, + "grad_norm": 1.03125, + "learning_rate": 0.0007847032687591072, + "loss": 0.8722, + "step": 8366 + }, + { + "epoch": 0.5820724199102577, + "grad_norm": 1.09375, + "learning_rate": 0.0007844832255861654, + "loss": 1.013, + "step": 8367 + }, + { + "epoch": 0.582141987547393, + "grad_norm": 0.80859375, + "learning_rate": 0.0007842631933560794, + "loss": 0.7286, + "step": 8368 + }, + { + "epoch": 0.5822115551845282, + "grad_norm": 1.140625, + "learning_rate": 0.0007840431720800212, + "loss": 0.8038, + "step": 8369 + }, + { + "epoch": 0.5822811228216633, + "grad_norm": 1.34375, + "learning_rate": 0.000783823161769162, + "loss": 0.8539, + "step": 8370 + }, + { + "epoch": 0.5823506904587986, + "grad_norm": 1.4140625, + "learning_rate": 0.0007836031624346731, + "loss": 0.7812, + "step": 8371 + }, + { + "epoch": 0.5824202580959338, + "grad_norm": 1.2109375, + "learning_rate": 0.000783383174087725, + "loss": 0.901, + "step": 8372 + }, + { + "epoch": 0.582489825733069, + "grad_norm": 1.265625, + "learning_rate": 0.0007831631967394876, + "loss": 0.7741, + "step": 8373 + }, + { + "epoch": 0.5825593933702041, + "grad_norm": 1.09375, + "learning_rate": 0.0007829432304011297, + "loss": 0.886, + "step": 8374 + }, + { + "epoch": 0.5826289610073394, + "grad_norm": 1.203125, + "learning_rate": 0.0007827232750838207, + "loss": 0.8501, + "step": 8375 + }, + { + "epoch": 0.5826985286444746, + "grad_norm": 1.1484375, + "learning_rate": 0.0007825033307987289, + "loss": 0.9736, + "step": 8376 + }, + { + "epoch": 0.5827680962816097, + "grad_norm": 1.1171875, + "learning_rate": 0.0007822833975570213, + "loss": 0.9093, + "step": 8377 + }, + { + "epoch": 0.582837663918745, + "grad_norm": 1.0234375, + "learning_rate": 0.0007820634753698656, + "loss": 1.0385, + "step": 8378 + }, + { + "epoch": 0.5829072315558802, + "grad_norm": 0.99609375, + "learning_rate": 0.0007818435642484283, + "loss": 0.8872, + "step": 8379 + }, + { + "epoch": 0.5829767991930154, + "grad_norm": 0.9765625, + "learning_rate": 0.000781623664203875, + "loss": 0.7607, + "step": 8380 + }, + { + "epoch": 0.5830463668301507, + "grad_norm": 0.9140625, + "learning_rate": 0.0007814037752473711, + "loss": 0.6788, + "step": 8381 + }, + { + "epoch": 0.5831159344672858, + "grad_norm": 1.0078125, + "learning_rate": 0.000781183897390082, + "loss": 0.7802, + "step": 8382 + }, + { + "epoch": 0.583185502104421, + "grad_norm": 1.1015625, + "learning_rate": 0.0007809640306431718, + "loss": 0.699, + "step": 8383 + }, + { + "epoch": 0.5832550697415563, + "grad_norm": 0.93359375, + "learning_rate": 0.000780744175017804, + "loss": 0.688, + "step": 8384 + }, + { + "epoch": 0.5833246373786914, + "grad_norm": 1.0859375, + "learning_rate": 0.0007805243305251415, + "loss": 0.8424, + "step": 8385 + }, + { + "epoch": 0.5833942050158266, + "grad_norm": 1.453125, + "learning_rate": 0.0007803044971763477, + "loss": 0.9309, + "step": 8386 + }, + { + "epoch": 0.5834637726529618, + "grad_norm": 1.0625, + "learning_rate": 0.0007800846749825842, + "loss": 0.7958, + "step": 8387 + }, + { + "epoch": 0.5835333402900971, + "grad_norm": 1.03125, + "learning_rate": 0.000779864863955012, + "loss": 0.7098, + "step": 8388 + }, + { + "epoch": 0.5836029079272322, + "grad_norm": 1.0625, + "learning_rate": 0.0007796450641047928, + "loss": 0.9335, + "step": 8389 + }, + { + "epoch": 0.5836724755643674, + "grad_norm": 1.078125, + "learning_rate": 0.0007794252754430866, + "loss": 1.0127, + "step": 8390 + }, + { + "epoch": 0.5837420432015027, + "grad_norm": 1.078125, + "learning_rate": 0.0007792054979810531, + "loss": 0.8231, + "step": 8391 + }, + { + "epoch": 0.5838116108386379, + "grad_norm": 1.359375, + "learning_rate": 0.0007789857317298512, + "loss": 0.829, + "step": 8392 + }, + { + "epoch": 0.583881178475773, + "grad_norm": 1.171875, + "learning_rate": 0.0007787659767006403, + "loss": 0.9883, + "step": 8393 + }, + { + "epoch": 0.5839507461129083, + "grad_norm": 0.9921875, + "learning_rate": 0.0007785462329045779, + "loss": 0.8823, + "step": 8394 + }, + { + "epoch": 0.5840203137500435, + "grad_norm": 0.859375, + "learning_rate": 0.0007783265003528212, + "loss": 0.6567, + "step": 8395 + }, + { + "epoch": 0.5840898813871787, + "grad_norm": 1.015625, + "learning_rate": 0.0007781067790565278, + "loss": 0.6483, + "step": 8396 + }, + { + "epoch": 0.5841594490243139, + "grad_norm": 0.9921875, + "learning_rate": 0.000777887069026854, + "loss": 0.8813, + "step": 8397 + }, + { + "epoch": 0.5842290166614491, + "grad_norm": 1.0703125, + "learning_rate": 0.000777667370274955, + "loss": 0.9875, + "step": 8398 + }, + { + "epoch": 0.5842985842985843, + "grad_norm": 0.9453125, + "learning_rate": 0.0007774476828119861, + "loss": 0.554, + "step": 8399 + }, + { + "epoch": 0.5843681519357194, + "grad_norm": 1.1640625, + "learning_rate": 0.0007772280066491024, + "loss": 1.1132, + "step": 8400 + }, + { + "epoch": 0.5844377195728547, + "grad_norm": 1.3359375, + "learning_rate": 0.0007770083417974578, + "loss": 0.9085, + "step": 8401 + }, + { + "epoch": 0.5845072872099899, + "grad_norm": 0.93359375, + "learning_rate": 0.0007767886882682053, + "loss": 0.6959, + "step": 8402 + }, + { + "epoch": 0.5845768548471251, + "grad_norm": 1.0703125, + "learning_rate": 0.0007765690460724982, + "loss": 0.5727, + "step": 8403 + }, + { + "epoch": 0.5846464224842604, + "grad_norm": 1.2421875, + "learning_rate": 0.0007763494152214892, + "loss": 0.8546, + "step": 8404 + }, + { + "epoch": 0.5847159901213955, + "grad_norm": 0.9140625, + "learning_rate": 0.0007761297957263291, + "loss": 0.8025, + "step": 8405 + }, + { + "epoch": 0.5847855577585307, + "grad_norm": 1.640625, + "learning_rate": 0.0007759101875981695, + "loss": 1.1985, + "step": 8406 + }, + { + "epoch": 0.584855125395666, + "grad_norm": 1.4140625, + "learning_rate": 0.0007756905908481615, + "loss": 0.8312, + "step": 8407 + }, + { + "epoch": 0.5849246930328011, + "grad_norm": 1.2734375, + "learning_rate": 0.0007754710054874548, + "loss": 0.8647, + "step": 8408 + }, + { + "epoch": 0.5849942606699363, + "grad_norm": 1.0546875, + "learning_rate": 0.0007752514315271981, + "loss": 0.8611, + "step": 8409 + }, + { + "epoch": 0.5850638283070716, + "grad_norm": 1.1484375, + "learning_rate": 0.0007750318689785413, + "loss": 0.7379, + "step": 8410 + }, + { + "epoch": 0.5851333959442068, + "grad_norm": 1.2421875, + "learning_rate": 0.0007748123178526324, + "loss": 0.6662, + "step": 8411 + }, + { + "epoch": 0.5852029635813419, + "grad_norm": 1.390625, + "learning_rate": 0.0007745927781606188, + "loss": 1.269, + "step": 8412 + }, + { + "epoch": 0.5852725312184771, + "grad_norm": 0.99609375, + "learning_rate": 0.0007743732499136476, + "loss": 0.7875, + "step": 8413 + }, + { + "epoch": 0.5853420988556124, + "grad_norm": 1.015625, + "learning_rate": 0.0007741537331228657, + "loss": 0.6835, + "step": 8414 + }, + { + "epoch": 0.5854116664927476, + "grad_norm": 1.25, + "learning_rate": 0.000773934227799419, + "loss": 0.877, + "step": 8415 + }, + { + "epoch": 0.5854812341298827, + "grad_norm": 1.140625, + "learning_rate": 0.0007737147339544526, + "loss": 0.9002, + "step": 8416 + }, + { + "epoch": 0.585550801767018, + "grad_norm": 0.9765625, + "learning_rate": 0.0007734952515991114, + "loss": 0.918, + "step": 8417 + }, + { + "epoch": 0.5856203694041532, + "grad_norm": 1.203125, + "learning_rate": 0.00077327578074454, + "loss": 0.9253, + "step": 8418 + }, + { + "epoch": 0.5856899370412884, + "grad_norm": 1.2421875, + "learning_rate": 0.0007730563214018814, + "loss": 1.0449, + "step": 8419 + }, + { + "epoch": 0.5857595046784236, + "grad_norm": 1.5, + "learning_rate": 0.0007728368735822787, + "loss": 0.6915, + "step": 8420 + }, + { + "epoch": 0.5858290723155588, + "grad_norm": 1.328125, + "learning_rate": 0.0007726174372968748, + "loss": 1.0598, + "step": 8421 + }, + { + "epoch": 0.585898639952694, + "grad_norm": 0.91015625, + "learning_rate": 0.0007723980125568116, + "loss": 0.7353, + "step": 8422 + }, + { + "epoch": 0.5859682075898293, + "grad_norm": 1.0390625, + "learning_rate": 0.0007721785993732296, + "loss": 0.8202, + "step": 8423 + }, + { + "epoch": 0.5860377752269644, + "grad_norm": 1.109375, + "learning_rate": 0.0007719591977572704, + "loss": 0.7018, + "step": 8424 + }, + { + "epoch": 0.5861073428640996, + "grad_norm": 1.0, + "learning_rate": 0.0007717398077200738, + "loss": 0.6593, + "step": 8425 + }, + { + "epoch": 0.5861769105012348, + "grad_norm": 1.1171875, + "learning_rate": 0.0007715204292727791, + "loss": 0.962, + "step": 8426 + }, + { + "epoch": 0.58624647813837, + "grad_norm": 1.125, + "learning_rate": 0.0007713010624265251, + "loss": 0.7747, + "step": 8427 + }, + { + "epoch": 0.5863160457755052, + "grad_norm": 1.203125, + "learning_rate": 0.0007710817071924507, + "loss": 0.7734, + "step": 8428 + }, + { + "epoch": 0.5863856134126404, + "grad_norm": 1.109375, + "learning_rate": 0.0007708623635816936, + "loss": 0.7227, + "step": 8429 + }, + { + "epoch": 0.5864551810497757, + "grad_norm": 1.0390625, + "learning_rate": 0.0007706430316053903, + "loss": 0.841, + "step": 8430 + }, + { + "epoch": 0.5865247486869108, + "grad_norm": 0.90625, + "learning_rate": 0.0007704237112746779, + "loss": 0.5351, + "step": 8431 + }, + { + "epoch": 0.586594316324046, + "grad_norm": 1.171875, + "learning_rate": 0.0007702044026006927, + "loss": 0.6932, + "step": 8432 + }, + { + "epoch": 0.5866638839611813, + "grad_norm": 1.1171875, + "learning_rate": 0.0007699851055945693, + "loss": 0.8046, + "step": 8433 + }, + { + "epoch": 0.5867334515983165, + "grad_norm": 0.96875, + "learning_rate": 0.0007697658202674427, + "loss": 0.8083, + "step": 8434 + }, + { + "epoch": 0.5868030192354516, + "grad_norm": 0.84765625, + "learning_rate": 0.0007695465466304476, + "loss": 0.7808, + "step": 8435 + }, + { + "epoch": 0.5868725868725869, + "grad_norm": 1.2890625, + "learning_rate": 0.0007693272846947173, + "loss": 1.0373, + "step": 8436 + }, + { + "epoch": 0.5869421545097221, + "grad_norm": 1.1640625, + "learning_rate": 0.0007691080344713845, + "loss": 0.923, + "step": 8437 + }, + { + "epoch": 0.5870117221468573, + "grad_norm": 1.2578125, + "learning_rate": 0.0007688887959715823, + "loss": 0.8264, + "step": 8438 + }, + { + "epoch": 0.5870812897839924, + "grad_norm": 1.0, + "learning_rate": 0.0007686695692064419, + "loss": 0.9924, + "step": 8439 + }, + { + "epoch": 0.5871508574211277, + "grad_norm": 1.03125, + "learning_rate": 0.0007684503541870952, + "loss": 0.7213, + "step": 8440 + }, + { + "epoch": 0.5872204250582629, + "grad_norm": 1.625, + "learning_rate": 0.0007682311509246719, + "loss": 0.9749, + "step": 8441 + }, + { + "epoch": 0.587289992695398, + "grad_norm": 1.453125, + "learning_rate": 0.0007680119594303028, + "loss": 0.9688, + "step": 8442 + }, + { + "epoch": 0.5873595603325333, + "grad_norm": 1.0859375, + "learning_rate": 0.0007677927797151172, + "loss": 0.7979, + "step": 8443 + }, + { + "epoch": 0.5874291279696685, + "grad_norm": 1.3515625, + "learning_rate": 0.0007675736117902435, + "loss": 0.8144, + "step": 8444 + }, + { + "epoch": 0.5874986956068037, + "grad_norm": 1.109375, + "learning_rate": 0.0007673544556668104, + "loss": 0.937, + "step": 8445 + }, + { + "epoch": 0.587568263243939, + "grad_norm": 1.0546875, + "learning_rate": 0.0007671353113559455, + "loss": 0.8635, + "step": 8446 + }, + { + "epoch": 0.5876378308810741, + "grad_norm": 1.203125, + "learning_rate": 0.000766916178868776, + "loss": 0.7995, + "step": 8447 + }, + { + "epoch": 0.5877073985182093, + "grad_norm": 1.15625, + "learning_rate": 0.0007666970582164277, + "loss": 1.0202, + "step": 8448 + }, + { + "epoch": 0.5877769661553446, + "grad_norm": 0.984375, + "learning_rate": 0.0007664779494100269, + "loss": 0.8607, + "step": 8449 + }, + { + "epoch": 0.5878465337924798, + "grad_norm": 1.0, + "learning_rate": 0.0007662588524606992, + "loss": 0.8812, + "step": 8450 + }, + { + "epoch": 0.5879161014296149, + "grad_norm": 1.5390625, + "learning_rate": 0.000766039767379568, + "loss": 0.9578, + "step": 8451 + }, + { + "epoch": 0.5879856690667501, + "grad_norm": 1.234375, + "learning_rate": 0.0007658206941777591, + "loss": 0.663, + "step": 8452 + }, + { + "epoch": 0.5880552367038854, + "grad_norm": 1.296875, + "learning_rate": 0.0007656016328663944, + "loss": 0.7829, + "step": 8453 + }, + { + "epoch": 0.5881248043410205, + "grad_norm": 1.1328125, + "learning_rate": 0.0007653825834565977, + "loss": 0.6719, + "step": 8454 + }, + { + "epoch": 0.5881943719781557, + "grad_norm": 0.7734375, + "learning_rate": 0.0007651635459594905, + "loss": 0.6089, + "step": 8455 + }, + { + "epoch": 0.588263939615291, + "grad_norm": 1.1484375, + "learning_rate": 0.000764944520386195, + "loss": 0.8261, + "step": 8456 + }, + { + "epoch": 0.5883335072524262, + "grad_norm": 1.203125, + "learning_rate": 0.0007647255067478321, + "loss": 0.7335, + "step": 8457 + }, + { + "epoch": 0.5884030748895613, + "grad_norm": 1.21875, + "learning_rate": 0.0007645065050555216, + "loss": 0.6148, + "step": 8458 + }, + { + "epoch": 0.5884726425266966, + "grad_norm": 1.3046875, + "learning_rate": 0.0007642875153203843, + "loss": 1.1661, + "step": 8459 + }, + { + "epoch": 0.5885422101638318, + "grad_norm": 1.21875, + "learning_rate": 0.0007640685375535388, + "loss": 0.7649, + "step": 8460 + }, + { + "epoch": 0.588611777800967, + "grad_norm": 1.0234375, + "learning_rate": 0.0007638495717661038, + "loss": 0.7276, + "step": 8461 + }, + { + "epoch": 0.5886813454381022, + "grad_norm": 1.0546875, + "learning_rate": 0.0007636306179691969, + "loss": 0.8225, + "step": 8462 + }, + { + "epoch": 0.5887509130752374, + "grad_norm": 0.94921875, + "learning_rate": 0.0007634116761739362, + "loss": 0.6969, + "step": 8463 + }, + { + "epoch": 0.5888204807123726, + "grad_norm": 1.109375, + "learning_rate": 0.0007631927463914382, + "loss": 0.7609, + "step": 8464 + }, + { + "epoch": 0.5888900483495078, + "grad_norm": 1.234375, + "learning_rate": 0.0007629738286328187, + "loss": 0.6378, + "step": 8465 + }, + { + "epoch": 0.588959615986643, + "grad_norm": 1.0390625, + "learning_rate": 0.0007627549229091932, + "loss": 0.9368, + "step": 8466 + }, + { + "epoch": 0.5890291836237782, + "grad_norm": 1.0234375, + "learning_rate": 0.0007625360292316773, + "loss": 0.7146, + "step": 8467 + }, + { + "epoch": 0.5890987512609134, + "grad_norm": 1.1640625, + "learning_rate": 0.000762317147611385, + "loss": 0.8827, + "step": 8468 + }, + { + "epoch": 0.5891683188980487, + "grad_norm": 0.9609375, + "learning_rate": 0.0007620982780594297, + "loss": 0.7071, + "step": 8469 + }, + { + "epoch": 0.5892378865351838, + "grad_norm": 1.140625, + "learning_rate": 0.0007618794205869247, + "loss": 0.8249, + "step": 8470 + }, + { + "epoch": 0.589307454172319, + "grad_norm": 1.015625, + "learning_rate": 0.0007616605752049827, + "loss": 0.5886, + "step": 8471 + }, + { + "epoch": 0.5893770218094543, + "grad_norm": 0.8984375, + "learning_rate": 0.0007614417419247155, + "loss": 0.6516, + "step": 8472 + }, + { + "epoch": 0.5894465894465895, + "grad_norm": 1.0546875, + "learning_rate": 0.0007612229207572337, + "loss": 0.7519, + "step": 8473 + }, + { + "epoch": 0.5895161570837246, + "grad_norm": 1.0, + "learning_rate": 0.0007610041117136488, + "loss": 0.6568, + "step": 8474 + }, + { + "epoch": 0.5895857247208599, + "grad_norm": 1.046875, + "learning_rate": 0.0007607853148050706, + "loss": 0.7215, + "step": 8475 + }, + { + "epoch": 0.5896552923579951, + "grad_norm": 0.95703125, + "learning_rate": 0.000760566530042608, + "loss": 0.7886, + "step": 8476 + }, + { + "epoch": 0.5897248599951302, + "grad_norm": 1.0390625, + "learning_rate": 0.0007603477574373705, + "loss": 0.7763, + "step": 8477 + }, + { + "epoch": 0.5897944276322654, + "grad_norm": 0.85546875, + "learning_rate": 0.0007601289970004658, + "loss": 0.641, + "step": 8478 + }, + { + "epoch": 0.5898639952694007, + "grad_norm": 1.421875, + "learning_rate": 0.0007599102487430018, + "loss": 0.6329, + "step": 8479 + }, + { + "epoch": 0.5899335629065359, + "grad_norm": 1.3828125, + "learning_rate": 0.0007596915126760848, + "loss": 0.9425, + "step": 8480 + }, + { + "epoch": 0.590003130543671, + "grad_norm": 1.1015625, + "learning_rate": 0.0007594727888108219, + "loss": 1.0997, + "step": 8481 + }, + { + "epoch": 0.5900726981808063, + "grad_norm": 0.98828125, + "learning_rate": 0.0007592540771583185, + "loss": 0.8175, + "step": 8482 + }, + { + "epoch": 0.5901422658179415, + "grad_norm": 1.171875, + "learning_rate": 0.0007590353777296793, + "loss": 0.8869, + "step": 8483 + }, + { + "epoch": 0.5902118334550767, + "grad_norm": 0.92578125, + "learning_rate": 0.0007588166905360091, + "loss": 0.7391, + "step": 8484 + }, + { + "epoch": 0.5902814010922119, + "grad_norm": 1.2578125, + "learning_rate": 0.0007585980155884118, + "loss": 0.9174, + "step": 8485 + }, + { + "epoch": 0.5903509687293471, + "grad_norm": 1.171875, + "learning_rate": 0.0007583793528979908, + "loss": 0.8217, + "step": 8486 + }, + { + "epoch": 0.5904205363664823, + "grad_norm": 1.328125, + "learning_rate": 0.0007581607024758479, + "loss": 0.8131, + "step": 8487 + }, + { + "epoch": 0.5904901040036176, + "grad_norm": 1.2734375, + "learning_rate": 0.0007579420643330858, + "loss": 0.6882, + "step": 8488 + }, + { + "epoch": 0.5905596716407527, + "grad_norm": 1.2890625, + "learning_rate": 0.0007577234384808058, + "loss": 0.7639, + "step": 8489 + }, + { + "epoch": 0.5906292392778879, + "grad_norm": 1.015625, + "learning_rate": 0.0007575048249301078, + "loss": 0.5981, + "step": 8490 + }, + { + "epoch": 0.5906988069150231, + "grad_norm": 1.0703125, + "learning_rate": 0.0007572862236920932, + "loss": 0.677, + "step": 8491 + }, + { + "epoch": 0.5907683745521584, + "grad_norm": 0.8984375, + "learning_rate": 0.0007570676347778605, + "loss": 0.6852, + "step": 8492 + }, + { + "epoch": 0.5908379421892935, + "grad_norm": 1.0234375, + "learning_rate": 0.0007568490581985091, + "loss": 0.8176, + "step": 8493 + }, + { + "epoch": 0.5909075098264287, + "grad_norm": 1.0625, + "learning_rate": 0.0007566304939651366, + "loss": 1.033, + "step": 8494 + }, + { + "epoch": 0.590977077463564, + "grad_norm": 1.0703125, + "learning_rate": 0.0007564119420888411, + "loss": 0.9066, + "step": 8495 + }, + { + "epoch": 0.5910466451006992, + "grad_norm": 1.265625, + "learning_rate": 0.0007561934025807196, + "loss": 0.5497, + "step": 8496 + }, + { + "epoch": 0.5911162127378343, + "grad_norm": 0.796875, + "learning_rate": 0.0007559748754518677, + "loss": 0.6161, + "step": 8497 + }, + { + "epoch": 0.5911857803749696, + "grad_norm": 1.125, + "learning_rate": 0.000755756360713382, + "loss": 0.746, + "step": 8498 + }, + { + "epoch": 0.5912553480121048, + "grad_norm": 1.03125, + "learning_rate": 0.0007555378583763572, + "loss": 1.0243, + "step": 8499 + }, + { + "epoch": 0.59132491564924, + "grad_norm": 1.0625, + "learning_rate": 0.0007553193684518881, + "loss": 0.9379, + "step": 8500 + }, + { + "epoch": 0.5913944832863752, + "grad_norm": 1.1953125, + "learning_rate": 0.0007551008909510676, + "loss": 0.7344, + "step": 8501 + }, + { + "epoch": 0.5914640509235104, + "grad_norm": 0.9375, + "learning_rate": 0.0007548824258849898, + "loss": 0.6471, + "step": 8502 + }, + { + "epoch": 0.5915336185606456, + "grad_norm": 1.3359375, + "learning_rate": 0.0007546639732647468, + "loss": 1.029, + "step": 8503 + }, + { + "epoch": 0.5916031861977807, + "grad_norm": 0.9609375, + "learning_rate": 0.0007544455331014305, + "loss": 0.6613, + "step": 8504 + }, + { + "epoch": 0.591672753834916, + "grad_norm": 1.2421875, + "learning_rate": 0.0007542271054061328, + "loss": 0.9418, + "step": 8505 + }, + { + "epoch": 0.5917423214720512, + "grad_norm": 1.0625, + "learning_rate": 0.0007540086901899436, + "loss": 0.8629, + "step": 8506 + }, + { + "epoch": 0.5918118891091864, + "grad_norm": 1.15625, + "learning_rate": 0.0007537902874639535, + "loss": 0.8829, + "step": 8507 + }, + { + "epoch": 0.5918814567463216, + "grad_norm": 1.8515625, + "learning_rate": 0.0007535718972392512, + "loss": 0.9109, + "step": 8508 + }, + { + "epoch": 0.5919510243834568, + "grad_norm": 1.453125, + "learning_rate": 0.0007533535195269262, + "loss": 0.847, + "step": 8509 + }, + { + "epoch": 0.592020592020592, + "grad_norm": 0.890625, + "learning_rate": 0.000753135154338066, + "loss": 0.7576, + "step": 8510 + }, + { + "epoch": 0.5920901596577273, + "grad_norm": 0.96875, + "learning_rate": 0.0007529168016837584, + "loss": 0.5772, + "step": 8511 + }, + { + "epoch": 0.5921597272948624, + "grad_norm": 1.296875, + "learning_rate": 0.0007526984615750904, + "loss": 1.1715, + "step": 8512 + }, + { + "epoch": 0.5922292949319976, + "grad_norm": 1.21875, + "learning_rate": 0.0007524801340231481, + "loss": 1.0781, + "step": 8513 + }, + { + "epoch": 0.5922988625691329, + "grad_norm": 1.2421875, + "learning_rate": 0.0007522618190390171, + "loss": 0.8278, + "step": 8514 + }, + { + "epoch": 0.5923684302062681, + "grad_norm": 1.1640625, + "learning_rate": 0.0007520435166337817, + "loss": 0.6436, + "step": 8515 + }, + { + "epoch": 0.5924379978434032, + "grad_norm": 1.0234375, + "learning_rate": 0.0007518252268185272, + "loss": 0.7351, + "step": 8516 + }, + { + "epoch": 0.5925075654805384, + "grad_norm": 1.2109375, + "learning_rate": 0.0007516069496043365, + "loss": 0.913, + "step": 8517 + }, + { + "epoch": 0.5925771331176737, + "grad_norm": 1.1640625, + "learning_rate": 0.0007513886850022928, + "loss": 1.0798, + "step": 8518 + }, + { + "epoch": 0.5926467007548089, + "grad_norm": 0.8828125, + "learning_rate": 0.0007511704330234791, + "loss": 0.6147, + "step": 8519 + }, + { + "epoch": 0.592716268391944, + "grad_norm": 1.0078125, + "learning_rate": 0.0007509521936789763, + "loss": 0.6515, + "step": 8520 + }, + { + "epoch": 0.5927858360290793, + "grad_norm": 1.328125, + "learning_rate": 0.000750733966979866, + "loss": 0.8566, + "step": 8521 + }, + { + "epoch": 0.5928554036662145, + "grad_norm": 1.3671875, + "learning_rate": 0.000750515752937228, + "loss": 1.1151, + "step": 8522 + }, + { + "epoch": 0.5929249713033496, + "grad_norm": 1.3203125, + "learning_rate": 0.0007502975515621431, + "loss": 0.803, + "step": 8523 + }, + { + "epoch": 0.5929945389404849, + "grad_norm": 1.125, + "learning_rate": 0.0007500793628656897, + "loss": 0.9002, + "step": 8524 + }, + { + "epoch": 0.5930641065776201, + "grad_norm": 1.0234375, + "learning_rate": 0.0007498611868589464, + "loss": 0.8477, + "step": 8525 + }, + { + "epoch": 0.5931336742147553, + "grad_norm": 1.1796875, + "learning_rate": 0.0007496430235529916, + "loss": 0.8164, + "step": 8526 + }, + { + "epoch": 0.5932032418518906, + "grad_norm": 0.8671875, + "learning_rate": 0.000749424872958902, + "loss": 0.6571, + "step": 8527 + }, + { + "epoch": 0.5932728094890257, + "grad_norm": 1.1171875, + "learning_rate": 0.0007492067350877546, + "loss": 0.7534, + "step": 8528 + }, + { + "epoch": 0.5933423771261609, + "grad_norm": 1.3046875, + "learning_rate": 0.0007489886099506244, + "loss": 0.8796, + "step": 8529 + }, + { + "epoch": 0.5934119447632961, + "grad_norm": 0.85546875, + "learning_rate": 0.000748770497558588, + "loss": 0.8716, + "step": 8530 + }, + { + "epoch": 0.5934815124004313, + "grad_norm": 1.265625, + "learning_rate": 0.0007485523979227194, + "loss": 0.6666, + "step": 8531 + }, + { + "epoch": 0.5935510800375665, + "grad_norm": 0.9453125, + "learning_rate": 0.0007483343110540923, + "loss": 0.6986, + "step": 8532 + }, + { + "epoch": 0.5936206476747017, + "grad_norm": 1.0234375, + "learning_rate": 0.0007481162369637808, + "loss": 0.6915, + "step": 8533 + }, + { + "epoch": 0.593690215311837, + "grad_norm": 1.234375, + "learning_rate": 0.0007478981756628571, + "loss": 1.1911, + "step": 8534 + }, + { + "epoch": 0.5937597829489721, + "grad_norm": 1.1171875, + "learning_rate": 0.0007476801271623934, + "loss": 0.7592, + "step": 8535 + }, + { + "epoch": 0.5938293505861073, + "grad_norm": 1.0625, + "learning_rate": 0.0007474620914734606, + "loss": 0.7574, + "step": 8536 + }, + { + "epoch": 0.5938989182232426, + "grad_norm": 1.0546875, + "learning_rate": 0.0007472440686071305, + "loss": 0.6431, + "step": 8537 + }, + { + "epoch": 0.5939684858603778, + "grad_norm": 1.3046875, + "learning_rate": 0.0007470260585744722, + "loss": 0.8533, + "step": 8538 + }, + { + "epoch": 0.5940380534975129, + "grad_norm": 1.0625, + "learning_rate": 0.000746808061386556, + "loss": 0.9223, + "step": 8539 + }, + { + "epoch": 0.5941076211346482, + "grad_norm": 1.09375, + "learning_rate": 0.0007465900770544498, + "loss": 0.7645, + "step": 8540 + }, + { + "epoch": 0.5941771887717834, + "grad_norm": 0.9296875, + "learning_rate": 0.0007463721055892223, + "loss": 0.7114, + "step": 8541 + }, + { + "epoch": 0.5942467564089186, + "grad_norm": 1.171875, + "learning_rate": 0.0007461541470019411, + "loss": 0.9832, + "step": 8542 + }, + { + "epoch": 0.5943163240460537, + "grad_norm": 1.1328125, + "learning_rate": 0.0007459362013036725, + "loss": 0.7882, + "step": 8543 + }, + { + "epoch": 0.594385891683189, + "grad_norm": 0.9609375, + "learning_rate": 0.0007457182685054834, + "loss": 0.7626, + "step": 8544 + }, + { + "epoch": 0.5944554593203242, + "grad_norm": 0.98828125, + "learning_rate": 0.0007455003486184389, + "loss": 0.5522, + "step": 8545 + }, + { + "epoch": 0.5945250269574593, + "grad_norm": 1.4140625, + "learning_rate": 0.0007452824416536039, + "loss": 1.0521, + "step": 8546 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 1.171875, + "learning_rate": 0.0007450645476220424, + "loss": 0.904, + "step": 8547 + }, + { + "epoch": 0.5946641622317298, + "grad_norm": 1.109375, + "learning_rate": 0.0007448466665348184, + "loss": 0.9891, + "step": 8548 + }, + { + "epoch": 0.594733729868865, + "grad_norm": 1.078125, + "learning_rate": 0.0007446287984029944, + "loss": 0.6378, + "step": 8549 + }, + { + "epoch": 0.5948032975060003, + "grad_norm": 1.21875, + "learning_rate": 0.0007444109432376329, + "loss": 1.0443, + "step": 8550 + }, + { + "epoch": 0.5948728651431354, + "grad_norm": 0.9140625, + "learning_rate": 0.0007441931010497958, + "loss": 0.768, + "step": 8551 + }, + { + "epoch": 0.5949424327802706, + "grad_norm": 1.1015625, + "learning_rate": 0.0007439752718505435, + "loss": 0.7619, + "step": 8552 + }, + { + "epoch": 0.5950120004174059, + "grad_norm": 1.21875, + "learning_rate": 0.0007437574556509365, + "loss": 0.9104, + "step": 8553 + }, + { + "epoch": 0.595081568054541, + "grad_norm": 1.0546875, + "learning_rate": 0.0007435396524620338, + "loss": 0.7708, + "step": 8554 + }, + { + "epoch": 0.5951511356916762, + "grad_norm": 1.0859375, + "learning_rate": 0.0007433218622948956, + "loss": 0.7666, + "step": 8555 + }, + { + "epoch": 0.5952207033288114, + "grad_norm": 1.125, + "learning_rate": 0.0007431040851605791, + "loss": 0.8577, + "step": 8556 + }, + { + "epoch": 0.5952902709659467, + "grad_norm": 1.1640625, + "learning_rate": 0.0007428863210701422, + "loss": 0.7772, + "step": 8557 + }, + { + "epoch": 0.5953598386030818, + "grad_norm": 1.2421875, + "learning_rate": 0.0007426685700346422, + "loss": 0.6987, + "step": 8558 + }, + { + "epoch": 0.595429406240217, + "grad_norm": 0.97265625, + "learning_rate": 0.0007424508320651352, + "loss": 0.7198, + "step": 8559 + }, + { + "epoch": 0.5954989738773523, + "grad_norm": 0.95703125, + "learning_rate": 0.0007422331071726769, + "loss": 0.5827, + "step": 8560 + }, + { + "epoch": 0.5955685415144875, + "grad_norm": 0.953125, + "learning_rate": 0.0007420153953683215, + "loss": 0.7368, + "step": 8561 + }, + { + "epoch": 0.5956381091516226, + "grad_norm": 1.171875, + "learning_rate": 0.0007417976966631249, + "loss": 0.7991, + "step": 8562 + }, + { + "epoch": 0.5957076767887579, + "grad_norm": 1.1953125, + "learning_rate": 0.0007415800110681392, + "loss": 0.87, + "step": 8563 + }, + { + "epoch": 0.5957772444258931, + "grad_norm": 1.3671875, + "learning_rate": 0.0007413623385944182, + "loss": 0.7575, + "step": 8564 + }, + { + "epoch": 0.5958468120630283, + "grad_norm": 1.0234375, + "learning_rate": 0.0007411446792530141, + "loss": 0.8132, + "step": 8565 + }, + { + "epoch": 0.5959163797001635, + "grad_norm": 1.4140625, + "learning_rate": 0.0007409270330549784, + "loss": 0.8191, + "step": 8566 + }, + { + "epoch": 0.5959859473372987, + "grad_norm": 1.296875, + "learning_rate": 0.0007407094000113623, + "loss": 0.7915, + "step": 8567 + }, + { + "epoch": 0.5960555149744339, + "grad_norm": 1.3125, + "learning_rate": 0.0007404917801332154, + "loss": 0.8721, + "step": 8568 + }, + { + "epoch": 0.596125082611569, + "grad_norm": 0.9453125, + "learning_rate": 0.0007402741734315885, + "loss": 0.7433, + "step": 8569 + }, + { + "epoch": 0.5961946502487043, + "grad_norm": 0.96875, + "learning_rate": 0.0007400565799175296, + "loss": 0.517, + "step": 8570 + }, + { + "epoch": 0.5962642178858395, + "grad_norm": 1.09375, + "learning_rate": 0.0007398389996020873, + "loss": 0.7589, + "step": 8571 + }, + { + "epoch": 0.5963337855229747, + "grad_norm": 1.046875, + "learning_rate": 0.0007396214324963098, + "loss": 0.8574, + "step": 8572 + }, + { + "epoch": 0.59640335316011, + "grad_norm": 0.96484375, + "learning_rate": 0.0007394038786112431, + "loss": 0.6718, + "step": 8573 + }, + { + "epoch": 0.5964729207972451, + "grad_norm": 1.15625, + "learning_rate": 0.000739186337957934, + "loss": 0.796, + "step": 8574 + }, + { + "epoch": 0.5965424884343803, + "grad_norm": 1.0234375, + "learning_rate": 0.0007389688105474279, + "loss": 0.9065, + "step": 8575 + }, + { + "epoch": 0.5966120560715156, + "grad_norm": 1.1875, + "learning_rate": 0.0007387512963907704, + "loss": 0.7815, + "step": 8576 + }, + { + "epoch": 0.5966816237086507, + "grad_norm": 0.84765625, + "learning_rate": 0.000738533795499005, + "loss": 0.7542, + "step": 8577 + }, + { + "epoch": 0.5967511913457859, + "grad_norm": 1.2578125, + "learning_rate": 0.0007383163078831754, + "loss": 0.832, + "step": 8578 + }, + { + "epoch": 0.5968207589829212, + "grad_norm": 1.4765625, + "learning_rate": 0.000738098833554325, + "loss": 0.9228, + "step": 8579 + }, + { + "epoch": 0.5968903266200564, + "grad_norm": 1.1171875, + "learning_rate": 0.0007378813725234958, + "loss": 0.784, + "step": 8580 + }, + { + "epoch": 0.5969598942571915, + "grad_norm": 0.9296875, + "learning_rate": 0.0007376639248017291, + "loss": 0.5069, + "step": 8581 + }, + { + "epoch": 0.5970294618943267, + "grad_norm": 1.2109375, + "learning_rate": 0.0007374464904000658, + "loss": 0.6602, + "step": 8582 + }, + { + "epoch": 0.597099029531462, + "grad_norm": 1.234375, + "learning_rate": 0.0007372290693295469, + "loss": 0.8191, + "step": 8583 + }, + { + "epoch": 0.5971685971685972, + "grad_norm": 1.0703125, + "learning_rate": 0.0007370116616012112, + "loss": 0.8624, + "step": 8584 + }, + { + "epoch": 0.5972381648057323, + "grad_norm": 1.078125, + "learning_rate": 0.0007367942672260974, + "loss": 0.9326, + "step": 8585 + }, + { + "epoch": 0.5973077324428676, + "grad_norm": 1.1328125, + "learning_rate": 0.0007365768862152447, + "loss": 0.9081, + "step": 8586 + }, + { + "epoch": 0.5973773000800028, + "grad_norm": 0.92578125, + "learning_rate": 0.0007363595185796895, + "loss": 0.837, + "step": 8587 + }, + { + "epoch": 0.597446867717138, + "grad_norm": 1.3828125, + "learning_rate": 0.0007361421643304692, + "loss": 1.1247, + "step": 8588 + }, + { + "epoch": 0.5975164353542732, + "grad_norm": 0.97265625, + "learning_rate": 0.0007359248234786198, + "loss": 0.7298, + "step": 8589 + }, + { + "epoch": 0.5975860029914084, + "grad_norm": 1.3515625, + "learning_rate": 0.0007357074960351771, + "loss": 0.8085, + "step": 8590 + }, + { + "epoch": 0.5976555706285436, + "grad_norm": 1.0546875, + "learning_rate": 0.0007354901820111753, + "loss": 0.9944, + "step": 8591 + }, + { + "epoch": 0.5977251382656789, + "grad_norm": 1.25, + "learning_rate": 0.0007352728814176489, + "loss": 0.8763, + "step": 8592 + }, + { + "epoch": 0.597794705902814, + "grad_norm": 1.1015625, + "learning_rate": 0.0007350555942656311, + "loss": 0.9391, + "step": 8593 + }, + { + "epoch": 0.5978642735399492, + "grad_norm": 1.1953125, + "learning_rate": 0.0007348383205661552, + "loss": 0.8523, + "step": 8594 + }, + { + "epoch": 0.5979338411770844, + "grad_norm": 1.0859375, + "learning_rate": 0.0007346210603302528, + "loss": 0.8748, + "step": 8595 + }, + { + "epoch": 0.5980034088142197, + "grad_norm": 1.1171875, + "learning_rate": 0.000734403813568955, + "loss": 0.8543, + "step": 8596 + }, + { + "epoch": 0.5980729764513548, + "grad_norm": 1.0390625, + "learning_rate": 0.0007341865802932932, + "loss": 0.812, + "step": 8597 + }, + { + "epoch": 0.59814254408849, + "grad_norm": 1.15625, + "learning_rate": 0.0007339693605142969, + "loss": 0.7481, + "step": 8598 + }, + { + "epoch": 0.5982121117256253, + "grad_norm": 0.9375, + "learning_rate": 0.0007337521542429955, + "loss": 0.6395, + "step": 8599 + }, + { + "epoch": 0.5982816793627604, + "grad_norm": 1.078125, + "learning_rate": 0.0007335349614904179, + "loss": 0.7155, + "step": 8600 + }, + { + "epoch": 0.5983512469998956, + "grad_norm": 1.09375, + "learning_rate": 0.0007333177822675918, + "loss": 0.8966, + "step": 8601 + }, + { + "epoch": 0.5984208146370309, + "grad_norm": 0.84765625, + "learning_rate": 0.0007331006165855448, + "loss": 0.7555, + "step": 8602 + }, + { + "epoch": 0.5984903822741661, + "grad_norm": 1.1328125, + "learning_rate": 0.0007328834644553026, + "loss": 0.8706, + "step": 8603 + }, + { + "epoch": 0.5985599499113012, + "grad_norm": 0.96484375, + "learning_rate": 0.0007326663258878923, + "loss": 0.8422, + "step": 8604 + }, + { + "epoch": 0.5986295175484365, + "grad_norm": 1.078125, + "learning_rate": 0.0007324492008943382, + "loss": 0.9008, + "step": 8605 + }, + { + "epoch": 0.5986990851855717, + "grad_norm": 1.09375, + "learning_rate": 0.000732232089485665, + "loss": 0.7004, + "step": 8606 + }, + { + "epoch": 0.5987686528227069, + "grad_norm": 0.92578125, + "learning_rate": 0.0007320149916728969, + "loss": 0.6937, + "step": 8607 + }, + { + "epoch": 0.598838220459842, + "grad_norm": 1.1171875, + "learning_rate": 0.0007317979074670569, + "loss": 0.8367, + "step": 8608 + }, + { + "epoch": 0.5989077880969773, + "grad_norm": 1.171875, + "learning_rate": 0.0007315808368791671, + "loss": 0.8127, + "step": 8609 + }, + { + "epoch": 0.5989773557341125, + "grad_norm": 0.96875, + "learning_rate": 0.0007313637799202493, + "loss": 0.8375, + "step": 8610 + }, + { + "epoch": 0.5990469233712477, + "grad_norm": 1.1953125, + "learning_rate": 0.0007311467366013251, + "loss": 0.7361, + "step": 8611 + }, + { + "epoch": 0.5991164910083829, + "grad_norm": 1.03125, + "learning_rate": 0.0007309297069334143, + "loss": 0.7678, + "step": 8612 + }, + { + "epoch": 0.5991860586455181, + "grad_norm": 1.3046875, + "learning_rate": 0.0007307126909275365, + "loss": 0.8631, + "step": 8613 + }, + { + "epoch": 0.5992556262826533, + "grad_norm": 1.3359375, + "learning_rate": 0.0007304956885947114, + "loss": 1.013, + "step": 8614 + }, + { + "epoch": 0.5993251939197886, + "grad_norm": 1.296875, + "learning_rate": 0.0007302786999459569, + "loss": 0.6568, + "step": 8615 + }, + { + "epoch": 0.5993947615569237, + "grad_norm": 1.28125, + "learning_rate": 0.0007300617249922903, + "loss": 0.8126, + "step": 8616 + }, + { + "epoch": 0.5994643291940589, + "grad_norm": 0.97265625, + "learning_rate": 0.0007298447637447284, + "loss": 0.5406, + "step": 8617 + }, + { + "epoch": 0.5995338968311942, + "grad_norm": 1.171875, + "learning_rate": 0.0007296278162142882, + "loss": 0.6937, + "step": 8618 + }, + { + "epoch": 0.5996034644683294, + "grad_norm": 1.1953125, + "learning_rate": 0.0007294108824119846, + "loss": 0.8836, + "step": 8619 + }, + { + "epoch": 0.5996730321054645, + "grad_norm": 1.109375, + "learning_rate": 0.0007291939623488324, + "loss": 1.1188, + "step": 8620 + }, + { + "epoch": 0.5997425997425997, + "grad_norm": 1.0390625, + "learning_rate": 0.0007289770560358458, + "loss": 0.8996, + "step": 8621 + }, + { + "epoch": 0.599812167379735, + "grad_norm": 1.3046875, + "learning_rate": 0.0007287601634840384, + "loss": 0.8006, + "step": 8622 + }, + { + "epoch": 0.5998817350168701, + "grad_norm": 0.984375, + "learning_rate": 0.0007285432847044227, + "loss": 0.7099, + "step": 8623 + }, + { + "epoch": 0.5999513026540053, + "grad_norm": 1.203125, + "learning_rate": 0.0007283264197080106, + "loss": 0.9949, + "step": 8624 + }, + { + "epoch": 0.6000208702911406, + "grad_norm": 1.0, + "learning_rate": 0.0007281095685058137, + "loss": 1.1107, + "step": 8625 + }, + { + "epoch": 0.6000904379282758, + "grad_norm": 1.2890625, + "learning_rate": 0.0007278927311088426, + "loss": 0.8135, + "step": 8626 + }, + { + "epoch": 0.6001600055654109, + "grad_norm": 1.25, + "learning_rate": 0.0007276759075281069, + "loss": 0.905, + "step": 8627 + }, + { + "epoch": 0.6002295732025462, + "grad_norm": 1.3671875, + "learning_rate": 0.0007274590977746161, + "loss": 0.868, + "step": 8628 + }, + { + "epoch": 0.6002991408396814, + "grad_norm": 1.109375, + "learning_rate": 0.0007272423018593787, + "loss": 0.7799, + "step": 8629 + }, + { + "epoch": 0.6003687084768166, + "grad_norm": 0.984375, + "learning_rate": 0.0007270255197934024, + "loss": 0.8145, + "step": 8630 + }, + { + "epoch": 0.6004382761139518, + "grad_norm": 1.1171875, + "learning_rate": 0.0007268087515876939, + "loss": 0.8468, + "step": 8631 + }, + { + "epoch": 0.600507843751087, + "grad_norm": 0.9609375, + "learning_rate": 0.0007265919972532603, + "loss": 0.5068, + "step": 8632 + }, + { + "epoch": 0.6005774113882222, + "grad_norm": 1.0390625, + "learning_rate": 0.0007263752568011073, + "loss": 0.8017, + "step": 8633 + }, + { + "epoch": 0.6006469790253574, + "grad_norm": 1.125, + "learning_rate": 0.0007261585302422392, + "loss": 0.7295, + "step": 8634 + }, + { + "epoch": 0.6007165466624926, + "grad_norm": 1.234375, + "learning_rate": 0.0007259418175876607, + "loss": 0.9748, + "step": 8635 + }, + { + "epoch": 0.6007861142996278, + "grad_norm": 1.234375, + "learning_rate": 0.0007257251188483756, + "loss": 0.9806, + "step": 8636 + }, + { + "epoch": 0.600855681936763, + "grad_norm": 0.9453125, + "learning_rate": 0.0007255084340353862, + "loss": 0.6462, + "step": 8637 + }, + { + "epoch": 0.6009252495738983, + "grad_norm": 1.1015625, + "learning_rate": 0.0007252917631596949, + "loss": 0.8728, + "step": 8638 + }, + { + "epoch": 0.6009948172110334, + "grad_norm": 1.453125, + "learning_rate": 0.0007250751062323036, + "loss": 1.0304, + "step": 8639 + }, + { + "epoch": 0.6010643848481686, + "grad_norm": 1.171875, + "learning_rate": 0.0007248584632642127, + "loss": 0.8004, + "step": 8640 + }, + { + "epoch": 0.6011339524853039, + "grad_norm": 0.94921875, + "learning_rate": 0.000724641834266422, + "loss": 0.6333, + "step": 8641 + }, + { + "epoch": 0.601203520122439, + "grad_norm": 1.203125, + "learning_rate": 0.000724425219249931, + "loss": 0.8913, + "step": 8642 + }, + { + "epoch": 0.6012730877595742, + "grad_norm": 0.9296875, + "learning_rate": 0.0007242086182257386, + "loss": 0.9301, + "step": 8643 + }, + { + "epoch": 0.6013426553967095, + "grad_norm": 1.25, + "learning_rate": 0.0007239920312048423, + "loss": 0.7183, + "step": 8644 + }, + { + "epoch": 0.6014122230338447, + "grad_norm": 0.95703125, + "learning_rate": 0.0007237754581982394, + "loss": 0.7616, + "step": 8645 + }, + { + "epoch": 0.6014817906709798, + "grad_norm": 1.21875, + "learning_rate": 0.0007235588992169265, + "loss": 0.6698, + "step": 8646 + }, + { + "epoch": 0.601551358308115, + "grad_norm": 1.484375, + "learning_rate": 0.0007233423542718997, + "loss": 0.8227, + "step": 8647 + }, + { + "epoch": 0.6016209259452503, + "grad_norm": 1.078125, + "learning_rate": 0.0007231258233741533, + "loss": 0.8004, + "step": 8648 + }, + { + "epoch": 0.6016904935823855, + "grad_norm": 1.2265625, + "learning_rate": 0.0007229093065346818, + "loss": 0.9062, + "step": 8649 + }, + { + "epoch": 0.6017600612195206, + "grad_norm": 1.0703125, + "learning_rate": 0.0007226928037644798, + "loss": 0.9083, + "step": 8650 + }, + { + "epoch": 0.6018296288566559, + "grad_norm": 1.125, + "learning_rate": 0.000722476315074539, + "loss": 0.6559, + "step": 8651 + }, + { + "epoch": 0.6018991964937911, + "grad_norm": 1.1015625, + "learning_rate": 0.000722259840475852, + "loss": 0.7909, + "step": 8652 + }, + { + "epoch": 0.6019687641309263, + "grad_norm": 1.234375, + "learning_rate": 0.0007220433799794106, + "loss": 0.6338, + "step": 8653 + }, + { + "epoch": 0.6020383317680615, + "grad_norm": 0.91796875, + "learning_rate": 0.0007218269335962055, + "loss": 0.97, + "step": 8654 + }, + { + "epoch": 0.6021078994051967, + "grad_norm": 1.0234375, + "learning_rate": 0.0007216105013372266, + "loss": 0.8516, + "step": 8655 + }, + { + "epoch": 0.6021774670423319, + "grad_norm": 1.1953125, + "learning_rate": 0.0007213940832134629, + "loss": 0.913, + "step": 8656 + }, + { + "epoch": 0.6022470346794672, + "grad_norm": 1.546875, + "learning_rate": 0.0007211776792359038, + "loss": 0.9543, + "step": 8657 + }, + { + "epoch": 0.6023166023166023, + "grad_norm": 1.3046875, + "learning_rate": 0.0007209612894155367, + "loss": 0.8336, + "step": 8658 + }, + { + "epoch": 0.6023861699537375, + "grad_norm": 1.21875, + "learning_rate": 0.0007207449137633483, + "loss": 0.9429, + "step": 8659 + }, + { + "epoch": 0.6024557375908727, + "grad_norm": 1.0234375, + "learning_rate": 0.0007205285522903262, + "loss": 0.7073, + "step": 8660 + }, + { + "epoch": 0.602525305228008, + "grad_norm": 0.796875, + "learning_rate": 0.0007203122050074556, + "loss": 0.468, + "step": 8661 + }, + { + "epoch": 0.6025948728651431, + "grad_norm": 1.1796875, + "learning_rate": 0.0007200958719257213, + "loss": 0.9161, + "step": 8662 + }, + { + "epoch": 0.6026644405022783, + "grad_norm": 1.140625, + "learning_rate": 0.0007198795530561077, + "loss": 0.9724, + "step": 8663 + }, + { + "epoch": 0.6027340081394136, + "grad_norm": 1.6171875, + "learning_rate": 0.0007196632484095986, + "loss": 0.6824, + "step": 8664 + }, + { + "epoch": 0.6028035757765488, + "grad_norm": 0.953125, + "learning_rate": 0.0007194469579971769, + "loss": 0.8068, + "step": 8665 + }, + { + "epoch": 0.6028731434136839, + "grad_norm": 1.2265625, + "learning_rate": 0.0007192306818298244, + "loss": 0.8191, + "step": 8666 + }, + { + "epoch": 0.6029427110508192, + "grad_norm": 1.1484375, + "learning_rate": 0.0007190144199185227, + "loss": 0.8178, + "step": 8667 + }, + { + "epoch": 0.6030122786879544, + "grad_norm": 0.86328125, + "learning_rate": 0.0007187981722742527, + "loss": 0.6776, + "step": 8668 + }, + { + "epoch": 0.6030818463250895, + "grad_norm": 1.046875, + "learning_rate": 0.0007185819389079939, + "loss": 0.8442, + "step": 8669 + }, + { + "epoch": 0.6031514139622248, + "grad_norm": 1.203125, + "learning_rate": 0.0007183657198307258, + "loss": 0.6645, + "step": 8670 + }, + { + "epoch": 0.60322098159936, + "grad_norm": 1.3359375, + "learning_rate": 0.000718149515053427, + "loss": 0.7999, + "step": 8671 + }, + { + "epoch": 0.6032905492364952, + "grad_norm": 1.65625, + "learning_rate": 0.0007179333245870753, + "loss": 0.7067, + "step": 8672 + }, + { + "epoch": 0.6033601168736303, + "grad_norm": 0.98046875, + "learning_rate": 0.0007177171484426474, + "loss": 0.6579, + "step": 8673 + }, + { + "epoch": 0.6034296845107656, + "grad_norm": 1.015625, + "learning_rate": 0.0007175009866311199, + "loss": 0.7907, + "step": 8674 + }, + { + "epoch": 0.6034992521479008, + "grad_norm": 1.0234375, + "learning_rate": 0.0007172848391634687, + "loss": 0.8546, + "step": 8675 + }, + { + "epoch": 0.603568819785036, + "grad_norm": 1.3515625, + "learning_rate": 0.0007170687060506682, + "loss": 0.9074, + "step": 8676 + }, + { + "epoch": 0.6036383874221712, + "grad_norm": 1.0234375, + "learning_rate": 0.0007168525873036926, + "loss": 0.8058, + "step": 8677 + }, + { + "epoch": 0.6037079550593064, + "grad_norm": 1.171875, + "learning_rate": 0.0007166364829335155, + "loss": 0.8768, + "step": 8678 + }, + { + "epoch": 0.6037775226964416, + "grad_norm": 0.83203125, + "learning_rate": 0.00071642039295111, + "loss": 0.8745, + "step": 8679 + }, + { + "epoch": 0.6038470903335769, + "grad_norm": 1.328125, + "learning_rate": 0.0007162043173674468, + "loss": 0.7824, + "step": 8680 + }, + { + "epoch": 0.603916657970712, + "grad_norm": 1.0234375, + "learning_rate": 0.0007159882561934984, + "loss": 0.9104, + "step": 8681 + }, + { + "epoch": 0.6039862256078472, + "grad_norm": 1.3359375, + "learning_rate": 0.0007157722094402351, + "loss": 0.6902, + "step": 8682 + }, + { + "epoch": 0.6040557932449825, + "grad_norm": 1.3671875, + "learning_rate": 0.0007155561771186259, + "loss": 0.839, + "step": 8683 + }, + { + "epoch": 0.6041253608821177, + "grad_norm": 1.6875, + "learning_rate": 0.0007153401592396402, + "loss": 0.7838, + "step": 8684 + }, + { + "epoch": 0.6041949285192528, + "grad_norm": 1.125, + "learning_rate": 0.0007151241558142467, + "loss": 0.8241, + "step": 8685 + }, + { + "epoch": 0.604264496156388, + "grad_norm": 1.15625, + "learning_rate": 0.000714908166853413, + "loss": 0.745, + "step": 8686 + }, + { + "epoch": 0.6043340637935233, + "grad_norm": 0.98046875, + "learning_rate": 0.0007146921923681051, + "loss": 0.7767, + "step": 8687 + }, + { + "epoch": 0.6044036314306585, + "grad_norm": 1.109375, + "learning_rate": 0.0007144762323692897, + "loss": 0.7174, + "step": 8688 + }, + { + "epoch": 0.6044731990677936, + "grad_norm": 0.83203125, + "learning_rate": 0.0007142602868679324, + "loss": 0.6602, + "step": 8689 + }, + { + "epoch": 0.6045427667049289, + "grad_norm": 1.0390625, + "learning_rate": 0.0007140443558749974, + "loss": 0.6507, + "step": 8690 + }, + { + "epoch": 0.6046123343420641, + "grad_norm": 1.21875, + "learning_rate": 0.0007138284394014483, + "loss": 0.79, + "step": 8691 + }, + { + "epoch": 0.6046819019791992, + "grad_norm": 0.890625, + "learning_rate": 0.000713612537458249, + "loss": 0.7754, + "step": 8692 + }, + { + "epoch": 0.6047514696163345, + "grad_norm": 1.1796875, + "learning_rate": 0.0007133966500563615, + "loss": 0.9146, + "step": 8693 + }, + { + "epoch": 0.6048210372534697, + "grad_norm": 1.109375, + "learning_rate": 0.0007131807772067473, + "loss": 0.6198, + "step": 8694 + }, + { + "epoch": 0.6048906048906049, + "grad_norm": 1.0625, + "learning_rate": 0.0007129649189203677, + "loss": 0.7704, + "step": 8695 + }, + { + "epoch": 0.6049601725277401, + "grad_norm": 1.0703125, + "learning_rate": 0.0007127490752081829, + "loss": 0.7555, + "step": 8696 + }, + { + "epoch": 0.6050297401648753, + "grad_norm": 1.203125, + "learning_rate": 0.0007125332460811522, + "loss": 0.9109, + "step": 8697 + }, + { + "epoch": 0.6050993078020105, + "grad_norm": 1.375, + "learning_rate": 0.0007123174315502341, + "loss": 0.7943, + "step": 8698 + }, + { + "epoch": 0.6051688754391457, + "grad_norm": 0.96875, + "learning_rate": 0.0007121016316263869, + "loss": 0.7806, + "step": 8699 + }, + { + "epoch": 0.6052384430762809, + "grad_norm": 1.140625, + "learning_rate": 0.000711885846320568, + "loss": 0.7104, + "step": 8700 + }, + { + "epoch": 0.6053080107134161, + "grad_norm": 1.1484375, + "learning_rate": 0.0007116700756437333, + "loss": 0.7789, + "step": 8701 + }, + { + "epoch": 0.6053775783505513, + "grad_norm": 1.40625, + "learning_rate": 0.0007114543196068389, + "loss": 0.7917, + "step": 8702 + }, + { + "epoch": 0.6054471459876866, + "grad_norm": 1.03125, + "learning_rate": 0.0007112385782208397, + "loss": 0.8465, + "step": 8703 + }, + { + "epoch": 0.6055167136248217, + "grad_norm": 1.078125, + "learning_rate": 0.0007110228514966903, + "loss": 1.0022, + "step": 8704 + }, + { + "epoch": 0.6055862812619569, + "grad_norm": 0.96875, + "learning_rate": 0.0007108071394453436, + "loss": 0.5816, + "step": 8705 + }, + { + "epoch": 0.6056558488990922, + "grad_norm": 1.0625, + "learning_rate": 0.0007105914420777529, + "loss": 0.6953, + "step": 8706 + }, + { + "epoch": 0.6057254165362274, + "grad_norm": 1.0234375, + "learning_rate": 0.0007103757594048703, + "loss": 0.6059, + "step": 8707 + }, + { + "epoch": 0.6057949841733625, + "grad_norm": 1.765625, + "learning_rate": 0.0007101600914376465, + "loss": 0.8081, + "step": 8708 + }, + { + "epoch": 0.6058645518104978, + "grad_norm": 1.125, + "learning_rate": 0.0007099444381870322, + "loss": 0.8674, + "step": 8709 + }, + { + "epoch": 0.605934119447633, + "grad_norm": 0.9921875, + "learning_rate": 0.0007097287996639776, + "loss": 0.7671, + "step": 8710 + }, + { + "epoch": 0.6060036870847682, + "grad_norm": 1.1796875, + "learning_rate": 0.0007095131758794317, + "loss": 0.6164, + "step": 8711 + }, + { + "epoch": 0.6060732547219033, + "grad_norm": 1.1171875, + "learning_rate": 0.0007092975668443421, + "loss": 0.9779, + "step": 8712 + }, + { + "epoch": 0.6061428223590386, + "grad_norm": 0.9609375, + "learning_rate": 0.000709081972569657, + "loss": 0.7917, + "step": 8713 + }, + { + "epoch": 0.6062123899961738, + "grad_norm": 1.0546875, + "learning_rate": 0.0007088663930663232, + "loss": 0.967, + "step": 8714 + }, + { + "epoch": 0.6062819576333089, + "grad_norm": 1.015625, + "learning_rate": 0.0007086508283452864, + "loss": 0.6636, + "step": 8715 + }, + { + "epoch": 0.6063515252704442, + "grad_norm": 1.0859375, + "learning_rate": 0.0007084352784174917, + "loss": 0.8809, + "step": 8716 + }, + { + "epoch": 0.6064210929075794, + "grad_norm": 1.1171875, + "learning_rate": 0.0007082197432938844, + "loss": 0.8034, + "step": 8717 + }, + { + "epoch": 0.6064906605447146, + "grad_norm": 1.125, + "learning_rate": 0.0007080042229854077, + "loss": 0.5958, + "step": 8718 + }, + { + "epoch": 0.6065602281818498, + "grad_norm": 0.97265625, + "learning_rate": 0.0007077887175030047, + "loss": 0.7012, + "step": 8719 + }, + { + "epoch": 0.606629795818985, + "grad_norm": 1.1015625, + "learning_rate": 0.000707573226857618, + "loss": 0.7884, + "step": 8720 + }, + { + "epoch": 0.6066993634561202, + "grad_norm": 1.140625, + "learning_rate": 0.0007073577510601889, + "loss": 0.6782, + "step": 8721 + }, + { + "epoch": 0.6067689310932555, + "grad_norm": 2.328125, + "learning_rate": 0.0007071422901216579, + "loss": 0.8939, + "step": 8722 + }, + { + "epoch": 0.6068384987303906, + "grad_norm": 1.03125, + "learning_rate": 0.0007069268440529654, + "loss": 0.5865, + "step": 8723 + }, + { + "epoch": 0.6069080663675258, + "grad_norm": 1.0234375, + "learning_rate": 0.0007067114128650506, + "loss": 0.6065, + "step": 8724 + }, + { + "epoch": 0.606977634004661, + "grad_norm": 1.015625, + "learning_rate": 0.0007064959965688522, + "loss": 0.6358, + "step": 8725 + }, + { + "epoch": 0.6070472016417963, + "grad_norm": 1.4765625, + "learning_rate": 0.0007062805951753073, + "loss": 1.11, + "step": 8726 + }, + { + "epoch": 0.6071167692789314, + "grad_norm": 1.125, + "learning_rate": 0.0007060652086953534, + "loss": 0.9128, + "step": 8727 + }, + { + "epoch": 0.6071863369160666, + "grad_norm": 0.78125, + "learning_rate": 0.0007058498371399269, + "loss": 0.5899, + "step": 8728 + }, + { + "epoch": 0.6072559045532019, + "grad_norm": 0.99609375, + "learning_rate": 0.000705634480519963, + "loss": 0.6378, + "step": 8729 + }, + { + "epoch": 0.6073254721903371, + "grad_norm": 1.140625, + "learning_rate": 0.0007054191388463962, + "loss": 0.958, + "step": 8730 + }, + { + "epoch": 0.6073950398274722, + "grad_norm": 1.171875, + "learning_rate": 0.0007052038121301609, + "loss": 0.9299, + "step": 8731 + }, + { + "epoch": 0.6074646074646075, + "grad_norm": 1.5703125, + "learning_rate": 0.0007049885003821905, + "loss": 1.0552, + "step": 8732 + }, + { + "epoch": 0.6075341751017427, + "grad_norm": 0.9140625, + "learning_rate": 0.0007047732036134165, + "loss": 0.6015, + "step": 8733 + }, + { + "epoch": 0.6076037427388779, + "grad_norm": 0.9140625, + "learning_rate": 0.0007045579218347712, + "loss": 0.7952, + "step": 8734 + }, + { + "epoch": 0.6076733103760131, + "grad_norm": 1.1953125, + "learning_rate": 0.0007043426550571858, + "loss": 1.0644, + "step": 8735 + }, + { + "epoch": 0.6077428780131483, + "grad_norm": 1.28125, + "learning_rate": 0.0007041274032915903, + "loss": 0.7886, + "step": 8736 + }, + { + "epoch": 0.6078124456502835, + "grad_norm": 1.1796875, + "learning_rate": 0.0007039121665489134, + "loss": 1.0829, + "step": 8737 + }, + { + "epoch": 0.6078820132874186, + "grad_norm": 1.3984375, + "learning_rate": 0.0007036969448400847, + "loss": 0.9535, + "step": 8738 + }, + { + "epoch": 0.6079515809245539, + "grad_norm": 1.2421875, + "learning_rate": 0.0007034817381760317, + "loss": 0.8826, + "step": 8739 + }, + { + "epoch": 0.6080211485616891, + "grad_norm": 1.2265625, + "learning_rate": 0.0007032665465676812, + "loss": 0.6847, + "step": 8740 + }, + { + "epoch": 0.6080907161988243, + "grad_norm": 1.1015625, + "learning_rate": 0.00070305137002596, + "loss": 0.9334, + "step": 8741 + }, + { + "epoch": 0.6081602838359595, + "grad_norm": 0.8359375, + "learning_rate": 0.0007028362085617935, + "loss": 0.7175, + "step": 8742 + }, + { + "epoch": 0.6082298514730947, + "grad_norm": 1.0625, + "learning_rate": 0.0007026210621861066, + "loss": 0.8694, + "step": 8743 + }, + { + "epoch": 0.6082994191102299, + "grad_norm": 1.2421875, + "learning_rate": 0.0007024059309098229, + "loss": 0.9104, + "step": 8744 + }, + { + "epoch": 0.6083689867473652, + "grad_norm": 1.046875, + "learning_rate": 0.0007021908147438662, + "loss": 0.8699, + "step": 8745 + }, + { + "epoch": 0.6084385543845003, + "grad_norm": 1.5234375, + "learning_rate": 0.0007019757136991591, + "loss": 0.6986, + "step": 8746 + }, + { + "epoch": 0.6085081220216355, + "grad_norm": 1.0390625, + "learning_rate": 0.0007017606277866225, + "loss": 0.7712, + "step": 8747 + }, + { + "epoch": 0.6085776896587708, + "grad_norm": 1.1953125, + "learning_rate": 0.0007015455570171787, + "loss": 1.0519, + "step": 8748 + }, + { + "epoch": 0.608647257295906, + "grad_norm": 1.2734375, + "learning_rate": 0.0007013305014017468, + "loss": 0.8171, + "step": 8749 + }, + { + "epoch": 0.6087168249330411, + "grad_norm": 1.0546875, + "learning_rate": 0.000701115460951247, + "loss": 0.8163, + "step": 8750 + }, + { + "epoch": 0.6087863925701763, + "grad_norm": 1.5859375, + "learning_rate": 0.0007009004356765971, + "loss": 0.8739, + "step": 8751 + }, + { + "epoch": 0.6088559602073116, + "grad_norm": 1.03125, + "learning_rate": 0.0007006854255887157, + "loss": 0.7692, + "step": 8752 + }, + { + "epoch": 0.6089255278444468, + "grad_norm": 1.2109375, + "learning_rate": 0.0007004704306985201, + "loss": 0.9266, + "step": 8753 + }, + { + "epoch": 0.6089950954815819, + "grad_norm": 0.96484375, + "learning_rate": 0.0007002554510169254, + "loss": 0.7674, + "step": 8754 + }, + { + "epoch": 0.6090646631187172, + "grad_norm": 1.109375, + "learning_rate": 0.0007000404865548489, + "loss": 0.8297, + "step": 8755 + }, + { + "epoch": 0.6091342307558524, + "grad_norm": 0.96875, + "learning_rate": 0.0006998255373232043, + "loss": 0.7031, + "step": 8756 + }, + { + "epoch": 0.6092037983929875, + "grad_norm": 1.375, + "learning_rate": 0.0006996106033329061, + "loss": 0.7895, + "step": 8757 + }, + { + "epoch": 0.6092733660301228, + "grad_norm": 1.078125, + "learning_rate": 0.000699395684594867, + "loss": 0.7787, + "step": 8758 + }, + { + "epoch": 0.609342933667258, + "grad_norm": 1.078125, + "learning_rate": 0.0006991807811200002, + "loss": 0.8288, + "step": 8759 + }, + { + "epoch": 0.6094125013043932, + "grad_norm": 1.328125, + "learning_rate": 0.0006989658929192171, + "loss": 0.9346, + "step": 8760 + }, + { + "epoch": 0.6094820689415285, + "grad_norm": 1.03125, + "learning_rate": 0.0006987510200034281, + "loss": 0.8066, + "step": 8761 + }, + { + "epoch": 0.6095516365786636, + "grad_norm": 0.9453125, + "learning_rate": 0.0006985361623835447, + "loss": 0.6906, + "step": 8762 + }, + { + "epoch": 0.6096212042157988, + "grad_norm": 0.8984375, + "learning_rate": 0.000698321320070475, + "loss": 0.7455, + "step": 8763 + }, + { + "epoch": 0.609690771852934, + "grad_norm": 1.296875, + "learning_rate": 0.0006981064930751285, + "loss": 0.8661, + "step": 8764 + }, + { + "epoch": 0.6097603394900692, + "grad_norm": 1.0703125, + "learning_rate": 0.0006978916814084121, + "loss": 0.7087, + "step": 8765 + }, + { + "epoch": 0.6098299071272044, + "grad_norm": 0.83203125, + "learning_rate": 0.0006976768850812336, + "loss": 0.6889, + "step": 8766 + }, + { + "epoch": 0.6098994747643396, + "grad_norm": 1.09375, + "learning_rate": 0.000697462104104499, + "loss": 0.8307, + "step": 8767 + }, + { + "epoch": 0.6099690424014749, + "grad_norm": 1.515625, + "learning_rate": 0.0006972473384891138, + "loss": 0.9023, + "step": 8768 + }, + { + "epoch": 0.61003861003861, + "grad_norm": 0.92578125, + "learning_rate": 0.0006970325882459832, + "loss": 0.7498, + "step": 8769 + }, + { + "epoch": 0.6101081776757452, + "grad_norm": 0.984375, + "learning_rate": 0.0006968178533860103, + "loss": 0.4831, + "step": 8770 + }, + { + "epoch": 0.6101777453128805, + "grad_norm": 1.0859375, + "learning_rate": 0.0006966031339200989, + "loss": 0.664, + "step": 8771 + }, + { + "epoch": 0.6102473129500157, + "grad_norm": 0.9609375, + "learning_rate": 0.0006963884298591507, + "loss": 0.8983, + "step": 8772 + }, + { + "epoch": 0.6103168805871508, + "grad_norm": 0.9140625, + "learning_rate": 0.0006961737412140681, + "loss": 0.8764, + "step": 8773 + }, + { + "epoch": 0.6103864482242861, + "grad_norm": 1.3671875, + "learning_rate": 0.0006959590679957513, + "loss": 0.8993, + "step": 8774 + }, + { + "epoch": 0.6104560158614213, + "grad_norm": 0.7890625, + "learning_rate": 0.0006957444102151009, + "loss": 0.7429, + "step": 8775 + }, + { + "epoch": 0.6105255834985565, + "grad_norm": 0.93359375, + "learning_rate": 0.0006955297678830153, + "loss": 0.8008, + "step": 8776 + }, + { + "epoch": 0.6105951511356916, + "grad_norm": 1.0703125, + "learning_rate": 0.0006953151410103937, + "loss": 0.8046, + "step": 8777 + }, + { + "epoch": 0.6106647187728269, + "grad_norm": 0.97265625, + "learning_rate": 0.0006951005296081336, + "loss": 0.7889, + "step": 8778 + }, + { + "epoch": 0.6107342864099621, + "grad_norm": 1.1796875, + "learning_rate": 0.0006948859336871314, + "loss": 0.8505, + "step": 8779 + }, + { + "epoch": 0.6108038540470972, + "grad_norm": 1.0, + "learning_rate": 0.0006946713532582841, + "loss": 0.6689, + "step": 8780 + }, + { + "epoch": 0.6108734216842325, + "grad_norm": 1.125, + "learning_rate": 0.0006944567883324863, + "loss": 0.8853, + "step": 8781 + }, + { + "epoch": 0.6109429893213677, + "grad_norm": 1.15625, + "learning_rate": 0.0006942422389206329, + "loss": 0.8171, + "step": 8782 + }, + { + "epoch": 0.6110125569585029, + "grad_norm": 1.09375, + "learning_rate": 0.0006940277050336172, + "loss": 1.0236, + "step": 8783 + }, + { + "epoch": 0.6110821245956382, + "grad_norm": 1.140625, + "learning_rate": 0.0006938131866823324, + "loss": 0.8976, + "step": 8784 + }, + { + "epoch": 0.6111516922327733, + "grad_norm": 1.2734375, + "learning_rate": 0.0006935986838776711, + "loss": 0.8862, + "step": 8785 + }, + { + "epoch": 0.6112212598699085, + "grad_norm": 1.859375, + "learning_rate": 0.0006933841966305234, + "loss": 1.2325, + "step": 8786 + }, + { + "epoch": 0.6112908275070438, + "grad_norm": 0.95703125, + "learning_rate": 0.0006931697249517816, + "loss": 0.8163, + "step": 8787 + }, + { + "epoch": 0.611360395144179, + "grad_norm": 0.98828125, + "learning_rate": 0.0006929552688523344, + "loss": 0.9309, + "step": 8788 + }, + { + "epoch": 0.6114299627813141, + "grad_norm": 1.0390625, + "learning_rate": 0.0006927408283430712, + "loss": 0.8401, + "step": 8789 + }, + { + "epoch": 0.6114995304184493, + "grad_norm": 1.15625, + "learning_rate": 0.0006925264034348795, + "loss": 0.7555, + "step": 8790 + }, + { + "epoch": 0.6115690980555846, + "grad_norm": 0.953125, + "learning_rate": 0.0006923119941386475, + "loss": 0.7396, + "step": 8791 + }, + { + "epoch": 0.6116386656927197, + "grad_norm": 1.1484375, + "learning_rate": 0.000692097600465262, + "loss": 0.6304, + "step": 8792 + }, + { + "epoch": 0.6117082333298549, + "grad_norm": 1.1171875, + "learning_rate": 0.0006918832224256076, + "loss": 0.962, + "step": 8793 + }, + { + "epoch": 0.6117778009669902, + "grad_norm": 1.1015625, + "learning_rate": 0.0006916688600305707, + "loss": 0.9029, + "step": 8794 + }, + { + "epoch": 0.6118473686041254, + "grad_norm": 1.015625, + "learning_rate": 0.0006914545132910348, + "loss": 0.8357, + "step": 8795 + }, + { + "epoch": 0.6119169362412605, + "grad_norm": 1.359375, + "learning_rate": 0.0006912401822178839, + "loss": 0.945, + "step": 8796 + }, + { + "epoch": 0.6119865038783958, + "grad_norm": 1.015625, + "learning_rate": 0.0006910258668219998, + "loss": 0.7049, + "step": 8797 + }, + { + "epoch": 0.612056071515531, + "grad_norm": 0.91796875, + "learning_rate": 0.000690811567114265, + "loss": 0.715, + "step": 8798 + }, + { + "epoch": 0.6121256391526662, + "grad_norm": 1.046875, + "learning_rate": 0.0006905972831055604, + "loss": 0.7373, + "step": 8799 + }, + { + "epoch": 0.6121952067898014, + "grad_norm": 1.015625, + "learning_rate": 0.000690383014806766, + "loss": 0.9297, + "step": 8800 + }, + { + "epoch": 0.6122647744269366, + "grad_norm": 1.078125, + "learning_rate": 0.000690168762228762, + "loss": 0.8317, + "step": 8801 + }, + { + "epoch": 0.6123343420640718, + "grad_norm": 1.1640625, + "learning_rate": 0.0006899545253824265, + "loss": 0.6979, + "step": 8802 + }, + { + "epoch": 0.612403909701207, + "grad_norm": 1.15625, + "learning_rate": 0.0006897403042786374, + "loss": 0.9181, + "step": 8803 + }, + { + "epoch": 0.6124734773383422, + "grad_norm": 1.421875, + "learning_rate": 0.0006895260989282717, + "loss": 0.7358, + "step": 8804 + }, + { + "epoch": 0.6125430449754774, + "grad_norm": 1.5234375, + "learning_rate": 0.0006893119093422058, + "loss": 0.7538, + "step": 8805 + }, + { + "epoch": 0.6126126126126126, + "grad_norm": 1.375, + "learning_rate": 0.0006890977355313152, + "loss": 0.8676, + "step": 8806 + }, + { + "epoch": 0.6126821802497479, + "grad_norm": 1.15625, + "learning_rate": 0.0006888835775064743, + "loss": 0.5304, + "step": 8807 + }, + { + "epoch": 0.612751747886883, + "grad_norm": 0.92578125, + "learning_rate": 0.0006886694352785576, + "loss": 0.5638, + "step": 8808 + }, + { + "epoch": 0.6128213155240182, + "grad_norm": 0.9453125, + "learning_rate": 0.0006884553088584376, + "loss": 0.6907, + "step": 8809 + }, + { + "epoch": 0.6128908831611535, + "grad_norm": 1.421875, + "learning_rate": 0.0006882411982569869, + "loss": 1.1476, + "step": 8810 + }, + { + "epoch": 0.6129604507982886, + "grad_norm": 0.92578125, + "learning_rate": 0.0006880271034850763, + "loss": 0.7789, + "step": 8811 + }, + { + "epoch": 0.6130300184354238, + "grad_norm": 1.203125, + "learning_rate": 0.0006878130245535772, + "loss": 0.9118, + "step": 8812 + }, + { + "epoch": 0.6130995860725591, + "grad_norm": 1.2109375, + "learning_rate": 0.0006875989614733592, + "loss": 0.9038, + "step": 8813 + }, + { + "epoch": 0.6131691537096943, + "grad_norm": 1.1640625, + "learning_rate": 0.000687384914255291, + "loss": 0.7975, + "step": 8814 + }, + { + "epoch": 0.6132387213468294, + "grad_norm": 1.28125, + "learning_rate": 0.0006871708829102417, + "loss": 0.9838, + "step": 8815 + }, + { + "epoch": 0.6133082889839646, + "grad_norm": 1.0234375, + "learning_rate": 0.000686956867449078, + "loss": 0.7255, + "step": 8816 + }, + { + "epoch": 0.6133778566210999, + "grad_norm": 1.0, + "learning_rate": 0.0006867428678826668, + "loss": 0.8206, + "step": 8817 + }, + { + "epoch": 0.6134474242582351, + "grad_norm": 1.03125, + "learning_rate": 0.0006865288842218733, + "loss": 0.6303, + "step": 8818 + }, + { + "epoch": 0.6135169918953702, + "grad_norm": 1.0546875, + "learning_rate": 0.0006863149164775637, + "loss": 0.5099, + "step": 8819 + }, + { + "epoch": 0.6135865595325055, + "grad_norm": 0.859375, + "learning_rate": 0.0006861009646606012, + "loss": 0.706, + "step": 8820 + }, + { + "epoch": 0.6136561271696407, + "grad_norm": 1.0390625, + "learning_rate": 0.0006858870287818494, + "loss": 0.7001, + "step": 8821 + }, + { + "epoch": 0.6137256948067759, + "grad_norm": 1.34375, + "learning_rate": 0.0006856731088521715, + "loss": 0.6741, + "step": 8822 + }, + { + "epoch": 0.6137952624439111, + "grad_norm": 0.93359375, + "learning_rate": 0.0006854592048824286, + "loss": 0.8331, + "step": 8823 + }, + { + "epoch": 0.6138648300810463, + "grad_norm": 1.1875, + "learning_rate": 0.000685245316883482, + "loss": 0.7475, + "step": 8824 + }, + { + "epoch": 0.6139343977181815, + "grad_norm": 1.1484375, + "learning_rate": 0.0006850314448661912, + "loss": 0.6212, + "step": 8825 + }, + { + "epoch": 0.6140039653553168, + "grad_norm": 1.3671875, + "learning_rate": 0.0006848175888414166, + "loss": 0.9171, + "step": 8826 + }, + { + "epoch": 0.6140735329924519, + "grad_norm": 1.28125, + "learning_rate": 0.0006846037488200161, + "loss": 0.9713, + "step": 8827 + }, + { + "epoch": 0.6141431006295871, + "grad_norm": 1.078125, + "learning_rate": 0.0006843899248128473, + "loss": 0.7993, + "step": 8828 + }, + { + "epoch": 0.6142126682667223, + "grad_norm": 1.2421875, + "learning_rate": 0.0006841761168307676, + "loss": 0.8893, + "step": 8829 + }, + { + "epoch": 0.6142822359038576, + "grad_norm": 1.234375, + "learning_rate": 0.0006839623248846327, + "loss": 0.8157, + "step": 8830 + }, + { + "epoch": 0.6143518035409927, + "grad_norm": 1.078125, + "learning_rate": 0.0006837485489852983, + "loss": 0.8176, + "step": 8831 + }, + { + "epoch": 0.6144213711781279, + "grad_norm": 1.1796875, + "learning_rate": 0.0006835347891436178, + "loss": 1.0037, + "step": 8832 + }, + { + "epoch": 0.6144909388152632, + "grad_norm": 1.15625, + "learning_rate": 0.0006833210453704463, + "loss": 0.9301, + "step": 8833 + }, + { + "epoch": 0.6145605064523983, + "grad_norm": 0.953125, + "learning_rate": 0.0006831073176766356, + "loss": 0.9012, + "step": 8834 + }, + { + "epoch": 0.6146300740895335, + "grad_norm": 1.1484375, + "learning_rate": 0.000682893606073038, + "loss": 0.8061, + "step": 8835 + }, + { + "epoch": 0.6146996417266688, + "grad_norm": 1.0859375, + "learning_rate": 0.000682679910570505, + "loss": 0.9522, + "step": 8836 + }, + { + "epoch": 0.614769209363804, + "grad_norm": 1.2265625, + "learning_rate": 0.0006824662311798867, + "loss": 0.7811, + "step": 8837 + }, + { + "epoch": 0.6148387770009391, + "grad_norm": 1.296875, + "learning_rate": 0.0006822525679120326, + "loss": 1.0802, + "step": 8838 + }, + { + "epoch": 0.6149083446380744, + "grad_norm": 1.2421875, + "learning_rate": 0.0006820389207777914, + "loss": 1.1608, + "step": 8839 + }, + { + "epoch": 0.6149779122752096, + "grad_norm": 0.921875, + "learning_rate": 0.0006818252897880115, + "loss": 0.5822, + "step": 8840 + }, + { + "epoch": 0.6150474799123448, + "grad_norm": 0.94140625, + "learning_rate": 0.0006816116749535395, + "loss": 0.684, + "step": 8841 + }, + { + "epoch": 0.6151170475494799, + "grad_norm": 1.1328125, + "learning_rate": 0.0006813980762852217, + "loss": 0.6828, + "step": 8842 + }, + { + "epoch": 0.6151866151866152, + "grad_norm": 1.1328125, + "learning_rate": 0.000681184493793904, + "loss": 0.9205, + "step": 8843 + }, + { + "epoch": 0.6152561828237504, + "grad_norm": 1.0078125, + "learning_rate": 0.0006809709274904305, + "loss": 0.7127, + "step": 8844 + }, + { + "epoch": 0.6153257504608856, + "grad_norm": 1.578125, + "learning_rate": 0.0006807573773856455, + "loss": 0.9358, + "step": 8845 + }, + { + "epoch": 0.6153953180980208, + "grad_norm": 0.78515625, + "learning_rate": 0.0006805438434903915, + "loss": 0.8105, + "step": 8846 + }, + { + "epoch": 0.615464885735156, + "grad_norm": 1.375, + "learning_rate": 0.0006803303258155111, + "loss": 0.9114, + "step": 8847 + }, + { + "epoch": 0.6155344533722912, + "grad_norm": 1.390625, + "learning_rate": 0.0006801168243718457, + "loss": 0.9992, + "step": 8848 + }, + { + "epoch": 0.6156040210094265, + "grad_norm": 1.2109375, + "learning_rate": 0.0006799033391702351, + "loss": 0.8461, + "step": 8849 + }, + { + "epoch": 0.6156735886465616, + "grad_norm": 1.296875, + "learning_rate": 0.0006796898702215199, + "loss": 0.6119, + "step": 8850 + }, + { + "epoch": 0.6157431562836968, + "grad_norm": 1.1875, + "learning_rate": 0.0006794764175365387, + "loss": 0.8069, + "step": 8851 + }, + { + "epoch": 0.6158127239208321, + "grad_norm": 0.92578125, + "learning_rate": 0.0006792629811261293, + "loss": 0.7707, + "step": 8852 + }, + { + "epoch": 0.6158822915579673, + "grad_norm": 1.3125, + "learning_rate": 0.0006790495610011289, + "loss": 0.5123, + "step": 8853 + }, + { + "epoch": 0.6159518591951024, + "grad_norm": 1.0625, + "learning_rate": 0.0006788361571723744, + "loss": 0.7792, + "step": 8854 + }, + { + "epoch": 0.6160214268322376, + "grad_norm": 1.03125, + "learning_rate": 0.0006786227696507011, + "loss": 1.1437, + "step": 8855 + }, + { + "epoch": 0.6160909944693729, + "grad_norm": 0.8984375, + "learning_rate": 0.0006784093984469437, + "loss": 0.7059, + "step": 8856 + }, + { + "epoch": 0.616160562106508, + "grad_norm": 0.92578125, + "learning_rate": 0.0006781960435719355, + "loss": 0.6805, + "step": 8857 + }, + { + "epoch": 0.6162301297436432, + "grad_norm": 0.7578125, + "learning_rate": 0.0006779827050365109, + "loss": 0.5224, + "step": 8858 + }, + { + "epoch": 0.6162996973807785, + "grad_norm": 1.0390625, + "learning_rate": 0.0006777693828515012, + "loss": 0.715, + "step": 8859 + }, + { + "epoch": 0.6163692650179137, + "grad_norm": 1.6171875, + "learning_rate": 0.0006775560770277378, + "loss": 0.6267, + "step": 8860 + }, + { + "epoch": 0.6164388326550488, + "grad_norm": 1.125, + "learning_rate": 0.0006773427875760521, + "loss": 0.6734, + "step": 8861 + }, + { + "epoch": 0.6165084002921841, + "grad_norm": 1.0625, + "learning_rate": 0.0006771295145072731, + "loss": 0.7742, + "step": 8862 + }, + { + "epoch": 0.6165779679293193, + "grad_norm": 1.3203125, + "learning_rate": 0.0006769162578322301, + "loss": 0.9444, + "step": 8863 + }, + { + "epoch": 0.6166475355664545, + "grad_norm": 1.203125, + "learning_rate": 0.0006767030175617505, + "loss": 0.7557, + "step": 8864 + }, + { + "epoch": 0.6167171032035897, + "grad_norm": 1.015625, + "learning_rate": 0.0006764897937066627, + "loss": 0.7259, + "step": 8865 + }, + { + "epoch": 0.6167866708407249, + "grad_norm": 1.2421875, + "learning_rate": 0.0006762765862777924, + "loss": 0.9274, + "step": 8866 + }, + { + "epoch": 0.6168562384778601, + "grad_norm": 1.1640625, + "learning_rate": 0.0006760633952859652, + "loss": 0.86, + "step": 8867 + }, + { + "epoch": 0.6169258061149953, + "grad_norm": 1.375, + "learning_rate": 0.0006758502207420065, + "loss": 0.939, + "step": 8868 + }, + { + "epoch": 0.6169953737521305, + "grad_norm": 1.4296875, + "learning_rate": 0.0006756370626567394, + "loss": 0.8592, + "step": 8869 + }, + { + "epoch": 0.6170649413892657, + "grad_norm": 1.171875, + "learning_rate": 0.0006754239210409874, + "loss": 0.803, + "step": 8870 + }, + { + "epoch": 0.6171345090264009, + "grad_norm": 1.1953125, + "learning_rate": 0.0006752107959055724, + "loss": 0.7991, + "step": 8871 + }, + { + "epoch": 0.6172040766635362, + "grad_norm": 1.03125, + "learning_rate": 0.0006749976872613166, + "loss": 0.8255, + "step": 8872 + }, + { + "epoch": 0.6172736443006713, + "grad_norm": 1.3203125, + "learning_rate": 0.00067478459511904, + "loss": 0.8896, + "step": 8873 + }, + { + "epoch": 0.6173432119378065, + "grad_norm": 1.0078125, + "learning_rate": 0.0006745715194895622, + "loss": 0.7888, + "step": 8874 + }, + { + "epoch": 0.6174127795749418, + "grad_norm": 0.8984375, + "learning_rate": 0.0006743584603837027, + "loss": 0.7649, + "step": 8875 + }, + { + "epoch": 0.617482347212077, + "grad_norm": 1.03125, + "learning_rate": 0.000674145417812279, + "loss": 0.8674, + "step": 8876 + }, + { + "epoch": 0.6175519148492121, + "grad_norm": 0.98828125, + "learning_rate": 0.0006739323917861087, + "loss": 0.7114, + "step": 8877 + }, + { + "epoch": 0.6176214824863474, + "grad_norm": 1.0625, + "learning_rate": 0.0006737193823160077, + "loss": 0.8836, + "step": 8878 + }, + { + "epoch": 0.6176910501234826, + "grad_norm": 1.1796875, + "learning_rate": 0.0006735063894127924, + "loss": 0.6029, + "step": 8879 + }, + { + "epoch": 0.6177606177606177, + "grad_norm": 1.1328125, + "learning_rate": 0.0006732934130872768, + "loss": 0.7293, + "step": 8880 + }, + { + "epoch": 0.6178301853977529, + "grad_norm": 1.1171875, + "learning_rate": 0.0006730804533502747, + "loss": 0.7565, + "step": 8881 + }, + { + "epoch": 0.6178997530348882, + "grad_norm": 0.984375, + "learning_rate": 0.0006728675102125997, + "loss": 0.8098, + "step": 8882 + }, + { + "epoch": 0.6179693206720234, + "grad_norm": 0.90625, + "learning_rate": 0.0006726545836850636, + "loss": 0.7885, + "step": 8883 + }, + { + "epoch": 0.6180388883091585, + "grad_norm": 1.265625, + "learning_rate": 0.0006724416737784777, + "loss": 0.8904, + "step": 8884 + }, + { + "epoch": 0.6181084559462938, + "grad_norm": 1.078125, + "learning_rate": 0.0006722287805036525, + "loss": 0.9974, + "step": 8885 + }, + { + "epoch": 0.618178023583429, + "grad_norm": 1.3359375, + "learning_rate": 0.0006720159038713981, + "loss": 0.681, + "step": 8886 + }, + { + "epoch": 0.6182475912205642, + "grad_norm": 1.015625, + "learning_rate": 0.0006718030438925227, + "loss": 0.6932, + "step": 8887 + }, + { + "epoch": 0.6183171588576994, + "grad_norm": 0.93359375, + "learning_rate": 0.0006715902005778343, + "loss": 0.7525, + "step": 8888 + }, + { + "epoch": 0.6183867264948346, + "grad_norm": 1.1953125, + "learning_rate": 0.0006713773739381403, + "loss": 0.6183, + "step": 8889 + }, + { + "epoch": 0.6184562941319698, + "grad_norm": 1.15625, + "learning_rate": 0.0006711645639842474, + "loss": 0.8117, + "step": 8890 + }, + { + "epoch": 0.6185258617691051, + "grad_norm": 1.1484375, + "learning_rate": 0.00067095177072696, + "loss": 0.736, + "step": 8891 + }, + { + "epoch": 0.6185954294062402, + "grad_norm": 1.453125, + "learning_rate": 0.0006707389941770829, + "loss": 0.9317, + "step": 8892 + }, + { + "epoch": 0.6186649970433754, + "grad_norm": 1.3203125, + "learning_rate": 0.0006705262343454208, + "loss": 0.8924, + "step": 8893 + }, + { + "epoch": 0.6187345646805106, + "grad_norm": 1.0546875, + "learning_rate": 0.0006703134912427754, + "loss": 0.6364, + "step": 8894 + }, + { + "epoch": 0.6188041323176459, + "grad_norm": 1.0078125, + "learning_rate": 0.0006701007648799491, + "loss": 0.779, + "step": 8895 + }, + { + "epoch": 0.618873699954781, + "grad_norm": 0.9296875, + "learning_rate": 0.0006698880552677432, + "loss": 0.6551, + "step": 8896 + }, + { + "epoch": 0.6189432675919162, + "grad_norm": 1.0625, + "learning_rate": 0.0006696753624169582, + "loss": 0.9548, + "step": 8897 + }, + { + "epoch": 0.6190128352290515, + "grad_norm": 1.03125, + "learning_rate": 0.0006694626863383932, + "loss": 0.7592, + "step": 8898 + }, + { + "epoch": 0.6190824028661867, + "grad_norm": 1.296875, + "learning_rate": 0.0006692500270428467, + "loss": 1.0177, + "step": 8899 + }, + { + "epoch": 0.6191519705033218, + "grad_norm": 1.2265625, + "learning_rate": 0.0006690373845411173, + "loss": 0.8578, + "step": 8900 + }, + { + "epoch": 0.6192215381404571, + "grad_norm": 1.015625, + "learning_rate": 0.0006688247588440008, + "loss": 0.7239, + "step": 8901 + }, + { + "epoch": 0.6192911057775923, + "grad_norm": 1.046875, + "learning_rate": 0.000668612149962294, + "loss": 0.6681, + "step": 8902 + }, + { + "epoch": 0.6193606734147274, + "grad_norm": 1.2890625, + "learning_rate": 0.0006683995579067918, + "loss": 0.8568, + "step": 8903 + }, + { + "epoch": 0.6194302410518627, + "grad_norm": 1.046875, + "learning_rate": 0.0006681869826882889, + "loss": 0.6597, + "step": 8904 + }, + { + "epoch": 0.6194998086889979, + "grad_norm": 0.9453125, + "learning_rate": 0.0006679744243175785, + "loss": 0.76, + "step": 8905 + }, + { + "epoch": 0.6195693763261331, + "grad_norm": 1.328125, + "learning_rate": 0.000667761882805453, + "loss": 1.2157, + "step": 8906 + }, + { + "epoch": 0.6196389439632682, + "grad_norm": 1.0859375, + "learning_rate": 0.0006675493581627049, + "loss": 0.7298, + "step": 8907 + }, + { + "epoch": 0.6197085116004035, + "grad_norm": 1.296875, + "learning_rate": 0.0006673368504001245, + "loss": 0.8582, + "step": 8908 + }, + { + "epoch": 0.6197780792375387, + "grad_norm": 1.234375, + "learning_rate": 0.000667124359528502, + "loss": 1.0202, + "step": 8909 + }, + { + "epoch": 0.6198476468746739, + "grad_norm": 1.0234375, + "learning_rate": 0.0006669118855586267, + "loss": 0.8024, + "step": 8910 + }, + { + "epoch": 0.6199172145118091, + "grad_norm": 1.140625, + "learning_rate": 0.0006666994285012873, + "loss": 0.8253, + "step": 8911 + }, + { + "epoch": 0.6199867821489443, + "grad_norm": 1.3125, + "learning_rate": 0.0006664869883672708, + "loss": 0.8428, + "step": 8912 + }, + { + "epoch": 0.6200563497860795, + "grad_norm": 0.83984375, + "learning_rate": 0.0006662745651673638, + "loss": 0.8329, + "step": 8913 + }, + { + "epoch": 0.6201259174232148, + "grad_norm": 1.0, + "learning_rate": 0.0006660621589123526, + "loss": 0.7488, + "step": 8914 + }, + { + "epoch": 0.6201954850603499, + "grad_norm": 1.1328125, + "learning_rate": 0.0006658497696130216, + "loss": 0.7186, + "step": 8915 + }, + { + "epoch": 0.6202650526974851, + "grad_norm": 1.390625, + "learning_rate": 0.0006656373972801548, + "loss": 0.81, + "step": 8916 + }, + { + "epoch": 0.6203346203346203, + "grad_norm": 0.6484375, + "learning_rate": 0.000665425041924536, + "loss": 0.5148, + "step": 8917 + }, + { + "epoch": 0.6204041879717556, + "grad_norm": 0.9375, + "learning_rate": 0.0006652127035569473, + "loss": 0.8071, + "step": 8918 + }, + { + "epoch": 0.6204737556088907, + "grad_norm": 1.0625, + "learning_rate": 0.0006650003821881698, + "loss": 0.6649, + "step": 8919 + }, + { + "epoch": 0.6205433232460259, + "grad_norm": 1.328125, + "learning_rate": 0.0006647880778289843, + "loss": 0.9216, + "step": 8920 + }, + { + "epoch": 0.6206128908831612, + "grad_norm": 1.2734375, + "learning_rate": 0.0006645757904901708, + "loss": 0.7633, + "step": 8921 + }, + { + "epoch": 0.6206824585202964, + "grad_norm": 1.0859375, + "learning_rate": 0.0006643635201825081, + "loss": 0.5849, + "step": 8922 + }, + { + "epoch": 0.6207520261574315, + "grad_norm": 1.0390625, + "learning_rate": 0.0006641512669167737, + "loss": 0.8264, + "step": 8923 + }, + { + "epoch": 0.6208215937945668, + "grad_norm": 1.1875, + "learning_rate": 0.0006639390307037456, + "loss": 0.8133, + "step": 8924 + }, + { + "epoch": 0.620891161431702, + "grad_norm": 1.046875, + "learning_rate": 0.0006637268115541997, + "loss": 0.6556, + "step": 8925 + }, + { + "epoch": 0.6209607290688371, + "grad_norm": 1.0546875, + "learning_rate": 0.0006635146094789111, + "loss": 0.7816, + "step": 8926 + }, + { + "epoch": 0.6210302967059724, + "grad_norm": 1.0625, + "learning_rate": 0.0006633024244886546, + "loss": 0.9437, + "step": 8927 + }, + { + "epoch": 0.6210998643431076, + "grad_norm": 0.89453125, + "learning_rate": 0.0006630902565942039, + "loss": 0.6375, + "step": 8928 + }, + { + "epoch": 0.6211694319802428, + "grad_norm": 1.3359375, + "learning_rate": 0.000662878105806332, + "loss": 0.8066, + "step": 8929 + }, + { + "epoch": 0.6212389996173779, + "grad_norm": 1.296875, + "learning_rate": 0.0006626659721358103, + "loss": 0.7373, + "step": 8930 + }, + { + "epoch": 0.6213085672545132, + "grad_norm": 1.2265625, + "learning_rate": 0.0006624538555934105, + "loss": 0.8738, + "step": 8931 + }, + { + "epoch": 0.6213781348916484, + "grad_norm": 1.3125, + "learning_rate": 0.0006622417561899028, + "loss": 0.9945, + "step": 8932 + }, + { + "epoch": 0.6214477025287836, + "grad_norm": 1.4375, + "learning_rate": 0.0006620296739360561, + "loss": 0.6965, + "step": 8933 + }, + { + "epoch": 0.6215172701659188, + "grad_norm": 0.87109375, + "learning_rate": 0.0006618176088426388, + "loss": 0.6123, + "step": 8934 + }, + { + "epoch": 0.621586837803054, + "grad_norm": 1.09375, + "learning_rate": 0.0006616055609204191, + "loss": 0.6154, + "step": 8935 + }, + { + "epoch": 0.6216564054401892, + "grad_norm": 1.0703125, + "learning_rate": 0.0006613935301801633, + "loss": 0.6419, + "step": 8936 + }, + { + "epoch": 0.6217259730773245, + "grad_norm": 0.85546875, + "learning_rate": 0.0006611815166326373, + "loss": 0.6474, + "step": 8937 + }, + { + "epoch": 0.6217955407144596, + "grad_norm": 1.6796875, + "learning_rate": 0.0006609695202886059, + "loss": 0.8672, + "step": 8938 + }, + { + "epoch": 0.6218651083515948, + "grad_norm": 0.9609375, + "learning_rate": 0.0006607575411588338, + "loss": 0.7448, + "step": 8939 + }, + { + "epoch": 0.6219346759887301, + "grad_norm": 0.80078125, + "learning_rate": 0.0006605455792540837, + "loss": 0.6578, + "step": 8940 + }, + { + "epoch": 0.6220042436258653, + "grad_norm": 1.140625, + "learning_rate": 0.0006603336345851179, + "loss": 0.9665, + "step": 8941 + }, + { + "epoch": 0.6220738112630004, + "grad_norm": 1.0546875, + "learning_rate": 0.0006601217071626981, + "loss": 0.7588, + "step": 8942 + }, + { + "epoch": 0.6221433789001356, + "grad_norm": 1.4140625, + "learning_rate": 0.0006599097969975853, + "loss": 0.9968, + "step": 8943 + }, + { + "epoch": 0.6222129465372709, + "grad_norm": 1.28125, + "learning_rate": 0.0006596979041005387, + "loss": 0.8707, + "step": 8944 + }, + { + "epoch": 0.622282514174406, + "grad_norm": 1.234375, + "learning_rate": 0.000659486028482317, + "loss": 1.1097, + "step": 8945 + }, + { + "epoch": 0.6223520818115412, + "grad_norm": 0.984375, + "learning_rate": 0.0006592741701536789, + "loss": 0.6772, + "step": 8946 + }, + { + "epoch": 0.6224216494486765, + "grad_norm": 1.296875, + "learning_rate": 0.0006590623291253807, + "loss": 0.8832, + "step": 8947 + }, + { + "epoch": 0.6224912170858117, + "grad_norm": 1.1015625, + "learning_rate": 0.0006588505054081788, + "loss": 0.826, + "step": 8948 + }, + { + "epoch": 0.6225607847229468, + "grad_norm": 1.0390625, + "learning_rate": 0.000658638699012829, + "loss": 0.6682, + "step": 8949 + }, + { + "epoch": 0.6226303523600821, + "grad_norm": 1.1875, + "learning_rate": 0.0006584269099500857, + "loss": 0.7514, + "step": 8950 + }, + { + "epoch": 0.6226999199972173, + "grad_norm": 1.0703125, + "learning_rate": 0.000658215138230702, + "loss": 0.7864, + "step": 8951 + }, + { + "epoch": 0.6227694876343525, + "grad_norm": 0.95703125, + "learning_rate": 0.0006580033838654305, + "loss": 0.585, + "step": 8952 + }, + { + "epoch": 0.6228390552714878, + "grad_norm": 1.203125, + "learning_rate": 0.0006577916468650238, + "loss": 0.9401, + "step": 8953 + }, + { + "epoch": 0.6229086229086229, + "grad_norm": 1.4765625, + "learning_rate": 0.0006575799272402326, + "loss": 1.1381, + "step": 8954 + }, + { + "epoch": 0.6229781905457581, + "grad_norm": 1.1796875, + "learning_rate": 0.0006573682250018062, + "loss": 0.8025, + "step": 8955 + }, + { + "epoch": 0.6230477581828933, + "grad_norm": 1.3203125, + "learning_rate": 0.0006571565401604948, + "loss": 0.8385, + "step": 8956 + }, + { + "epoch": 0.6231173258200285, + "grad_norm": 1.078125, + "learning_rate": 0.0006569448727270462, + "loss": 0.7887, + "step": 8957 + }, + { + "epoch": 0.6231868934571637, + "grad_norm": 1.15625, + "learning_rate": 0.0006567332227122078, + "loss": 1.0143, + "step": 8958 + }, + { + "epoch": 0.6232564610942989, + "grad_norm": 1.0, + "learning_rate": 0.0006565215901267259, + "loss": 0.6021, + "step": 8959 + }, + { + "epoch": 0.6233260287314342, + "grad_norm": 1.1328125, + "learning_rate": 0.0006563099749813466, + "loss": 0.7746, + "step": 8960 + }, + { + "epoch": 0.6233955963685693, + "grad_norm": 1.2734375, + "learning_rate": 0.0006560983772868146, + "loss": 0.9201, + "step": 8961 + }, + { + "epoch": 0.6234651640057045, + "grad_norm": 0.92578125, + "learning_rate": 0.0006558867970538733, + "loss": 0.7459, + "step": 8962 + }, + { + "epoch": 0.6235347316428398, + "grad_norm": 1.125, + "learning_rate": 0.000655675234293266, + "loss": 0.7632, + "step": 8963 + }, + { + "epoch": 0.623604299279975, + "grad_norm": 1.5, + "learning_rate": 0.0006554636890157352, + "loss": 0.6866, + "step": 8964 + }, + { + "epoch": 0.6236738669171101, + "grad_norm": 0.90234375, + "learning_rate": 0.0006552521612320214, + "loss": 0.6747, + "step": 8965 + }, + { + "epoch": 0.6237434345542454, + "grad_norm": 1.0859375, + "learning_rate": 0.0006550406509528649, + "loss": 0.8112, + "step": 8966 + }, + { + "epoch": 0.6238130021913806, + "grad_norm": 0.99609375, + "learning_rate": 0.0006548291581890057, + "loss": 0.8227, + "step": 8967 + }, + { + "epoch": 0.6238825698285158, + "grad_norm": 1.0546875, + "learning_rate": 0.0006546176829511823, + "loss": 0.6668, + "step": 8968 + }, + { + "epoch": 0.6239521374656509, + "grad_norm": 0.9921875, + "learning_rate": 0.0006544062252501317, + "loss": 0.7372, + "step": 8969 + }, + { + "epoch": 0.6240217051027862, + "grad_norm": 1.1328125, + "learning_rate": 0.0006541947850965911, + "loss": 0.9286, + "step": 8970 + }, + { + "epoch": 0.6240912727399214, + "grad_norm": 1.3125, + "learning_rate": 0.0006539833625012968, + "loss": 1.1129, + "step": 8971 + }, + { + "epoch": 0.6241608403770565, + "grad_norm": 1.0625, + "learning_rate": 0.0006537719574749828, + "loss": 0.6248, + "step": 8972 + }, + { + "epoch": 0.6242304080141918, + "grad_norm": 1.046875, + "learning_rate": 0.0006535605700283836, + "loss": 0.6675, + "step": 8973 + }, + { + "epoch": 0.624299975651327, + "grad_norm": 1.328125, + "learning_rate": 0.0006533492001722327, + "loss": 0.8368, + "step": 8974 + }, + { + "epoch": 0.6243695432884622, + "grad_norm": 1.3359375, + "learning_rate": 0.0006531378479172624, + "loss": 0.6613, + "step": 8975 + }, + { + "epoch": 0.6244391109255975, + "grad_norm": 1.046875, + "learning_rate": 0.0006529265132742035, + "loss": 0.78, + "step": 8976 + }, + { + "epoch": 0.6245086785627326, + "grad_norm": 1.203125, + "learning_rate": 0.000652715196253787, + "loss": 0.8276, + "step": 8977 + }, + { + "epoch": 0.6245782461998678, + "grad_norm": 1.171875, + "learning_rate": 0.0006525038968667425, + "loss": 0.8103, + "step": 8978 + }, + { + "epoch": 0.6246478138370031, + "grad_norm": 1.5625, + "learning_rate": 0.0006522926151237984, + "loss": 0.7788, + "step": 8979 + }, + { + "epoch": 0.6247173814741382, + "grad_norm": 1.125, + "learning_rate": 0.0006520813510356827, + "loss": 0.6742, + "step": 8980 + }, + { + "epoch": 0.6247869491112734, + "grad_norm": 1.2578125, + "learning_rate": 0.0006518701046131226, + "loss": 1.0233, + "step": 8981 + }, + { + "epoch": 0.6248565167484086, + "grad_norm": 1.1171875, + "learning_rate": 0.000651658875866844, + "loss": 0.7299, + "step": 8982 + }, + { + "epoch": 0.6249260843855439, + "grad_norm": 1.4453125, + "learning_rate": 0.0006514476648075714, + "loss": 0.906, + "step": 8983 + }, + { + "epoch": 0.624995652022679, + "grad_norm": 1.0625, + "learning_rate": 0.0006512364714460297, + "loss": 0.8938, + "step": 8984 + }, + { + "epoch": 0.6250652196598142, + "grad_norm": 1.1015625, + "learning_rate": 0.0006510252957929426, + "loss": 1.0569, + "step": 8985 + }, + { + "epoch": 0.6251347872969495, + "grad_norm": 1.046875, + "learning_rate": 0.0006508141378590316, + "loss": 0.7806, + "step": 8986 + }, + { + "epoch": 0.6252043549340847, + "grad_norm": 1.3125, + "learning_rate": 0.0006506029976550184, + "loss": 0.7813, + "step": 8987 + }, + { + "epoch": 0.6252739225712198, + "grad_norm": 1.578125, + "learning_rate": 0.0006503918751916241, + "loss": 0.8226, + "step": 8988 + }, + { + "epoch": 0.6253434902083551, + "grad_norm": 1.2109375, + "learning_rate": 0.0006501807704795686, + "loss": 0.8797, + "step": 8989 + }, + { + "epoch": 0.6254130578454903, + "grad_norm": 1.0703125, + "learning_rate": 0.0006499696835295698, + "loss": 0.8448, + "step": 8990 + }, + { + "epoch": 0.6254826254826255, + "grad_norm": 1.4453125, + "learning_rate": 0.0006497586143523464, + "loss": 0.8622, + "step": 8991 + }, + { + "epoch": 0.6255521931197607, + "grad_norm": 1.3203125, + "learning_rate": 0.0006495475629586153, + "loss": 0.8413, + "step": 8992 + }, + { + "epoch": 0.6256217607568959, + "grad_norm": 1.1640625, + "learning_rate": 0.0006493365293590927, + "loss": 0.9827, + "step": 8993 + }, + { + "epoch": 0.6256913283940311, + "grad_norm": 1.2109375, + "learning_rate": 0.0006491255135644931, + "loss": 0.6953, + "step": 8994 + }, + { + "epoch": 0.6257608960311662, + "grad_norm": 1.015625, + "learning_rate": 0.0006489145155855318, + "loss": 0.5838, + "step": 8995 + }, + { + "epoch": 0.6258304636683015, + "grad_norm": 0.96875, + "learning_rate": 0.000648703535432922, + "loss": 0.783, + "step": 8996 + }, + { + "epoch": 0.6259000313054367, + "grad_norm": 1.125, + "learning_rate": 0.0006484925731173755, + "loss": 0.8399, + "step": 8997 + }, + { + "epoch": 0.6259695989425719, + "grad_norm": 1.2734375, + "learning_rate": 0.0006482816286496046, + "loss": 0.8525, + "step": 8998 + }, + { + "epoch": 0.6260391665797072, + "grad_norm": 1.2265625, + "learning_rate": 0.0006480707020403198, + "loss": 0.8645, + "step": 8999 + }, + { + "epoch": 0.6261087342168423, + "grad_norm": 0.828125, + "learning_rate": 0.0006478597933002313, + "loss": 0.7281, + "step": 9000 + }, + { + "epoch": 0.6261783018539775, + "grad_norm": 0.859375, + "learning_rate": 0.000647648902440047, + "loss": 0.7644, + "step": 9001 + }, + { + "epoch": 0.6262478694911128, + "grad_norm": 1.34375, + "learning_rate": 0.0006474380294704756, + "loss": 0.7728, + "step": 9002 + }, + { + "epoch": 0.626317437128248, + "grad_norm": 0.94921875, + "learning_rate": 0.0006472271744022243, + "loss": 0.5117, + "step": 9003 + }, + { + "epoch": 0.6263870047653831, + "grad_norm": 0.890625, + "learning_rate": 0.0006470163372459984, + "loss": 0.6666, + "step": 9004 + }, + { + "epoch": 0.6264565724025184, + "grad_norm": 0.94921875, + "learning_rate": 0.0006468055180125043, + "loss": 0.8523, + "step": 9005 + }, + { + "epoch": 0.6265261400396536, + "grad_norm": 0.90234375, + "learning_rate": 0.0006465947167124455, + "loss": 0.7207, + "step": 9006 + }, + { + "epoch": 0.6265957076767887, + "grad_norm": 1.0625, + "learning_rate": 0.000646383933356526, + "loss": 0.5216, + "step": 9007 + }, + { + "epoch": 0.6266652753139239, + "grad_norm": 1.1875, + "learning_rate": 0.0006461731679554476, + "loss": 0.8078, + "step": 9008 + }, + { + "epoch": 0.6267348429510592, + "grad_norm": 1.0625, + "learning_rate": 0.0006459624205199124, + "loss": 0.7409, + "step": 9009 + }, + { + "epoch": 0.6268044105881944, + "grad_norm": 1.21875, + "learning_rate": 0.0006457516910606213, + "loss": 1.0532, + "step": 9010 + }, + { + "epoch": 0.6268739782253295, + "grad_norm": 1.03125, + "learning_rate": 0.0006455409795882737, + "loss": 0.6999, + "step": 9011 + }, + { + "epoch": 0.6269435458624648, + "grad_norm": 1.5, + "learning_rate": 0.0006453302861135681, + "loss": 1.0903, + "step": 9012 + }, + { + "epoch": 0.6270131134996, + "grad_norm": 0.9921875, + "learning_rate": 0.0006451196106472031, + "loss": 0.9208, + "step": 9013 + }, + { + "epoch": 0.6270826811367352, + "grad_norm": 1.0234375, + "learning_rate": 0.0006449089531998759, + "loss": 0.5971, + "step": 9014 + }, + { + "epoch": 0.6271522487738704, + "grad_norm": 1.3671875, + "learning_rate": 0.0006446983137822818, + "loss": 1.0601, + "step": 9015 + }, + { + "epoch": 0.6272218164110056, + "grad_norm": 1.3671875, + "learning_rate": 0.0006444876924051168, + "loss": 0.8429, + "step": 9016 + }, + { + "epoch": 0.6272913840481408, + "grad_norm": 1.3515625, + "learning_rate": 0.0006442770890790749, + "loss": 0.7286, + "step": 9017 + }, + { + "epoch": 0.6273609516852761, + "grad_norm": 1.03125, + "learning_rate": 0.0006440665038148493, + "loss": 0.9192, + "step": 9018 + }, + { + "epoch": 0.6274305193224112, + "grad_norm": 1.203125, + "learning_rate": 0.0006438559366231325, + "loss": 1.0337, + "step": 9019 + }, + { + "epoch": 0.6275000869595464, + "grad_norm": 0.99609375, + "learning_rate": 0.0006436453875146161, + "loss": 0.8791, + "step": 9020 + }, + { + "epoch": 0.6275696545966816, + "grad_norm": 1.109375, + "learning_rate": 0.0006434348564999911, + "loss": 0.5871, + "step": 9021 + }, + { + "epoch": 0.6276392222338169, + "grad_norm": 1.6796875, + "learning_rate": 0.0006432243435899465, + "loss": 0.9247, + "step": 9022 + }, + { + "epoch": 0.627708789870952, + "grad_norm": 1.3671875, + "learning_rate": 0.0006430138487951715, + "loss": 0.7651, + "step": 9023 + }, + { + "epoch": 0.6277783575080872, + "grad_norm": 1.0546875, + "learning_rate": 0.0006428033721263541, + "loss": 0.9575, + "step": 9024 + }, + { + "epoch": 0.6278479251452225, + "grad_norm": 1.28125, + "learning_rate": 0.0006425929135941813, + "loss": 0.7282, + "step": 9025 + }, + { + "epoch": 0.6279174927823576, + "grad_norm": 1.390625, + "learning_rate": 0.0006423824732093383, + "loss": 1.0565, + "step": 9026 + }, + { + "epoch": 0.6279870604194928, + "grad_norm": 1.0859375, + "learning_rate": 0.0006421720509825111, + "loss": 0.8532, + "step": 9027 + }, + { + "epoch": 0.6280566280566281, + "grad_norm": 1.140625, + "learning_rate": 0.0006419616469243837, + "loss": 0.81, + "step": 9028 + }, + { + "epoch": 0.6281261956937633, + "grad_norm": 1.234375, + "learning_rate": 0.0006417512610456389, + "loss": 0.9244, + "step": 9029 + }, + { + "epoch": 0.6281957633308984, + "grad_norm": 1.0625, + "learning_rate": 0.0006415408933569593, + "loss": 0.8321, + "step": 9030 + }, + { + "epoch": 0.6282653309680337, + "grad_norm": 1.125, + "learning_rate": 0.0006413305438690267, + "loss": 0.8649, + "step": 9031 + }, + { + "epoch": 0.6283348986051689, + "grad_norm": 1.2265625, + "learning_rate": 0.0006411202125925213, + "loss": 1.0341, + "step": 9032 + }, + { + "epoch": 0.6284044662423041, + "grad_norm": 1.390625, + "learning_rate": 0.0006409098995381222, + "loss": 0.9063, + "step": 9033 + }, + { + "epoch": 0.6284740338794392, + "grad_norm": 1.4375, + "learning_rate": 0.0006406996047165086, + "loss": 0.7885, + "step": 9034 + }, + { + "epoch": 0.6285436015165745, + "grad_norm": 1.5625, + "learning_rate": 0.0006404893281383583, + "loss": 1.0488, + "step": 9035 + }, + { + "epoch": 0.6286131691537097, + "grad_norm": 1.0546875, + "learning_rate": 0.0006402790698143477, + "loss": 0.8029, + "step": 9036 + }, + { + "epoch": 0.6286827367908449, + "grad_norm": 1.0546875, + "learning_rate": 0.0006400688297551526, + "loss": 0.7888, + "step": 9037 + }, + { + "epoch": 0.6287523044279801, + "grad_norm": 1.3828125, + "learning_rate": 0.0006398586079714485, + "loss": 0.903, + "step": 9038 + }, + { + "epoch": 0.6288218720651153, + "grad_norm": 1.171875, + "learning_rate": 0.000639648404473909, + "loss": 0.8528, + "step": 9039 + }, + { + "epoch": 0.6288914397022505, + "grad_norm": 1.3203125, + "learning_rate": 0.0006394382192732069, + "loss": 0.8498, + "step": 9040 + }, + { + "epoch": 0.6289610073393858, + "grad_norm": 1.1171875, + "learning_rate": 0.0006392280523800149, + "loss": 0.7965, + "step": 9041 + }, + { + "epoch": 0.6290305749765209, + "grad_norm": 1.265625, + "learning_rate": 0.0006390179038050041, + "loss": 0.688, + "step": 9042 + }, + { + "epoch": 0.6291001426136561, + "grad_norm": 0.8046875, + "learning_rate": 0.0006388077735588441, + "loss": 0.6369, + "step": 9043 + }, + { + "epoch": 0.6291697102507914, + "grad_norm": 1.09375, + "learning_rate": 0.0006385976616522054, + "loss": 0.9437, + "step": 9044 + }, + { + "epoch": 0.6292392778879266, + "grad_norm": 1.171875, + "learning_rate": 0.0006383875680957557, + "loss": 0.8621, + "step": 9045 + }, + { + "epoch": 0.6293088455250617, + "grad_norm": 1.109375, + "learning_rate": 0.0006381774929001628, + "loss": 0.683, + "step": 9046 + }, + { + "epoch": 0.6293784131621969, + "grad_norm": 0.984375, + "learning_rate": 0.0006379674360760927, + "loss": 0.7988, + "step": 9047 + }, + { + "epoch": 0.6294479807993322, + "grad_norm": 1.046875, + "learning_rate": 0.0006377573976342114, + "loss": 0.7488, + "step": 9048 + }, + { + "epoch": 0.6295175484364673, + "grad_norm": 1.1328125, + "learning_rate": 0.0006375473775851841, + "loss": 0.7922, + "step": 9049 + }, + { + "epoch": 0.6295871160736025, + "grad_norm": 1.1328125, + "learning_rate": 0.0006373373759396735, + "loss": 0.8314, + "step": 9050 + }, + { + "epoch": 0.6296566837107378, + "grad_norm": 1.0625, + "learning_rate": 0.0006371273927083434, + "loss": 0.765, + "step": 9051 + }, + { + "epoch": 0.629726251347873, + "grad_norm": 1.3515625, + "learning_rate": 0.000636917427901855, + "loss": 0.9509, + "step": 9052 + }, + { + "epoch": 0.6297958189850081, + "grad_norm": 1.15625, + "learning_rate": 0.00063670748153087, + "loss": 0.6709, + "step": 9053 + }, + { + "epoch": 0.6298653866221434, + "grad_norm": 0.96484375, + "learning_rate": 0.0006364975536060475, + "loss": 0.8492, + "step": 9054 + }, + { + "epoch": 0.6299349542592786, + "grad_norm": 1.15625, + "learning_rate": 0.0006362876441380471, + "loss": 0.787, + "step": 9055 + }, + { + "epoch": 0.6300045218964138, + "grad_norm": 0.86328125, + "learning_rate": 0.000636077753137527, + "loss": 0.5293, + "step": 9056 + }, + { + "epoch": 0.630074089533549, + "grad_norm": 0.98828125, + "learning_rate": 0.000635867880615144, + "loss": 0.4938, + "step": 9057 + }, + { + "epoch": 0.6301436571706842, + "grad_norm": 1.359375, + "learning_rate": 0.0006356580265815551, + "loss": 0.9754, + "step": 9058 + }, + { + "epoch": 0.6302132248078194, + "grad_norm": 1.125, + "learning_rate": 0.000635448191047415, + "loss": 0.6962, + "step": 9059 + }, + { + "epoch": 0.6302827924449546, + "grad_norm": 1.171875, + "learning_rate": 0.0006352383740233784, + "loss": 0.5832, + "step": 9060 + }, + { + "epoch": 0.6303523600820898, + "grad_norm": 1.515625, + "learning_rate": 0.0006350285755200984, + "loss": 0.6581, + "step": 9061 + }, + { + "epoch": 0.630421927719225, + "grad_norm": 0.9140625, + "learning_rate": 0.0006348187955482279, + "loss": 0.7414, + "step": 9062 + }, + { + "epoch": 0.6304914953563602, + "grad_norm": 0.9765625, + "learning_rate": 0.0006346090341184183, + "loss": 0.7638, + "step": 9063 + }, + { + "epoch": 0.6305610629934955, + "grad_norm": 1.421875, + "learning_rate": 0.00063439929124132, + "loss": 0.9622, + "step": 9064 + }, + { + "epoch": 0.6306306306306306, + "grad_norm": 1.265625, + "learning_rate": 0.0006341895669275834, + "loss": 0.7825, + "step": 9065 + }, + { + "epoch": 0.6307001982677658, + "grad_norm": 1.0390625, + "learning_rate": 0.0006339798611878565, + "loss": 0.6645, + "step": 9066 + }, + { + "epoch": 0.6307697659049011, + "grad_norm": 0.85546875, + "learning_rate": 0.0006337701740327876, + "loss": 0.5933, + "step": 9067 + }, + { + "epoch": 0.6308393335420363, + "grad_norm": 1.1640625, + "learning_rate": 0.000633560505473023, + "loss": 0.6615, + "step": 9068 + }, + { + "epoch": 0.6309089011791714, + "grad_norm": 0.86328125, + "learning_rate": 0.0006333508555192089, + "loss": 0.6311, + "step": 9069 + }, + { + "epoch": 0.6309784688163067, + "grad_norm": 1.1015625, + "learning_rate": 0.0006331412241819905, + "loss": 0.7053, + "step": 9070 + }, + { + "epoch": 0.6310480364534419, + "grad_norm": 1.3828125, + "learning_rate": 0.0006329316114720114, + "loss": 0.9294, + "step": 9071 + }, + { + "epoch": 0.631117604090577, + "grad_norm": 1.3125, + "learning_rate": 0.0006327220173999153, + "loss": 1.0703, + "step": 9072 + }, + { + "epoch": 0.6311871717277122, + "grad_norm": 0.953125, + "learning_rate": 0.0006325124419763438, + "loss": 0.8098, + "step": 9073 + }, + { + "epoch": 0.6312567393648475, + "grad_norm": 1.1640625, + "learning_rate": 0.0006323028852119383, + "loss": 0.8282, + "step": 9074 + }, + { + "epoch": 0.6313263070019827, + "grad_norm": 1.25, + "learning_rate": 0.0006320933471173385, + "loss": 0.8094, + "step": 9075 + }, + { + "epoch": 0.6313958746391178, + "grad_norm": 1.1484375, + "learning_rate": 0.0006318838277031845, + "loss": 0.8589, + "step": 9076 + }, + { + "epoch": 0.6314654422762531, + "grad_norm": 1.203125, + "learning_rate": 0.0006316743269801142, + "loss": 0.907, + "step": 9077 + }, + { + "epoch": 0.6315350099133883, + "grad_norm": 0.8984375, + "learning_rate": 0.0006314648449587649, + "loss": 0.7442, + "step": 9078 + }, + { + "epoch": 0.6316045775505235, + "grad_norm": 1.1015625, + "learning_rate": 0.0006312553816497737, + "loss": 0.8458, + "step": 9079 + }, + { + "epoch": 0.6316741451876587, + "grad_norm": 1.2109375, + "learning_rate": 0.0006310459370637754, + "loss": 0.9374, + "step": 9080 + }, + { + "epoch": 0.6317437128247939, + "grad_norm": 1.0859375, + "learning_rate": 0.000630836511211405, + "loss": 0.828, + "step": 9081 + }, + { + "epoch": 0.6318132804619291, + "grad_norm": 1.140625, + "learning_rate": 0.000630627104103295, + "loss": 0.879, + "step": 9082 + }, + { + "epoch": 0.6318828480990644, + "grad_norm": 0.89453125, + "learning_rate": 0.0006304177157500796, + "loss": 0.6049, + "step": 9083 + }, + { + "epoch": 0.6319524157361995, + "grad_norm": 1.28125, + "learning_rate": 0.0006302083461623896, + "loss": 0.6668, + "step": 9084 + }, + { + "epoch": 0.6320219833733347, + "grad_norm": 1.3125, + "learning_rate": 0.0006299989953508558, + "loss": 0.8314, + "step": 9085 + }, + { + "epoch": 0.6320915510104699, + "grad_norm": 0.92578125, + "learning_rate": 0.0006297896633261083, + "loss": 0.6113, + "step": 9086 + }, + { + "epoch": 0.6321611186476052, + "grad_norm": 1.1484375, + "learning_rate": 0.0006295803500987755, + "loss": 0.7645, + "step": 9087 + }, + { + "epoch": 0.6322306862847403, + "grad_norm": 1.296875, + "learning_rate": 0.0006293710556794859, + "loss": 0.9322, + "step": 9088 + }, + { + "epoch": 0.6323002539218755, + "grad_norm": 1.109375, + "learning_rate": 0.000629161780078865, + "loss": 0.8756, + "step": 9089 + }, + { + "epoch": 0.6323698215590108, + "grad_norm": 0.98046875, + "learning_rate": 0.0006289525233075406, + "loss": 0.8356, + "step": 9090 + }, + { + "epoch": 0.632439389196146, + "grad_norm": 1.328125, + "learning_rate": 0.0006287432853761365, + "loss": 0.6584, + "step": 9091 + }, + { + "epoch": 0.6325089568332811, + "grad_norm": 1.25, + "learning_rate": 0.0006285340662952775, + "loss": 0.9389, + "step": 9092 + }, + { + "epoch": 0.6325785244704164, + "grad_norm": 1.140625, + "learning_rate": 0.0006283248660755858, + "loss": 1.0067, + "step": 9093 + }, + { + "epoch": 0.6326480921075516, + "grad_norm": 1.0078125, + "learning_rate": 0.0006281156847276841, + "loss": 0.8184, + "step": 9094 + }, + { + "epoch": 0.6327176597446867, + "grad_norm": 1.2890625, + "learning_rate": 0.0006279065222621936, + "loss": 0.982, + "step": 9095 + }, + { + "epoch": 0.632787227381822, + "grad_norm": 1.25, + "learning_rate": 0.0006276973786897342, + "loss": 1.0089, + "step": 9096 + }, + { + "epoch": 0.6328567950189572, + "grad_norm": 0.87109375, + "learning_rate": 0.0006274882540209258, + "loss": 0.598, + "step": 9097 + }, + { + "epoch": 0.6329263626560924, + "grad_norm": 1.140625, + "learning_rate": 0.0006272791482663859, + "loss": 0.6582, + "step": 9098 + }, + { + "epoch": 0.6329959302932275, + "grad_norm": 1.34375, + "learning_rate": 0.0006270700614367326, + "loss": 1.0057, + "step": 9099 + }, + { + "epoch": 0.6330654979303628, + "grad_norm": 1.34375, + "learning_rate": 0.0006268609935425815, + "loss": 1.0524, + "step": 9100 + }, + { + "epoch": 0.633135065567498, + "grad_norm": 1.171875, + "learning_rate": 0.0006266519445945484, + "loss": 0.8258, + "step": 9101 + }, + { + "epoch": 0.6332046332046332, + "grad_norm": 1.046875, + "learning_rate": 0.0006264429146032478, + "loss": 0.8512, + "step": 9102 + }, + { + "epoch": 0.6332742008417684, + "grad_norm": 1.15625, + "learning_rate": 0.000626233903579293, + "loss": 0.8977, + "step": 9103 + }, + { + "epoch": 0.6333437684789036, + "grad_norm": 1.0546875, + "learning_rate": 0.000626024911533297, + "loss": 0.678, + "step": 9104 + }, + { + "epoch": 0.6334133361160388, + "grad_norm": 1.0234375, + "learning_rate": 0.0006258159384758709, + "loss": 0.6439, + "step": 9105 + }, + { + "epoch": 0.6334829037531741, + "grad_norm": 1.1484375, + "learning_rate": 0.0006256069844176256, + "loss": 0.702, + "step": 9106 + }, + { + "epoch": 0.6335524713903092, + "grad_norm": 1.0703125, + "learning_rate": 0.0006253980493691698, + "loss": 0.9091, + "step": 9107 + }, + { + "epoch": 0.6336220390274444, + "grad_norm": 0.9453125, + "learning_rate": 0.0006251891333411136, + "loss": 0.9381, + "step": 9108 + }, + { + "epoch": 0.6336916066645797, + "grad_norm": 1.125, + "learning_rate": 0.0006249802363440638, + "loss": 0.816, + "step": 9109 + }, + { + "epoch": 0.6337611743017149, + "grad_norm": 1.109375, + "learning_rate": 0.0006247713583886272, + "loss": 0.9566, + "step": 9110 + }, + { + "epoch": 0.63383074193885, + "grad_norm": 1.1953125, + "learning_rate": 0.0006245624994854102, + "loss": 0.8885, + "step": 9111 + }, + { + "epoch": 0.6339003095759852, + "grad_norm": 0.94140625, + "learning_rate": 0.0006243536596450168, + "loss": 0.7782, + "step": 9112 + }, + { + "epoch": 0.6339698772131205, + "grad_norm": 1.4296875, + "learning_rate": 0.0006241448388780514, + "loss": 0.9671, + "step": 9113 + }, + { + "epoch": 0.6340394448502557, + "grad_norm": 1.0546875, + "learning_rate": 0.0006239360371951161, + "loss": 0.7471, + "step": 9114 + }, + { + "epoch": 0.6341090124873908, + "grad_norm": 1.453125, + "learning_rate": 0.0006237272546068137, + "loss": 0.8383, + "step": 9115 + }, + { + "epoch": 0.6341785801245261, + "grad_norm": 1.0703125, + "learning_rate": 0.0006235184911237449, + "loss": 0.7559, + "step": 9116 + }, + { + "epoch": 0.6342481477616613, + "grad_norm": 1.4140625, + "learning_rate": 0.0006233097467565092, + "loss": 0.8646, + "step": 9117 + }, + { + "epoch": 0.6343177153987964, + "grad_norm": 1.171875, + "learning_rate": 0.0006231010215157062, + "loss": 0.8285, + "step": 9118 + }, + { + "epoch": 0.6343872830359317, + "grad_norm": 0.78125, + "learning_rate": 0.0006228923154119334, + "loss": 0.6399, + "step": 9119 + }, + { + "epoch": 0.6344568506730669, + "grad_norm": 1.078125, + "learning_rate": 0.0006226836284557885, + "loss": 0.8933, + "step": 9120 + }, + { + "epoch": 0.6345264183102021, + "grad_norm": 0.90625, + "learning_rate": 0.0006224749606578662, + "loss": 0.64, + "step": 9121 + }, + { + "epoch": 0.6345959859473373, + "grad_norm": 1.5234375, + "learning_rate": 0.0006222663120287633, + "loss": 0.8596, + "step": 9122 + }, + { + "epoch": 0.6346655535844725, + "grad_norm": 1.390625, + "learning_rate": 0.0006220576825790729, + "loss": 0.9377, + "step": 9123 + }, + { + "epoch": 0.6347351212216077, + "grad_norm": 1.3984375, + "learning_rate": 0.0006218490723193884, + "loss": 1.0686, + "step": 9124 + }, + { + "epoch": 0.6348046888587429, + "grad_norm": 0.87109375, + "learning_rate": 0.0006216404812603021, + "loss": 0.7806, + "step": 9125 + }, + { + "epoch": 0.6348742564958781, + "grad_norm": 0.94140625, + "learning_rate": 0.0006214319094124051, + "loss": 0.7259, + "step": 9126 + }, + { + "epoch": 0.6349438241330133, + "grad_norm": 0.9921875, + "learning_rate": 0.0006212233567862875, + "loss": 1.2013, + "step": 9127 + }, + { + "epoch": 0.6350133917701485, + "grad_norm": 0.953125, + "learning_rate": 0.0006210148233925385, + "loss": 0.6091, + "step": 9128 + }, + { + "epoch": 0.6350829594072838, + "grad_norm": 1.125, + "learning_rate": 0.000620806309241747, + "loss": 0.8251, + "step": 9129 + }, + { + "epoch": 0.6351525270444189, + "grad_norm": 1.03125, + "learning_rate": 0.0006205978143444996, + "loss": 0.7176, + "step": 9130 + }, + { + "epoch": 0.6352220946815541, + "grad_norm": 1.09375, + "learning_rate": 0.0006203893387113826, + "loss": 0.8539, + "step": 9131 + }, + { + "epoch": 0.6352916623186894, + "grad_norm": 1.2734375, + "learning_rate": 0.0006201808823529819, + "loss": 0.7645, + "step": 9132 + }, + { + "epoch": 0.6353612299558246, + "grad_norm": 1.234375, + "learning_rate": 0.0006199724452798816, + "loss": 0.8681, + "step": 9133 + }, + { + "epoch": 0.6354307975929597, + "grad_norm": 1.3515625, + "learning_rate": 0.000619764027502665, + "loss": 0.9698, + "step": 9134 + }, + { + "epoch": 0.635500365230095, + "grad_norm": 1.046875, + "learning_rate": 0.0006195556290319143, + "loss": 0.9378, + "step": 9135 + }, + { + "epoch": 0.6355699328672302, + "grad_norm": 1.046875, + "learning_rate": 0.0006193472498782116, + "loss": 0.7968, + "step": 9136 + }, + { + "epoch": 0.6356395005043654, + "grad_norm": 1.40625, + "learning_rate": 0.0006191388900521368, + "loss": 0.8968, + "step": 9137 + }, + { + "epoch": 0.6357090681415005, + "grad_norm": 1.2109375, + "learning_rate": 0.000618930549564269, + "loss": 0.8122, + "step": 9138 + }, + { + "epoch": 0.6357786357786358, + "grad_norm": 0.8671875, + "learning_rate": 0.0006187222284251879, + "loss": 0.7596, + "step": 9139 + }, + { + "epoch": 0.635848203415771, + "grad_norm": 1.0546875, + "learning_rate": 0.0006185139266454698, + "loss": 0.5965, + "step": 9140 + }, + { + "epoch": 0.6359177710529061, + "grad_norm": 1.3203125, + "learning_rate": 0.0006183056442356918, + "loss": 1.0042, + "step": 9141 + }, + { + "epoch": 0.6359873386900414, + "grad_norm": 1.078125, + "learning_rate": 0.0006180973812064291, + "loss": 0.8028, + "step": 9142 + }, + { + "epoch": 0.6360569063271766, + "grad_norm": 1.2265625, + "learning_rate": 0.000617889137568257, + "loss": 0.9763, + "step": 9143 + }, + { + "epoch": 0.6361264739643118, + "grad_norm": 1.015625, + "learning_rate": 0.000617680913331748, + "loss": 0.5821, + "step": 9144 + }, + { + "epoch": 0.636196041601447, + "grad_norm": 1.09375, + "learning_rate": 0.0006174727085074751, + "loss": 0.637, + "step": 9145 + }, + { + "epoch": 0.6362656092385822, + "grad_norm": 1.2265625, + "learning_rate": 0.0006172645231060103, + "loss": 0.9573, + "step": 9146 + }, + { + "epoch": 0.6363351768757174, + "grad_norm": 1.5234375, + "learning_rate": 0.000617056357137924, + "loss": 0.8373, + "step": 9147 + }, + { + "epoch": 0.6364047445128527, + "grad_norm": 1.203125, + "learning_rate": 0.0006168482106137854, + "loss": 0.7554, + "step": 9148 + }, + { + "epoch": 0.6364743121499878, + "grad_norm": 1.0078125, + "learning_rate": 0.0006166400835441635, + "loss": 0.6937, + "step": 9149 + }, + { + "epoch": 0.636543879787123, + "grad_norm": 1.078125, + "learning_rate": 0.0006164319759396261, + "loss": 0.6661, + "step": 9150 + }, + { + "epoch": 0.6366134474242582, + "grad_norm": 1.265625, + "learning_rate": 0.0006162238878107394, + "loss": 0.7754, + "step": 9151 + }, + { + "epoch": 0.6366830150613935, + "grad_norm": 1.34375, + "learning_rate": 0.0006160158191680691, + "loss": 0.79, + "step": 9152 + }, + { + "epoch": 0.6367525826985286, + "grad_norm": 0.85546875, + "learning_rate": 0.0006158077700221805, + "loss": 0.6313, + "step": 9153 + }, + { + "epoch": 0.6368221503356638, + "grad_norm": 1.1171875, + "learning_rate": 0.0006155997403836369, + "loss": 0.8359, + "step": 9154 + }, + { + "epoch": 0.6368917179727991, + "grad_norm": 0.9765625, + "learning_rate": 0.0006153917302630007, + "loss": 0.6287, + "step": 9155 + }, + { + "epoch": 0.6369612856099343, + "grad_norm": 1.1328125, + "learning_rate": 0.0006151837396708337, + "loss": 1.0975, + "step": 9156 + }, + { + "epoch": 0.6370308532470694, + "grad_norm": 1.078125, + "learning_rate": 0.0006149757686176973, + "loss": 0.6502, + "step": 9157 + }, + { + "epoch": 0.6371004208842047, + "grad_norm": 1.203125, + "learning_rate": 0.0006147678171141504, + "loss": 0.8921, + "step": 9158 + }, + { + "epoch": 0.6371699885213399, + "grad_norm": 1.1484375, + "learning_rate": 0.0006145598851707519, + "loss": 0.9472, + "step": 9159 + }, + { + "epoch": 0.637239556158475, + "grad_norm": 1.140625, + "learning_rate": 0.0006143519727980597, + "loss": 0.8791, + "step": 9160 + }, + { + "epoch": 0.6373091237956103, + "grad_norm": 1.3359375, + "learning_rate": 0.0006141440800066309, + "loss": 0.9028, + "step": 9161 + }, + { + "epoch": 0.6373786914327455, + "grad_norm": 1.2109375, + "learning_rate": 0.0006139362068070207, + "loss": 0.7902, + "step": 9162 + }, + { + "epoch": 0.6374482590698807, + "grad_norm": 1.078125, + "learning_rate": 0.0006137283532097837, + "loss": 0.7436, + "step": 9163 + }, + { + "epoch": 0.6375178267070158, + "grad_norm": 1.375, + "learning_rate": 0.0006135205192254742, + "loss": 0.8525, + "step": 9164 + }, + { + "epoch": 0.6375873943441511, + "grad_norm": 1.2578125, + "learning_rate": 0.0006133127048646448, + "loss": 0.8239, + "step": 9165 + }, + { + "epoch": 0.6376569619812863, + "grad_norm": 1.1796875, + "learning_rate": 0.0006131049101378472, + "loss": 0.7516, + "step": 9166 + }, + { + "epoch": 0.6377265296184215, + "grad_norm": 1.171875, + "learning_rate": 0.0006128971350556319, + "loss": 0.7965, + "step": 9167 + }, + { + "epoch": 0.6377960972555567, + "grad_norm": 1.1640625, + "learning_rate": 0.0006126893796285493, + "loss": 0.7398, + "step": 9168 + }, + { + "epoch": 0.6378656648926919, + "grad_norm": 1.0703125, + "learning_rate": 0.0006124816438671476, + "loss": 0.7929, + "step": 9169 + }, + { + "epoch": 0.6379352325298271, + "grad_norm": 1.03125, + "learning_rate": 0.0006122739277819747, + "loss": 0.7309, + "step": 9170 + }, + { + "epoch": 0.6380048001669624, + "grad_norm": 1.3203125, + "learning_rate": 0.0006120662313835776, + "loss": 1.0432, + "step": 9171 + }, + { + "epoch": 0.6380743678040975, + "grad_norm": 1.234375, + "learning_rate": 0.0006118585546825019, + "loss": 0.7969, + "step": 9172 + }, + { + "epoch": 0.6381439354412327, + "grad_norm": 1.1484375, + "learning_rate": 0.0006116508976892925, + "loss": 0.6666, + "step": 9173 + }, + { + "epoch": 0.638213503078368, + "grad_norm": 0.88671875, + "learning_rate": 0.0006114432604144928, + "loss": 0.6517, + "step": 9174 + }, + { + "epoch": 0.6382830707155032, + "grad_norm": 1.0390625, + "learning_rate": 0.0006112356428686463, + "loss": 0.7091, + "step": 9175 + }, + { + "epoch": 0.6383526383526383, + "grad_norm": 1.2734375, + "learning_rate": 0.0006110280450622943, + "loss": 0.9462, + "step": 9176 + }, + { + "epoch": 0.6384222059897735, + "grad_norm": 0.90625, + "learning_rate": 0.0006108204670059772, + "loss": 0.9103, + "step": 9177 + }, + { + "epoch": 0.6384917736269088, + "grad_norm": 1.03125, + "learning_rate": 0.0006106129087102354, + "loss": 0.827, + "step": 9178 + }, + { + "epoch": 0.638561341264044, + "grad_norm": 1.2734375, + "learning_rate": 0.0006104053701856076, + "loss": 0.9415, + "step": 9179 + }, + { + "epoch": 0.6386309089011791, + "grad_norm": 1.1328125, + "learning_rate": 0.0006101978514426312, + "loss": 0.7695, + "step": 9180 + }, + { + "epoch": 0.6387004765383144, + "grad_norm": 1.375, + "learning_rate": 0.000609990352491843, + "loss": 0.6326, + "step": 9181 + }, + { + "epoch": 0.6387700441754496, + "grad_norm": 1.0234375, + "learning_rate": 0.0006097828733437794, + "loss": 0.6975, + "step": 9182 + }, + { + "epoch": 0.6388396118125848, + "grad_norm": 1.1796875, + "learning_rate": 0.0006095754140089744, + "loss": 0.9786, + "step": 9183 + }, + { + "epoch": 0.63890917944972, + "grad_norm": 1.125, + "learning_rate": 0.0006093679744979617, + "loss": 0.9109, + "step": 9184 + }, + { + "epoch": 0.6389787470868552, + "grad_norm": 0.79296875, + "learning_rate": 0.0006091605548212746, + "loss": 0.7764, + "step": 9185 + }, + { + "epoch": 0.6390483147239904, + "grad_norm": 1.140625, + "learning_rate": 0.0006089531549894447, + "loss": 0.6927, + "step": 9186 + }, + { + "epoch": 0.6391178823611257, + "grad_norm": 1.28125, + "learning_rate": 0.0006087457750130023, + "loss": 0.9123, + "step": 9187 + }, + { + "epoch": 0.6391874499982608, + "grad_norm": 1.0546875, + "learning_rate": 0.0006085384149024773, + "loss": 0.7084, + "step": 9188 + }, + { + "epoch": 0.639257017635396, + "grad_norm": 1.078125, + "learning_rate": 0.000608331074668399, + "loss": 1.138, + "step": 9189 + }, + { + "epoch": 0.6393265852725312, + "grad_norm": 1.4296875, + "learning_rate": 0.000608123754321294, + "loss": 0.9994, + "step": 9190 + }, + { + "epoch": 0.6393961529096664, + "grad_norm": 1.3203125, + "learning_rate": 0.0006079164538716897, + "loss": 0.98, + "step": 9191 + }, + { + "epoch": 0.6394657205468016, + "grad_norm": 1.015625, + "learning_rate": 0.0006077091733301117, + "loss": 0.6393, + "step": 9192 + }, + { + "epoch": 0.6395352881839368, + "grad_norm": 1.4921875, + "learning_rate": 0.0006075019127070849, + "loss": 1.0215, + "step": 9193 + }, + { + "epoch": 0.6396048558210721, + "grad_norm": 1.03125, + "learning_rate": 0.0006072946720131323, + "loss": 0.9159, + "step": 9194 + }, + { + "epoch": 0.6396744234582072, + "grad_norm": 0.96484375, + "learning_rate": 0.0006070874512587766, + "loss": 0.5661, + "step": 9195 + }, + { + "epoch": 0.6397439910953424, + "grad_norm": 1.3359375, + "learning_rate": 0.0006068802504545402, + "loss": 0.8845, + "step": 9196 + }, + { + "epoch": 0.6398135587324777, + "grad_norm": 1.1484375, + "learning_rate": 0.000606673069610943, + "loss": 0.6966, + "step": 9197 + }, + { + "epoch": 0.6398831263696129, + "grad_norm": 1.25, + "learning_rate": 0.0006064659087385047, + "loss": 0.691, + "step": 9198 + }, + { + "epoch": 0.639952694006748, + "grad_norm": 1.2109375, + "learning_rate": 0.0006062587678477441, + "loss": 0.8337, + "step": 9199 + }, + { + "epoch": 0.6400222616438833, + "grad_norm": 1.0859375, + "learning_rate": 0.0006060516469491788, + "loss": 0.7723, + "step": 9200 + }, + { + "epoch": 0.6400918292810185, + "grad_norm": 1.171875, + "learning_rate": 0.0006058445460533251, + "loss": 0.7276, + "step": 9201 + }, + { + "epoch": 0.6401613969181537, + "grad_norm": 1.515625, + "learning_rate": 0.0006056374651706985, + "loss": 0.9775, + "step": 9202 + }, + { + "epoch": 0.6402309645552888, + "grad_norm": 1.2265625, + "learning_rate": 0.0006054304043118141, + "loss": 0.8342, + "step": 9203 + }, + { + "epoch": 0.6403005321924241, + "grad_norm": 0.9921875, + "learning_rate": 0.0006052233634871847, + "loss": 0.741, + "step": 9204 + }, + { + "epoch": 0.6403700998295593, + "grad_norm": 1.3828125, + "learning_rate": 0.000605016342707323, + "loss": 0.9725, + "step": 9205 + }, + { + "epoch": 0.6404396674666945, + "grad_norm": 1.15625, + "learning_rate": 0.0006048093419827405, + "loss": 0.6572, + "step": 9206 + }, + { + "epoch": 0.6405092351038297, + "grad_norm": 1.125, + "learning_rate": 0.0006046023613239482, + "loss": 0.7487, + "step": 9207 + }, + { + "epoch": 0.6405788027409649, + "grad_norm": 1.4140625, + "learning_rate": 0.0006043954007414548, + "loss": 1.0782, + "step": 9208 + }, + { + "epoch": 0.6406483703781001, + "grad_norm": 1.078125, + "learning_rate": 0.0006041884602457685, + "loss": 0.9721, + "step": 9209 + }, + { + "epoch": 0.6407179380152354, + "grad_norm": 1.2578125, + "learning_rate": 0.0006039815398473978, + "loss": 1.022, + "step": 9210 + }, + { + "epoch": 0.6407875056523705, + "grad_norm": 1.1015625, + "learning_rate": 0.0006037746395568481, + "loss": 0.9364, + "step": 9211 + }, + { + "epoch": 0.6408570732895057, + "grad_norm": 1.0078125, + "learning_rate": 0.0006035677593846249, + "loss": 0.8639, + "step": 9212 + }, + { + "epoch": 0.640926640926641, + "grad_norm": 1.28125, + "learning_rate": 0.0006033608993412329, + "loss": 0.8383, + "step": 9213 + }, + { + "epoch": 0.6409962085637761, + "grad_norm": 1.125, + "learning_rate": 0.0006031540594371755, + "loss": 0.7661, + "step": 9214 + }, + { + "epoch": 0.6410657762009113, + "grad_norm": 0.90234375, + "learning_rate": 0.0006029472396829545, + "loss": 0.514, + "step": 9215 + }, + { + "epoch": 0.6411353438380465, + "grad_norm": 1.09375, + "learning_rate": 0.0006027404400890711, + "loss": 0.9713, + "step": 9216 + }, + { + "epoch": 0.6412049114751818, + "grad_norm": 1.0859375, + "learning_rate": 0.0006025336606660262, + "loss": 0.6745, + "step": 9217 + }, + { + "epoch": 0.6412744791123169, + "grad_norm": 1.0, + "learning_rate": 0.0006023269014243186, + "loss": 0.6754, + "step": 9218 + }, + { + "epoch": 0.6413440467494521, + "grad_norm": 1.1171875, + "learning_rate": 0.0006021201623744462, + "loss": 0.7241, + "step": 9219 + }, + { + "epoch": 0.6414136143865874, + "grad_norm": 1.0078125, + "learning_rate": 0.0006019134435269066, + "loss": 0.8975, + "step": 9220 + }, + { + "epoch": 0.6414831820237226, + "grad_norm": 1.015625, + "learning_rate": 0.0006017067448921962, + "loss": 0.732, + "step": 9221 + }, + { + "epoch": 0.6415527496608577, + "grad_norm": 0.71484375, + "learning_rate": 0.0006015000664808096, + "loss": 0.4636, + "step": 9222 + }, + { + "epoch": 0.641622317297993, + "grad_norm": 1.140625, + "learning_rate": 0.0006012934083032406, + "loss": 0.7586, + "step": 9223 + }, + { + "epoch": 0.6416918849351282, + "grad_norm": 1.15625, + "learning_rate": 0.0006010867703699831, + "loss": 0.8372, + "step": 9224 + }, + { + "epoch": 0.6417614525722634, + "grad_norm": 0.9453125, + "learning_rate": 0.0006008801526915288, + "loss": 0.7143, + "step": 9225 + }, + { + "epoch": 0.6418310202093986, + "grad_norm": 1.140625, + "learning_rate": 0.0006006735552783683, + "loss": 0.8462, + "step": 9226 + }, + { + "epoch": 0.6419005878465338, + "grad_norm": 1.140625, + "learning_rate": 0.0006004669781409922, + "loss": 0.6911, + "step": 9227 + }, + { + "epoch": 0.641970155483669, + "grad_norm": 1.2578125, + "learning_rate": 0.0006002604212898892, + "loss": 0.8371, + "step": 9228 + }, + { + "epoch": 0.6420397231208042, + "grad_norm": 1.1953125, + "learning_rate": 0.000600053884735547, + "loss": 0.7316, + "step": 9229 + }, + { + "epoch": 0.6421092907579394, + "grad_norm": 1.921875, + "learning_rate": 0.0005998473684884525, + "loss": 0.8465, + "step": 9230 + }, + { + "epoch": 0.6421788583950746, + "grad_norm": 0.96875, + "learning_rate": 0.0005996408725590918, + "loss": 0.7709, + "step": 9231 + }, + { + "epoch": 0.6422484260322098, + "grad_norm": 1.1484375, + "learning_rate": 0.0005994343969579498, + "loss": 0.5967, + "step": 9232 + }, + { + "epoch": 0.642317993669345, + "grad_norm": 0.96875, + "learning_rate": 0.00059922794169551, + "loss": 0.9269, + "step": 9233 + }, + { + "epoch": 0.6423875613064802, + "grad_norm": 1.0390625, + "learning_rate": 0.0005990215067822553, + "loss": 0.979, + "step": 9234 + }, + { + "epoch": 0.6424571289436154, + "grad_norm": 1.2109375, + "learning_rate": 0.0005988150922286676, + "loss": 0.8963, + "step": 9235 + }, + { + "epoch": 0.6425266965807507, + "grad_norm": 0.9296875, + "learning_rate": 0.0005986086980452272, + "loss": 0.7806, + "step": 9236 + }, + { + "epoch": 0.6425962642178858, + "grad_norm": 1.25, + "learning_rate": 0.0005984023242424138, + "loss": 0.8006, + "step": 9237 + }, + { + "epoch": 0.642665831855021, + "grad_norm": 1.1171875, + "learning_rate": 0.0005981959708307063, + "loss": 0.7691, + "step": 9238 + }, + { + "epoch": 0.6427353994921563, + "grad_norm": 1.1171875, + "learning_rate": 0.0005979896378205824, + "loss": 0.477, + "step": 9239 + }, + { + "epoch": 0.6428049671292915, + "grad_norm": 1.125, + "learning_rate": 0.000597783325222518, + "loss": 0.8926, + "step": 9240 + }, + { + "epoch": 0.6428745347664266, + "grad_norm": 1.1015625, + "learning_rate": 0.0005975770330469892, + "loss": 0.6737, + "step": 9241 + }, + { + "epoch": 0.6429441024035618, + "grad_norm": 1.53125, + "learning_rate": 0.0005973707613044706, + "loss": 0.9849, + "step": 9242 + }, + { + "epoch": 0.6430136700406971, + "grad_norm": 1.359375, + "learning_rate": 0.000597164510005435, + "loss": 0.8287, + "step": 9243 + }, + { + "epoch": 0.6430832376778323, + "grad_norm": 0.98046875, + "learning_rate": 0.0005969582791603551, + "loss": 0.6986, + "step": 9244 + }, + { + "epoch": 0.6431528053149674, + "grad_norm": 1.3359375, + "learning_rate": 0.0005967520687797023, + "loss": 0.9721, + "step": 9245 + }, + { + "epoch": 0.6432223729521027, + "grad_norm": 1.46875, + "learning_rate": 0.0005965458788739473, + "loss": 0.8994, + "step": 9246 + }, + { + "epoch": 0.6432919405892379, + "grad_norm": 0.88671875, + "learning_rate": 0.0005963397094535587, + "loss": 0.6767, + "step": 9247 + }, + { + "epoch": 0.6433615082263731, + "grad_norm": 1.1875, + "learning_rate": 0.000596133560529005, + "loss": 0.9858, + "step": 9248 + }, + { + "epoch": 0.6434310758635083, + "grad_norm": 1.2734375, + "learning_rate": 0.0005959274321107535, + "loss": 0.8106, + "step": 9249 + }, + { + "epoch": 0.6435006435006435, + "grad_norm": 1.3125, + "learning_rate": 0.0005957213242092707, + "loss": 0.7467, + "step": 9250 + }, + { + "epoch": 0.6435702111377787, + "grad_norm": 1.203125, + "learning_rate": 0.0005955152368350207, + "loss": 0.7088, + "step": 9251 + }, + { + "epoch": 0.643639778774914, + "grad_norm": 0.98046875, + "learning_rate": 0.0005953091699984687, + "loss": 0.7812, + "step": 9252 + }, + { + "epoch": 0.6437093464120491, + "grad_norm": 0.890625, + "learning_rate": 0.0005951031237100773, + "loss": 0.4536, + "step": 9253 + }, + { + "epoch": 0.6437789140491843, + "grad_norm": 1.359375, + "learning_rate": 0.0005948970979803082, + "loss": 1.0513, + "step": 9254 + }, + { + "epoch": 0.6438484816863195, + "grad_norm": 1.0703125, + "learning_rate": 0.0005946910928196224, + "loss": 0.7437, + "step": 9255 + }, + { + "epoch": 0.6439180493234548, + "grad_norm": 0.8671875, + "learning_rate": 0.0005944851082384802, + "loss": 0.7235, + "step": 9256 + }, + { + "epoch": 0.6439876169605899, + "grad_norm": 1.0234375, + "learning_rate": 0.0005942791442473405, + "loss": 0.9745, + "step": 9257 + }, + { + "epoch": 0.6440571845977251, + "grad_norm": 1.2734375, + "learning_rate": 0.0005940732008566605, + "loss": 0.753, + "step": 9258 + }, + { + "epoch": 0.6441267522348604, + "grad_norm": 1.25, + "learning_rate": 0.0005938672780768974, + "loss": 1.0134, + "step": 9259 + }, + { + "epoch": 0.6441963198719955, + "grad_norm": 1.3125, + "learning_rate": 0.0005936613759185073, + "loss": 0.9824, + "step": 9260 + }, + { + "epoch": 0.6442658875091307, + "grad_norm": 1.234375, + "learning_rate": 0.0005934554943919442, + "loss": 0.8076, + "step": 9261 + }, + { + "epoch": 0.644335455146266, + "grad_norm": 1.1640625, + "learning_rate": 0.0005932496335076616, + "loss": 0.7829, + "step": 9262 + }, + { + "epoch": 0.6444050227834012, + "grad_norm": 1.2890625, + "learning_rate": 0.0005930437932761126, + "loss": 0.7276, + "step": 9263 + }, + { + "epoch": 0.6444745904205363, + "grad_norm": 1.0859375, + "learning_rate": 0.0005928379737077489, + "loss": 0.7165, + "step": 9264 + }, + { + "epoch": 0.6445441580576716, + "grad_norm": 1.0546875, + "learning_rate": 0.0005926321748130201, + "loss": 0.7109, + "step": 9265 + }, + { + "epoch": 0.6446137256948068, + "grad_norm": 1.1953125, + "learning_rate": 0.0005924263966023767, + "loss": 0.9391, + "step": 9266 + }, + { + "epoch": 0.644683293331942, + "grad_norm": 1.0078125, + "learning_rate": 0.0005922206390862663, + "loss": 0.8388, + "step": 9267 + }, + { + "epoch": 0.6447528609690771, + "grad_norm": 1.15625, + "learning_rate": 0.0005920149022751366, + "loss": 0.9156, + "step": 9268 + }, + { + "epoch": 0.6448224286062124, + "grad_norm": 1.125, + "learning_rate": 0.0005918091861794334, + "loss": 0.7867, + "step": 9269 + }, + { + "epoch": 0.6448919962433476, + "grad_norm": 1.1015625, + "learning_rate": 0.0005916034908096026, + "loss": 0.8344, + "step": 9270 + }, + { + "epoch": 0.6449615638804828, + "grad_norm": 1.1640625, + "learning_rate": 0.0005913978161760883, + "loss": 0.697, + "step": 9271 + }, + { + "epoch": 0.645031131517618, + "grad_norm": 1.265625, + "learning_rate": 0.0005911921622893331, + "loss": 0.7987, + "step": 9272 + }, + { + "epoch": 0.6451006991547532, + "grad_norm": 1.1171875, + "learning_rate": 0.0005909865291597792, + "loss": 0.7588, + "step": 9273 + }, + { + "epoch": 0.6451702667918884, + "grad_norm": 0.8828125, + "learning_rate": 0.0005907809167978682, + "loss": 0.7214, + "step": 9274 + }, + { + "epoch": 0.6452398344290237, + "grad_norm": 0.94140625, + "learning_rate": 0.0005905753252140394, + "loss": 0.8408, + "step": 9275 + }, + { + "epoch": 0.6453094020661588, + "grad_norm": 1.078125, + "learning_rate": 0.0005903697544187318, + "loss": 1.061, + "step": 9276 + }, + { + "epoch": 0.645378969703294, + "grad_norm": 0.9921875, + "learning_rate": 0.0005901642044223834, + "loss": 0.6878, + "step": 9277 + }, + { + "epoch": 0.6454485373404293, + "grad_norm": 1.0546875, + "learning_rate": 0.0005899586752354314, + "loss": 0.7811, + "step": 9278 + }, + { + "epoch": 0.6455181049775645, + "grad_norm": 1.1171875, + "learning_rate": 0.0005897531668683104, + "loss": 0.7057, + "step": 9279 + }, + { + "epoch": 0.6455876726146996, + "grad_norm": 1.234375, + "learning_rate": 0.0005895476793314563, + "loss": 0.8484, + "step": 9280 + }, + { + "epoch": 0.6456572402518348, + "grad_norm": 1.265625, + "learning_rate": 0.0005893422126353021, + "loss": 0.8589, + "step": 9281 + }, + { + "epoch": 0.6457268078889701, + "grad_norm": 1.0546875, + "learning_rate": 0.0005891367667902807, + "loss": 0.9069, + "step": 9282 + }, + { + "epoch": 0.6457963755261052, + "grad_norm": 1.65625, + "learning_rate": 0.0005889313418068229, + "loss": 0.9748, + "step": 9283 + }, + { + "epoch": 0.6458659431632404, + "grad_norm": 1.125, + "learning_rate": 0.0005887259376953597, + "loss": 0.804, + "step": 9284 + }, + { + "epoch": 0.6459355108003757, + "grad_norm": 1.015625, + "learning_rate": 0.0005885205544663208, + "loss": 0.7784, + "step": 9285 + }, + { + "epoch": 0.6460050784375109, + "grad_norm": 1.0234375, + "learning_rate": 0.0005883151921301337, + "loss": 0.6842, + "step": 9286 + }, + { + "epoch": 0.646074646074646, + "grad_norm": 1.1015625, + "learning_rate": 0.0005881098506972265, + "loss": 0.709, + "step": 9287 + }, + { + "epoch": 0.6461442137117813, + "grad_norm": 0.9921875, + "learning_rate": 0.0005879045301780247, + "loss": 0.8058, + "step": 9288 + }, + { + "epoch": 0.6462137813489165, + "grad_norm": 1.015625, + "learning_rate": 0.000587699230582954, + "loss": 0.6286, + "step": 9289 + }, + { + "epoch": 0.6462833489860517, + "grad_norm": 1.1640625, + "learning_rate": 0.0005874939519224378, + "loss": 1.0014, + "step": 9290 + }, + { + "epoch": 0.646352916623187, + "grad_norm": 1.2421875, + "learning_rate": 0.0005872886942068999, + "loss": 0.8455, + "step": 9291 + }, + { + "epoch": 0.6464224842603221, + "grad_norm": 1.0859375, + "learning_rate": 0.0005870834574467621, + "loss": 0.7493, + "step": 9292 + }, + { + "epoch": 0.6464920518974573, + "grad_norm": 2.328125, + "learning_rate": 0.0005868782416524446, + "loss": 0.8493, + "step": 9293 + }, + { + "epoch": 0.6465616195345925, + "grad_norm": 1.0546875, + "learning_rate": 0.0005866730468343678, + "loss": 0.5316, + "step": 9294 + }, + { + "epoch": 0.6466311871717277, + "grad_norm": 1.0078125, + "learning_rate": 0.0005864678730029503, + "loss": 0.6703, + "step": 9295 + }, + { + "epoch": 0.6467007548088629, + "grad_norm": 1.171875, + "learning_rate": 0.0005862627201686102, + "loss": 1.0093, + "step": 9296 + }, + { + "epoch": 0.6467703224459981, + "grad_norm": 0.9609375, + "learning_rate": 0.0005860575883417634, + "loss": 0.7401, + "step": 9297 + }, + { + "epoch": 0.6468398900831334, + "grad_norm": 1.0625, + "learning_rate": 0.000585852477532826, + "loss": 0.6057, + "step": 9298 + }, + { + "epoch": 0.6469094577202685, + "grad_norm": 0.9921875, + "learning_rate": 0.0005856473877522126, + "loss": 0.7363, + "step": 9299 + }, + { + "epoch": 0.6469790253574037, + "grad_norm": 1.09375, + "learning_rate": 0.0005854423190103357, + "loss": 0.7201, + "step": 9300 + }, + { + "epoch": 0.647048592994539, + "grad_norm": 1.1015625, + "learning_rate": 0.0005852372713176088, + "loss": 0.6854, + "step": 9301 + }, + { + "epoch": 0.6471181606316742, + "grad_norm": 1.015625, + "learning_rate": 0.0005850322446844427, + "loss": 0.8433, + "step": 9302 + }, + { + "epoch": 0.6471877282688093, + "grad_norm": 0.96484375, + "learning_rate": 0.0005848272391212477, + "loss": 0.6544, + "step": 9303 + }, + { + "epoch": 0.6472572959059446, + "grad_norm": 1.2421875, + "learning_rate": 0.0005846222546384325, + "loss": 1.06, + "step": 9304 + }, + { + "epoch": 0.6473268635430798, + "grad_norm": 0.921875, + "learning_rate": 0.0005844172912464057, + "loss": 0.6242, + "step": 9305 + }, + { + "epoch": 0.647396431180215, + "grad_norm": 1.1484375, + "learning_rate": 0.0005842123489555744, + "loss": 0.7195, + "step": 9306 + }, + { + "epoch": 0.6474659988173501, + "grad_norm": 1.046875, + "learning_rate": 0.0005840074277763437, + "loss": 0.7005, + "step": 9307 + }, + { + "epoch": 0.6475355664544854, + "grad_norm": 1.1640625, + "learning_rate": 0.0005838025277191197, + "loss": 0.7961, + "step": 9308 + }, + { + "epoch": 0.6476051340916206, + "grad_norm": 0.8828125, + "learning_rate": 0.0005835976487943055, + "loss": 0.9478, + "step": 9309 + }, + { + "epoch": 0.6476747017287557, + "grad_norm": 0.96875, + "learning_rate": 0.0005833927910123036, + "loss": 0.6468, + "step": 9310 + }, + { + "epoch": 0.647744269365891, + "grad_norm": 1.109375, + "learning_rate": 0.0005831879543835157, + "loss": 0.595, + "step": 9311 + }, + { + "epoch": 0.6478138370030262, + "grad_norm": 1.5078125, + "learning_rate": 0.0005829831389183431, + "loss": 1.113, + "step": 9312 + }, + { + "epoch": 0.6478834046401614, + "grad_norm": 1.125, + "learning_rate": 0.0005827783446271848, + "loss": 0.9451, + "step": 9313 + }, + { + "epoch": 0.6479529722772966, + "grad_norm": 1.21875, + "learning_rate": 0.0005825735715204388, + "loss": 1.0168, + "step": 9314 + }, + { + "epoch": 0.6480225399144318, + "grad_norm": 0.98046875, + "learning_rate": 0.0005823688196085028, + "loss": 0.735, + "step": 9315 + }, + { + "epoch": 0.648092107551567, + "grad_norm": 0.9375, + "learning_rate": 0.0005821640889017737, + "loss": 0.6886, + "step": 9316 + }, + { + "epoch": 0.6481616751887023, + "grad_norm": 1.4140625, + "learning_rate": 0.000581959379410646, + "loss": 0.7337, + "step": 9317 + }, + { + "epoch": 0.6482312428258374, + "grad_norm": 0.8984375, + "learning_rate": 0.0005817546911455134, + "loss": 0.7089, + "step": 9318 + }, + { + "epoch": 0.6483008104629726, + "grad_norm": 1.015625, + "learning_rate": 0.0005815500241167699, + "loss": 0.7163, + "step": 9319 + }, + { + "epoch": 0.6483703781001078, + "grad_norm": 1.328125, + "learning_rate": 0.0005813453783348069, + "loss": 1.0089, + "step": 9320 + }, + { + "epoch": 0.6484399457372431, + "grad_norm": 0.9609375, + "learning_rate": 0.0005811407538100151, + "loss": 0.6632, + "step": 9321 + }, + { + "epoch": 0.6485095133743782, + "grad_norm": 1.0625, + "learning_rate": 0.0005809361505527852, + "loss": 0.6799, + "step": 9322 + }, + { + "epoch": 0.6485790810115134, + "grad_norm": 1.015625, + "learning_rate": 0.0005807315685735052, + "loss": 0.7795, + "step": 9323 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 1.03125, + "learning_rate": 0.0005805270078825626, + "loss": 0.808, + "step": 9324 + }, + { + "epoch": 0.6487182162857839, + "grad_norm": 1.046875, + "learning_rate": 0.0005803224684903442, + "loss": 0.793, + "step": 9325 + }, + { + "epoch": 0.648787783922919, + "grad_norm": 1.109375, + "learning_rate": 0.0005801179504072359, + "loss": 0.8611, + "step": 9326 + }, + { + "epoch": 0.6488573515600543, + "grad_norm": 1.3203125, + "learning_rate": 0.0005799134536436217, + "loss": 0.9363, + "step": 9327 + }, + { + "epoch": 0.6489269191971895, + "grad_norm": 1.046875, + "learning_rate": 0.0005797089782098846, + "loss": 0.7961, + "step": 9328 + }, + { + "epoch": 0.6489964868343246, + "grad_norm": 0.97265625, + "learning_rate": 0.0005795045241164072, + "loss": 0.648, + "step": 9329 + }, + { + "epoch": 0.6490660544714599, + "grad_norm": 1.3828125, + "learning_rate": 0.0005793000913735709, + "loss": 0.9337, + "step": 9330 + }, + { + "epoch": 0.6491356221085951, + "grad_norm": 1.1640625, + "learning_rate": 0.0005790956799917555, + "loss": 0.8339, + "step": 9331 + }, + { + "epoch": 0.6492051897457303, + "grad_norm": 1.21875, + "learning_rate": 0.0005788912899813395, + "loss": 0.809, + "step": 9332 + }, + { + "epoch": 0.6492747573828654, + "grad_norm": 1.1875, + "learning_rate": 0.0005786869213527013, + "loss": 0.8671, + "step": 9333 + }, + { + "epoch": 0.6493443250200007, + "grad_norm": 1.15625, + "learning_rate": 0.0005784825741162181, + "loss": 0.8602, + "step": 9334 + }, + { + "epoch": 0.6494138926571359, + "grad_norm": 1.390625, + "learning_rate": 0.0005782782482822653, + "loss": 0.6793, + "step": 9335 + }, + { + "epoch": 0.6494834602942711, + "grad_norm": 1.5546875, + "learning_rate": 0.0005780739438612169, + "loss": 0.775, + "step": 9336 + }, + { + "epoch": 0.6495530279314063, + "grad_norm": 1.453125, + "learning_rate": 0.0005778696608634473, + "loss": 0.7829, + "step": 9337 + }, + { + "epoch": 0.6496225955685415, + "grad_norm": 1.1484375, + "learning_rate": 0.0005776653992993282, + "loss": 0.6999, + "step": 9338 + }, + { + "epoch": 0.6496921632056767, + "grad_norm": 1.0078125, + "learning_rate": 0.0005774611591792314, + "loss": 0.7163, + "step": 9339 + }, + { + "epoch": 0.649761730842812, + "grad_norm": 1.0234375, + "learning_rate": 0.0005772569405135277, + "loss": 0.6897, + "step": 9340 + }, + { + "epoch": 0.6498312984799471, + "grad_norm": 1.2265625, + "learning_rate": 0.0005770527433125857, + "loss": 0.8467, + "step": 9341 + }, + { + "epoch": 0.6499008661170823, + "grad_norm": 1.1171875, + "learning_rate": 0.0005768485675867732, + "loss": 0.8483, + "step": 9342 + }, + { + "epoch": 0.6499704337542176, + "grad_norm": 1.0546875, + "learning_rate": 0.0005766444133464577, + "loss": 0.8846, + "step": 9343 + }, + { + "epoch": 0.6500400013913528, + "grad_norm": 1.171875, + "learning_rate": 0.0005764402806020053, + "loss": 0.7636, + "step": 9344 + }, + { + "epoch": 0.6501095690284879, + "grad_norm": 1.0546875, + "learning_rate": 0.0005762361693637805, + "loss": 0.8441, + "step": 9345 + }, + { + "epoch": 0.6501791366656231, + "grad_norm": 1.0, + "learning_rate": 0.0005760320796421468, + "loss": 0.7062, + "step": 9346 + }, + { + "epoch": 0.6502487043027584, + "grad_norm": 1.015625, + "learning_rate": 0.0005758280114474671, + "loss": 0.7896, + "step": 9347 + }, + { + "epoch": 0.6503182719398936, + "grad_norm": 1.0703125, + "learning_rate": 0.0005756239647901033, + "loss": 0.8114, + "step": 9348 + }, + { + "epoch": 0.6503878395770287, + "grad_norm": 1.0546875, + "learning_rate": 0.0005754199396804157, + "loss": 0.7884, + "step": 9349 + }, + { + "epoch": 0.650457407214164, + "grad_norm": 1.2734375, + "learning_rate": 0.0005752159361287631, + "loss": 1.0018, + "step": 9350 + }, + { + "epoch": 0.6505269748512992, + "grad_norm": 1.234375, + "learning_rate": 0.0005750119541455045, + "loss": 0.8045, + "step": 9351 + }, + { + "epoch": 0.6505965424884343, + "grad_norm": 1.015625, + "learning_rate": 0.0005748079937409965, + "loss": 0.8519, + "step": 9352 + }, + { + "epoch": 0.6506661101255696, + "grad_norm": 1.234375, + "learning_rate": 0.0005746040549255955, + "loss": 0.9646, + "step": 9353 + }, + { + "epoch": 0.6507356777627048, + "grad_norm": 1.234375, + "learning_rate": 0.0005744001377096566, + "loss": 0.8221, + "step": 9354 + }, + { + "epoch": 0.65080524539984, + "grad_norm": 1.1953125, + "learning_rate": 0.0005741962421035337, + "loss": 1.0693, + "step": 9355 + }, + { + "epoch": 0.6508748130369753, + "grad_norm": 1.4296875, + "learning_rate": 0.0005739923681175789, + "loss": 0.7727, + "step": 9356 + }, + { + "epoch": 0.6509443806741104, + "grad_norm": 1.0546875, + "learning_rate": 0.0005737885157621446, + "loss": 0.6614, + "step": 9357 + }, + { + "epoch": 0.6510139483112456, + "grad_norm": 1.25, + "learning_rate": 0.0005735846850475814, + "loss": 0.8921, + "step": 9358 + }, + { + "epoch": 0.6510835159483808, + "grad_norm": 1.5234375, + "learning_rate": 0.0005733808759842387, + "loss": 0.9109, + "step": 9359 + }, + { + "epoch": 0.651153083585516, + "grad_norm": 1.34375, + "learning_rate": 0.0005731770885824643, + "loss": 0.916, + "step": 9360 + }, + { + "epoch": 0.6512226512226512, + "grad_norm": 1.1328125, + "learning_rate": 0.0005729733228526061, + "loss": 1.0362, + "step": 9361 + }, + { + "epoch": 0.6512922188597864, + "grad_norm": 1.21875, + "learning_rate": 0.0005727695788050106, + "loss": 0.8676, + "step": 9362 + }, + { + "epoch": 0.6513617864969217, + "grad_norm": 1.09375, + "learning_rate": 0.0005725658564500225, + "loss": 0.7026, + "step": 9363 + }, + { + "epoch": 0.6514313541340568, + "grad_norm": 1.1484375, + "learning_rate": 0.0005723621557979854, + "loss": 0.8282, + "step": 9364 + }, + { + "epoch": 0.651500921771192, + "grad_norm": 1.2890625, + "learning_rate": 0.0005721584768592425, + "loss": 0.9341, + "step": 9365 + }, + { + "epoch": 0.6515704894083273, + "grad_norm": 1.5234375, + "learning_rate": 0.0005719548196441359, + "loss": 0.9879, + "step": 9366 + }, + { + "epoch": 0.6516400570454625, + "grad_norm": 1.1328125, + "learning_rate": 0.0005717511841630058, + "loss": 0.6812, + "step": 9367 + }, + { + "epoch": 0.6517096246825976, + "grad_norm": 1.0546875, + "learning_rate": 0.0005715475704261925, + "loss": 0.6552, + "step": 9368 + }, + { + "epoch": 0.6517791923197329, + "grad_norm": 1.0234375, + "learning_rate": 0.0005713439784440341, + "loss": 0.6276, + "step": 9369 + }, + { + "epoch": 0.6518487599568681, + "grad_norm": 1.3125, + "learning_rate": 0.0005711404082268673, + "loss": 0.856, + "step": 9370 + }, + { + "epoch": 0.6519183275940033, + "grad_norm": 0.94921875, + "learning_rate": 0.0005709368597850291, + "loss": 0.9023, + "step": 9371 + }, + { + "epoch": 0.6519878952311384, + "grad_norm": 1.0234375, + "learning_rate": 0.0005707333331288548, + "loss": 1.0215, + "step": 9372 + }, + { + "epoch": 0.6520574628682737, + "grad_norm": 1.3203125, + "learning_rate": 0.0005705298282686782, + "loss": 0.8265, + "step": 9373 + }, + { + "epoch": 0.6521270305054089, + "grad_norm": 1.0703125, + "learning_rate": 0.0005703263452148319, + "loss": 0.772, + "step": 9374 + }, + { + "epoch": 0.652196598142544, + "grad_norm": 1.1484375, + "learning_rate": 0.000570122883977648, + "loss": 0.7103, + "step": 9375 + }, + { + "epoch": 0.6522661657796793, + "grad_norm": 1.1640625, + "learning_rate": 0.0005699194445674577, + "loss": 0.9021, + "step": 9376 + }, + { + "epoch": 0.6523357334168145, + "grad_norm": 1.1328125, + "learning_rate": 0.0005697160269945902, + "loss": 0.971, + "step": 9377 + }, + { + "epoch": 0.6524053010539497, + "grad_norm": 1.03125, + "learning_rate": 0.0005695126312693738, + "loss": 0.8955, + "step": 9378 + }, + { + "epoch": 0.652474868691085, + "grad_norm": 0.9375, + "learning_rate": 0.0005693092574021361, + "loss": 0.8563, + "step": 9379 + }, + { + "epoch": 0.6525444363282201, + "grad_norm": 1.1640625, + "learning_rate": 0.0005691059054032039, + "loss": 0.721, + "step": 9380 + }, + { + "epoch": 0.6526140039653553, + "grad_norm": 1.34375, + "learning_rate": 0.0005689025752829014, + "loss": 1.0648, + "step": 9381 + }, + { + "epoch": 0.6526835716024906, + "grad_norm": 0.984375, + "learning_rate": 0.0005686992670515538, + "loss": 0.7004, + "step": 9382 + }, + { + "epoch": 0.6527531392396257, + "grad_norm": 0.96875, + "learning_rate": 0.0005684959807194835, + "loss": 0.8682, + "step": 9383 + }, + { + "epoch": 0.6528227068767609, + "grad_norm": 1.2734375, + "learning_rate": 0.0005682927162970119, + "loss": 1.0182, + "step": 9384 + }, + { + "epoch": 0.6528922745138961, + "grad_norm": 1.1953125, + "learning_rate": 0.0005680894737944602, + "loss": 0.9278, + "step": 9385 + }, + { + "epoch": 0.6529618421510314, + "grad_norm": 1.0546875, + "learning_rate": 0.0005678862532221485, + "loss": 0.6509, + "step": 9386 + }, + { + "epoch": 0.6530314097881665, + "grad_norm": 1.2421875, + "learning_rate": 0.0005676830545903948, + "loss": 0.7036, + "step": 9387 + }, + { + "epoch": 0.6531009774253017, + "grad_norm": 0.88671875, + "learning_rate": 0.0005674798779095161, + "loss": 0.6861, + "step": 9388 + }, + { + "epoch": 0.653170545062437, + "grad_norm": 1.046875, + "learning_rate": 0.0005672767231898292, + "loss": 0.7598, + "step": 9389 + }, + { + "epoch": 0.6532401126995722, + "grad_norm": 1.203125, + "learning_rate": 0.0005670735904416495, + "loss": 0.7604, + "step": 9390 + }, + { + "epoch": 0.6533096803367073, + "grad_norm": 1.0234375, + "learning_rate": 0.0005668704796752909, + "loss": 0.7848, + "step": 9391 + }, + { + "epoch": 0.6533792479738426, + "grad_norm": 1.71875, + "learning_rate": 0.0005666673909010658, + "loss": 0.9454, + "step": 9392 + }, + { + "epoch": 0.6534488156109778, + "grad_norm": 1.3203125, + "learning_rate": 0.0005664643241292864, + "loss": 0.9623, + "step": 9393 + }, + { + "epoch": 0.653518383248113, + "grad_norm": 1.2421875, + "learning_rate": 0.0005662612793702639, + "loss": 0.8303, + "step": 9394 + }, + { + "epoch": 0.6535879508852482, + "grad_norm": 1.140625, + "learning_rate": 0.0005660582566343068, + "loss": 0.7962, + "step": 9395 + }, + { + "epoch": 0.6536575185223834, + "grad_norm": 1.359375, + "learning_rate": 0.0005658552559317248, + "loss": 0.9712, + "step": 9396 + }, + { + "epoch": 0.6537270861595186, + "grad_norm": 1.1328125, + "learning_rate": 0.0005656522772728243, + "loss": 0.9124, + "step": 9397 + }, + { + "epoch": 0.6537966537966537, + "grad_norm": 1.078125, + "learning_rate": 0.0005654493206679121, + "loss": 0.968, + "step": 9398 + }, + { + "epoch": 0.653866221433789, + "grad_norm": 1.0234375, + "learning_rate": 0.0005652463861272928, + "loss": 0.6466, + "step": 9399 + }, + { + "epoch": 0.6539357890709242, + "grad_norm": 1.078125, + "learning_rate": 0.0005650434736612711, + "loss": 0.8994, + "step": 9400 + }, + { + "epoch": 0.6540053567080594, + "grad_norm": 0.91015625, + "learning_rate": 0.0005648405832801495, + "loss": 0.6486, + "step": 9401 + }, + { + "epoch": 0.6540749243451947, + "grad_norm": 1.421875, + "learning_rate": 0.0005646377149942292, + "loss": 1.1225, + "step": 9402 + }, + { + "epoch": 0.6541444919823298, + "grad_norm": 1.046875, + "learning_rate": 0.0005644348688138114, + "loss": 0.7517, + "step": 9403 + }, + { + "epoch": 0.654214059619465, + "grad_norm": 0.9765625, + "learning_rate": 0.000564232044749196, + "loss": 0.7078, + "step": 9404 + }, + { + "epoch": 0.6542836272566003, + "grad_norm": 1.6484375, + "learning_rate": 0.000564029242810681, + "loss": 0.8726, + "step": 9405 + }, + { + "epoch": 0.6543531948937354, + "grad_norm": 0.92578125, + "learning_rate": 0.000563826463008563, + "loss": 0.7171, + "step": 9406 + }, + { + "epoch": 0.6544227625308706, + "grad_norm": 0.87890625, + "learning_rate": 0.0005636237053531388, + "loss": 0.5968, + "step": 9407 + }, + { + "epoch": 0.6544923301680059, + "grad_norm": 0.96484375, + "learning_rate": 0.0005634209698547038, + "loss": 0.7417, + "step": 9408 + }, + { + "epoch": 0.6545618978051411, + "grad_norm": 1.234375, + "learning_rate": 0.0005632182565235514, + "loss": 0.8136, + "step": 9409 + }, + { + "epoch": 0.6546314654422762, + "grad_norm": 0.95703125, + "learning_rate": 0.000563015565369974, + "loss": 0.7378, + "step": 9410 + }, + { + "epoch": 0.6547010330794114, + "grad_norm": 1.59375, + "learning_rate": 0.0005628128964042636, + "loss": 0.919, + "step": 9411 + }, + { + "epoch": 0.6547706007165467, + "grad_norm": 1.2578125, + "learning_rate": 0.0005626102496367111, + "loss": 0.9547, + "step": 9412 + }, + { + "epoch": 0.6548401683536819, + "grad_norm": 0.9765625, + "learning_rate": 0.0005624076250776052, + "loss": 0.7509, + "step": 9413 + }, + { + "epoch": 0.654909735990817, + "grad_norm": 1.109375, + "learning_rate": 0.0005622050227372348, + "loss": 0.7757, + "step": 9414 + }, + { + "epoch": 0.6549793036279523, + "grad_norm": 1.046875, + "learning_rate": 0.0005620024426258867, + "loss": 0.6383, + "step": 9415 + }, + { + "epoch": 0.6550488712650875, + "grad_norm": 1.0625, + "learning_rate": 0.0005617998847538466, + "loss": 0.648, + "step": 9416 + }, + { + "epoch": 0.6551184389022227, + "grad_norm": 1.25, + "learning_rate": 0.0005615973491313996, + "loss": 0.723, + "step": 9417 + }, + { + "epoch": 0.6551880065393579, + "grad_norm": 1.1796875, + "learning_rate": 0.0005613948357688299, + "loss": 0.7752, + "step": 9418 + }, + { + "epoch": 0.6552575741764931, + "grad_norm": 0.96875, + "learning_rate": 0.0005611923446764196, + "loss": 0.8143, + "step": 9419 + }, + { + "epoch": 0.6553271418136283, + "grad_norm": 1.1953125, + "learning_rate": 0.00056098987586445, + "loss": 0.8424, + "step": 9420 + }, + { + "epoch": 0.6553967094507636, + "grad_norm": 1.3203125, + "learning_rate": 0.0005607874293432017, + "loss": 0.987, + "step": 9421 + }, + { + "epoch": 0.6554662770878987, + "grad_norm": 0.87890625, + "learning_rate": 0.0005605850051229544, + "loss": 0.5593, + "step": 9422 + }, + { + "epoch": 0.6555358447250339, + "grad_norm": 1.140625, + "learning_rate": 0.0005603826032139856, + "loss": 0.7599, + "step": 9423 + }, + { + "epoch": 0.6556054123621691, + "grad_norm": 1.21875, + "learning_rate": 0.0005601802236265721, + "loss": 0.8143, + "step": 9424 + }, + { + "epoch": 0.6556749799993044, + "grad_norm": 1.171875, + "learning_rate": 0.0005599778663709898, + "loss": 0.5966, + "step": 9425 + }, + { + "epoch": 0.6557445476364395, + "grad_norm": 1.2265625, + "learning_rate": 0.0005597755314575142, + "loss": 0.8998, + "step": 9426 + }, + { + "epoch": 0.6558141152735747, + "grad_norm": 1.21875, + "learning_rate": 0.0005595732188964177, + "loss": 1.1193, + "step": 9427 + }, + { + "epoch": 0.65588368291071, + "grad_norm": 1.5625, + "learning_rate": 0.0005593709286979736, + "loss": 0.8141, + "step": 9428 + }, + { + "epoch": 0.6559532505478451, + "grad_norm": 1.1015625, + "learning_rate": 0.0005591686608724524, + "loss": 0.7236, + "step": 9429 + }, + { + "epoch": 0.6560228181849803, + "grad_norm": 0.98046875, + "learning_rate": 0.000558966415430125, + "loss": 0.8101, + "step": 9430 + }, + { + "epoch": 0.6560923858221156, + "grad_norm": 1.046875, + "learning_rate": 0.0005587641923812599, + "loss": 0.9927, + "step": 9431 + }, + { + "epoch": 0.6561619534592508, + "grad_norm": 1.03125, + "learning_rate": 0.0005585619917361254, + "loss": 0.7156, + "step": 9432 + }, + { + "epoch": 0.6562315210963859, + "grad_norm": 1.328125, + "learning_rate": 0.0005583598135049879, + "loss": 0.9143, + "step": 9433 + }, + { + "epoch": 0.6563010887335212, + "grad_norm": 1.171875, + "learning_rate": 0.0005581576576981125, + "loss": 0.8323, + "step": 9434 + }, + { + "epoch": 0.6563706563706564, + "grad_norm": 1.2421875, + "learning_rate": 0.0005579555243257644, + "loss": 0.8342, + "step": 9435 + }, + { + "epoch": 0.6564402240077916, + "grad_norm": 1.125, + "learning_rate": 0.0005577534133982071, + "loss": 0.8752, + "step": 9436 + }, + { + "epoch": 0.6565097916449267, + "grad_norm": 1.2109375, + "learning_rate": 0.0005575513249257022, + "loss": 1.2009, + "step": 9437 + }, + { + "epoch": 0.656579359282062, + "grad_norm": 1.1015625, + "learning_rate": 0.0005573492589185107, + "loss": 0.8155, + "step": 9438 + }, + { + "epoch": 0.6566489269191972, + "grad_norm": 1.1875, + "learning_rate": 0.0005571472153868926, + "loss": 1.0064, + "step": 9439 + }, + { + "epoch": 0.6567184945563324, + "grad_norm": 1.0, + "learning_rate": 0.0005569451943411072, + "loss": 0.7553, + "step": 9440 + }, + { + "epoch": 0.6567880621934676, + "grad_norm": 0.8671875, + "learning_rate": 0.0005567431957914114, + "loss": 0.7241, + "step": 9441 + }, + { + "epoch": 0.6568576298306028, + "grad_norm": 1.015625, + "learning_rate": 0.0005565412197480621, + "loss": 0.8892, + "step": 9442 + }, + { + "epoch": 0.656927197467738, + "grad_norm": 1.53125, + "learning_rate": 0.0005563392662213143, + "loss": 0.9475, + "step": 9443 + }, + { + "epoch": 0.6569967651048733, + "grad_norm": 1.09375, + "learning_rate": 0.0005561373352214225, + "loss": 0.8963, + "step": 9444 + }, + { + "epoch": 0.6570663327420084, + "grad_norm": 1.1875, + "learning_rate": 0.0005559354267586394, + "loss": 0.9154, + "step": 9445 + }, + { + "epoch": 0.6571359003791436, + "grad_norm": 1.046875, + "learning_rate": 0.0005557335408432174, + "loss": 0.7852, + "step": 9446 + }, + { + "epoch": 0.6572054680162789, + "grad_norm": 1.140625, + "learning_rate": 0.0005555316774854068, + "loss": 0.8167, + "step": 9447 + }, + { + "epoch": 0.657275035653414, + "grad_norm": 1.0546875, + "learning_rate": 0.0005553298366954566, + "loss": 0.7987, + "step": 9448 + }, + { + "epoch": 0.6573446032905492, + "grad_norm": 0.984375, + "learning_rate": 0.000555128018483617, + "loss": 0.8527, + "step": 9449 + }, + { + "epoch": 0.6574141709276844, + "grad_norm": 1.1953125, + "learning_rate": 0.000554926222860134, + "loss": 0.7769, + "step": 9450 + }, + { + "epoch": 0.6574837385648197, + "grad_norm": 1.0234375, + "learning_rate": 0.0005547244498352542, + "loss": 0.7784, + "step": 9451 + }, + { + "epoch": 0.6575533062019548, + "grad_norm": 1.265625, + "learning_rate": 0.0005545226994192221, + "loss": 0.8866, + "step": 9452 + }, + { + "epoch": 0.65762287383909, + "grad_norm": 1.09375, + "learning_rate": 0.0005543209716222819, + "loss": 0.7285, + "step": 9453 + }, + { + "epoch": 0.6576924414762253, + "grad_norm": 1.203125, + "learning_rate": 0.0005541192664546768, + "loss": 0.8682, + "step": 9454 + }, + { + "epoch": 0.6577620091133605, + "grad_norm": 0.9609375, + "learning_rate": 0.0005539175839266475, + "loss": 0.9855, + "step": 9455 + }, + { + "epoch": 0.6578315767504956, + "grad_norm": 1.3203125, + "learning_rate": 0.0005537159240484353, + "loss": 1.0906, + "step": 9456 + }, + { + "epoch": 0.6579011443876309, + "grad_norm": 1.109375, + "learning_rate": 0.0005535142868302787, + "loss": 0.7366, + "step": 9457 + }, + { + "epoch": 0.6579707120247661, + "grad_norm": 0.921875, + "learning_rate": 0.0005533126722824164, + "loss": 0.7251, + "step": 9458 + }, + { + "epoch": 0.6580402796619013, + "grad_norm": 1.0859375, + "learning_rate": 0.000553111080415085, + "loss": 0.6313, + "step": 9459 + }, + { + "epoch": 0.6581098472990365, + "grad_norm": 1.1953125, + "learning_rate": 0.0005529095112385207, + "loss": 0.8287, + "step": 9460 + }, + { + "epoch": 0.6581794149361717, + "grad_norm": 1.1015625, + "learning_rate": 0.0005527079647629578, + "loss": 0.9603, + "step": 9461 + }, + { + "epoch": 0.6582489825733069, + "grad_norm": 1.078125, + "learning_rate": 0.0005525064409986292, + "loss": 0.7685, + "step": 9462 + }, + { + "epoch": 0.658318550210442, + "grad_norm": 0.99609375, + "learning_rate": 0.0005523049399557689, + "loss": 0.7197, + "step": 9463 + }, + { + "epoch": 0.6583881178475773, + "grad_norm": 1.203125, + "learning_rate": 0.0005521034616446071, + "loss": 0.7905, + "step": 9464 + }, + { + "epoch": 0.6584576854847125, + "grad_norm": 0.984375, + "learning_rate": 0.0005519020060753739, + "loss": 0.9, + "step": 9465 + }, + { + "epoch": 0.6585272531218477, + "grad_norm": 1.0234375, + "learning_rate": 0.0005517005732582981, + "loss": 0.7885, + "step": 9466 + }, + { + "epoch": 0.658596820758983, + "grad_norm": 1.3046875, + "learning_rate": 0.0005514991632036073, + "loss": 1.0088, + "step": 9467 + }, + { + "epoch": 0.6586663883961181, + "grad_norm": 1.2578125, + "learning_rate": 0.0005512977759215289, + "loss": 0.738, + "step": 9468 + }, + { + "epoch": 0.6587359560332533, + "grad_norm": 1.0625, + "learning_rate": 0.0005510964114222873, + "loss": 0.8165, + "step": 9469 + }, + { + "epoch": 0.6588055236703886, + "grad_norm": 0.94921875, + "learning_rate": 0.0005508950697161079, + "loss": 0.7834, + "step": 9470 + }, + { + "epoch": 0.6588750913075238, + "grad_norm": 1.0234375, + "learning_rate": 0.0005506937508132127, + "loss": 0.7316, + "step": 9471 + }, + { + "epoch": 0.6589446589446589, + "grad_norm": 1.1171875, + "learning_rate": 0.0005504924547238245, + "loss": 0.6777, + "step": 9472 + }, + { + "epoch": 0.6590142265817942, + "grad_norm": 1.2265625, + "learning_rate": 0.0005502911814581634, + "loss": 0.8826, + "step": 9473 + }, + { + "epoch": 0.6590837942189294, + "grad_norm": 0.8671875, + "learning_rate": 0.00055008993102645, + "loss": 0.6996, + "step": 9474 + }, + { + "epoch": 0.6591533618560645, + "grad_norm": 0.91796875, + "learning_rate": 0.0005498887034389015, + "loss": 0.7108, + "step": 9475 + }, + { + "epoch": 0.6592229294931997, + "grad_norm": 0.9296875, + "learning_rate": 0.0005496874987057361, + "loss": 0.6236, + "step": 9476 + }, + { + "epoch": 0.659292497130335, + "grad_norm": 1.0, + "learning_rate": 0.0005494863168371701, + "loss": 0.7545, + "step": 9477 + }, + { + "epoch": 0.6593620647674702, + "grad_norm": 0.87890625, + "learning_rate": 0.0005492851578434182, + "loss": 0.6097, + "step": 9478 + }, + { + "epoch": 0.6594316324046053, + "grad_norm": 1.140625, + "learning_rate": 0.0005490840217346942, + "loss": 0.665, + "step": 9479 + }, + { + "epoch": 0.6595012000417406, + "grad_norm": 1.1328125, + "learning_rate": 0.00054888290852121, + "loss": 0.8723, + "step": 9480 + }, + { + "epoch": 0.6595707676788758, + "grad_norm": 1.0859375, + "learning_rate": 0.0005486818182131785, + "loss": 0.8145, + "step": 9481 + }, + { + "epoch": 0.659640335316011, + "grad_norm": 1.0546875, + "learning_rate": 0.0005484807508208098, + "loss": 0.7581, + "step": 9482 + }, + { + "epoch": 0.6597099029531462, + "grad_norm": 1.2734375, + "learning_rate": 0.0005482797063543125, + "loss": 0.6727, + "step": 9483 + }, + { + "epoch": 0.6597794705902814, + "grad_norm": 0.89453125, + "learning_rate": 0.0005480786848238946, + "loss": 0.6584, + "step": 9484 + }, + { + "epoch": 0.6598490382274166, + "grad_norm": 0.8359375, + "learning_rate": 0.0005478776862397631, + "loss": 0.7583, + "step": 9485 + }, + { + "epoch": 0.6599186058645519, + "grad_norm": 0.94140625, + "learning_rate": 0.0005476767106121245, + "loss": 0.6691, + "step": 9486 + }, + { + "epoch": 0.659988173501687, + "grad_norm": 1.234375, + "learning_rate": 0.000547475757951182, + "loss": 0.8247, + "step": 9487 + }, + { + "epoch": 0.6600577411388222, + "grad_norm": 1.4609375, + "learning_rate": 0.0005472748282671401, + "loss": 0.9761, + "step": 9488 + }, + { + "epoch": 0.6601273087759574, + "grad_norm": 1.0234375, + "learning_rate": 0.0005470739215702001, + "loss": 0.8019, + "step": 9489 + }, + { + "epoch": 0.6601968764130927, + "grad_norm": 1.2265625, + "learning_rate": 0.000546873037870564, + "loss": 0.7796, + "step": 9490 + }, + { + "epoch": 0.6602664440502278, + "grad_norm": 1.4453125, + "learning_rate": 0.0005466721771784305, + "loss": 0.8948, + "step": 9491 + }, + { + "epoch": 0.660336011687363, + "grad_norm": 1.2578125, + "learning_rate": 0.0005464713395039993, + "loss": 1.0112, + "step": 9492 + }, + { + "epoch": 0.6604055793244983, + "grad_norm": 1.140625, + "learning_rate": 0.0005462705248574677, + "loss": 0.852, + "step": 9493 + }, + { + "epoch": 0.6604751469616335, + "grad_norm": 1.0859375, + "learning_rate": 0.000546069733249031, + "loss": 0.9642, + "step": 9494 + }, + { + "epoch": 0.6605447145987686, + "grad_norm": 1.0390625, + "learning_rate": 0.0005458689646888859, + "loss": 0.8078, + "step": 9495 + }, + { + "epoch": 0.6606142822359039, + "grad_norm": 1.3671875, + "learning_rate": 0.000545668219187226, + "loss": 0.9276, + "step": 9496 + }, + { + "epoch": 0.6606838498730391, + "grad_norm": 1.25, + "learning_rate": 0.0005454674967542439, + "loss": 0.8876, + "step": 9497 + }, + { + "epoch": 0.6607534175101742, + "grad_norm": 0.96484375, + "learning_rate": 0.0005452667974001308, + "loss": 0.7041, + "step": 9498 + }, + { + "epoch": 0.6608229851473094, + "grad_norm": 1.2578125, + "learning_rate": 0.0005450661211350779, + "loss": 0.6879, + "step": 9499 + }, + { + "epoch": 0.6608925527844447, + "grad_norm": 0.94921875, + "learning_rate": 0.0005448654679692745, + "loss": 0.8165, + "step": 9500 + }, + { + "epoch": 0.6609621204215799, + "grad_norm": 1.0078125, + "learning_rate": 0.0005446648379129083, + "loss": 0.7812, + "step": 9501 + }, + { + "epoch": 0.661031688058715, + "grad_norm": 1.171875, + "learning_rate": 0.0005444642309761669, + "loss": 0.8959, + "step": 9502 + }, + { + "epoch": 0.6611012556958503, + "grad_norm": 0.93359375, + "learning_rate": 0.0005442636471692355, + "loss": 0.9082, + "step": 9503 + }, + { + "epoch": 0.6611708233329855, + "grad_norm": 0.80859375, + "learning_rate": 0.0005440630865022993, + "loss": 0.5763, + "step": 9504 + }, + { + "epoch": 0.6612403909701207, + "grad_norm": 0.890625, + "learning_rate": 0.0005438625489855412, + "loss": 0.844, + "step": 9505 + }, + { + "epoch": 0.6613099586072559, + "grad_norm": 1.3125, + "learning_rate": 0.000543662034629144, + "loss": 1.0799, + "step": 9506 + }, + { + "epoch": 0.6613795262443911, + "grad_norm": 1.0078125, + "learning_rate": 0.0005434615434432884, + "loss": 0.7713, + "step": 9507 + }, + { + "epoch": 0.6614490938815263, + "grad_norm": 1.03125, + "learning_rate": 0.0005432610754381543, + "loss": 0.7808, + "step": 9508 + }, + { + "epoch": 0.6615186615186616, + "grad_norm": 1.3515625, + "learning_rate": 0.0005430606306239211, + "loss": 0.8285, + "step": 9509 + }, + { + "epoch": 0.6615882291557967, + "grad_norm": 1.3046875, + "learning_rate": 0.000542860209010766, + "loss": 0.6139, + "step": 9510 + }, + { + "epoch": 0.6616577967929319, + "grad_norm": 1.1171875, + "learning_rate": 0.0005426598106088651, + "loss": 0.7686, + "step": 9511 + }, + { + "epoch": 0.6617273644300671, + "grad_norm": 0.91796875, + "learning_rate": 0.0005424594354283937, + "loss": 0.8478, + "step": 9512 + }, + { + "epoch": 0.6617969320672024, + "grad_norm": 1.1171875, + "learning_rate": 0.0005422590834795259, + "loss": 0.7465, + "step": 9513 + }, + { + "epoch": 0.6618664997043375, + "grad_norm": 1.3046875, + "learning_rate": 0.0005420587547724352, + "loss": 1.0016, + "step": 9514 + }, + { + "epoch": 0.6619360673414727, + "grad_norm": 1.0625, + "learning_rate": 0.0005418584493172921, + "loss": 0.6856, + "step": 9515 + }, + { + "epoch": 0.662005634978608, + "grad_norm": 1.5390625, + "learning_rate": 0.0005416581671242682, + "loss": 1.1134, + "step": 9516 + }, + { + "epoch": 0.6620752026157432, + "grad_norm": 0.9453125, + "learning_rate": 0.000541457908203532, + "loss": 0.5706, + "step": 9517 + }, + { + "epoch": 0.6621447702528783, + "grad_norm": 0.97265625, + "learning_rate": 0.0005412576725652525, + "loss": 0.8443, + "step": 9518 + }, + { + "epoch": 0.6622143378900136, + "grad_norm": 1.484375, + "learning_rate": 0.0005410574602195957, + "loss": 0.9196, + "step": 9519 + }, + { + "epoch": 0.6622839055271488, + "grad_norm": 1.2109375, + "learning_rate": 0.0005408572711767282, + "loss": 0.9009, + "step": 9520 + }, + { + "epoch": 0.662353473164284, + "grad_norm": 1.2109375, + "learning_rate": 0.0005406571054468137, + "loss": 0.9215, + "step": 9521 + }, + { + "epoch": 0.6624230408014192, + "grad_norm": 1.203125, + "learning_rate": 0.0005404569630400163, + "loss": 0.9282, + "step": 9522 + }, + { + "epoch": 0.6624926084385544, + "grad_norm": 0.94140625, + "learning_rate": 0.0005402568439664983, + "loss": 0.7831, + "step": 9523 + }, + { + "epoch": 0.6625621760756896, + "grad_norm": 0.84765625, + "learning_rate": 0.0005400567482364207, + "loss": 0.5068, + "step": 9524 + }, + { + "epoch": 0.6626317437128247, + "grad_norm": 0.96484375, + "learning_rate": 0.0005398566758599429, + "loss": 0.6971, + "step": 9525 + }, + { + "epoch": 0.66270131134996, + "grad_norm": 1.1796875, + "learning_rate": 0.0005396566268472231, + "loss": 0.9397, + "step": 9526 + }, + { + "epoch": 0.6627708789870952, + "grad_norm": 0.8984375, + "learning_rate": 0.0005394566012084203, + "loss": 0.5619, + "step": 9527 + }, + { + "epoch": 0.6628404466242304, + "grad_norm": 1.046875, + "learning_rate": 0.00053925659895369, + "loss": 0.9043, + "step": 9528 + }, + { + "epoch": 0.6629100142613656, + "grad_norm": 1.234375, + "learning_rate": 0.0005390566200931869, + "loss": 0.9458, + "step": 9529 + }, + { + "epoch": 0.6629795818985008, + "grad_norm": 1.4453125, + "learning_rate": 0.0005388566646370656, + "loss": 0.7127, + "step": 9530 + }, + { + "epoch": 0.663049149535636, + "grad_norm": 0.96484375, + "learning_rate": 0.0005386567325954783, + "loss": 0.7, + "step": 9531 + }, + { + "epoch": 0.6631187171727713, + "grad_norm": 0.90234375, + "learning_rate": 0.0005384568239785771, + "loss": 0.7347, + "step": 9532 + }, + { + "epoch": 0.6631882848099064, + "grad_norm": 1.265625, + "learning_rate": 0.0005382569387965115, + "loss": 0.8503, + "step": 9533 + }, + { + "epoch": 0.6632578524470416, + "grad_norm": 1.2265625, + "learning_rate": 0.0005380570770594317, + "loss": 0.8793, + "step": 9534 + }, + { + "epoch": 0.6633274200841769, + "grad_norm": 1.1171875, + "learning_rate": 0.0005378572387774849, + "loss": 0.8123, + "step": 9535 + }, + { + "epoch": 0.6633969877213121, + "grad_norm": 1.0390625, + "learning_rate": 0.0005376574239608179, + "loss": 0.9453, + "step": 9536 + }, + { + "epoch": 0.6634665553584472, + "grad_norm": 1.484375, + "learning_rate": 0.000537457632619577, + "loss": 0.8145, + "step": 9537 + }, + { + "epoch": 0.6635361229955824, + "grad_norm": 1.15625, + "learning_rate": 0.0005372578647639063, + "loss": 0.9752, + "step": 9538 + }, + { + "epoch": 0.6636056906327177, + "grad_norm": 1.1640625, + "learning_rate": 0.0005370581204039482, + "loss": 0.7784, + "step": 9539 + }, + { + "epoch": 0.6636752582698529, + "grad_norm": 0.94140625, + "learning_rate": 0.0005368583995498455, + "loss": 0.7265, + "step": 9540 + }, + { + "epoch": 0.663744825906988, + "grad_norm": 0.90234375, + "learning_rate": 0.0005366587022117392, + "loss": 0.5821, + "step": 9541 + }, + { + "epoch": 0.6638143935441233, + "grad_norm": 1.3671875, + "learning_rate": 0.0005364590283997685, + "loss": 1.0459, + "step": 9542 + }, + { + "epoch": 0.6638839611812585, + "grad_norm": 1.1015625, + "learning_rate": 0.0005362593781240716, + "loss": 0.8329, + "step": 9543 + }, + { + "epoch": 0.6639535288183936, + "grad_norm": 0.9921875, + "learning_rate": 0.0005360597513947866, + "loss": 0.5252, + "step": 9544 + }, + { + "epoch": 0.6640230964555289, + "grad_norm": 1.1953125, + "learning_rate": 0.0005358601482220484, + "loss": 0.6583, + "step": 9545 + }, + { + "epoch": 0.6640926640926641, + "grad_norm": 1.1953125, + "learning_rate": 0.000535660568615993, + "loss": 0.8504, + "step": 9546 + }, + { + "epoch": 0.6641622317297993, + "grad_norm": 1.046875, + "learning_rate": 0.0005354610125867529, + "loss": 0.8956, + "step": 9547 + }, + { + "epoch": 0.6642317993669345, + "grad_norm": 1.15625, + "learning_rate": 0.0005352614801444617, + "loss": 1.0768, + "step": 9548 + }, + { + "epoch": 0.6643013670040697, + "grad_norm": 1.0390625, + "learning_rate": 0.0005350619712992495, + "loss": 0.8248, + "step": 9549 + }, + { + "epoch": 0.6643709346412049, + "grad_norm": 1.234375, + "learning_rate": 0.0005348624860612471, + "loss": 0.7256, + "step": 9550 + }, + { + "epoch": 0.6644405022783401, + "grad_norm": 0.8828125, + "learning_rate": 0.0005346630244405835, + "loss": 0.7343, + "step": 9551 + }, + { + "epoch": 0.6645100699154753, + "grad_norm": 0.875, + "learning_rate": 0.0005344635864473861, + "loss": 0.6724, + "step": 9552 + }, + { + "epoch": 0.6645796375526105, + "grad_norm": 0.7890625, + "learning_rate": 0.0005342641720917809, + "loss": 0.6969, + "step": 9553 + }, + { + "epoch": 0.6646492051897457, + "grad_norm": 1.40625, + "learning_rate": 0.0005340647813838935, + "loss": 1.1734, + "step": 9554 + }, + { + "epoch": 0.664718772826881, + "grad_norm": 1.0625, + "learning_rate": 0.0005338654143338484, + "loss": 0.9715, + "step": 9555 + }, + { + "epoch": 0.6647883404640161, + "grad_norm": 1.21875, + "learning_rate": 0.0005336660709517681, + "loss": 0.6627, + "step": 9556 + }, + { + "epoch": 0.6648579081011513, + "grad_norm": 1.1875, + "learning_rate": 0.0005334667512477742, + "loss": 0.8215, + "step": 9557 + }, + { + "epoch": 0.6649274757382866, + "grad_norm": 1.09375, + "learning_rate": 0.0005332674552319865, + "loss": 0.7588, + "step": 9558 + }, + { + "epoch": 0.6649970433754218, + "grad_norm": 1.109375, + "learning_rate": 0.0005330681829145257, + "loss": 0.8211, + "step": 9559 + }, + { + "epoch": 0.6650666110125569, + "grad_norm": 1.140625, + "learning_rate": 0.0005328689343055089, + "loss": 0.7802, + "step": 9560 + }, + { + "epoch": 0.6651361786496922, + "grad_norm": 0.953125, + "learning_rate": 0.0005326697094150528, + "loss": 0.699, + "step": 9561 + }, + { + "epoch": 0.6652057462868274, + "grad_norm": 0.9453125, + "learning_rate": 0.0005324705082532737, + "loss": 0.6794, + "step": 9562 + }, + { + "epoch": 0.6652753139239626, + "grad_norm": 1.09375, + "learning_rate": 0.0005322713308302852, + "loss": 0.6506, + "step": 9563 + }, + { + "epoch": 0.6653448815610977, + "grad_norm": 0.96875, + "learning_rate": 0.0005320721771562015, + "loss": 0.8187, + "step": 9564 + }, + { + "epoch": 0.665414449198233, + "grad_norm": 1.1796875, + "learning_rate": 0.0005318730472411337, + "loss": 0.9967, + "step": 9565 + }, + { + "epoch": 0.6654840168353682, + "grad_norm": 1.2265625, + "learning_rate": 0.0005316739410951934, + "loss": 0.7501, + "step": 9566 + }, + { + "epoch": 0.6655535844725033, + "grad_norm": 1.0234375, + "learning_rate": 0.0005314748587284895, + "loss": 0.7076, + "step": 9567 + }, + { + "epoch": 0.6656231521096386, + "grad_norm": 0.99609375, + "learning_rate": 0.0005312758001511307, + "loss": 1.0002, + "step": 9568 + }, + { + "epoch": 0.6656927197467738, + "grad_norm": 1.109375, + "learning_rate": 0.0005310767653732246, + "loss": 0.8335, + "step": 9569 + }, + { + "epoch": 0.665762287383909, + "grad_norm": 1.0859375, + "learning_rate": 0.0005308777544048767, + "loss": 0.8125, + "step": 9570 + }, + { + "epoch": 0.6658318550210442, + "grad_norm": 1.0234375, + "learning_rate": 0.0005306787672561917, + "loss": 0.7869, + "step": 9571 + }, + { + "epoch": 0.6659014226581794, + "grad_norm": 1.046875, + "learning_rate": 0.0005304798039372731, + "loss": 0.7647, + "step": 9572 + }, + { + "epoch": 0.6659709902953146, + "grad_norm": 1.1796875, + "learning_rate": 0.0005302808644582241, + "loss": 0.8244, + "step": 9573 + }, + { + "epoch": 0.6660405579324499, + "grad_norm": 0.9375, + "learning_rate": 0.0005300819488291452, + "loss": 0.6193, + "step": 9574 + }, + { + "epoch": 0.666110125569585, + "grad_norm": 1.1015625, + "learning_rate": 0.000529883057060136, + "loss": 0.8622, + "step": 9575 + }, + { + "epoch": 0.6661796932067202, + "grad_norm": 0.8203125, + "learning_rate": 0.0005296841891612959, + "loss": 0.8218, + "step": 9576 + }, + { + "epoch": 0.6662492608438554, + "grad_norm": 1.2265625, + "learning_rate": 0.0005294853451427217, + "loss": 0.8445, + "step": 9577 + }, + { + "epoch": 0.6663188284809907, + "grad_norm": 1.0078125, + "learning_rate": 0.0005292865250145107, + "loss": 0.6962, + "step": 9578 + }, + { + "epoch": 0.6663883961181258, + "grad_norm": 1.15625, + "learning_rate": 0.0005290877287867568, + "loss": 0.9622, + "step": 9579 + }, + { + "epoch": 0.666457963755261, + "grad_norm": 1.1484375, + "learning_rate": 0.000528888956469555, + "loss": 0.8845, + "step": 9580 + }, + { + "epoch": 0.6665275313923963, + "grad_norm": 1.1953125, + "learning_rate": 0.0005286902080729967, + "loss": 0.9223, + "step": 9581 + }, + { + "epoch": 0.6665970990295315, + "grad_norm": 1.2109375, + "learning_rate": 0.0005284914836071743, + "loss": 0.8549, + "step": 9582 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.328125, + "learning_rate": 0.0005282927830821782, + "loss": 0.9951, + "step": 9583 + }, + { + "epoch": 0.6667362343038019, + "grad_norm": 0.8359375, + "learning_rate": 0.000528094106508097, + "loss": 0.6767, + "step": 9584 + }, + { + "epoch": 0.6668058019409371, + "grad_norm": 1.1171875, + "learning_rate": 0.000527895453895018, + "loss": 0.9949, + "step": 9585 + }, + { + "epoch": 0.6668753695780723, + "grad_norm": 0.96484375, + "learning_rate": 0.0005276968252530283, + "loss": 0.7045, + "step": 9586 + }, + { + "epoch": 0.6669449372152075, + "grad_norm": 1.1796875, + "learning_rate": 0.0005274982205922136, + "loss": 0.8576, + "step": 9587 + }, + { + "epoch": 0.6670145048523427, + "grad_norm": 1.1640625, + "learning_rate": 0.0005272996399226578, + "loss": 0.8649, + "step": 9588 + }, + { + "epoch": 0.6670840724894779, + "grad_norm": 1.3515625, + "learning_rate": 0.0005271010832544431, + "loss": 0.7696, + "step": 9589 + }, + { + "epoch": 0.667153640126613, + "grad_norm": 1.25, + "learning_rate": 0.0005269025505976521, + "loss": 0.8932, + "step": 9590 + }, + { + "epoch": 0.6672232077637483, + "grad_norm": 1.0546875, + "learning_rate": 0.0005267040419623652, + "loss": 0.9085, + "step": 9591 + }, + { + "epoch": 0.6672927754008835, + "grad_norm": 1.2890625, + "learning_rate": 0.0005265055573586614, + "loss": 0.7903, + "step": 9592 + }, + { + "epoch": 0.6673623430380187, + "grad_norm": 1.0546875, + "learning_rate": 0.0005263070967966186, + "loss": 0.7596, + "step": 9593 + }, + { + "epoch": 0.667431910675154, + "grad_norm": 1.15625, + "learning_rate": 0.0005261086602863141, + "loss": 0.8148, + "step": 9594 + }, + { + "epoch": 0.6675014783122891, + "grad_norm": 1.0234375, + "learning_rate": 0.0005259102478378228, + "loss": 0.6692, + "step": 9595 + }, + { + "epoch": 0.6675710459494243, + "grad_norm": 1.1953125, + "learning_rate": 0.0005257118594612195, + "loss": 0.916, + "step": 9596 + }, + { + "epoch": 0.6676406135865596, + "grad_norm": 0.87109375, + "learning_rate": 0.000525513495166578, + "loss": 0.5399, + "step": 9597 + }, + { + "epoch": 0.6677101812236947, + "grad_norm": 0.99609375, + "learning_rate": 0.0005253151549639694, + "loss": 0.7878, + "step": 9598 + }, + { + "epoch": 0.6677797488608299, + "grad_norm": 1.203125, + "learning_rate": 0.0005251168388634644, + "loss": 0.8096, + "step": 9599 + }, + { + "epoch": 0.6678493164979652, + "grad_norm": 0.9921875, + "learning_rate": 0.0005249185468751327, + "loss": 0.762, + "step": 9600 + }, + { + "epoch": 0.6679188841351004, + "grad_norm": 1.3828125, + "learning_rate": 0.000524720279009043, + "loss": 0.7958, + "step": 9601 + }, + { + "epoch": 0.6679884517722355, + "grad_norm": 1.2578125, + "learning_rate": 0.0005245220352752619, + "loss": 0.9365, + "step": 9602 + }, + { + "epoch": 0.6680580194093707, + "grad_norm": 1.046875, + "learning_rate": 0.0005243238156838548, + "loss": 0.8501, + "step": 9603 + }, + { + "epoch": 0.668127587046506, + "grad_norm": 1.4375, + "learning_rate": 0.000524125620244887, + "loss": 0.9144, + "step": 9604 + }, + { + "epoch": 0.6681971546836412, + "grad_norm": 1.1953125, + "learning_rate": 0.0005239274489684218, + "loss": 0.8119, + "step": 9605 + }, + { + "epoch": 0.6682667223207763, + "grad_norm": 1.3984375, + "learning_rate": 0.0005237293018645211, + "loss": 0.9269, + "step": 9606 + }, + { + "epoch": 0.6683362899579116, + "grad_norm": 1.34375, + "learning_rate": 0.0005235311789432457, + "loss": 0.9449, + "step": 9607 + }, + { + "epoch": 0.6684058575950468, + "grad_norm": 1.1171875, + "learning_rate": 0.0005233330802146556, + "loss": 0.9373, + "step": 9608 + }, + { + "epoch": 0.668475425232182, + "grad_norm": 0.92578125, + "learning_rate": 0.0005231350056888089, + "loss": 0.823, + "step": 9609 + }, + { + "epoch": 0.6685449928693172, + "grad_norm": 1.484375, + "learning_rate": 0.000522936955375763, + "loss": 0.9763, + "step": 9610 + }, + { + "epoch": 0.6686145605064524, + "grad_norm": 1.25, + "learning_rate": 0.0005227389292855743, + "loss": 0.9294, + "step": 9611 + }, + { + "epoch": 0.6686841281435876, + "grad_norm": 1.0, + "learning_rate": 0.0005225409274282973, + "loss": 0.8357, + "step": 9612 + }, + { + "epoch": 0.6687536957807229, + "grad_norm": 1.5625, + "learning_rate": 0.0005223429498139849, + "loss": 0.8657, + "step": 9613 + }, + { + "epoch": 0.668823263417858, + "grad_norm": 1.4921875, + "learning_rate": 0.0005221449964526899, + "loss": 1.0415, + "step": 9614 + }, + { + "epoch": 0.6688928310549932, + "grad_norm": 1.2109375, + "learning_rate": 0.000521947067354464, + "loss": 0.8141, + "step": 9615 + }, + { + "epoch": 0.6689623986921284, + "grad_norm": 1.1171875, + "learning_rate": 0.0005217491625293562, + "loss": 0.8964, + "step": 9616 + }, + { + "epoch": 0.6690319663292636, + "grad_norm": 1.21875, + "learning_rate": 0.0005215512819874152, + "loss": 0.8618, + "step": 9617 + }, + { + "epoch": 0.6691015339663988, + "grad_norm": 1.15625, + "learning_rate": 0.0005213534257386885, + "loss": 0.9095, + "step": 9618 + }, + { + "epoch": 0.669171101603534, + "grad_norm": 1.03125, + "learning_rate": 0.0005211555937932225, + "loss": 0.58, + "step": 9619 + }, + { + "epoch": 0.6692406692406693, + "grad_norm": 1.109375, + "learning_rate": 0.0005209577861610621, + "loss": 0.7973, + "step": 9620 + }, + { + "epoch": 0.6693102368778044, + "grad_norm": 0.859375, + "learning_rate": 0.0005207600028522503, + "loss": 0.6226, + "step": 9621 + }, + { + "epoch": 0.6693798045149396, + "grad_norm": 1.1953125, + "learning_rate": 0.00052056224387683, + "loss": 0.8321, + "step": 9622 + }, + { + "epoch": 0.6694493721520749, + "grad_norm": 1.0390625, + "learning_rate": 0.0005203645092448428, + "loss": 0.7101, + "step": 9623 + }, + { + "epoch": 0.6695189397892101, + "grad_norm": 0.9296875, + "learning_rate": 0.0005201667989663279, + "loss": 0.7034, + "step": 9624 + }, + { + "epoch": 0.6695885074263452, + "grad_norm": 1.515625, + "learning_rate": 0.0005199691130513248, + "loss": 0.9162, + "step": 9625 + }, + { + "epoch": 0.6696580750634805, + "grad_norm": 0.89453125, + "learning_rate": 0.0005197714515098705, + "loss": 0.7831, + "step": 9626 + }, + { + "epoch": 0.6697276427006157, + "grad_norm": 0.953125, + "learning_rate": 0.0005195738143520012, + "loss": 0.8744, + "step": 9627 + }, + { + "epoch": 0.6697972103377509, + "grad_norm": 0.984375, + "learning_rate": 0.0005193762015877519, + "loss": 0.7083, + "step": 9628 + }, + { + "epoch": 0.669866777974886, + "grad_norm": 1.0390625, + "learning_rate": 0.000519178613227157, + "loss": 0.6718, + "step": 9629 + }, + { + "epoch": 0.6699363456120213, + "grad_norm": 1.0390625, + "learning_rate": 0.0005189810492802485, + "loss": 0.915, + "step": 9630 + }, + { + "epoch": 0.6700059132491565, + "grad_norm": 1.1953125, + "learning_rate": 0.0005187835097570576, + "loss": 0.7503, + "step": 9631 + }, + { + "epoch": 0.6700754808862917, + "grad_norm": 1.1015625, + "learning_rate": 0.0005185859946676143, + "loss": 0.8699, + "step": 9632 + }, + { + "epoch": 0.6701450485234269, + "grad_norm": 1.234375, + "learning_rate": 0.0005183885040219484, + "loss": 0.9738, + "step": 9633 + }, + { + "epoch": 0.6702146161605621, + "grad_norm": 1.03125, + "learning_rate": 0.0005181910378300866, + "loss": 0.6297, + "step": 9634 + }, + { + "epoch": 0.6702841837976973, + "grad_norm": 1.140625, + "learning_rate": 0.000517993596102055, + "loss": 0.779, + "step": 9635 + }, + { + "epoch": 0.6703537514348326, + "grad_norm": 1.359375, + "learning_rate": 0.000517796178847879, + "loss": 0.8163, + "step": 9636 + }, + { + "epoch": 0.6704233190719677, + "grad_norm": 0.9140625, + "learning_rate": 0.0005175987860775832, + "loss": 0.7354, + "step": 9637 + }, + { + "epoch": 0.6704928867091029, + "grad_norm": 1.0, + "learning_rate": 0.0005174014178011894, + "loss": 0.8463, + "step": 9638 + }, + { + "epoch": 0.6705624543462382, + "grad_norm": 1.28125, + "learning_rate": 0.0005172040740287188, + "loss": 0.8223, + "step": 9639 + }, + { + "epoch": 0.6706320219833733, + "grad_norm": 0.96875, + "learning_rate": 0.0005170067547701922, + "loss": 0.7575, + "step": 9640 + }, + { + "epoch": 0.6707015896205085, + "grad_norm": 1.1953125, + "learning_rate": 0.0005168094600356277, + "loss": 0.5398, + "step": 9641 + }, + { + "epoch": 0.6707711572576437, + "grad_norm": 0.90234375, + "learning_rate": 0.0005166121898350434, + "loss": 0.681, + "step": 9642 + }, + { + "epoch": 0.670840724894779, + "grad_norm": 1.3125, + "learning_rate": 0.000516414944178456, + "loss": 0.9722, + "step": 9643 + }, + { + "epoch": 0.6709102925319141, + "grad_norm": 1.203125, + "learning_rate": 0.0005162177230758803, + "loss": 0.8177, + "step": 9644 + }, + { + "epoch": 0.6709798601690493, + "grad_norm": 1.0703125, + "learning_rate": 0.0005160205265373299, + "loss": 0.8753, + "step": 9645 + }, + { + "epoch": 0.6710494278061846, + "grad_norm": 1.3203125, + "learning_rate": 0.0005158233545728175, + "loss": 1.0807, + "step": 9646 + }, + { + "epoch": 0.6711189954433198, + "grad_norm": 1.3984375, + "learning_rate": 0.0005156262071923553, + "loss": 0.8875, + "step": 9647 + }, + { + "epoch": 0.6711885630804549, + "grad_norm": 1.375, + "learning_rate": 0.0005154290844059528, + "loss": 1.0051, + "step": 9648 + }, + { + "epoch": 0.6712581307175902, + "grad_norm": 1.4921875, + "learning_rate": 0.0005152319862236185, + "loss": 1.0798, + "step": 9649 + }, + { + "epoch": 0.6713276983547254, + "grad_norm": 1.171875, + "learning_rate": 0.0005150349126553607, + "loss": 0.8117, + "step": 9650 + }, + { + "epoch": 0.6713972659918606, + "grad_norm": 1.03125, + "learning_rate": 0.000514837863711186, + "loss": 0.7735, + "step": 9651 + }, + { + "epoch": 0.6714668336289958, + "grad_norm": 1.0625, + "learning_rate": 0.0005146408394010991, + "loss": 0.8665, + "step": 9652 + }, + { + "epoch": 0.671536401266131, + "grad_norm": 1.5625, + "learning_rate": 0.0005144438397351037, + "loss": 1.0766, + "step": 9653 + }, + { + "epoch": 0.6716059689032662, + "grad_norm": 1.03125, + "learning_rate": 0.0005142468647232025, + "loss": 0.843, + "step": 9654 + }, + { + "epoch": 0.6716755365404014, + "grad_norm": 1.0390625, + "learning_rate": 0.0005140499143753978, + "loss": 0.746, + "step": 9655 + }, + { + "epoch": 0.6717451041775366, + "grad_norm": 1.1796875, + "learning_rate": 0.0005138529887016885, + "loss": 0.844, + "step": 9656 + }, + { + "epoch": 0.6718146718146718, + "grad_norm": 1.1875, + "learning_rate": 0.0005136560877120746, + "loss": 0.7019, + "step": 9657 + }, + { + "epoch": 0.671884239451807, + "grad_norm": 1.171875, + "learning_rate": 0.0005134592114165531, + "loss": 0.7212, + "step": 9658 + }, + { + "epoch": 0.6719538070889423, + "grad_norm": 1.28125, + "learning_rate": 0.0005132623598251201, + "loss": 0.9283, + "step": 9659 + }, + { + "epoch": 0.6720233747260774, + "grad_norm": 1.1171875, + "learning_rate": 0.0005130655329477712, + "loss": 0.7341, + "step": 9660 + }, + { + "epoch": 0.6720929423632126, + "grad_norm": 1.0546875, + "learning_rate": 0.0005128687307945006, + "loss": 0.5998, + "step": 9661 + }, + { + "epoch": 0.6721625100003479, + "grad_norm": 1.3046875, + "learning_rate": 0.0005126719533753006, + "loss": 0.8028, + "step": 9662 + }, + { + "epoch": 0.672232077637483, + "grad_norm": 1.3515625, + "learning_rate": 0.0005124752007001619, + "loss": 0.9841, + "step": 9663 + }, + { + "epoch": 0.6723016452746182, + "grad_norm": 1.53125, + "learning_rate": 0.0005122784727790752, + "loss": 1.1907, + "step": 9664 + }, + { + "epoch": 0.6723712129117535, + "grad_norm": 1.1796875, + "learning_rate": 0.0005120817696220299, + "loss": 1.0263, + "step": 9665 + }, + { + "epoch": 0.6724407805488887, + "grad_norm": 0.96875, + "learning_rate": 0.0005118850912390131, + "loss": 0.827, + "step": 9666 + }, + { + "epoch": 0.6725103481860238, + "grad_norm": 1.0859375, + "learning_rate": 0.0005116884376400107, + "loss": 0.6834, + "step": 9667 + }, + { + "epoch": 0.672579915823159, + "grad_norm": 1.015625, + "learning_rate": 0.0005114918088350079, + "loss": 0.9123, + "step": 9668 + }, + { + "epoch": 0.6726494834602943, + "grad_norm": 1.453125, + "learning_rate": 0.0005112952048339894, + "loss": 0.8906, + "step": 9669 + }, + { + "epoch": 0.6727190510974295, + "grad_norm": 1.234375, + "learning_rate": 0.0005110986256469366, + "loss": 0.7647, + "step": 9670 + }, + { + "epoch": 0.6727886187345646, + "grad_norm": 0.86328125, + "learning_rate": 0.0005109020712838318, + "loss": 0.5316, + "step": 9671 + }, + { + "epoch": 0.6728581863716999, + "grad_norm": 0.92578125, + "learning_rate": 0.0005107055417546547, + "loss": 0.6323, + "step": 9672 + }, + { + "epoch": 0.6729277540088351, + "grad_norm": 1.2109375, + "learning_rate": 0.0005105090370693835, + "loss": 0.7915, + "step": 9673 + }, + { + "epoch": 0.6729973216459703, + "grad_norm": 1.40625, + "learning_rate": 0.000510312557237996, + "loss": 1.0018, + "step": 9674 + }, + { + "epoch": 0.6730668892831055, + "grad_norm": 0.91796875, + "learning_rate": 0.0005101161022704692, + "loss": 0.6877, + "step": 9675 + }, + { + "epoch": 0.6731364569202407, + "grad_norm": 0.91796875, + "learning_rate": 0.0005099196721767776, + "loss": 0.7115, + "step": 9676 + }, + { + "epoch": 0.6732060245573759, + "grad_norm": 1.140625, + "learning_rate": 0.0005097232669668943, + "loss": 0.8713, + "step": 9677 + }, + { + "epoch": 0.6732755921945112, + "grad_norm": 0.875, + "learning_rate": 0.0005095268866507924, + "loss": 0.4574, + "step": 9678 + }, + { + "epoch": 0.6733451598316463, + "grad_norm": 1.015625, + "learning_rate": 0.0005093305312384434, + "loss": 0.7687, + "step": 9679 + }, + { + "epoch": 0.6734147274687815, + "grad_norm": 1.0078125, + "learning_rate": 0.000509134200739817, + "loss": 0.7457, + "step": 9680 + }, + { + "epoch": 0.6734842951059167, + "grad_norm": 1.0, + "learning_rate": 0.0005089378951648811, + "loss": 0.5863, + "step": 9681 + }, + { + "epoch": 0.673553862743052, + "grad_norm": 1.0, + "learning_rate": 0.0005087416145236039, + "loss": 0.7629, + "step": 9682 + }, + { + "epoch": 0.6736234303801871, + "grad_norm": 1.0390625, + "learning_rate": 0.0005085453588259519, + "loss": 0.6333, + "step": 9683 + }, + { + "epoch": 0.6736929980173223, + "grad_norm": 1.0234375, + "learning_rate": 0.0005083491280818888, + "loss": 0.9301, + "step": 9684 + }, + { + "epoch": 0.6737625656544576, + "grad_norm": 1.046875, + "learning_rate": 0.0005081529223013795, + "loss": 0.7265, + "step": 9685 + }, + { + "epoch": 0.6738321332915927, + "grad_norm": 0.8671875, + "learning_rate": 0.0005079567414943856, + "loss": 0.5622, + "step": 9686 + }, + { + "epoch": 0.6739017009287279, + "grad_norm": 1.0390625, + "learning_rate": 0.0005077605856708678, + "loss": 0.9549, + "step": 9687 + }, + { + "epoch": 0.6739712685658632, + "grad_norm": 1.0390625, + "learning_rate": 0.0005075644548407865, + "loss": 0.6197, + "step": 9688 + }, + { + "epoch": 0.6740408362029984, + "grad_norm": 1.28125, + "learning_rate": 0.0005073683490141005, + "loss": 0.8045, + "step": 9689 + }, + { + "epoch": 0.6741104038401335, + "grad_norm": 0.80859375, + "learning_rate": 0.0005071722682007667, + "loss": 0.7151, + "step": 9690 + }, + { + "epoch": 0.6741799714772688, + "grad_norm": 1.1328125, + "learning_rate": 0.0005069762124107408, + "loss": 0.726, + "step": 9691 + }, + { + "epoch": 0.674249539114404, + "grad_norm": 1.3046875, + "learning_rate": 0.0005067801816539776, + "loss": 1.2405, + "step": 9692 + }, + { + "epoch": 0.6743191067515392, + "grad_norm": 1.09375, + "learning_rate": 0.0005065841759404313, + "loss": 0.7895, + "step": 9693 + }, + { + "epoch": 0.6743886743886743, + "grad_norm": 1.078125, + "learning_rate": 0.0005063881952800535, + "loss": 1.0126, + "step": 9694 + }, + { + "epoch": 0.6744582420258096, + "grad_norm": 1.015625, + "learning_rate": 0.0005061922396827947, + "loss": 0.7312, + "step": 9695 + }, + { + "epoch": 0.6745278096629448, + "grad_norm": 1.015625, + "learning_rate": 0.0005059963091586051, + "loss": 0.8364, + "step": 9696 + }, + { + "epoch": 0.67459737730008, + "grad_norm": 1.265625, + "learning_rate": 0.0005058004037174333, + "loss": 0.7215, + "step": 9697 + }, + { + "epoch": 0.6746669449372152, + "grad_norm": 1.1875, + "learning_rate": 0.0005056045233692257, + "loss": 0.9513, + "step": 9698 + }, + { + "epoch": 0.6747365125743504, + "grad_norm": 1.1328125, + "learning_rate": 0.0005054086681239288, + "loss": 1.0549, + "step": 9699 + }, + { + "epoch": 0.6748060802114856, + "grad_norm": 1.3671875, + "learning_rate": 0.0005052128379914864, + "loss": 0.5822, + "step": 9700 + }, + { + "epoch": 0.6748756478486209, + "grad_norm": 1.140625, + "learning_rate": 0.0005050170329818427, + "loss": 0.7373, + "step": 9701 + }, + { + "epoch": 0.674945215485756, + "grad_norm": 1.1875, + "learning_rate": 0.0005048212531049386, + "loss": 0.8542, + "step": 9702 + }, + { + "epoch": 0.6750147831228912, + "grad_norm": 1.015625, + "learning_rate": 0.0005046254983707159, + "loss": 0.8557, + "step": 9703 + }, + { + "epoch": 0.6750843507600265, + "grad_norm": 1.2109375, + "learning_rate": 0.0005044297687891135, + "loss": 0.9148, + "step": 9704 + }, + { + "epoch": 0.6751539183971617, + "grad_norm": 0.90234375, + "learning_rate": 0.0005042340643700687, + "loss": 0.7164, + "step": 9705 + }, + { + "epoch": 0.6752234860342968, + "grad_norm": 1.125, + "learning_rate": 0.0005040383851235202, + "loss": 0.764, + "step": 9706 + }, + { + "epoch": 0.675293053671432, + "grad_norm": 0.94921875, + "learning_rate": 0.0005038427310594026, + "loss": 0.6977, + "step": 9707 + }, + { + "epoch": 0.6753626213085673, + "grad_norm": 1.0390625, + "learning_rate": 0.0005036471021876503, + "loss": 1.0362, + "step": 9708 + }, + { + "epoch": 0.6754321889457024, + "grad_norm": 0.890625, + "learning_rate": 0.0005034514985181959, + "loss": 0.6269, + "step": 9709 + }, + { + "epoch": 0.6755017565828376, + "grad_norm": 1.1484375, + "learning_rate": 0.0005032559200609716, + "loss": 0.8695, + "step": 9710 + }, + { + "epoch": 0.6755713242199729, + "grad_norm": 1.015625, + "learning_rate": 0.0005030603668259084, + "loss": 0.8633, + "step": 9711 + }, + { + "epoch": 0.6756408918571081, + "grad_norm": 0.90625, + "learning_rate": 0.0005028648388229346, + "loss": 0.5972, + "step": 9712 + }, + { + "epoch": 0.6757104594942432, + "grad_norm": 0.97265625, + "learning_rate": 0.000502669336061979, + "loss": 0.7126, + "step": 9713 + }, + { + "epoch": 0.6757800271313785, + "grad_norm": 1.1171875, + "learning_rate": 0.0005024738585529672, + "loss": 0.9349, + "step": 9714 + }, + { + "epoch": 0.6758495947685137, + "grad_norm": 1.203125, + "learning_rate": 0.0005022784063058257, + "loss": 0.8628, + "step": 9715 + }, + { + "epoch": 0.6759191624056489, + "grad_norm": 0.80859375, + "learning_rate": 0.0005020829793304775, + "loss": 0.6641, + "step": 9716 + }, + { + "epoch": 0.6759887300427841, + "grad_norm": 0.8046875, + "learning_rate": 0.0005018875776368464, + "loss": 0.5537, + "step": 9717 + }, + { + "epoch": 0.6760582976799193, + "grad_norm": 0.98828125, + "learning_rate": 0.0005016922012348535, + "loss": 0.6479, + "step": 9718 + }, + { + "epoch": 0.6761278653170545, + "grad_norm": 1.0859375, + "learning_rate": 0.0005014968501344184, + "loss": 0.949, + "step": 9719 + }, + { + "epoch": 0.6761974329541897, + "grad_norm": 1.2734375, + "learning_rate": 0.0005013015243454607, + "loss": 0.9195, + "step": 9720 + }, + { + "epoch": 0.6762670005913249, + "grad_norm": 1.4921875, + "learning_rate": 0.0005011062238778983, + "loss": 0.8893, + "step": 9721 + }, + { + "epoch": 0.6763365682284601, + "grad_norm": 0.91796875, + "learning_rate": 0.0005009109487416473, + "loss": 0.6517, + "step": 9722 + }, + { + "epoch": 0.6764061358655953, + "grad_norm": 1.125, + "learning_rate": 0.0005007156989466224, + "loss": 0.8944, + "step": 9723 + }, + { + "epoch": 0.6764757035027306, + "grad_norm": 0.953125, + "learning_rate": 0.0005005204745027376, + "loss": 0.7622, + "step": 9724 + }, + { + "epoch": 0.6765452711398657, + "grad_norm": 1.0859375, + "learning_rate": 0.000500325275419906, + "loss": 1.125, + "step": 9725 + }, + { + "epoch": 0.6766148387770009, + "grad_norm": 1.1171875, + "learning_rate": 0.0005001301017080384, + "loss": 0.7286, + "step": 9726 + }, + { + "epoch": 0.6766844064141362, + "grad_norm": 1.578125, + "learning_rate": 0.0004999349533770444, + "loss": 0.8135, + "step": 9727 + }, + { + "epoch": 0.6767539740512714, + "grad_norm": 1.234375, + "learning_rate": 0.0004997398304368327, + "loss": 0.7637, + "step": 9728 + }, + { + "epoch": 0.6768235416884065, + "grad_norm": 1.015625, + "learning_rate": 0.0004995447328973114, + "loss": 0.6227, + "step": 9729 + }, + { + "epoch": 0.6768931093255418, + "grad_norm": 1.34375, + "learning_rate": 0.0004993496607683857, + "loss": 0.9152, + "step": 9730 + }, + { + "epoch": 0.676962676962677, + "grad_norm": 1.0078125, + "learning_rate": 0.0004991546140599612, + "loss": 0.767, + "step": 9731 + }, + { + "epoch": 0.6770322445998121, + "grad_norm": 1.4765625, + "learning_rate": 0.0004989595927819406, + "loss": 0.9917, + "step": 9732 + }, + { + "epoch": 0.6771018122369473, + "grad_norm": 1.1015625, + "learning_rate": 0.0004987645969442268, + "loss": 0.9896, + "step": 9733 + }, + { + "epoch": 0.6771713798740826, + "grad_norm": 0.96875, + "learning_rate": 0.0004985696265567198, + "loss": 0.6176, + "step": 9734 + }, + { + "epoch": 0.6772409475112178, + "grad_norm": 1.3359375, + "learning_rate": 0.0004983746816293204, + "loss": 0.8158, + "step": 9735 + }, + { + "epoch": 0.6773105151483529, + "grad_norm": 1.046875, + "learning_rate": 0.0004981797621719262, + "loss": 0.7196, + "step": 9736 + }, + { + "epoch": 0.6773800827854882, + "grad_norm": 1.171875, + "learning_rate": 0.0004979848681944338, + "loss": 0.9503, + "step": 9737 + }, + { + "epoch": 0.6774496504226234, + "grad_norm": 1.6875, + "learning_rate": 0.0004977899997067396, + "loss": 0.9683, + "step": 9738 + }, + { + "epoch": 0.6775192180597586, + "grad_norm": 1.6328125, + "learning_rate": 0.0004975951567187382, + "loss": 0.7924, + "step": 9739 + }, + { + "epoch": 0.6775887856968938, + "grad_norm": 1.1953125, + "learning_rate": 0.0004974003392403224, + "loss": 0.8315, + "step": 9740 + }, + { + "epoch": 0.677658353334029, + "grad_norm": 1.453125, + "learning_rate": 0.0004972055472813839, + "loss": 1.0786, + "step": 9741 + }, + { + "epoch": 0.6777279209711642, + "grad_norm": 1.0546875, + "learning_rate": 0.0004970107808518133, + "loss": 0.8551, + "step": 9742 + }, + { + "epoch": 0.6777974886082995, + "grad_norm": 0.9140625, + "learning_rate": 0.0004968160399615003, + "loss": 0.6797, + "step": 9743 + }, + { + "epoch": 0.6778670562454346, + "grad_norm": 0.8984375, + "learning_rate": 0.0004966213246203323, + "loss": 0.7715, + "step": 9744 + }, + { + "epoch": 0.6779366238825698, + "grad_norm": 0.921875, + "learning_rate": 0.0004964266348381965, + "loss": 0.5859, + "step": 9745 + }, + { + "epoch": 0.678006191519705, + "grad_norm": 1.171875, + "learning_rate": 0.0004962319706249777, + "loss": 0.7948, + "step": 9746 + }, + { + "epoch": 0.6780757591568403, + "grad_norm": 1.0546875, + "learning_rate": 0.0004960373319905605, + "loss": 0.6442, + "step": 9747 + }, + { + "epoch": 0.6781453267939754, + "grad_norm": 1.046875, + "learning_rate": 0.0004958427189448272, + "loss": 0.5826, + "step": 9748 + }, + { + "epoch": 0.6782148944311106, + "grad_norm": 1.0546875, + "learning_rate": 0.0004956481314976599, + "loss": 0.8098, + "step": 9749 + }, + { + "epoch": 0.6782844620682459, + "grad_norm": 1.078125, + "learning_rate": 0.0004954535696589382, + "loss": 0.9073, + "step": 9750 + }, + { + "epoch": 0.6783540297053811, + "grad_norm": 1.046875, + "learning_rate": 0.0004952590334385404, + "loss": 0.8885, + "step": 9751 + }, + { + "epoch": 0.6784235973425162, + "grad_norm": 1.34375, + "learning_rate": 0.0004950645228463457, + "loss": 0.9079, + "step": 9752 + }, + { + "epoch": 0.6784931649796515, + "grad_norm": 1.546875, + "learning_rate": 0.0004948700378922293, + "loss": 0.8953, + "step": 9753 + }, + { + "epoch": 0.6785627326167867, + "grad_norm": 1.15625, + "learning_rate": 0.0004946755785860664, + "loss": 0.839, + "step": 9754 + }, + { + "epoch": 0.6786323002539218, + "grad_norm": 1.1328125, + "learning_rate": 0.0004944811449377301, + "loss": 0.8367, + "step": 9755 + }, + { + "epoch": 0.6787018678910571, + "grad_norm": 1.046875, + "learning_rate": 0.0004942867369570934, + "loss": 0.6971, + "step": 9756 + }, + { + "epoch": 0.6787714355281923, + "grad_norm": 1.1640625, + "learning_rate": 0.0004940923546540276, + "loss": 0.9824, + "step": 9757 + }, + { + "epoch": 0.6788410031653275, + "grad_norm": 1.28125, + "learning_rate": 0.0004938979980384017, + "loss": 0.8169, + "step": 9758 + }, + { + "epoch": 0.6789105708024626, + "grad_norm": 1.125, + "learning_rate": 0.0004937036671200847, + "loss": 0.8172, + "step": 9759 + }, + { + "epoch": 0.6789801384395979, + "grad_norm": 1.2109375, + "learning_rate": 0.0004935093619089434, + "loss": 0.8487, + "step": 9760 + }, + { + "epoch": 0.6790497060767331, + "grad_norm": 1.234375, + "learning_rate": 0.0004933150824148441, + "loss": 0.7046, + "step": 9761 + }, + { + "epoch": 0.6791192737138683, + "grad_norm": 0.84765625, + "learning_rate": 0.0004931208286476506, + "loss": 0.5163, + "step": 9762 + }, + { + "epoch": 0.6791888413510035, + "grad_norm": 1.0390625, + "learning_rate": 0.000492926600617227, + "loss": 0.7764, + "step": 9763 + }, + { + "epoch": 0.6792584089881387, + "grad_norm": 1.1640625, + "learning_rate": 0.0004927323983334344, + "loss": 0.8477, + "step": 9764 + }, + { + "epoch": 0.6793279766252739, + "grad_norm": 1.140625, + "learning_rate": 0.0004925382218061338, + "loss": 0.8992, + "step": 9765 + }, + { + "epoch": 0.6793975442624092, + "grad_norm": 1.0234375, + "learning_rate": 0.0004923440710451848, + "loss": 0.958, + "step": 9766 + }, + { + "epoch": 0.6794671118995443, + "grad_norm": 1.15625, + "learning_rate": 0.0004921499460604453, + "loss": 0.8821, + "step": 9767 + }, + { + "epoch": 0.6795366795366795, + "grad_norm": 1.6171875, + "learning_rate": 0.0004919558468617717, + "loss": 1.3672, + "step": 9768 + }, + { + "epoch": 0.6796062471738148, + "grad_norm": 1.171875, + "learning_rate": 0.000491761773459019, + "loss": 0.6571, + "step": 9769 + }, + { + "epoch": 0.67967581481095, + "grad_norm": 0.921875, + "learning_rate": 0.0004915677258620416, + "loss": 0.8382, + "step": 9770 + }, + { + "epoch": 0.6797453824480851, + "grad_norm": 1.7265625, + "learning_rate": 0.0004913737040806931, + "loss": 1.2732, + "step": 9771 + }, + { + "epoch": 0.6798149500852203, + "grad_norm": 1.1640625, + "learning_rate": 0.0004911797081248238, + "loss": 1.0633, + "step": 9772 + }, + { + "epoch": 0.6798845177223556, + "grad_norm": 1.1953125, + "learning_rate": 0.0004909857380042845, + "loss": 0.8228, + "step": 9773 + }, + { + "epoch": 0.6799540853594908, + "grad_norm": 1.296875, + "learning_rate": 0.0004907917937289235, + "loss": 0.8447, + "step": 9774 + }, + { + "epoch": 0.6800236529966259, + "grad_norm": 0.96875, + "learning_rate": 0.0004905978753085889, + "loss": 0.8315, + "step": 9775 + }, + { + "epoch": 0.6800932206337612, + "grad_norm": 1.1796875, + "learning_rate": 0.0004904039827531262, + "loss": 1.024, + "step": 9776 + }, + { + "epoch": 0.6801627882708964, + "grad_norm": 1.53125, + "learning_rate": 0.0004902101160723813, + "loss": 1.0698, + "step": 9777 + }, + { + "epoch": 0.6802323559080315, + "grad_norm": 0.9921875, + "learning_rate": 0.0004900162752761966, + "loss": 0.9307, + "step": 9778 + }, + { + "epoch": 0.6803019235451668, + "grad_norm": 1.25, + "learning_rate": 0.0004898224603744151, + "loss": 1.0074, + "step": 9779 + }, + { + "epoch": 0.680371491182302, + "grad_norm": 1.1328125, + "learning_rate": 0.0004896286713768778, + "loss": 0.7614, + "step": 9780 + }, + { + "epoch": 0.6804410588194372, + "grad_norm": 1.078125, + "learning_rate": 0.0004894349082934243, + "loss": 0.6902, + "step": 9781 + }, + { + "epoch": 0.6805106264565725, + "grad_norm": 1.34375, + "learning_rate": 0.0004892411711338925, + "loss": 0.786, + "step": 9782 + }, + { + "epoch": 0.6805801940937076, + "grad_norm": 1.140625, + "learning_rate": 0.000489047459908119, + "loss": 0.7574, + "step": 9783 + }, + { + "epoch": 0.6806497617308428, + "grad_norm": 0.90625, + "learning_rate": 0.0004888537746259408, + "loss": 0.9947, + "step": 9784 + }, + { + "epoch": 0.680719329367978, + "grad_norm": 1.03125, + "learning_rate": 0.0004886601152971915, + "loss": 0.5515, + "step": 9785 + }, + { + "epoch": 0.6807888970051132, + "grad_norm": 1.2734375, + "learning_rate": 0.000488466481931704, + "loss": 0.7297, + "step": 9786 + }, + { + "epoch": 0.6808584646422484, + "grad_norm": 0.875, + "learning_rate": 0.0004882728745393105, + "loss": 0.6101, + "step": 9787 + }, + { + "epoch": 0.6809280322793836, + "grad_norm": 1.0390625, + "learning_rate": 0.0004880792931298408, + "loss": 1.0144, + "step": 9788 + }, + { + "epoch": 0.6809975999165189, + "grad_norm": 1.0703125, + "learning_rate": 0.0004878857377131246, + "loss": 0.7814, + "step": 9789 + }, + { + "epoch": 0.681067167553654, + "grad_norm": 1.1640625, + "learning_rate": 0.0004876922082989891, + "loss": 0.7869, + "step": 9790 + }, + { + "epoch": 0.6811367351907892, + "grad_norm": 1.375, + "learning_rate": 0.00048749870489726133, + "loss": 0.883, + "step": 9791 + }, + { + "epoch": 0.6812063028279245, + "grad_norm": 1.828125, + "learning_rate": 0.00048730522751776586, + "loss": 0.9427, + "step": 9792 + }, + { + "epoch": 0.6812758704650597, + "grad_norm": 1.0390625, + "learning_rate": 0.0004871117761703271, + "loss": 0.6713, + "step": 9793 + }, + { + "epoch": 0.6813454381021948, + "grad_norm": 1.328125, + "learning_rate": 0.0004869183508647668, + "loss": 0.8737, + "step": 9794 + }, + { + "epoch": 0.6814150057393301, + "grad_norm": 1.1171875, + "learning_rate": 0.0004867249516109069, + "loss": 0.76, + "step": 9795 + }, + { + "epoch": 0.6814845733764653, + "grad_norm": 1.140625, + "learning_rate": 0.0004865315784185664, + "loss": 0.7439, + "step": 9796 + }, + { + "epoch": 0.6815541410136005, + "grad_norm": 1.1015625, + "learning_rate": 0.0004863382312975644, + "loss": 0.8266, + "step": 9797 + }, + { + "epoch": 0.6816237086507356, + "grad_norm": 1.375, + "learning_rate": 0.00048614491025771836, + "loss": 0.9495, + "step": 9798 + }, + { + "epoch": 0.6816932762878709, + "grad_norm": 1.0, + "learning_rate": 0.0004859516153088437, + "loss": 1.083, + "step": 9799 + }, + { + "epoch": 0.6817628439250061, + "grad_norm": 0.88671875, + "learning_rate": 0.00048575834646075503, + "loss": 0.7001, + "step": 9800 + }, + { + "epoch": 0.6818324115621412, + "grad_norm": 1.109375, + "learning_rate": 0.00048556510372326514, + "loss": 0.6385, + "step": 9801 + }, + { + "epoch": 0.6819019791992765, + "grad_norm": 1.265625, + "learning_rate": 0.0004853718871061863, + "loss": 1.0354, + "step": 9802 + }, + { + "epoch": 0.6819715468364117, + "grad_norm": 1.2421875, + "learning_rate": 0.00048517869661932956, + "loss": 0.7948, + "step": 9803 + }, + { + "epoch": 0.6820411144735469, + "grad_norm": 1.3671875, + "learning_rate": 0.0004849855322725034, + "loss": 0.7419, + "step": 9804 + }, + { + "epoch": 0.6821106821106822, + "grad_norm": 0.90234375, + "learning_rate": 0.00048479239407551636, + "loss": 0.7721, + "step": 9805 + }, + { + "epoch": 0.6821802497478173, + "grad_norm": 1.1640625, + "learning_rate": 0.0004845992820381743, + "loss": 0.9139, + "step": 9806 + }, + { + "epoch": 0.6822498173849525, + "grad_norm": 1.21875, + "learning_rate": 0.00048440619617028325, + "loss": 0.8945, + "step": 9807 + }, + { + "epoch": 0.6823193850220878, + "grad_norm": 1.0546875, + "learning_rate": 0.00048421313648164645, + "loss": 0.8678, + "step": 9808 + }, + { + "epoch": 0.682388952659223, + "grad_norm": 0.99609375, + "learning_rate": 0.0004840201029820672, + "loss": 0.6539, + "step": 9809 + }, + { + "epoch": 0.6824585202963581, + "grad_norm": 0.8671875, + "learning_rate": 0.0004838270956813461, + "loss": 0.9426, + "step": 9810 + }, + { + "epoch": 0.6825280879334933, + "grad_norm": 1.015625, + "learning_rate": 0.0004836341145892832, + "loss": 0.6187, + "step": 9811 + }, + { + "epoch": 0.6825976555706286, + "grad_norm": 1.3046875, + "learning_rate": 0.0004834411597156777, + "loss": 0.8741, + "step": 9812 + }, + { + "epoch": 0.6826672232077637, + "grad_norm": 1.0703125, + "learning_rate": 0.00048324823107032653, + "loss": 0.7333, + "step": 9813 + }, + { + "epoch": 0.6827367908448989, + "grad_norm": 0.98046875, + "learning_rate": 0.0004830553286630256, + "loss": 0.8243, + "step": 9814 + }, + { + "epoch": 0.6828063584820342, + "grad_norm": 1.0234375, + "learning_rate": 0.00048286245250356866, + "loss": 0.8961, + "step": 9815 + }, + { + "epoch": 0.6828759261191694, + "grad_norm": 1.328125, + "learning_rate": 0.00048266960260175053, + "loss": 0.8247, + "step": 9816 + }, + { + "epoch": 0.6829454937563045, + "grad_norm": 1.1484375, + "learning_rate": 0.00048247677896736253, + "loss": 0.9039, + "step": 9817 + }, + { + "epoch": 0.6830150613934398, + "grad_norm": 1.078125, + "learning_rate": 0.00048228398161019473, + "loss": 0.7044, + "step": 9818 + }, + { + "epoch": 0.683084629030575, + "grad_norm": 0.8828125, + "learning_rate": 0.00048209121054003726, + "loss": 0.7803, + "step": 9819 + }, + { + "epoch": 0.6831541966677102, + "grad_norm": 1.0390625, + "learning_rate": 0.00048189846576667726, + "loss": 0.6082, + "step": 9820 + }, + { + "epoch": 0.6832237643048454, + "grad_norm": 0.96875, + "learning_rate": 0.00048170574729990227, + "loss": 0.8898, + "step": 9821 + }, + { + "epoch": 0.6832933319419806, + "grad_norm": 0.98828125, + "learning_rate": 0.0004815130551494965, + "loss": 1.072, + "step": 9822 + }, + { + "epoch": 0.6833628995791158, + "grad_norm": 1.046875, + "learning_rate": 0.00048132038932524493, + "loss": 0.7886, + "step": 9823 + }, + { + "epoch": 0.683432467216251, + "grad_norm": 0.84375, + "learning_rate": 0.00048112774983692907, + "loss": 0.6386, + "step": 9824 + }, + { + "epoch": 0.6835020348533862, + "grad_norm": 1.0390625, + "learning_rate": 0.000480935136694331, + "loss": 0.7901, + "step": 9825 + }, + { + "epoch": 0.6835716024905214, + "grad_norm": 1.0234375, + "learning_rate": 0.00048074254990723063, + "loss": 0.821, + "step": 9826 + }, + { + "epoch": 0.6836411701276566, + "grad_norm": 1.078125, + "learning_rate": 0.0004805499894854063, + "loss": 0.8903, + "step": 9827 + }, + { + "epoch": 0.6837107377647919, + "grad_norm": 1.4375, + "learning_rate": 0.0004803574554386351, + "loss": 0.8313, + "step": 9828 + }, + { + "epoch": 0.683780305401927, + "grad_norm": 1.09375, + "learning_rate": 0.00048016494777669295, + "loss": 0.9615, + "step": 9829 + }, + { + "epoch": 0.6838498730390622, + "grad_norm": 1.296875, + "learning_rate": 0.000479972466509355, + "loss": 0.7357, + "step": 9830 + }, + { + "epoch": 0.6839194406761975, + "grad_norm": 0.921875, + "learning_rate": 0.00047978001164639404, + "loss": 0.6229, + "step": 9831 + }, + { + "epoch": 0.6839890083133326, + "grad_norm": 0.99609375, + "learning_rate": 0.00047958758319758166, + "loss": 0.688, + "step": 9832 + }, + { + "epoch": 0.6840585759504678, + "grad_norm": 1.359375, + "learning_rate": 0.0004793951811726891, + "loss": 0.8784, + "step": 9833 + }, + { + "epoch": 0.6841281435876031, + "grad_norm": 1.03125, + "learning_rate": 0.0004792028055814848, + "loss": 0.7363, + "step": 9834 + }, + { + "epoch": 0.6841977112247383, + "grad_norm": 1.125, + "learning_rate": 0.0004790104564337374, + "loss": 0.8188, + "step": 9835 + }, + { + "epoch": 0.6842672788618734, + "grad_norm": 1.0234375, + "learning_rate": 0.0004788181337392127, + "loss": 0.6419, + "step": 9836 + }, + { + "epoch": 0.6843368464990086, + "grad_norm": 1.484375, + "learning_rate": 0.00047862583750767654, + "loss": 0.7032, + "step": 9837 + }, + { + "epoch": 0.6844064141361439, + "grad_norm": 1.109375, + "learning_rate": 0.0004784335677488921, + "loss": 0.6312, + "step": 9838 + }, + { + "epoch": 0.6844759817732791, + "grad_norm": 1.109375, + "learning_rate": 0.00047824132447262213, + "loss": 0.6002, + "step": 9839 + }, + { + "epoch": 0.6845455494104142, + "grad_norm": 1.1171875, + "learning_rate": 0.0004780491076886283, + "loss": 0.8293, + "step": 9840 + }, + { + "epoch": 0.6846151170475495, + "grad_norm": 1.5703125, + "learning_rate": 0.0004778569174066699, + "loss": 1.0433, + "step": 9841 + }, + { + "epoch": 0.6846846846846847, + "grad_norm": 1.390625, + "learning_rate": 0.0004776647536365051, + "loss": 1.086, + "step": 9842 + }, + { + "epoch": 0.6847542523218199, + "grad_norm": 1.0234375, + "learning_rate": 0.0004774726163878914, + "loss": 0.5927, + "step": 9843 + }, + { + "epoch": 0.6848238199589551, + "grad_norm": 1.03125, + "learning_rate": 0.0004772805056705848, + "loss": 0.7805, + "step": 9844 + }, + { + "epoch": 0.6848933875960903, + "grad_norm": 0.95703125, + "learning_rate": 0.0004770884214943394, + "loss": 0.7865, + "step": 9845 + }, + { + "epoch": 0.6849629552332255, + "grad_norm": 1.0390625, + "learning_rate": 0.000476896363868908, + "loss": 0.7678, + "step": 9846 + }, + { + "epoch": 0.6850325228703608, + "grad_norm": 1.109375, + "learning_rate": 0.00047670433280404257, + "loss": 0.952, + "step": 9847 + }, + { + "epoch": 0.6851020905074959, + "grad_norm": 1.2265625, + "learning_rate": 0.00047651232830949386, + "loss": 0.8057, + "step": 9848 + }, + { + "epoch": 0.6851716581446311, + "grad_norm": 1.109375, + "learning_rate": 0.00047632035039501055, + "loss": 0.7233, + "step": 9849 + }, + { + "epoch": 0.6852412257817663, + "grad_norm": 1.1875, + "learning_rate": 0.0004761283990703399, + "loss": 0.5269, + "step": 9850 + }, + { + "epoch": 0.6853107934189016, + "grad_norm": 0.9375, + "learning_rate": 0.000475936474345229, + "loss": 0.5662, + "step": 9851 + }, + { + "epoch": 0.6853803610560367, + "grad_norm": 1.2890625, + "learning_rate": 0.00047574457622942225, + "loss": 0.7139, + "step": 9852 + }, + { + "epoch": 0.6854499286931719, + "grad_norm": 1.0859375, + "learning_rate": 0.0004755527047326633, + "loss": 0.7737, + "step": 9853 + }, + { + "epoch": 0.6855194963303072, + "grad_norm": 1.0546875, + "learning_rate": 0.000475360859864695, + "loss": 0.7234, + "step": 9854 + }, + { + "epoch": 0.6855890639674423, + "grad_norm": 1.0859375, + "learning_rate": 0.00047516904163525796, + "loss": 0.6907, + "step": 9855 + }, + { + "epoch": 0.6856586316045775, + "grad_norm": 1.4921875, + "learning_rate": 0.0004749772500540912, + "loss": 0.8297, + "step": 9856 + }, + { + "epoch": 0.6857281992417128, + "grad_norm": 1.03125, + "learning_rate": 0.00047478548513093334, + "loss": 0.6738, + "step": 9857 + }, + { + "epoch": 0.685797766878848, + "grad_norm": 1.296875, + "learning_rate": 0.0004745937468755217, + "loss": 1.108, + "step": 9858 + }, + { + "epoch": 0.6858673345159831, + "grad_norm": 0.80078125, + "learning_rate": 0.0004744020352975913, + "loss": 0.7054, + "step": 9859 + }, + { + "epoch": 0.6859369021531184, + "grad_norm": 1.0390625, + "learning_rate": 0.000474210350406876, + "loss": 0.7712, + "step": 9860 + }, + { + "epoch": 0.6860064697902536, + "grad_norm": 1.0703125, + "learning_rate": 0.00047401869221310887, + "loss": 0.867, + "step": 9861 + }, + { + "epoch": 0.6860760374273888, + "grad_norm": 1.0390625, + "learning_rate": 0.0004738270607260218, + "loss": 0.8149, + "step": 9862 + }, + { + "epoch": 0.6861456050645239, + "grad_norm": 1.0859375, + "learning_rate": 0.0004736354559553445, + "loss": 0.74, + "step": 9863 + }, + { + "epoch": 0.6862151727016592, + "grad_norm": 0.7421875, + "learning_rate": 0.00047344387791080535, + "loss": 0.4866, + "step": 9864 + }, + { + "epoch": 0.6862847403387944, + "grad_norm": 1.5078125, + "learning_rate": 0.00047325232660213234, + "loss": 1.0872, + "step": 9865 + }, + { + "epoch": 0.6863543079759296, + "grad_norm": 0.9140625, + "learning_rate": 0.00047306080203905076, + "loss": 0.7222, + "step": 9866 + }, + { + "epoch": 0.6864238756130648, + "grad_norm": 0.9921875, + "learning_rate": 0.00047286930423128584, + "loss": 0.6516, + "step": 9867 + }, + { + "epoch": 0.6864934432502, + "grad_norm": 0.91015625, + "learning_rate": 0.00047267783318856097, + "loss": 0.9393, + "step": 9868 + }, + { + "epoch": 0.6865630108873352, + "grad_norm": 1.09375, + "learning_rate": 0.0004724863889205978, + "loss": 0.673, + "step": 9869 + }, + { + "epoch": 0.6866325785244705, + "grad_norm": 1.0625, + "learning_rate": 0.0004722949714371166, + "loss": 0.7548, + "step": 9870 + }, + { + "epoch": 0.6867021461616056, + "grad_norm": 1.203125, + "learning_rate": 0.000472103580747837, + "loss": 0.899, + "step": 9871 + }, + { + "epoch": 0.6867717137987408, + "grad_norm": 1.140625, + "learning_rate": 0.0004719122168624771, + "loss": 0.892, + "step": 9872 + }, + { + "epoch": 0.6868412814358761, + "grad_norm": 1.203125, + "learning_rate": 0.00047172087979075307, + "loss": 1.0007, + "step": 9873 + }, + { + "epoch": 0.6869108490730113, + "grad_norm": 1.5546875, + "learning_rate": 0.00047152956954237967, + "loss": 0.9274, + "step": 9874 + }, + { + "epoch": 0.6869804167101464, + "grad_norm": 1.0234375, + "learning_rate": 0.00047133828612707095, + "loss": 0.8581, + "step": 9875 + }, + { + "epoch": 0.6870499843472816, + "grad_norm": 1.0234375, + "learning_rate": 0.0004711470295545399, + "loss": 0.8484, + "step": 9876 + }, + { + "epoch": 0.6871195519844169, + "grad_norm": 1.078125, + "learning_rate": 0.0004709557998344971, + "loss": 0.7848, + "step": 9877 + }, + { + "epoch": 0.687189119621552, + "grad_norm": 1.2578125, + "learning_rate": 0.00047076459697665174, + "loss": 0.9097, + "step": 9878 + }, + { + "epoch": 0.6872586872586872, + "grad_norm": 1.140625, + "learning_rate": 0.00047057342099071257, + "loss": 0.9504, + "step": 9879 + }, + { + "epoch": 0.6873282548958225, + "grad_norm": 1.375, + "learning_rate": 0.00047038227188638703, + "loss": 0.9776, + "step": 9880 + }, + { + "epoch": 0.6873978225329577, + "grad_norm": 1.25, + "learning_rate": 0.00047019114967338015, + "loss": 0.7916, + "step": 9881 + }, + { + "epoch": 0.6874673901700928, + "grad_norm": 0.9375, + "learning_rate": 0.000470000054361396, + "loss": 0.5856, + "step": 9882 + }, + { + "epoch": 0.6875369578072281, + "grad_norm": 1.0546875, + "learning_rate": 0.00046980898596013797, + "loss": 0.6789, + "step": 9883 + }, + { + "epoch": 0.6876065254443633, + "grad_norm": 1.0234375, + "learning_rate": 0.0004696179444793071, + "loss": 0.83, + "step": 9884 + }, + { + "epoch": 0.6876760930814985, + "grad_norm": 1.234375, + "learning_rate": 0.00046942692992860347, + "loss": 1.0817, + "step": 9885 + }, + { + "epoch": 0.6877456607186337, + "grad_norm": 0.86328125, + "learning_rate": 0.0004692359423177265, + "loss": 0.8211, + "step": 9886 + }, + { + "epoch": 0.6878152283557689, + "grad_norm": 0.87109375, + "learning_rate": 0.0004690449816563731, + "loss": 0.6253, + "step": 9887 + }, + { + "epoch": 0.6878847959929041, + "grad_norm": 1.1875, + "learning_rate": 0.00046885404795423894, + "loss": 0.749, + "step": 9888 + }, + { + "epoch": 0.6879543636300393, + "grad_norm": 1.09375, + "learning_rate": 0.00046866314122101906, + "loss": 1.0462, + "step": 9889 + }, + { + "epoch": 0.6880239312671745, + "grad_norm": 1.03125, + "learning_rate": 0.0004684722614664072, + "loss": 0.7023, + "step": 9890 + }, + { + "epoch": 0.6880934989043097, + "grad_norm": 0.94140625, + "learning_rate": 0.00046828140870009473, + "loss": 0.7339, + "step": 9891 + }, + { + "epoch": 0.6881630665414449, + "grad_norm": 1.09375, + "learning_rate": 0.00046809058293177186, + "loss": 0.7907, + "step": 9892 + }, + { + "epoch": 0.6882326341785802, + "grad_norm": 1.03125, + "learning_rate": 0.00046789978417112823, + "loss": 0.8627, + "step": 9893 + }, + { + "epoch": 0.6883022018157153, + "grad_norm": 1.171875, + "learning_rate": 0.0004677090124278519, + "loss": 0.7997, + "step": 9894 + }, + { + "epoch": 0.6883717694528505, + "grad_norm": 1.0703125, + "learning_rate": 0.00046751826771162895, + "loss": 0.8273, + "step": 9895 + }, + { + "epoch": 0.6884413370899858, + "grad_norm": 1.203125, + "learning_rate": 0.0004673275500321441, + "loss": 0.875, + "step": 9896 + }, + { + "epoch": 0.688510904727121, + "grad_norm": 1.53125, + "learning_rate": 0.0004671368593990818, + "loss": 0.7543, + "step": 9897 + }, + { + "epoch": 0.6885804723642561, + "grad_norm": 1.0625, + "learning_rate": 0.0004669461958221236, + "loss": 0.983, + "step": 9898 + }, + { + "epoch": 0.6886500400013914, + "grad_norm": 1.203125, + "learning_rate": 0.0004667555593109507, + "loss": 0.843, + "step": 9899 + }, + { + "epoch": 0.6887196076385266, + "grad_norm": 1.2109375, + "learning_rate": 0.0004665649498752432, + "loss": 0.6497, + "step": 9900 + }, + { + "epoch": 0.6887891752756617, + "grad_norm": 1.0234375, + "learning_rate": 0.00046637436752467874, + "loss": 0.8861, + "step": 9901 + }, + { + "epoch": 0.6888587429127969, + "grad_norm": 1.3515625, + "learning_rate": 0.00046618381226893403, + "loss": 0.8833, + "step": 9902 + }, + { + "epoch": 0.6889283105499322, + "grad_norm": 0.91015625, + "learning_rate": 0.0004659932841176845, + "loss": 0.7966, + "step": 9903 + }, + { + "epoch": 0.6889978781870674, + "grad_norm": 1.09375, + "learning_rate": 0.0004658027830806049, + "loss": 0.6317, + "step": 9904 + }, + { + "epoch": 0.6890674458242025, + "grad_norm": 1.234375, + "learning_rate": 0.0004656123091673674, + "loss": 1.1123, + "step": 9905 + }, + { + "epoch": 0.6891370134613378, + "grad_norm": 1.3359375, + "learning_rate": 0.00046542186238764295, + "loss": 0.8116, + "step": 9906 + }, + { + "epoch": 0.689206581098473, + "grad_norm": 1.1953125, + "learning_rate": 0.00046523144275110187, + "loss": 0.9125, + "step": 9907 + }, + { + "epoch": 0.6892761487356082, + "grad_norm": 1.1640625, + "learning_rate": 0.0004650410502674131, + "loss": 0.9166, + "step": 9908 + }, + { + "epoch": 0.6893457163727434, + "grad_norm": 1.0390625, + "learning_rate": 0.0004648506849462433, + "loss": 0.8827, + "step": 9909 + }, + { + "epoch": 0.6894152840098786, + "grad_norm": 1.5078125, + "learning_rate": 0.000464660346797258, + "loss": 0.645, + "step": 9910 + }, + { + "epoch": 0.6894848516470138, + "grad_norm": 1.15625, + "learning_rate": 0.0004644700358301224, + "loss": 0.9024, + "step": 9911 + }, + { + "epoch": 0.6895544192841491, + "grad_norm": 1.1171875, + "learning_rate": 0.0004642797520544987, + "loss": 0.8633, + "step": 9912 + }, + { + "epoch": 0.6896239869212842, + "grad_norm": 0.9921875, + "learning_rate": 0.00046408949548004897, + "loss": 0.6726, + "step": 9913 + }, + { + "epoch": 0.6896935545584194, + "grad_norm": 2.15625, + "learning_rate": 0.00046389926611643394, + "loss": 1.0634, + "step": 9914 + }, + { + "epoch": 0.6897631221955546, + "grad_norm": 1.125, + "learning_rate": 0.0004637090639733119, + "loss": 0.9568, + "step": 9915 + }, + { + "epoch": 0.6898326898326899, + "grad_norm": 1.3125, + "learning_rate": 0.0004635188890603402, + "loss": 0.7871, + "step": 9916 + }, + { + "epoch": 0.689902257469825, + "grad_norm": 1.015625, + "learning_rate": 0.00046332874138717517, + "loss": 0.731, + "step": 9917 + }, + { + "epoch": 0.6899718251069602, + "grad_norm": 1.2734375, + "learning_rate": 0.00046313862096347203, + "loss": 0.9366, + "step": 9918 + }, + { + "epoch": 0.6900413927440955, + "grad_norm": 1.6796875, + "learning_rate": 0.00046294852779888384, + "loss": 1.1027, + "step": 9919 + }, + { + "epoch": 0.6901109603812307, + "grad_norm": 0.9921875, + "learning_rate": 0.00046275846190306193, + "loss": 0.6191, + "step": 9920 + }, + { + "epoch": 0.6901805280183658, + "grad_norm": 0.92578125, + "learning_rate": 0.0004625684232856575, + "loss": 0.7207, + "step": 9921 + }, + { + "epoch": 0.6902500956555011, + "grad_norm": 1.1328125, + "learning_rate": 0.00046237841195632013, + "loss": 0.797, + "step": 9922 + }, + { + "epoch": 0.6903196632926363, + "grad_norm": 0.9609375, + "learning_rate": 0.0004621884279246971, + "loss": 0.6195, + "step": 9923 + }, + { + "epoch": 0.6903892309297714, + "grad_norm": 0.953125, + "learning_rate": 0.0004619984712004346, + "loss": 0.5835, + "step": 9924 + }, + { + "epoch": 0.6904587985669067, + "grad_norm": 0.9140625, + "learning_rate": 0.0004618085417931779, + "loss": 0.696, + "step": 9925 + }, + { + "epoch": 0.6905283662040419, + "grad_norm": 1.0078125, + "learning_rate": 0.00046161863971257123, + "loss": 0.8957, + "step": 9926 + }, + { + "epoch": 0.6905979338411771, + "grad_norm": 0.875, + "learning_rate": 0.00046142876496825606, + "loss": 0.7213, + "step": 9927 + }, + { + "epoch": 0.6906675014783122, + "grad_norm": 1.0, + "learning_rate": 0.0004612389175698739, + "loss": 0.855, + "step": 9928 + }, + { + "epoch": 0.6907370691154475, + "grad_norm": 1.2734375, + "learning_rate": 0.0004610490975270639, + "loss": 0.7702, + "step": 9929 + }, + { + "epoch": 0.6908066367525827, + "grad_norm": 1.453125, + "learning_rate": 0.0004608593048494639, + "loss": 1.1194, + "step": 9930 + }, + { + "epoch": 0.6908762043897179, + "grad_norm": 1.03125, + "learning_rate": 0.000460669539546711, + "loss": 0.4911, + "step": 9931 + }, + { + "epoch": 0.6909457720268531, + "grad_norm": 1.0625, + "learning_rate": 0.00046047980162844073, + "loss": 0.8049, + "step": 9932 + }, + { + "epoch": 0.6910153396639883, + "grad_norm": 1.0859375, + "learning_rate": 0.0004602900911042868, + "loss": 0.8622, + "step": 9933 + }, + { + "epoch": 0.6910849073011235, + "grad_norm": 1.125, + "learning_rate": 0.0004601004079838813, + "loss": 0.7903, + "step": 9934 + }, + { + "epoch": 0.6911544749382588, + "grad_norm": 1.3046875, + "learning_rate": 0.0004599107522768557, + "loss": 0.9072, + "step": 9935 + }, + { + "epoch": 0.6912240425753939, + "grad_norm": 1.21875, + "learning_rate": 0.00045972112399284037, + "loss": 0.7728, + "step": 9936 + }, + { + "epoch": 0.6912936102125291, + "grad_norm": 1.0625, + "learning_rate": 0.0004595315231414632, + "loss": 1.007, + "step": 9937 + }, + { + "epoch": 0.6913631778496644, + "grad_norm": 1.0, + "learning_rate": 0.00045934194973235054, + "loss": 0.7445, + "step": 9938 + }, + { + "epoch": 0.6914327454867996, + "grad_norm": 1.15625, + "learning_rate": 0.00045915240377512867, + "loss": 0.963, + "step": 9939 + }, + { + "epoch": 0.6915023131239347, + "grad_norm": 0.95703125, + "learning_rate": 0.0004589628852794221, + "loss": 0.7394, + "step": 9940 + }, + { + "epoch": 0.6915718807610699, + "grad_norm": 1.0078125, + "learning_rate": 0.00045877339425485277, + "loss": 0.778, + "step": 9941 + }, + { + "epoch": 0.6916414483982052, + "grad_norm": 1.234375, + "learning_rate": 0.0004585839307110428, + "loss": 0.7646, + "step": 9942 + }, + { + "epoch": 0.6917110160353404, + "grad_norm": 1.234375, + "learning_rate": 0.00045839449465761195, + "loss": 0.9427, + "step": 9943 + }, + { + "epoch": 0.6917805836724755, + "grad_norm": 1.0859375, + "learning_rate": 0.0004582050861041783, + "loss": 0.5645, + "step": 9944 + }, + { + "epoch": 0.6918501513096108, + "grad_norm": 0.93359375, + "learning_rate": 0.00045801570506035974, + "loss": 0.7495, + "step": 9945 + }, + { + "epoch": 0.691919718946746, + "grad_norm": 1.5078125, + "learning_rate": 0.000457826351535772, + "loss": 0.8523, + "step": 9946 + }, + { + "epoch": 0.6919892865838811, + "grad_norm": 1.0, + "learning_rate": 0.0004576370255400295, + "loss": 1.0741, + "step": 9947 + }, + { + "epoch": 0.6920588542210164, + "grad_norm": 1.296875, + "learning_rate": 0.00045744772708274485, + "loss": 0.8472, + "step": 9948 + }, + { + "epoch": 0.6921284218581516, + "grad_norm": 1.3515625, + "learning_rate": 0.00045725845617352977, + "loss": 0.7895, + "step": 9949 + }, + { + "epoch": 0.6921979894952868, + "grad_norm": 1.078125, + "learning_rate": 0.0004570692128219951, + "loss": 0.7883, + "step": 9950 + }, + { + "epoch": 0.692267557132422, + "grad_norm": 1.0546875, + "learning_rate": 0.0004568799970377493, + "loss": 0.8317, + "step": 9951 + }, + { + "epoch": 0.6923371247695572, + "grad_norm": 1.125, + "learning_rate": 0.00045669080883039924, + "loss": 0.9023, + "step": 9952 + }, + { + "epoch": 0.6924066924066924, + "grad_norm": 1.0703125, + "learning_rate": 0.0004565016482095515, + "loss": 0.7524, + "step": 9953 + }, + { + "epoch": 0.6924762600438276, + "grad_norm": 1.2421875, + "learning_rate": 0.000456312515184811, + "loss": 0.7842, + "step": 9954 + }, + { + "epoch": 0.6925458276809628, + "grad_norm": 1.0234375, + "learning_rate": 0.0004561234097657806, + "loss": 0.8211, + "step": 9955 + }, + { + "epoch": 0.692615395318098, + "grad_norm": 1.1796875, + "learning_rate": 0.0004559343319620617, + "loss": 0.6618, + "step": 9956 + }, + { + "epoch": 0.6926849629552332, + "grad_norm": 1.28125, + "learning_rate": 0.0004557452817832551, + "loss": 0.8867, + "step": 9957 + }, + { + "epoch": 0.6927545305923685, + "grad_norm": 1.03125, + "learning_rate": 0.0004555562592389603, + "loss": 0.7492, + "step": 9958 + }, + { + "epoch": 0.6928240982295036, + "grad_norm": 1.359375, + "learning_rate": 0.00045536726433877405, + "loss": 0.9273, + "step": 9959 + }, + { + "epoch": 0.6928936658666388, + "grad_norm": 1.328125, + "learning_rate": 0.0004551782970922933, + "loss": 1.032, + "step": 9960 + }, + { + "epoch": 0.6929632335037741, + "grad_norm": 1.046875, + "learning_rate": 0.00045498935750911253, + "loss": 0.7288, + "step": 9961 + }, + { + "epoch": 0.6930328011409093, + "grad_norm": 1.171875, + "learning_rate": 0.0004548004455988248, + "loss": 0.9221, + "step": 9962 + }, + { + "epoch": 0.6931023687780444, + "grad_norm": 0.9921875, + "learning_rate": 0.0004546115613710224, + "loss": 0.6233, + "step": 9963 + }, + { + "epoch": 0.6931719364151797, + "grad_norm": 1.078125, + "learning_rate": 0.00045442270483529636, + "loss": 0.7356, + "step": 9964 + }, + { + "epoch": 0.6932415040523149, + "grad_norm": 1.2734375, + "learning_rate": 0.00045423387600123543, + "loss": 1.0036, + "step": 9965 + }, + { + "epoch": 0.69331107168945, + "grad_norm": 1.171875, + "learning_rate": 0.00045404507487842706, + "loss": 1.0622, + "step": 9966 + }, + { + "epoch": 0.6933806393265852, + "grad_norm": 1.0234375, + "learning_rate": 0.00045385630147645793, + "loss": 0.5868, + "step": 9967 + }, + { + "epoch": 0.6934502069637205, + "grad_norm": 1.1484375, + "learning_rate": 0.00045366755580491337, + "loss": 0.9475, + "step": 9968 + }, + { + "epoch": 0.6935197746008557, + "grad_norm": 1.078125, + "learning_rate": 0.00045347883787337674, + "loss": 1.0282, + "step": 9969 + }, + { + "epoch": 0.6935893422379908, + "grad_norm": 0.94921875, + "learning_rate": 0.00045329014769142963, + "loss": 0.9051, + "step": 9970 + }, + { + "epoch": 0.6936589098751261, + "grad_norm": 1.0390625, + "learning_rate": 0.00045310148526865314, + "loss": 0.6954, + "step": 9971 + }, + { + "epoch": 0.6937284775122613, + "grad_norm": 1.0078125, + "learning_rate": 0.00045291285061462705, + "loss": 0.8558, + "step": 9972 + }, + { + "epoch": 0.6937980451493965, + "grad_norm": 0.8671875, + "learning_rate": 0.0004527242437389285, + "loss": 0.6456, + "step": 9973 + }, + { + "epoch": 0.6938676127865318, + "grad_norm": 1.015625, + "learning_rate": 0.0004525356646511348, + "loss": 0.754, + "step": 9974 + }, + { + "epoch": 0.6939371804236669, + "grad_norm": 1.203125, + "learning_rate": 0.0004523471133608206, + "loss": 0.8408, + "step": 9975 + }, + { + "epoch": 0.6940067480608021, + "grad_norm": 0.953125, + "learning_rate": 0.0004521585898775592, + "loss": 0.629, + "step": 9976 + }, + { + "epoch": 0.6940763156979374, + "grad_norm": 0.95703125, + "learning_rate": 0.0004519700942109234, + "loss": 0.861, + "step": 9977 + }, + { + "epoch": 0.6941458833350725, + "grad_norm": 1.1640625, + "learning_rate": 0.00045178162637048413, + "loss": 0.7202, + "step": 9978 + }, + { + "epoch": 0.6942154509722077, + "grad_norm": 1.1015625, + "learning_rate": 0.00045159318636581083, + "loss": 0.8389, + "step": 9979 + }, + { + "epoch": 0.6942850186093429, + "grad_norm": 1.25, + "learning_rate": 0.0004514047742064709, + "loss": 0.689, + "step": 9980 + }, + { + "epoch": 0.6943545862464782, + "grad_norm": 1.03125, + "learning_rate": 0.0004512163899020314, + "loss": 0.6303, + "step": 9981 + }, + { + "epoch": 0.6944241538836133, + "grad_norm": 1.25, + "learning_rate": 0.0004510280334620579, + "loss": 0.7624, + "step": 9982 + }, + { + "epoch": 0.6944937215207485, + "grad_norm": 1.078125, + "learning_rate": 0.00045083970489611383, + "loss": 0.6146, + "step": 9983 + }, + { + "epoch": 0.6945632891578838, + "grad_norm": 1.15625, + "learning_rate": 0.00045065140421376125, + "loss": 0.9113, + "step": 9984 + }, + { + "epoch": 0.694632856795019, + "grad_norm": 1.265625, + "learning_rate": 0.0004504631314245614, + "loss": 0.7849, + "step": 9985 + }, + { + "epoch": 0.6947024244321541, + "grad_norm": 1.390625, + "learning_rate": 0.00045027488653807425, + "loss": 0.7079, + "step": 9986 + }, + { + "epoch": 0.6947719920692894, + "grad_norm": 1.484375, + "learning_rate": 0.00045008666956385725, + "loss": 0.8342, + "step": 9987 + }, + { + "epoch": 0.6948415597064246, + "grad_norm": 1.2421875, + "learning_rate": 0.00044989848051146765, + "loss": 0.7096, + "step": 9988 + }, + { + "epoch": 0.6949111273435598, + "grad_norm": 1.1640625, + "learning_rate": 0.0004497103193904601, + "loss": 0.8621, + "step": 9989 + }, + { + "epoch": 0.694980694980695, + "grad_norm": 0.97265625, + "learning_rate": 0.00044952218621038944, + "loss": 0.7878, + "step": 9990 + }, + { + "epoch": 0.6950502626178302, + "grad_norm": 1.109375, + "learning_rate": 0.000449334080980807, + "loss": 0.925, + "step": 9991 + }, + { + "epoch": 0.6951198302549654, + "grad_norm": 1.234375, + "learning_rate": 0.0004491460037112648, + "loss": 0.8893, + "step": 9992 + }, + { + "epoch": 0.6951893978921005, + "grad_norm": 1.09375, + "learning_rate": 0.000448957954411312, + "loss": 0.9219, + "step": 9993 + }, + { + "epoch": 0.6952589655292358, + "grad_norm": 1.3203125, + "learning_rate": 0.00044876993309049654, + "loss": 1.1209, + "step": 9994 + }, + { + "epoch": 0.695328533166371, + "grad_norm": 0.87109375, + "learning_rate": 0.00044858193975836534, + "loss": 0.7067, + "step": 9995 + }, + { + "epoch": 0.6953981008035062, + "grad_norm": 0.921875, + "learning_rate": 0.0004483939744244643, + "loss": 0.5887, + "step": 9996 + }, + { + "epoch": 0.6954676684406415, + "grad_norm": 1.2265625, + "learning_rate": 0.000448206037098337, + "loss": 0.9967, + "step": 9997 + }, + { + "epoch": 0.6955372360777766, + "grad_norm": 1.0, + "learning_rate": 0.00044801812778952544, + "loss": 0.8292, + "step": 9998 + }, + { + "epoch": 0.6956068037149118, + "grad_norm": 1.1171875, + "learning_rate": 0.0004478302465075711, + "loss": 0.8827, + "step": 9999 + }, + { + "epoch": 0.6956763713520471, + "grad_norm": 1.1015625, + "learning_rate": 0.00044764239326201415, + "loss": 0.809, + "step": 10000 + }, + { + "epoch": 0.6957459389891822, + "grad_norm": 0.90625, + "learning_rate": 0.00044745456806239206, + "loss": 0.6268, + "step": 10001 + }, + { + "epoch": 0.6958155066263174, + "grad_norm": 1.0859375, + "learning_rate": 0.0004472667709182423, + "loss": 0.8195, + "step": 10002 + }, + { + "epoch": 0.6958850742634527, + "grad_norm": 1.09375, + "learning_rate": 0.00044707900183909953, + "loss": 0.6104, + "step": 10003 + }, + { + "epoch": 0.6959546419005879, + "grad_norm": 1.0, + "learning_rate": 0.0004468912608344985, + "loss": 0.6953, + "step": 10004 + }, + { + "epoch": 0.696024209537723, + "grad_norm": 0.9375, + "learning_rate": 0.00044670354791397104, + "loss": 0.547, + "step": 10005 + }, + { + "epoch": 0.6960937771748582, + "grad_norm": 1.03125, + "learning_rate": 0.00044651586308704896, + "loss": 0.7972, + "step": 10006 + }, + { + "epoch": 0.6961633448119935, + "grad_norm": 1.1875, + "learning_rate": 0.00044632820636326156, + "loss": 0.7603, + "step": 10007 + }, + { + "epoch": 0.6962329124491287, + "grad_norm": 1.4140625, + "learning_rate": 0.00044614057775213637, + "loss": 0.9619, + "step": 10008 + }, + { + "epoch": 0.6963024800862638, + "grad_norm": 2.859375, + "learning_rate": 0.00044595297726320173, + "loss": 1.0059, + "step": 10009 + }, + { + "epoch": 0.6963720477233991, + "grad_norm": 1.1640625, + "learning_rate": 0.00044576540490598226, + "loss": 0.8679, + "step": 10010 + }, + { + "epoch": 0.6964416153605343, + "grad_norm": 1.0, + "learning_rate": 0.0004455778606900021, + "loss": 0.5389, + "step": 10011 + }, + { + "epoch": 0.6965111829976695, + "grad_norm": 1.234375, + "learning_rate": 0.00044539034462478324, + "loss": 0.7035, + "step": 10012 + }, + { + "epoch": 0.6965807506348047, + "grad_norm": 0.9609375, + "learning_rate": 0.00044520285671984715, + "loss": 0.6121, + "step": 10013 + }, + { + "epoch": 0.6966503182719399, + "grad_norm": 1.078125, + "learning_rate": 0.00044501539698471414, + "loss": 0.9757, + "step": 10014 + }, + { + "epoch": 0.6967198859090751, + "grad_norm": 1.1953125, + "learning_rate": 0.0004448279654289015, + "loss": 0.7322, + "step": 10015 + }, + { + "epoch": 0.6967894535462104, + "grad_norm": 0.9921875, + "learning_rate": 0.00044464056206192684, + "loss": 0.6639, + "step": 10016 + }, + { + "epoch": 0.6968590211833455, + "grad_norm": 0.86328125, + "learning_rate": 0.00044445318689330496, + "loss": 0.6743, + "step": 10017 + }, + { + "epoch": 0.6969285888204807, + "grad_norm": 1.125, + "learning_rate": 0.0004442658399325503, + "loss": 1.0035, + "step": 10018 + }, + { + "epoch": 0.6969981564576159, + "grad_norm": 1.484375, + "learning_rate": 0.0004440785211891749, + "loss": 0.9152, + "step": 10019 + }, + { + "epoch": 0.6970677240947512, + "grad_norm": 0.96484375, + "learning_rate": 0.00044389123067269055, + "loss": 0.7669, + "step": 10020 + }, + { + "epoch": 0.6971372917318863, + "grad_norm": 1.0390625, + "learning_rate": 0.00044370396839260606, + "loss": 0.9853, + "step": 10021 + }, + { + "epoch": 0.6972068593690215, + "grad_norm": 1.0703125, + "learning_rate": 0.0004435167343584302, + "loss": 0.827, + "step": 10022 + }, + { + "epoch": 0.6972764270061568, + "grad_norm": 1.3125, + "learning_rate": 0.0004433295285796699, + "loss": 0.8992, + "step": 10023 + }, + { + "epoch": 0.6973459946432919, + "grad_norm": 0.8515625, + "learning_rate": 0.0004431423510658304, + "loss": 0.6252, + "step": 10024 + }, + { + "epoch": 0.6974155622804271, + "grad_norm": 1.1015625, + "learning_rate": 0.0004429552018264157, + "loss": 0.955, + "step": 10025 + }, + { + "epoch": 0.6974851299175624, + "grad_norm": 1.28125, + "learning_rate": 0.0004427680808709276, + "loss": 1.0229, + "step": 10026 + }, + { + "epoch": 0.6975546975546976, + "grad_norm": 1.15625, + "learning_rate": 0.00044258098820886774, + "loss": 0.8838, + "step": 10027 + }, + { + "epoch": 0.6976242651918327, + "grad_norm": 1.15625, + "learning_rate": 0.000442393923849736, + "loss": 0.9667, + "step": 10028 + }, + { + "epoch": 0.697693832828968, + "grad_norm": 1.1015625, + "learning_rate": 0.0004422068878030303, + "loss": 0.8528, + "step": 10029 + }, + { + "epoch": 0.6977634004661032, + "grad_norm": 1.2578125, + "learning_rate": 0.000442019880078247, + "loss": 0.851, + "step": 10030 + }, + { + "epoch": 0.6978329681032384, + "grad_norm": 1.359375, + "learning_rate": 0.0004418329006848818, + "loss": 0.7651, + "step": 10031 + }, + { + "epoch": 0.6979025357403735, + "grad_norm": 1.140625, + "learning_rate": 0.0004416459496324289, + "loss": 0.8588, + "step": 10032 + }, + { + "epoch": 0.6979721033775088, + "grad_norm": 1.5078125, + "learning_rate": 0.00044145902693037986, + "loss": 1.0708, + "step": 10033 + }, + { + "epoch": 0.698041671014644, + "grad_norm": 0.83203125, + "learning_rate": 0.0004412721325882266, + "loss": 0.701, + "step": 10034 + }, + { + "epoch": 0.6981112386517792, + "grad_norm": 0.953125, + "learning_rate": 0.000441085266615458, + "loss": 0.7535, + "step": 10035 + }, + { + "epoch": 0.6981808062889144, + "grad_norm": 1.25, + "learning_rate": 0.00044089842902156275, + "loss": 1.0042, + "step": 10036 + }, + { + "epoch": 0.6982503739260496, + "grad_norm": 1.109375, + "learning_rate": 0.00044071161981602667, + "loss": 0.9096, + "step": 10037 + }, + { + "epoch": 0.6983199415631848, + "grad_norm": 1.28125, + "learning_rate": 0.0004405248390083361, + "loss": 1.0399, + "step": 10038 + }, + { + "epoch": 0.6983895092003201, + "grad_norm": 1.4453125, + "learning_rate": 0.0004403380866079741, + "loss": 1.0811, + "step": 10039 + }, + { + "epoch": 0.6984590768374552, + "grad_norm": 0.96484375, + "learning_rate": 0.00044015136262442247, + "loss": 0.4406, + "step": 10040 + }, + { + "epoch": 0.6985286444745904, + "grad_norm": 1.109375, + "learning_rate": 0.00043996466706716354, + "loss": 0.8179, + "step": 10041 + }, + { + "epoch": 0.6985982121117257, + "grad_norm": 1.1953125, + "learning_rate": 0.00043977799994567604, + "loss": 0.7871, + "step": 10042 + }, + { + "epoch": 0.6986677797488609, + "grad_norm": 1.4296875, + "learning_rate": 0.0004395913612694379, + "loss": 1.0078, + "step": 10043 + }, + { + "epoch": 0.698737347385996, + "grad_norm": 1.203125, + "learning_rate": 0.0004394047510479254, + "loss": 0.721, + "step": 10044 + }, + { + "epoch": 0.6988069150231312, + "grad_norm": 1.4140625, + "learning_rate": 0.0004392181692906142, + "loss": 0.8101, + "step": 10045 + }, + { + "epoch": 0.6988764826602665, + "grad_norm": 1.0390625, + "learning_rate": 0.00043903161600697806, + "loss": 0.7656, + "step": 10046 + }, + { + "epoch": 0.6989460502974016, + "grad_norm": 0.98046875, + "learning_rate": 0.00043884509120648864, + "loss": 0.5622, + "step": 10047 + }, + { + "epoch": 0.6990156179345368, + "grad_norm": 0.9921875, + "learning_rate": 0.0004386585948986174, + "loss": 0.7001, + "step": 10048 + }, + { + "epoch": 0.6990851855716721, + "grad_norm": 0.99609375, + "learning_rate": 0.0004384721270928329, + "loss": 0.8348, + "step": 10049 + }, + { + "epoch": 0.6991547532088073, + "grad_norm": 1.015625, + "learning_rate": 0.0004382856877986039, + "loss": 0.7234, + "step": 10050 + }, + { + "epoch": 0.6992243208459424, + "grad_norm": 0.9296875, + "learning_rate": 0.00043809927702539607, + "loss": 0.5557, + "step": 10051 + }, + { + "epoch": 0.6992938884830777, + "grad_norm": 1.609375, + "learning_rate": 0.00043791289478267514, + "loss": 1.1759, + "step": 10052 + }, + { + "epoch": 0.6993634561202129, + "grad_norm": 0.9296875, + "learning_rate": 0.00043772654107990385, + "loss": 0.8746, + "step": 10053 + }, + { + "epoch": 0.6994330237573481, + "grad_norm": 0.8125, + "learning_rate": 0.0004375402159265448, + "loss": 0.8024, + "step": 10054 + }, + { + "epoch": 0.6995025913944833, + "grad_norm": 1.0390625, + "learning_rate": 0.0004373539193320589, + "loss": 0.7398, + "step": 10055 + }, + { + "epoch": 0.6995721590316185, + "grad_norm": 1.125, + "learning_rate": 0.00043716765130590507, + "loss": 0.6992, + "step": 10056 + }, + { + "epoch": 0.6996417266687537, + "grad_norm": 1.4296875, + "learning_rate": 0.00043698141185754104, + "loss": 0.8592, + "step": 10057 + }, + { + "epoch": 0.6997112943058889, + "grad_norm": 1.0390625, + "learning_rate": 0.00043679520099642276, + "loss": 0.7887, + "step": 10058 + }, + { + "epoch": 0.6997808619430241, + "grad_norm": 1.03125, + "learning_rate": 0.00043660901873200533, + "loss": 0.7324, + "step": 10059 + }, + { + "epoch": 0.6998504295801593, + "grad_norm": 1.2890625, + "learning_rate": 0.0004364228650737426, + "loss": 0.7056, + "step": 10060 + }, + { + "epoch": 0.6999199972172945, + "grad_norm": 0.8671875, + "learning_rate": 0.00043623674003108584, + "loss": 0.8171, + "step": 10061 + }, + { + "epoch": 0.6999895648544298, + "grad_norm": 1.078125, + "learning_rate": 0.00043605064361348613, + "loss": 0.7432, + "step": 10062 + }, + { + "epoch": 0.7000591324915649, + "grad_norm": 1.125, + "learning_rate": 0.00043586457583039183, + "loss": 0.6791, + "step": 10063 + }, + { + "epoch": 0.7001287001287001, + "grad_norm": 1.3671875, + "learning_rate": 0.00043567853669125133, + "loss": 0.8353, + "step": 10064 + }, + { + "epoch": 0.7001982677658354, + "grad_norm": 1.015625, + "learning_rate": 0.00043549252620551004, + "loss": 0.7989, + "step": 10065 + }, + { + "epoch": 0.7002678354029706, + "grad_norm": 1.1875, + "learning_rate": 0.0004353065443826133, + "loss": 0.6995, + "step": 10066 + }, + { + "epoch": 0.7003374030401057, + "grad_norm": 0.99609375, + "learning_rate": 0.00043512059123200356, + "loss": 0.6129, + "step": 10067 + }, + { + "epoch": 0.7004069706772409, + "grad_norm": 1.1015625, + "learning_rate": 0.000434934666763123, + "loss": 0.8444, + "step": 10068 + }, + { + "epoch": 0.7004765383143762, + "grad_norm": 1.0078125, + "learning_rate": 0.0004347487709854122, + "loss": 0.8488, + "step": 10069 + }, + { + "epoch": 0.7005461059515113, + "grad_norm": 1.234375, + "learning_rate": 0.0004345629039083099, + "loss": 0.8437, + "step": 10070 + }, + { + "epoch": 0.7006156735886465, + "grad_norm": 1.0078125, + "learning_rate": 0.0004343770655412532, + "loss": 1.0469, + "step": 10071 + }, + { + "epoch": 0.7006852412257818, + "grad_norm": 0.875, + "learning_rate": 0.00043419125589367745, + "loss": 0.7606, + "step": 10072 + }, + { + "epoch": 0.700754808862917, + "grad_norm": 0.890625, + "learning_rate": 0.00043400547497501863, + "loss": 0.6727, + "step": 10073 + }, + { + "epoch": 0.7008243765000521, + "grad_norm": 1.078125, + "learning_rate": 0.00043381972279470896, + "loss": 0.7686, + "step": 10074 + }, + { + "epoch": 0.7008939441371874, + "grad_norm": 0.7890625, + "learning_rate": 0.0004336339993621795, + "loss": 0.6855, + "step": 10075 + }, + { + "epoch": 0.7009635117743226, + "grad_norm": 0.90625, + "learning_rate": 0.00043344830468686137, + "loss": 0.7121, + "step": 10076 + }, + { + "epoch": 0.7010330794114578, + "grad_norm": 1.3125, + "learning_rate": 0.00043326263877818227, + "loss": 0.8287, + "step": 10077 + }, + { + "epoch": 0.701102647048593, + "grad_norm": 1.1328125, + "learning_rate": 0.00043307700164557016, + "loss": 0.7769, + "step": 10078 + }, + { + "epoch": 0.7011722146857282, + "grad_norm": 1.0859375, + "learning_rate": 0.00043289139329845004, + "loss": 0.8616, + "step": 10079 + }, + { + "epoch": 0.7012417823228634, + "grad_norm": 0.828125, + "learning_rate": 0.00043270581374624695, + "loss": 0.8422, + "step": 10080 + }, + { + "epoch": 0.7013113499599986, + "grad_norm": 1.296875, + "learning_rate": 0.0004325202629983829, + "loss": 0.9342, + "step": 10081 + }, + { + "epoch": 0.7013809175971338, + "grad_norm": 1.078125, + "learning_rate": 0.0004323347410642795, + "loss": 0.7916, + "step": 10082 + }, + { + "epoch": 0.701450485234269, + "grad_norm": 1.203125, + "learning_rate": 0.00043214924795335717, + "loss": 0.7212, + "step": 10083 + }, + { + "epoch": 0.7015200528714042, + "grad_norm": 1.1796875, + "learning_rate": 0.00043196378367503377, + "loss": 0.7458, + "step": 10084 + }, + { + "epoch": 0.7015896205085395, + "grad_norm": 1.2265625, + "learning_rate": 0.00043177834823872644, + "loss": 0.8478, + "step": 10085 + }, + { + "epoch": 0.7016591881456746, + "grad_norm": 1.0078125, + "learning_rate": 0.00043159294165384963, + "loss": 0.8785, + "step": 10086 + }, + { + "epoch": 0.7017287557828098, + "grad_norm": 0.8828125, + "learning_rate": 0.00043140756392981905, + "loss": 0.6397, + "step": 10087 + }, + { + "epoch": 0.7017983234199451, + "grad_norm": 1.0234375, + "learning_rate": 0.00043122221507604653, + "loss": 0.8018, + "step": 10088 + }, + { + "epoch": 0.7018678910570803, + "grad_norm": 1.0, + "learning_rate": 0.00043103689510194264, + "loss": 0.6511, + "step": 10089 + }, + { + "epoch": 0.7019374586942154, + "grad_norm": 1.4140625, + "learning_rate": 0.0004308516040169178, + "loss": 0.7782, + "step": 10090 + }, + { + "epoch": 0.7020070263313507, + "grad_norm": 1.0078125, + "learning_rate": 0.00043066634183037945, + "loss": 0.7878, + "step": 10091 + }, + { + "epoch": 0.7020765939684859, + "grad_norm": 0.921875, + "learning_rate": 0.00043048110855173507, + "loss": 0.688, + "step": 10092 + }, + { + "epoch": 0.702146161605621, + "grad_norm": 0.89453125, + "learning_rate": 0.0004302959041903889, + "loss": 0.619, + "step": 10093 + }, + { + "epoch": 0.7022157292427562, + "grad_norm": 0.984375, + "learning_rate": 0.0004301107287557455, + "loss": 0.8383, + "step": 10094 + }, + { + "epoch": 0.7022852968798915, + "grad_norm": 1.0625, + "learning_rate": 0.0004299255822572067, + "loss": 0.7158, + "step": 10095 + }, + { + "epoch": 0.7023548645170267, + "grad_norm": 1.265625, + "learning_rate": 0.00042974046470417327, + "loss": 0.8948, + "step": 10096 + }, + { + "epoch": 0.7024244321541618, + "grad_norm": 1.0859375, + "learning_rate": 0.0004295553761060451, + "loss": 0.9329, + "step": 10097 + }, + { + "epoch": 0.7024939997912971, + "grad_norm": 1.0078125, + "learning_rate": 0.0004293703164722197, + "loss": 0.7611, + "step": 10098 + }, + { + "epoch": 0.7025635674284323, + "grad_norm": 1.0546875, + "learning_rate": 0.00042918528581209313, + "loss": 0.6111, + "step": 10099 + }, + { + "epoch": 0.7026331350655675, + "grad_norm": 1.0546875, + "learning_rate": 0.00042900028413506055, + "loss": 0.6119, + "step": 10100 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 1.171875, + "learning_rate": 0.0004288153114505159, + "loss": 0.7833, + "step": 10101 + }, + { + "epoch": 0.7027722703398379, + "grad_norm": 1.0, + "learning_rate": 0.0004286303677678508, + "loss": 0.6972, + "step": 10102 + }, + { + "epoch": 0.7028418379769731, + "grad_norm": 1.0546875, + "learning_rate": 0.0004284454530964552, + "loss": 0.7726, + "step": 10103 + }, + { + "epoch": 0.7029114056141084, + "grad_norm": 1.140625, + "learning_rate": 0.00042826056744571875, + "loss": 0.7007, + "step": 10104 + }, + { + "epoch": 0.7029809732512435, + "grad_norm": 1.28125, + "learning_rate": 0.0004280757108250293, + "loss": 0.838, + "step": 10105 + }, + { + "epoch": 0.7030505408883787, + "grad_norm": 1.296875, + "learning_rate": 0.00042789088324377244, + "loss": 0.9073, + "step": 10106 + }, + { + "epoch": 0.7031201085255139, + "grad_norm": 0.9921875, + "learning_rate": 0.00042770608471133254, + "loss": 0.642, + "step": 10107 + }, + { + "epoch": 0.7031896761626492, + "grad_norm": 1.5, + "learning_rate": 0.00042752131523709347, + "loss": 1.1096, + "step": 10108 + }, + { + "epoch": 0.7032592437997843, + "grad_norm": 1.0390625, + "learning_rate": 0.0004273365748304362, + "loss": 0.7165, + "step": 10109 + }, + { + "epoch": 0.7033288114369195, + "grad_norm": 1.03125, + "learning_rate": 0.0004271518635007415, + "loss": 0.7117, + "step": 10110 + }, + { + "epoch": 0.7033983790740548, + "grad_norm": 1.1953125, + "learning_rate": 0.00042696718125738756, + "loss": 0.8286, + "step": 10111 + }, + { + "epoch": 0.70346794671119, + "grad_norm": 1.21875, + "learning_rate": 0.00042678252810975206, + "loss": 0.8924, + "step": 10112 + }, + { + "epoch": 0.7035375143483251, + "grad_norm": 1.578125, + "learning_rate": 0.00042659790406721033, + "loss": 0.6813, + "step": 10113 + }, + { + "epoch": 0.7036070819854604, + "grad_norm": 1.28125, + "learning_rate": 0.00042641330913913676, + "loss": 1.0315, + "step": 10114 + }, + { + "epoch": 0.7036766496225956, + "grad_norm": 1.078125, + "learning_rate": 0.0004262287433349047, + "loss": 0.8003, + "step": 10115 + }, + { + "epoch": 0.7037462172597307, + "grad_norm": 1.296875, + "learning_rate": 0.000426044206663885, + "loss": 0.9367, + "step": 10116 + }, + { + "epoch": 0.703815784896866, + "grad_norm": 1.0078125, + "learning_rate": 0.0004258596991354475, + "loss": 0.7856, + "step": 10117 + }, + { + "epoch": 0.7038853525340012, + "grad_norm": 0.82421875, + "learning_rate": 0.0004256752207589599, + "loss": 0.776, + "step": 10118 + }, + { + "epoch": 0.7039549201711364, + "grad_norm": 1.953125, + "learning_rate": 0.00042549077154379055, + "loss": 0.6142, + "step": 10119 + }, + { + "epoch": 0.7040244878082715, + "grad_norm": 1.03125, + "learning_rate": 0.00042530635149930397, + "loss": 0.5824, + "step": 10120 + }, + { + "epoch": 0.7040940554454068, + "grad_norm": 1.40625, + "learning_rate": 0.00042512196063486396, + "loss": 0.8986, + "step": 10121 + }, + { + "epoch": 0.704163623082542, + "grad_norm": 1.2109375, + "learning_rate": 0.0004249375989598335, + "loss": 0.9061, + "step": 10122 + }, + { + "epoch": 0.7042331907196772, + "grad_norm": 0.99609375, + "learning_rate": 0.00042475326648357283, + "loss": 0.6117, + "step": 10123 + }, + { + "epoch": 0.7043027583568124, + "grad_norm": 1.0078125, + "learning_rate": 0.00042456896321544225, + "loss": 0.7902, + "step": 10124 + }, + { + "epoch": 0.7043723259939476, + "grad_norm": 1.25, + "learning_rate": 0.0004243846891647989, + "loss": 0.8703, + "step": 10125 + }, + { + "epoch": 0.7044418936310828, + "grad_norm": 0.98828125, + "learning_rate": 0.00042420044434100015, + "loss": 0.9079, + "step": 10126 + }, + { + "epoch": 0.7045114612682181, + "grad_norm": 1.0390625, + "learning_rate": 0.00042401622875340016, + "loss": 0.7512, + "step": 10127 + }, + { + "epoch": 0.7045810289053532, + "grad_norm": 1.0546875, + "learning_rate": 0.000423832042411353, + "loss": 0.7617, + "step": 10128 + }, + { + "epoch": 0.7046505965424884, + "grad_norm": 0.796875, + "learning_rate": 0.00042364788532421095, + "loss": 0.6617, + "step": 10129 + }, + { + "epoch": 0.7047201641796237, + "grad_norm": 0.859375, + "learning_rate": 0.00042346375750132415, + "loss": 0.5288, + "step": 10130 + }, + { + "epoch": 0.7047897318167589, + "grad_norm": 1.1953125, + "learning_rate": 0.00042327965895204157, + "loss": 0.9053, + "step": 10131 + }, + { + "epoch": 0.704859299453894, + "grad_norm": 0.9296875, + "learning_rate": 0.0004230955896857109, + "loss": 0.7674, + "step": 10132 + }, + { + "epoch": 0.7049288670910292, + "grad_norm": 0.89453125, + "learning_rate": 0.0004229115497116788, + "loss": 0.5979, + "step": 10133 + }, + { + "epoch": 0.7049984347281645, + "grad_norm": 1.125, + "learning_rate": 0.0004227275390392895, + "loss": 0.9274, + "step": 10134 + }, + { + "epoch": 0.7050680023652997, + "grad_norm": 1.0703125, + "learning_rate": 0.00042254355767788564, + "loss": 0.8571, + "step": 10135 + }, + { + "epoch": 0.7051375700024348, + "grad_norm": 0.91796875, + "learning_rate": 0.0004223596056368094, + "loss": 0.9398, + "step": 10136 + }, + { + "epoch": 0.7052071376395701, + "grad_norm": 1.359375, + "learning_rate": 0.0004221756829254012, + "loss": 0.9064, + "step": 10137 + }, + { + "epoch": 0.7052767052767053, + "grad_norm": 0.94140625, + "learning_rate": 0.00042199178955299946, + "loss": 0.6491, + "step": 10138 + }, + { + "epoch": 0.7053462729138404, + "grad_norm": 1.046875, + "learning_rate": 0.00042180792552894077, + "loss": 0.7337, + "step": 10139 + }, + { + "epoch": 0.7054158405509757, + "grad_norm": 1.1953125, + "learning_rate": 0.0004216240908625617, + "loss": 0.7206, + "step": 10140 + }, + { + "epoch": 0.7054854081881109, + "grad_norm": 0.99609375, + "learning_rate": 0.0004214402855631958, + "loss": 0.6149, + "step": 10141 + }, + { + "epoch": 0.7055549758252461, + "grad_norm": 1.140625, + "learning_rate": 0.00042125650964017593, + "loss": 0.8573, + "step": 10142 + }, + { + "epoch": 0.7056245434623813, + "grad_norm": 0.90625, + "learning_rate": 0.00042107276310283384, + "loss": 0.7419, + "step": 10143 + }, + { + "epoch": 0.7056941110995165, + "grad_norm": 1.046875, + "learning_rate": 0.00042088904596049884, + "loss": 0.8879, + "step": 10144 + }, + { + "epoch": 0.7057636787366517, + "grad_norm": 1.1171875, + "learning_rate": 0.00042070535822249865, + "loss": 0.8415, + "step": 10145 + }, + { + "epoch": 0.7058332463737869, + "grad_norm": 1.046875, + "learning_rate": 0.0004205216998981607, + "loss": 0.7848, + "step": 10146 + }, + { + "epoch": 0.7059028140109221, + "grad_norm": 1.09375, + "learning_rate": 0.0004203380709968103, + "loss": 0.7432, + "step": 10147 + }, + { + "epoch": 0.7059723816480573, + "grad_norm": 1.28125, + "learning_rate": 0.000420154471527771, + "loss": 0.7737, + "step": 10148 + }, + { + "epoch": 0.7060419492851925, + "grad_norm": 0.8046875, + "learning_rate": 0.0004199709015003645, + "loss": 0.7522, + "step": 10149 + }, + { + "epoch": 0.7061115169223278, + "grad_norm": 0.89453125, + "learning_rate": 0.00041978736092391226, + "loss": 0.5649, + "step": 10150 + }, + { + "epoch": 0.7061810845594629, + "grad_norm": 1.046875, + "learning_rate": 0.00041960384980773357, + "loss": 0.7011, + "step": 10151 + }, + { + "epoch": 0.7062506521965981, + "grad_norm": 1.1484375, + "learning_rate": 0.00041942036816114604, + "loss": 0.6468, + "step": 10152 + }, + { + "epoch": 0.7063202198337334, + "grad_norm": 1.21875, + "learning_rate": 0.00041923691599346545, + "loss": 0.7579, + "step": 10153 + }, + { + "epoch": 0.7063897874708686, + "grad_norm": 1.1484375, + "learning_rate": 0.00041905349331400744, + "loss": 0.7975, + "step": 10154 + }, + { + "epoch": 0.7064593551080037, + "grad_norm": 1.3828125, + "learning_rate": 0.0004188701001320845, + "loss": 1.0643, + "step": 10155 + }, + { + "epoch": 0.706528922745139, + "grad_norm": 1.59375, + "learning_rate": 0.0004186867364570087, + "loss": 0.741, + "step": 10156 + }, + { + "epoch": 0.7065984903822742, + "grad_norm": 1.5703125, + "learning_rate": 0.0004185034022980907, + "loss": 1.1413, + "step": 10157 + }, + { + "epoch": 0.7066680580194094, + "grad_norm": 0.890625, + "learning_rate": 0.0004183200976646391, + "loss": 0.5771, + "step": 10158 + }, + { + "epoch": 0.7067376256565445, + "grad_norm": 1.125, + "learning_rate": 0.00041813682256596065, + "loss": 0.9539, + "step": 10159 + }, + { + "epoch": 0.7068071932936798, + "grad_norm": 1.0546875, + "learning_rate": 0.0004179535770113615, + "loss": 0.6908, + "step": 10160 + }, + { + "epoch": 0.706876760930815, + "grad_norm": 0.94140625, + "learning_rate": 0.0004177703610101463, + "loss": 0.6224, + "step": 10161 + }, + { + "epoch": 0.7069463285679501, + "grad_norm": 0.9921875, + "learning_rate": 0.00041758717457161766, + "loss": 0.8689, + "step": 10162 + }, + { + "epoch": 0.7070158962050854, + "grad_norm": 1.0390625, + "learning_rate": 0.0004174040177050762, + "loss": 0.7937, + "step": 10163 + }, + { + "epoch": 0.7070854638422206, + "grad_norm": 0.921875, + "learning_rate": 0.00041722089041982234, + "loss": 0.8885, + "step": 10164 + }, + { + "epoch": 0.7071550314793558, + "grad_norm": 1.1953125, + "learning_rate": 0.0004170377927251545, + "loss": 0.9376, + "step": 10165 + }, + { + "epoch": 0.707224599116491, + "grad_norm": 1.140625, + "learning_rate": 0.00041685472463036936, + "loss": 0.6855, + "step": 10166 + }, + { + "epoch": 0.7072941667536262, + "grad_norm": 1.0625, + "learning_rate": 0.0004166716861447615, + "loss": 0.7672, + "step": 10167 + }, + { + "epoch": 0.7073637343907614, + "grad_norm": 1.046875, + "learning_rate": 0.00041648867727762565, + "loss": 0.8199, + "step": 10168 + }, + { + "epoch": 0.7074333020278967, + "grad_norm": 1.1796875, + "learning_rate": 0.00041630569803825324, + "loss": 0.8799, + "step": 10169 + }, + { + "epoch": 0.7075028696650318, + "grad_norm": 1.2421875, + "learning_rate": 0.00041612274843593557, + "loss": 0.6829, + "step": 10170 + }, + { + "epoch": 0.707572437302167, + "grad_norm": 0.99609375, + "learning_rate": 0.00041593982847996203, + "loss": 0.6841, + "step": 10171 + }, + { + "epoch": 0.7076420049393022, + "grad_norm": 1.2109375, + "learning_rate": 0.00041575693817962013, + "loss": 0.8985, + "step": 10172 + }, + { + "epoch": 0.7077115725764375, + "grad_norm": 1.0546875, + "learning_rate": 0.0004155740775441957, + "loss": 0.8377, + "step": 10173 + }, + { + "epoch": 0.7077811402135726, + "grad_norm": 1.1875, + "learning_rate": 0.000415391246582974, + "loss": 0.8208, + "step": 10174 + }, + { + "epoch": 0.7078507078507078, + "grad_norm": 1.1640625, + "learning_rate": 0.0004152084453052385, + "loss": 0.8361, + "step": 10175 + }, + { + "epoch": 0.7079202754878431, + "grad_norm": 1.3671875, + "learning_rate": 0.00041502567372027056, + "loss": 1.1543, + "step": 10176 + }, + { + "epoch": 0.7079898431249783, + "grad_norm": 1.40625, + "learning_rate": 0.00041484293183735, + "loss": 1.0145, + "step": 10177 + }, + { + "epoch": 0.7080594107621134, + "grad_norm": 1.171875, + "learning_rate": 0.0004146602196657561, + "loss": 0.8072, + "step": 10178 + }, + { + "epoch": 0.7081289783992487, + "grad_norm": 1.203125, + "learning_rate": 0.0004144775372147661, + "loss": 0.8695, + "step": 10179 + }, + { + "epoch": 0.7081985460363839, + "grad_norm": 1.09375, + "learning_rate": 0.0004142948844936556, + "loss": 0.8563, + "step": 10180 + }, + { + "epoch": 0.708268113673519, + "grad_norm": 1.2109375, + "learning_rate": 0.0004141122615116982, + "loss": 0.7231, + "step": 10181 + }, + { + "epoch": 0.7083376813106543, + "grad_norm": 1.140625, + "learning_rate": 0.00041392966827816723, + "loss": 0.677, + "step": 10182 + }, + { + "epoch": 0.7084072489477895, + "grad_norm": 1.203125, + "learning_rate": 0.00041374710480233403, + "loss": 1.0409, + "step": 10183 + }, + { + "epoch": 0.7084768165849247, + "grad_norm": 1.0859375, + "learning_rate": 0.0004135645710934679, + "loss": 1.1684, + "step": 10184 + }, + { + "epoch": 0.7085463842220598, + "grad_norm": 1.0390625, + "learning_rate": 0.0004133820671608366, + "loss": 0.6784, + "step": 10185 + }, + { + "epoch": 0.7086159518591951, + "grad_norm": 1.0859375, + "learning_rate": 0.0004131995930137076, + "loss": 0.8895, + "step": 10186 + }, + { + "epoch": 0.7086855194963303, + "grad_norm": 1.3671875, + "learning_rate": 0.0004130171486613451, + "loss": 1.0721, + "step": 10187 + }, + { + "epoch": 0.7087550871334655, + "grad_norm": 0.90625, + "learning_rate": 0.0004128347341130132, + "loss": 0.7085, + "step": 10188 + }, + { + "epoch": 0.7088246547706007, + "grad_norm": 1.140625, + "learning_rate": 0.00041265234937797437, + "loss": 0.6015, + "step": 10189 + }, + { + "epoch": 0.7088942224077359, + "grad_norm": 1.0546875, + "learning_rate": 0.00041246999446548885, + "loss": 0.859, + "step": 10190 + }, + { + "epoch": 0.7089637900448711, + "grad_norm": 0.94921875, + "learning_rate": 0.0004122876693848151, + "loss": 0.8156, + "step": 10191 + }, + { + "epoch": 0.7090333576820064, + "grad_norm": 1.5078125, + "learning_rate": 0.0004121053741452113, + "loss": 0.5851, + "step": 10192 + }, + { + "epoch": 0.7091029253191415, + "grad_norm": 0.9609375, + "learning_rate": 0.00041192310875593386, + "loss": 0.7188, + "step": 10193 + }, + { + "epoch": 0.7091724929562767, + "grad_norm": 1.09375, + "learning_rate": 0.00041174087322623667, + "loss": 0.5221, + "step": 10194 + }, + { + "epoch": 0.709242060593412, + "grad_norm": 1.234375, + "learning_rate": 0.00041155866756537263, + "loss": 0.8736, + "step": 10195 + }, + { + "epoch": 0.7093116282305472, + "grad_norm": 1.0078125, + "learning_rate": 0.0004113764917825935, + "loss": 0.6578, + "step": 10196 + }, + { + "epoch": 0.7093811958676823, + "grad_norm": 1.0703125, + "learning_rate": 0.0004111943458871495, + "loss": 0.7692, + "step": 10197 + }, + { + "epoch": 0.7094507635048175, + "grad_norm": 1.21875, + "learning_rate": 0.0004110122298882889, + "loss": 1.0225, + "step": 10198 + }, + { + "epoch": 0.7095203311419528, + "grad_norm": 1.21875, + "learning_rate": 0.0004108301437952582, + "loss": 0.9021, + "step": 10199 + }, + { + "epoch": 0.709589898779088, + "grad_norm": 1.09375, + "learning_rate": 0.00041064808761730344, + "loss": 0.7685, + "step": 10200 + }, + { + "epoch": 0.7096594664162231, + "grad_norm": 1.0078125, + "learning_rate": 0.00041046606136366795, + "loss": 0.843, + "step": 10201 + }, + { + "epoch": 0.7097290340533584, + "grad_norm": 1.078125, + "learning_rate": 0.0004102840650435943, + "loss": 0.6716, + "step": 10202 + }, + { + "epoch": 0.7097986016904936, + "grad_norm": 1.25, + "learning_rate": 0.0004101020986663239, + "loss": 0.9474, + "step": 10203 + }, + { + "epoch": 0.7098681693276288, + "grad_norm": 1.2734375, + "learning_rate": 0.00040992016224109554, + "loss": 0.6837, + "step": 10204 + }, + { + "epoch": 0.709937736964764, + "grad_norm": 0.85546875, + "learning_rate": 0.00040973825577714674, + "loss": 0.5875, + "step": 10205 + }, + { + "epoch": 0.7100073046018992, + "grad_norm": 1.0703125, + "learning_rate": 0.00040955637928371424, + "loss": 0.7265, + "step": 10206 + }, + { + "epoch": 0.7100768722390344, + "grad_norm": 1.2109375, + "learning_rate": 0.0004093745327700331, + "loss": 0.9111, + "step": 10207 + }, + { + "epoch": 0.7101464398761697, + "grad_norm": 1.3359375, + "learning_rate": 0.00040919271624533627, + "loss": 0.8341, + "step": 10208 + }, + { + "epoch": 0.7102160075133048, + "grad_norm": 1.0859375, + "learning_rate": 0.00040901092971885503, + "loss": 0.6712, + "step": 10209 + }, + { + "epoch": 0.71028557515044, + "grad_norm": 1.0546875, + "learning_rate": 0.0004088291731998198, + "loss": 0.7024, + "step": 10210 + }, + { + "epoch": 0.7103551427875752, + "grad_norm": 1.1953125, + "learning_rate": 0.00040864744669746, + "loss": 0.6244, + "step": 10211 + }, + { + "epoch": 0.7104247104247104, + "grad_norm": 0.921875, + "learning_rate": 0.0004084657502210022, + "loss": 0.6959, + "step": 10212 + }, + { + "epoch": 0.7104942780618456, + "grad_norm": 1.1875, + "learning_rate": 0.00040828408377967165, + "loss": 0.5662, + "step": 10213 + }, + { + "epoch": 0.7105638456989808, + "grad_norm": 1.25, + "learning_rate": 0.00040810244738269277, + "loss": 0.7056, + "step": 10214 + }, + { + "epoch": 0.7106334133361161, + "grad_norm": 1.3203125, + "learning_rate": 0.0004079208410392887, + "loss": 0.9715, + "step": 10215 + }, + { + "epoch": 0.7107029809732512, + "grad_norm": 0.9921875, + "learning_rate": 0.0004077392647586796, + "loss": 0.7252, + "step": 10216 + }, + { + "epoch": 0.7107725486103864, + "grad_norm": 1.1015625, + "learning_rate": 0.0004075577185500858, + "loss": 0.768, + "step": 10217 + }, + { + "epoch": 0.7108421162475217, + "grad_norm": 0.98828125, + "learning_rate": 0.000407376202422725, + "loss": 0.789, + "step": 10218 + }, + { + "epoch": 0.7109116838846569, + "grad_norm": 1.328125, + "learning_rate": 0.0004071947163858131, + "loss": 0.8223, + "step": 10219 + }, + { + "epoch": 0.710981251521792, + "grad_norm": 0.88671875, + "learning_rate": 0.00040701326044856556, + "loss": 0.6058, + "step": 10220 + }, + { + "epoch": 0.7110508191589273, + "grad_norm": 1.3671875, + "learning_rate": 0.0004068318346201962, + "loss": 0.8007, + "step": 10221 + }, + { + "epoch": 0.7111203867960625, + "grad_norm": 1.0703125, + "learning_rate": 0.0004066504389099165, + "loss": 0.7994, + "step": 10222 + }, + { + "epoch": 0.7111899544331977, + "grad_norm": 1.40625, + "learning_rate": 0.0004064690733269365, + "loss": 0.7742, + "step": 10223 + }, + { + "epoch": 0.7112595220703328, + "grad_norm": 1.0390625, + "learning_rate": 0.00040628773788046525, + "loss": 0.7676, + "step": 10224 + }, + { + "epoch": 0.7113290897074681, + "grad_norm": 1.3046875, + "learning_rate": 0.0004061064325797105, + "loss": 0.7144, + "step": 10225 + }, + { + "epoch": 0.7113986573446033, + "grad_norm": 1.2578125, + "learning_rate": 0.0004059251574338776, + "loss": 0.8438, + "step": 10226 + }, + { + "epoch": 0.7114682249817385, + "grad_norm": 1.515625, + "learning_rate": 0.0004057439124521706, + "loss": 1.0629, + "step": 10227 + }, + { + "epoch": 0.7115377926188737, + "grad_norm": 1.25, + "learning_rate": 0.0004055626976437924, + "loss": 0.816, + "step": 10228 + }, + { + "epoch": 0.7116073602560089, + "grad_norm": 1.03125, + "learning_rate": 0.00040538151301794455, + "loss": 0.7275, + "step": 10229 + }, + { + "epoch": 0.7116769278931441, + "grad_norm": 1.21875, + "learning_rate": 0.0004052003585838261, + "loss": 0.7616, + "step": 10230 + }, + { + "epoch": 0.7117464955302794, + "grad_norm": 1.1640625, + "learning_rate": 0.0004050192343506358, + "loss": 0.9566, + "step": 10231 + }, + { + "epoch": 0.7118160631674145, + "grad_norm": 1.390625, + "learning_rate": 0.0004048381403275697, + "loss": 0.7487, + "step": 10232 + }, + { + "epoch": 0.7118856308045497, + "grad_norm": 0.99609375, + "learning_rate": 0.00040465707652382276, + "loss": 0.855, + "step": 10233 + }, + { + "epoch": 0.711955198441685, + "grad_norm": 0.7734375, + "learning_rate": 0.00040447604294858877, + "loss": 0.6897, + "step": 10234 + }, + { + "epoch": 0.7120247660788201, + "grad_norm": 1.0859375, + "learning_rate": 0.00040429503961106, + "loss": 0.9849, + "step": 10235 + }, + { + "epoch": 0.7120943337159553, + "grad_norm": 1.1484375, + "learning_rate": 0.0004041140665204264, + "loss": 0.6458, + "step": 10236 + }, + { + "epoch": 0.7121639013530905, + "grad_norm": 0.9765625, + "learning_rate": 0.00040393312368587674, + "loss": 0.8279, + "step": 10237 + }, + { + "epoch": 0.7122334689902258, + "grad_norm": 1.140625, + "learning_rate": 0.00040375221111659866, + "loss": 0.8865, + "step": 10238 + }, + { + "epoch": 0.7123030366273609, + "grad_norm": 1.046875, + "learning_rate": 0.0004035713288217784, + "loss": 0.9888, + "step": 10239 + }, + { + "epoch": 0.7123726042644961, + "grad_norm": 1.0859375, + "learning_rate": 0.0004033904768105997, + "loss": 0.8738, + "step": 10240 + }, + { + "epoch": 0.7124421719016314, + "grad_norm": 0.91796875, + "learning_rate": 0.0004032096550922453, + "loss": 0.8892, + "step": 10241 + }, + { + "epoch": 0.7125117395387666, + "grad_norm": 1.234375, + "learning_rate": 0.0004030288636758964, + "loss": 0.7223, + "step": 10242 + }, + { + "epoch": 0.7125813071759017, + "grad_norm": 1.140625, + "learning_rate": 0.00040284810257073324, + "loss": 0.6951, + "step": 10243 + }, + { + "epoch": 0.712650874813037, + "grad_norm": 1.015625, + "learning_rate": 0.00040266737178593326, + "loss": 0.7866, + "step": 10244 + }, + { + "epoch": 0.7127204424501722, + "grad_norm": 0.90234375, + "learning_rate": 0.00040248667133067364, + "loss": 0.693, + "step": 10245 + }, + { + "epoch": 0.7127900100873074, + "grad_norm": 1.171875, + "learning_rate": 0.00040230600121412885, + "loss": 0.9007, + "step": 10246 + }, + { + "epoch": 0.7128595777244426, + "grad_norm": 1.046875, + "learning_rate": 0.0004021253614454731, + "loss": 0.6981, + "step": 10247 + }, + { + "epoch": 0.7129291453615778, + "grad_norm": 1.1015625, + "learning_rate": 0.0004019447520338776, + "loss": 0.7544, + "step": 10248 + }, + { + "epoch": 0.712998712998713, + "grad_norm": 1.1796875, + "learning_rate": 0.0004017641729885134, + "loss": 0.8259, + "step": 10249 + }, + { + "epoch": 0.7130682806358482, + "grad_norm": 1.140625, + "learning_rate": 0.00040158362431854934, + "loss": 0.9236, + "step": 10250 + }, + { + "epoch": 0.7131378482729834, + "grad_norm": 1.140625, + "learning_rate": 0.0004014031060331522, + "loss": 0.7259, + "step": 10251 + }, + { + "epoch": 0.7132074159101186, + "grad_norm": 1.328125, + "learning_rate": 0.0004012226181414882, + "loss": 0.8314, + "step": 10252 + }, + { + "epoch": 0.7132769835472538, + "grad_norm": 1.1796875, + "learning_rate": 0.0004010421606527218, + "loss": 0.7988, + "step": 10253 + }, + { + "epoch": 0.713346551184389, + "grad_norm": 1.3203125, + "learning_rate": 0.00040086173357601566, + "loss": 0.8601, + "step": 10254 + }, + { + "epoch": 0.7134161188215242, + "grad_norm": 1.046875, + "learning_rate": 0.00040068133692053044, + "loss": 0.9581, + "step": 10255 + }, + { + "epoch": 0.7134856864586594, + "grad_norm": 1.3046875, + "learning_rate": 0.00040050097069542614, + "loss": 0.9045, + "step": 10256 + }, + { + "epoch": 0.7135552540957947, + "grad_norm": 1.2109375, + "learning_rate": 0.00040032063490986114, + "loss": 0.9228, + "step": 10257 + }, + { + "epoch": 0.7136248217329298, + "grad_norm": 0.96484375, + "learning_rate": 0.0004001403295729914, + "loss": 0.7246, + "step": 10258 + }, + { + "epoch": 0.713694389370065, + "grad_norm": 0.94140625, + "learning_rate": 0.0003999600546939726, + "loss": 0.9472, + "step": 10259 + }, + { + "epoch": 0.7137639570072003, + "grad_norm": 1.5625, + "learning_rate": 0.0003997798102819573, + "loss": 0.6811, + "step": 10260 + }, + { + "epoch": 0.7138335246443355, + "grad_norm": 1.3125, + "learning_rate": 0.0003995995963460983, + "loss": 0.8605, + "step": 10261 + }, + { + "epoch": 0.7139030922814706, + "grad_norm": 0.91015625, + "learning_rate": 0.00039941941289554526, + "loss": 0.5264, + "step": 10262 + }, + { + "epoch": 0.7139726599186058, + "grad_norm": 1.203125, + "learning_rate": 0.00039923925993944764, + "loss": 0.945, + "step": 10263 + }, + { + "epoch": 0.7140422275557411, + "grad_norm": 1.109375, + "learning_rate": 0.0003990591374869523, + "loss": 0.958, + "step": 10264 + }, + { + "epoch": 0.7141117951928763, + "grad_norm": 1.125, + "learning_rate": 0.00039887904554720467, + "loss": 1.0233, + "step": 10265 + }, + { + "epoch": 0.7141813628300114, + "grad_norm": 1.390625, + "learning_rate": 0.00039869898412934926, + "loss": 0.8459, + "step": 10266 + }, + { + "epoch": 0.7142509304671467, + "grad_norm": 1.1484375, + "learning_rate": 0.00039851895324252896, + "loss": 0.757, + "step": 10267 + }, + { + "epoch": 0.7143204981042819, + "grad_norm": 1.2734375, + "learning_rate": 0.0003983389528958845, + "loss": 0.9271, + "step": 10268 + }, + { + "epoch": 0.7143900657414171, + "grad_norm": 1.5078125, + "learning_rate": 0.000398158983098555, + "loss": 0.8022, + "step": 10269 + }, + { + "epoch": 0.7144596333785523, + "grad_norm": 1.125, + "learning_rate": 0.000397979043859679, + "loss": 0.7929, + "step": 10270 + }, + { + "epoch": 0.7145292010156875, + "grad_norm": 1.015625, + "learning_rate": 0.00039779913518839304, + "loss": 0.6102, + "step": 10271 + }, + { + "epoch": 0.7145987686528227, + "grad_norm": 0.984375, + "learning_rate": 0.0003976192570938316, + "loss": 0.7635, + "step": 10272 + }, + { + "epoch": 0.714668336289958, + "grad_norm": 1.0390625, + "learning_rate": 0.00039743940958512783, + "loss": 0.8293, + "step": 10273 + }, + { + "epoch": 0.7147379039270931, + "grad_norm": 0.90625, + "learning_rate": 0.00039725959267141364, + "loss": 0.8435, + "step": 10274 + }, + { + "epoch": 0.7148074715642283, + "grad_norm": 1.3671875, + "learning_rate": 0.0003970798063618196, + "loss": 0.832, + "step": 10275 + }, + { + "epoch": 0.7148770392013635, + "grad_norm": 1.4140625, + "learning_rate": 0.00039690005066547377, + "loss": 0.8953, + "step": 10276 + }, + { + "epoch": 0.7149466068384988, + "grad_norm": 1.3046875, + "learning_rate": 0.00039672032559150383, + "loss": 0.8819, + "step": 10277 + }, + { + "epoch": 0.7150161744756339, + "grad_norm": 1.0859375, + "learning_rate": 0.0003965406311490347, + "loss": 0.7694, + "step": 10278 + }, + { + "epoch": 0.7150857421127691, + "grad_norm": 1.1328125, + "learning_rate": 0.00039636096734719096, + "loss": 0.7337, + "step": 10279 + }, + { + "epoch": 0.7151553097499044, + "grad_norm": 1.328125, + "learning_rate": 0.0003961813341950945, + "loss": 0.9547, + "step": 10280 + }, + { + "epoch": 0.7152248773870395, + "grad_norm": 1.1875, + "learning_rate": 0.0003960017317018666, + "loss": 0.815, + "step": 10281 + }, + { + "epoch": 0.7152944450241747, + "grad_norm": 1.3984375, + "learning_rate": 0.0003958221598766265, + "loss": 0.9948, + "step": 10282 + }, + { + "epoch": 0.71536401266131, + "grad_norm": 0.99609375, + "learning_rate": 0.0003956426187284915, + "loss": 0.6415, + "step": 10283 + }, + { + "epoch": 0.7154335802984452, + "grad_norm": 1.4765625, + "learning_rate": 0.000395463108266578, + "loss": 0.8901, + "step": 10284 + }, + { + "epoch": 0.7155031479355803, + "grad_norm": 1.2578125, + "learning_rate": 0.0003952836285000012, + "loss": 0.8202, + "step": 10285 + }, + { + "epoch": 0.7155727155727156, + "grad_norm": 1.3125, + "learning_rate": 0.0003951041794378738, + "loss": 0.6531, + "step": 10286 + }, + { + "epoch": 0.7156422832098508, + "grad_norm": 1.0625, + "learning_rate": 0.00039492476108930687, + "loss": 0.8524, + "step": 10287 + }, + { + "epoch": 0.715711850846986, + "grad_norm": 1.125, + "learning_rate": 0.00039474537346341075, + "loss": 0.7424, + "step": 10288 + }, + { + "epoch": 0.7157814184841211, + "grad_norm": 1.078125, + "learning_rate": 0.0003945660165692942, + "loss": 0.8893, + "step": 10289 + }, + { + "epoch": 0.7158509861212564, + "grad_norm": 1.203125, + "learning_rate": 0.00039438669041606345, + "loss": 0.738, + "step": 10290 + }, + { + "epoch": 0.7159205537583916, + "grad_norm": 1.0234375, + "learning_rate": 0.0003942073950128243, + "loss": 0.7486, + "step": 10291 + }, + { + "epoch": 0.7159901213955268, + "grad_norm": 1.2890625, + "learning_rate": 0.0003940281303686799, + "loss": 0.8488, + "step": 10292 + }, + { + "epoch": 0.716059689032662, + "grad_norm": 1.0, + "learning_rate": 0.00039384889649273305, + "loss": 0.9268, + "step": 10293 + }, + { + "epoch": 0.7161292566697972, + "grad_norm": 1.0234375, + "learning_rate": 0.00039366969339408366, + "loss": 0.6976, + "step": 10294 + }, + { + "epoch": 0.7161988243069324, + "grad_norm": 0.98046875, + "learning_rate": 0.00039349052108183147, + "loss": 0.8058, + "step": 10295 + }, + { + "epoch": 0.7162683919440677, + "grad_norm": 1.0078125, + "learning_rate": 0.0003933113795650737, + "loss": 0.7992, + "step": 10296 + }, + { + "epoch": 0.7163379595812028, + "grad_norm": 1.1484375, + "learning_rate": 0.0003931322688529052, + "loss": 0.7471, + "step": 10297 + }, + { + "epoch": 0.716407527218338, + "grad_norm": 1.2421875, + "learning_rate": 0.0003929531889544221, + "loss": 0.8424, + "step": 10298 + }, + { + "epoch": 0.7164770948554733, + "grad_norm": 1.03125, + "learning_rate": 0.00039277413987871633, + "loss": 0.805, + "step": 10299 + }, + { + "epoch": 0.7165466624926085, + "grad_norm": 1.0859375, + "learning_rate": 0.00039259512163487896, + "loss": 0.6088, + "step": 10300 + }, + { + "epoch": 0.7166162301297436, + "grad_norm": 1.15625, + "learning_rate": 0.0003924161342319996, + "loss": 0.8697, + "step": 10301 + }, + { + "epoch": 0.7166857977668788, + "grad_norm": 1.5390625, + "learning_rate": 0.00039223717767916633, + "loss": 0.8839, + "step": 10302 + }, + { + "epoch": 0.7167553654040141, + "grad_norm": 1.3359375, + "learning_rate": 0.00039205825198546627, + "loss": 0.8275, + "step": 10303 + }, + { + "epoch": 0.7168249330411492, + "grad_norm": 0.9140625, + "learning_rate": 0.0003918793571599836, + "loss": 0.5849, + "step": 10304 + }, + { + "epoch": 0.7168945006782844, + "grad_norm": 1.2109375, + "learning_rate": 0.0003917004932118023, + "loss": 1.1259, + "step": 10305 + }, + { + "epoch": 0.7169640683154197, + "grad_norm": 1.0, + "learning_rate": 0.00039152166015000354, + "loss": 0.7061, + "step": 10306 + }, + { + "epoch": 0.7170336359525549, + "grad_norm": 1.3984375, + "learning_rate": 0.0003913428579836683, + "loss": 0.9957, + "step": 10307 + }, + { + "epoch": 0.71710320358969, + "grad_norm": 1.09375, + "learning_rate": 0.0003911640867218745, + "loss": 0.837, + "step": 10308 + }, + { + "epoch": 0.7171727712268253, + "grad_norm": 1.2890625, + "learning_rate": 0.00039098534637369996, + "loss": 0.8902, + "step": 10309 + }, + { + "epoch": 0.7172423388639605, + "grad_norm": 1.0546875, + "learning_rate": 0.0003908066369482196, + "loss": 0.7183, + "step": 10310 + }, + { + "epoch": 0.7173119065010957, + "grad_norm": 1.0234375, + "learning_rate": 0.0003906279584545076, + "loss": 0.6909, + "step": 10311 + }, + { + "epoch": 0.717381474138231, + "grad_norm": 0.90625, + "learning_rate": 0.0003904493109016367, + "loss": 0.6833, + "step": 10312 + }, + { + "epoch": 0.7174510417753661, + "grad_norm": 1.15625, + "learning_rate": 0.00039027069429867754, + "loss": 0.8421, + "step": 10313 + }, + { + "epoch": 0.7175206094125013, + "grad_norm": 1.171875, + "learning_rate": 0.00039009210865469926, + "loss": 0.759, + "step": 10314 + }, + { + "epoch": 0.7175901770496365, + "grad_norm": 1.09375, + "learning_rate": 0.00038991355397876903, + "loss": 0.9539, + "step": 10315 + }, + { + "epoch": 0.7176597446867717, + "grad_norm": 1.015625, + "learning_rate": 0.0003897350302799536, + "loss": 0.8893, + "step": 10316 + }, + { + "epoch": 0.7177293123239069, + "grad_norm": 1.1953125, + "learning_rate": 0.0003895565375673177, + "loss": 1.0996, + "step": 10317 + }, + { + "epoch": 0.7177988799610421, + "grad_norm": 1.1953125, + "learning_rate": 0.0003893780758499236, + "loss": 0.701, + "step": 10318 + }, + { + "epoch": 0.7178684475981774, + "grad_norm": 0.9921875, + "learning_rate": 0.00038919964513683334, + "loss": 0.6515, + "step": 10319 + }, + { + "epoch": 0.7179380152353125, + "grad_norm": 1.015625, + "learning_rate": 0.00038902124543710616, + "loss": 0.7534, + "step": 10320 + }, + { + "epoch": 0.7180075828724477, + "grad_norm": 1.2109375, + "learning_rate": 0.0003888428767598009, + "loss": 0.8148, + "step": 10321 + }, + { + "epoch": 0.718077150509583, + "grad_norm": 1.1640625, + "learning_rate": 0.0003886645391139736, + "loss": 0.938, + "step": 10322 + }, + { + "epoch": 0.7181467181467182, + "grad_norm": 1.265625, + "learning_rate": 0.00038848623250867985, + "loss": 1.1074, + "step": 10323 + }, + { + "epoch": 0.7182162857838533, + "grad_norm": 1.1875, + "learning_rate": 0.0003883079569529727, + "loss": 0.8166, + "step": 10324 + }, + { + "epoch": 0.7182858534209886, + "grad_norm": 1.046875, + "learning_rate": 0.00038812971245590446, + "loss": 0.7357, + "step": 10325 + }, + { + "epoch": 0.7183554210581238, + "grad_norm": 1.109375, + "learning_rate": 0.0003879514990265255, + "loss": 0.8994, + "step": 10326 + }, + { + "epoch": 0.718424988695259, + "grad_norm": 0.95703125, + "learning_rate": 0.0003877733166738846, + "loss": 0.8689, + "step": 10327 + }, + { + "epoch": 0.7184945563323941, + "grad_norm": 1.1875, + "learning_rate": 0.00038759516540702875, + "loss": 0.8483, + "step": 10328 + }, + { + "epoch": 0.7185641239695294, + "grad_norm": 1.203125, + "learning_rate": 0.0003874170452350031, + "loss": 0.8876, + "step": 10329 + }, + { + "epoch": 0.7186336916066646, + "grad_norm": 1.3203125, + "learning_rate": 0.00038723895616685276, + "loss": 0.7773, + "step": 10330 + }, + { + "epoch": 0.7187032592437997, + "grad_norm": 1.328125, + "learning_rate": 0.0003870608982116198, + "loss": 0.7464, + "step": 10331 + }, + { + "epoch": 0.718772826880935, + "grad_norm": 1.3125, + "learning_rate": 0.00038688287137834455, + "loss": 0.8565, + "step": 10332 + }, + { + "epoch": 0.7188423945180702, + "grad_norm": 1.2109375, + "learning_rate": 0.00038670487567606717, + "loss": 0.7739, + "step": 10333 + }, + { + "epoch": 0.7189119621552054, + "grad_norm": 1.4140625, + "learning_rate": 0.0003865269111138247, + "loss": 1.0526, + "step": 10334 + }, + { + "epoch": 0.7189815297923406, + "grad_norm": 1.0859375, + "learning_rate": 0.0003863489777006537, + "loss": 0.8619, + "step": 10335 + }, + { + "epoch": 0.7190510974294758, + "grad_norm": 1.1171875, + "learning_rate": 0.0003861710754455884, + "loss": 0.6509, + "step": 10336 + }, + { + "epoch": 0.719120665066611, + "grad_norm": 1.21875, + "learning_rate": 0.00038599320435766214, + "loss": 0.7926, + "step": 10337 + }, + { + "epoch": 0.7191902327037463, + "grad_norm": 1.25, + "learning_rate": 0.0003858153644459059, + "loss": 0.804, + "step": 10338 + }, + { + "epoch": 0.7192598003408814, + "grad_norm": 0.9296875, + "learning_rate": 0.00038563755571934975, + "loss": 0.8925, + "step": 10339 + }, + { + "epoch": 0.7193293679780166, + "grad_norm": 1.09375, + "learning_rate": 0.00038545977818702225, + "loss": 0.5914, + "step": 10340 + }, + { + "epoch": 0.7193989356151518, + "grad_norm": 1.09375, + "learning_rate": 0.00038528203185794963, + "loss": 0.8468, + "step": 10341 + }, + { + "epoch": 0.7194685032522871, + "grad_norm": 1.0078125, + "learning_rate": 0.0003851043167411571, + "loss": 0.8216, + "step": 10342 + }, + { + "epoch": 0.7195380708894222, + "grad_norm": 1.078125, + "learning_rate": 0.0003849266328456673, + "loss": 0.8859, + "step": 10343 + }, + { + "epoch": 0.7196076385265574, + "grad_norm": 0.921875, + "learning_rate": 0.0003847489801805036, + "loss": 0.7897, + "step": 10344 + }, + { + "epoch": 0.7196772061636927, + "grad_norm": 1.203125, + "learning_rate": 0.00038457135875468574, + "loss": 0.7999, + "step": 10345 + }, + { + "epoch": 0.7197467738008279, + "grad_norm": 0.8828125, + "learning_rate": 0.0003843937685772321, + "loss": 0.8396, + "step": 10346 + }, + { + "epoch": 0.719816341437963, + "grad_norm": 1.03125, + "learning_rate": 0.00038421620965715974, + "loss": 0.6124, + "step": 10347 + }, + { + "epoch": 0.7198859090750983, + "grad_norm": 0.8515625, + "learning_rate": 0.00038403868200348446, + "loss": 0.6664, + "step": 10348 + }, + { + "epoch": 0.7199554767122335, + "grad_norm": 1.2265625, + "learning_rate": 0.00038386118562522053, + "loss": 0.9508, + "step": 10349 + }, + { + "epoch": 0.7200250443493686, + "grad_norm": 1.3515625, + "learning_rate": 0.0003836837205313798, + "loss": 0.9475, + "step": 10350 + }, + { + "epoch": 0.7200946119865039, + "grad_norm": 1.03125, + "learning_rate": 0.00038350628673097353, + "loss": 0.6598, + "step": 10351 + }, + { + "epoch": 0.7201641796236391, + "grad_norm": 0.81640625, + "learning_rate": 0.00038332888423301027, + "loss": 0.7691, + "step": 10352 + }, + { + "epoch": 0.7202337472607743, + "grad_norm": 1.0078125, + "learning_rate": 0.00038315151304649844, + "loss": 0.6976, + "step": 10353 + }, + { + "epoch": 0.7203033148979094, + "grad_norm": 1.0, + "learning_rate": 0.00038297417318044325, + "loss": 0.8059, + "step": 10354 + }, + { + "epoch": 0.7203728825350447, + "grad_norm": 1.0625, + "learning_rate": 0.00038279686464384987, + "loss": 0.9312, + "step": 10355 + }, + { + "epoch": 0.7204424501721799, + "grad_norm": 1.09375, + "learning_rate": 0.00038261958744572044, + "loss": 0.7603, + "step": 10356 + }, + { + "epoch": 0.7205120178093151, + "grad_norm": 0.9921875, + "learning_rate": 0.00038244234159505653, + "loss": 0.803, + "step": 10357 + }, + { + "epoch": 0.7205815854464503, + "grad_norm": 1.0234375, + "learning_rate": 0.00038226512710085817, + "loss": 0.745, + "step": 10358 + }, + { + "epoch": 0.7206511530835855, + "grad_norm": 1.0234375, + "learning_rate": 0.000382087943972123, + "loss": 0.7745, + "step": 10359 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 0.9765625, + "learning_rate": 0.00038191079221784754, + "loss": 0.8289, + "step": 10360 + }, + { + "epoch": 0.720790288357856, + "grad_norm": 1.1796875, + "learning_rate": 0.0003817336718470259, + "loss": 1.0168, + "step": 10361 + }, + { + "epoch": 0.7208598559949911, + "grad_norm": 1.046875, + "learning_rate": 0.0003815565828686528, + "loss": 0.7349, + "step": 10362 + }, + { + "epoch": 0.7209294236321263, + "grad_norm": 1.1796875, + "learning_rate": 0.00038137952529171924, + "loss": 0.8569, + "step": 10363 + }, + { + "epoch": 0.7209989912692616, + "grad_norm": 1.28125, + "learning_rate": 0.0003812024991252151, + "loss": 0.9164, + "step": 10364 + }, + { + "epoch": 0.7210685589063968, + "grad_norm": 1.2421875, + "learning_rate": 0.00038102550437812933, + "loss": 0.8319, + "step": 10365 + }, + { + "epoch": 0.7211381265435319, + "grad_norm": 1.171875, + "learning_rate": 0.0003808485410594482, + "loss": 0.676, + "step": 10366 + }, + { + "epoch": 0.7212076941806671, + "grad_norm": 0.91796875, + "learning_rate": 0.0003806716091781578, + "loss": 0.7555, + "step": 10367 + }, + { + "epoch": 0.7212772618178024, + "grad_norm": 1.0078125, + "learning_rate": 0.0003804947087432411, + "loss": 0.5666, + "step": 10368 + }, + { + "epoch": 0.7213468294549376, + "grad_norm": 1.1953125, + "learning_rate": 0.0003803178397636808, + "loss": 0.9253, + "step": 10369 + }, + { + "epoch": 0.7214163970920727, + "grad_norm": 1.3515625, + "learning_rate": 0.0003801410022484569, + "loss": 0.8512, + "step": 10370 + }, + { + "epoch": 0.721485964729208, + "grad_norm": 1.15625, + "learning_rate": 0.00037996419620654867, + "loss": 0.7916, + "step": 10371 + }, + { + "epoch": 0.7215555323663432, + "grad_norm": 1.0703125, + "learning_rate": 0.0003797874216469336, + "loss": 0.8352, + "step": 10372 + }, + { + "epoch": 0.7216251000034783, + "grad_norm": 1.1796875, + "learning_rate": 0.0003796106785785871, + "loss": 0.7665, + "step": 10373 + }, + { + "epoch": 0.7216946676406136, + "grad_norm": 1.53125, + "learning_rate": 0.0003794339670104835, + "loss": 0.8361, + "step": 10374 + }, + { + "epoch": 0.7217642352777488, + "grad_norm": 0.859375, + "learning_rate": 0.00037925728695159435, + "loss": 0.6771, + "step": 10375 + }, + { + "epoch": 0.721833802914884, + "grad_norm": 0.9609375, + "learning_rate": 0.00037908063841089214, + "loss": 0.6722, + "step": 10376 + }, + { + "epoch": 0.7219033705520193, + "grad_norm": 1.4453125, + "learning_rate": 0.0003789040213973454, + "loss": 0.87, + "step": 10377 + }, + { + "epoch": 0.7219729381891544, + "grad_norm": 1.1328125, + "learning_rate": 0.00037872743591992156, + "loss": 0.804, + "step": 10378 + }, + { + "epoch": 0.7220425058262896, + "grad_norm": 1.09375, + "learning_rate": 0.00037855088198758747, + "loss": 1.0457, + "step": 10379 + }, + { + "epoch": 0.7221120734634248, + "grad_norm": 1.203125, + "learning_rate": 0.00037837435960930686, + "loss": 0.8591, + "step": 10380 + }, + { + "epoch": 0.72218164110056, + "grad_norm": 1.046875, + "learning_rate": 0.00037819786879404336, + "loss": 0.8595, + "step": 10381 + }, + { + "epoch": 0.7222512087376952, + "grad_norm": 1.0234375, + "learning_rate": 0.0003780214095507577, + "loss": 0.8191, + "step": 10382 + }, + { + "epoch": 0.7223207763748304, + "grad_norm": 1.171875, + "learning_rate": 0.0003778449818884102, + "loss": 0.8937, + "step": 10383 + }, + { + "epoch": 0.7223903440119657, + "grad_norm": 1.4765625, + "learning_rate": 0.0003776685858159583, + "loss": 0.6803, + "step": 10384 + }, + { + "epoch": 0.7224599116491008, + "grad_norm": 1.0390625, + "learning_rate": 0.0003774922213423588, + "loss": 0.9583, + "step": 10385 + }, + { + "epoch": 0.722529479286236, + "grad_norm": 1.1953125, + "learning_rate": 0.0003773158884765669, + "loss": 0.9292, + "step": 10386 + }, + { + "epoch": 0.7225990469233713, + "grad_norm": 1.046875, + "learning_rate": 0.0003771395872275357, + "loss": 0.5646, + "step": 10387 + }, + { + "epoch": 0.7226686145605065, + "grad_norm": 1.1171875, + "learning_rate": 0.00037696331760421654, + "loss": 0.8099, + "step": 10388 + }, + { + "epoch": 0.7227381821976416, + "grad_norm": 1.0859375, + "learning_rate": 0.0003767870796155597, + "loss": 0.7322, + "step": 10389 + }, + { + "epoch": 0.7228077498347769, + "grad_norm": 1.3515625, + "learning_rate": 0.000376610873270514, + "loss": 0.8178, + "step": 10390 + }, + { + "epoch": 0.7228773174719121, + "grad_norm": 0.94921875, + "learning_rate": 0.00037643469857802614, + "loss": 0.7163, + "step": 10391 + }, + { + "epoch": 0.7229468851090473, + "grad_norm": 0.99609375, + "learning_rate": 0.0003762585555470409, + "loss": 0.7489, + "step": 10392 + }, + { + "epoch": 0.7230164527461824, + "grad_norm": 1.0234375, + "learning_rate": 0.0003760824441865026, + "loss": 0.4969, + "step": 10393 + }, + { + "epoch": 0.7230860203833177, + "grad_norm": 1.0625, + "learning_rate": 0.0003759063645053528, + "loss": 0.983, + "step": 10394 + }, + { + "epoch": 0.7231555880204529, + "grad_norm": 1.0859375, + "learning_rate": 0.00037573031651253245, + "loss": 0.98, + "step": 10395 + }, + { + "epoch": 0.723225155657588, + "grad_norm": 1.53125, + "learning_rate": 0.0003755543002169797, + "loss": 1.1609, + "step": 10396 + }, + { + "epoch": 0.7232947232947233, + "grad_norm": 1.1328125, + "learning_rate": 0.0003753783156276325, + "loss": 0.8617, + "step": 10397 + }, + { + "epoch": 0.7233642909318585, + "grad_norm": 1.046875, + "learning_rate": 0.00037520236275342565, + "loss": 0.8373, + "step": 10398 + }, + { + "epoch": 0.7234338585689937, + "grad_norm": 0.953125, + "learning_rate": 0.0003750264416032938, + "loss": 0.8973, + "step": 10399 + }, + { + "epoch": 0.723503426206129, + "grad_norm": 1.234375, + "learning_rate": 0.0003748505521861694, + "loss": 0.6331, + "step": 10400 + }, + { + "epoch": 0.7235729938432641, + "grad_norm": 0.98828125, + "learning_rate": 0.00037467469451098293, + "loss": 0.8331, + "step": 10401 + }, + { + "epoch": 0.7236425614803993, + "grad_norm": 1.4453125, + "learning_rate": 0.0003744988685866633, + "loss": 0.9041, + "step": 10402 + }, + { + "epoch": 0.7237121291175346, + "grad_norm": 1.265625, + "learning_rate": 0.0003743230744221383, + "loss": 1.0202, + "step": 10403 + }, + { + "epoch": 0.7237816967546697, + "grad_norm": 1.09375, + "learning_rate": 0.0003741473120263345, + "loss": 0.8418, + "step": 10404 + }, + { + "epoch": 0.7238512643918049, + "grad_norm": 1.0546875, + "learning_rate": 0.0003739715814081754, + "loss": 0.7421, + "step": 10405 + }, + { + "epoch": 0.7239208320289401, + "grad_norm": 1.2578125, + "learning_rate": 0.0003737958825765837, + "loss": 0.9736, + "step": 10406 + }, + { + "epoch": 0.7239903996660754, + "grad_norm": 1.0078125, + "learning_rate": 0.0003736202155404809, + "loss": 0.6861, + "step": 10407 + }, + { + "epoch": 0.7240599673032105, + "grad_norm": 1.0859375, + "learning_rate": 0.0003734445803087866, + "loss": 1.0322, + "step": 10408 + }, + { + "epoch": 0.7241295349403457, + "grad_norm": 1.0625, + "learning_rate": 0.0003732689768904185, + "loss": 0.7579, + "step": 10409 + }, + { + "epoch": 0.724199102577481, + "grad_norm": 1.1796875, + "learning_rate": 0.0003730934052942924, + "loss": 0.7262, + "step": 10410 + }, + { + "epoch": 0.7242686702146162, + "grad_norm": 1.015625, + "learning_rate": 0.00037291786552932373, + "loss": 0.6312, + "step": 10411 + }, + { + "epoch": 0.7243382378517513, + "grad_norm": 0.96484375, + "learning_rate": 0.00037274235760442466, + "loss": 0.7394, + "step": 10412 + }, + { + "epoch": 0.7244078054888866, + "grad_norm": 1.1171875, + "learning_rate": 0.00037256688152850716, + "loss": 1.1719, + "step": 10413 + }, + { + "epoch": 0.7244773731260218, + "grad_norm": 0.8671875, + "learning_rate": 0.0003723914373104813, + "loss": 0.5643, + "step": 10414 + }, + { + "epoch": 0.724546940763157, + "grad_norm": 0.9453125, + "learning_rate": 0.0003722160249592548, + "loss": 0.6835, + "step": 10415 + }, + { + "epoch": 0.7246165084002922, + "grad_norm": 1.0078125, + "learning_rate": 0.000372040644483734, + "loss": 0.7814, + "step": 10416 + }, + { + "epoch": 0.7246860760374274, + "grad_norm": 1.171875, + "learning_rate": 0.00037186529589282405, + "loss": 0.7209, + "step": 10417 + }, + { + "epoch": 0.7247556436745626, + "grad_norm": 1.1171875, + "learning_rate": 0.0003716899791954287, + "loss": 0.8136, + "step": 10418 + }, + { + "epoch": 0.7248252113116977, + "grad_norm": 1.125, + "learning_rate": 0.0003715146944004494, + "loss": 0.941, + "step": 10419 + }, + { + "epoch": 0.724894778948833, + "grad_norm": 0.9921875, + "learning_rate": 0.0003713394415167856, + "loss": 0.6571, + "step": 10420 + }, + { + "epoch": 0.7249643465859682, + "grad_norm": 1.3515625, + "learning_rate": 0.00037116422055333634, + "loss": 1.0111, + "step": 10421 + }, + { + "epoch": 0.7250339142231034, + "grad_norm": 1.1015625, + "learning_rate": 0.0003709890315189988, + "loss": 0.8752, + "step": 10422 + }, + { + "epoch": 0.7251034818602387, + "grad_norm": 1.4453125, + "learning_rate": 0.0003708138744226678, + "loss": 1.1282, + "step": 10423 + }, + { + "epoch": 0.7251730494973738, + "grad_norm": 1.265625, + "learning_rate": 0.0003706387492732365, + "loss": 0.925, + "step": 10424 + }, + { + "epoch": 0.725242617134509, + "grad_norm": 1.0703125, + "learning_rate": 0.0003704636560795976, + "loss": 0.9034, + "step": 10425 + }, + { + "epoch": 0.7253121847716443, + "grad_norm": 1.0703125, + "learning_rate": 0.00037028859485064094, + "loss": 0.7064, + "step": 10426 + }, + { + "epoch": 0.7253817524087794, + "grad_norm": 1.03125, + "learning_rate": 0.0003701135655952557, + "loss": 0.7784, + "step": 10427 + }, + { + "epoch": 0.7254513200459146, + "grad_norm": 1.4375, + "learning_rate": 0.00036993856832232843, + "loss": 0.9509, + "step": 10428 + }, + { + "epoch": 0.7255208876830499, + "grad_norm": 0.95703125, + "learning_rate": 0.00036976360304074516, + "loss": 0.8873, + "step": 10429 + }, + { + "epoch": 0.7255904553201851, + "grad_norm": 1.484375, + "learning_rate": 0.0003695886697593893, + "loss": 0.8801, + "step": 10430 + }, + { + "epoch": 0.7256600229573202, + "grad_norm": 0.85546875, + "learning_rate": 0.00036941376848714325, + "loss": 0.5566, + "step": 10431 + }, + { + "epoch": 0.7257295905944554, + "grad_norm": 1.21875, + "learning_rate": 0.0003692388992328879, + "loss": 0.6438, + "step": 10432 + }, + { + "epoch": 0.7257991582315907, + "grad_norm": 0.99609375, + "learning_rate": 0.00036906406200550213, + "loss": 0.9409, + "step": 10433 + }, + { + "epoch": 0.7258687258687259, + "grad_norm": 1.4453125, + "learning_rate": 0.00036888925681386267, + "loss": 0.8923, + "step": 10434 + }, + { + "epoch": 0.725938293505861, + "grad_norm": 1.0546875, + "learning_rate": 0.0003687144836668457, + "loss": 0.9672, + "step": 10435 + }, + { + "epoch": 0.7260078611429963, + "grad_norm": 1.21875, + "learning_rate": 0.0003685397425733258, + "loss": 0.9184, + "step": 10436 + }, + { + "epoch": 0.7260774287801315, + "grad_norm": 1.21875, + "learning_rate": 0.0003683650335421749, + "loss": 0.8752, + "step": 10437 + }, + { + "epoch": 0.7261469964172667, + "grad_norm": 1.1953125, + "learning_rate": 0.0003681903565822635, + "loss": 0.9065, + "step": 10438 + }, + { + "epoch": 0.7262165640544019, + "grad_norm": 1.0, + "learning_rate": 0.0003680157117024614, + "loss": 0.8314, + "step": 10439 + }, + { + "epoch": 0.7262861316915371, + "grad_norm": 1.0390625, + "learning_rate": 0.0003678410989116362, + "loss": 0.7061, + "step": 10440 + }, + { + "epoch": 0.7263556993286723, + "grad_norm": 0.95703125, + "learning_rate": 0.0003676665182186538, + "loss": 0.702, + "step": 10441 + }, + { + "epoch": 0.7264252669658076, + "grad_norm": 0.8359375, + "learning_rate": 0.0003674919696323781, + "loss": 0.7032, + "step": 10442 + }, + { + "epoch": 0.7264948346029427, + "grad_norm": 1.3359375, + "learning_rate": 0.0003673174531616723, + "loss": 1.001, + "step": 10443 + }, + { + "epoch": 0.7265644022400779, + "grad_norm": 1.21875, + "learning_rate": 0.000367142968815397, + "loss": 0.6741, + "step": 10444 + }, + { + "epoch": 0.7266339698772131, + "grad_norm": 1.28125, + "learning_rate": 0.0003669685166024119, + "loss": 0.7309, + "step": 10445 + }, + { + "epoch": 0.7267035375143484, + "grad_norm": 1.359375, + "learning_rate": 0.00036679409653157525, + "loss": 0.7796, + "step": 10446 + }, + { + "epoch": 0.7267731051514835, + "grad_norm": 1.0390625, + "learning_rate": 0.00036661970861174263, + "loss": 0.832, + "step": 10447 + }, + { + "epoch": 0.7268426727886187, + "grad_norm": 1.328125, + "learning_rate": 0.0003664453528517685, + "loss": 0.9177, + "step": 10448 + }, + { + "epoch": 0.726912240425754, + "grad_norm": 1.2734375, + "learning_rate": 0.00036627102926050596, + "loss": 1.0279, + "step": 10449 + }, + { + "epoch": 0.7269818080628891, + "grad_norm": 1.1640625, + "learning_rate": 0.00036609673784680666, + "loss": 0.7577, + "step": 10450 + }, + { + "epoch": 0.7270513757000243, + "grad_norm": 1.015625, + "learning_rate": 0.0003659224786195199, + "loss": 0.6044, + "step": 10451 + }, + { + "epoch": 0.7271209433371596, + "grad_norm": 0.9453125, + "learning_rate": 0.00036574825158749335, + "loss": 0.5833, + "step": 10452 + }, + { + "epoch": 0.7271905109742948, + "grad_norm": 1.0546875, + "learning_rate": 0.0003655740567595738, + "loss": 0.8016, + "step": 10453 + }, + { + "epoch": 0.7272600786114299, + "grad_norm": 1.109375, + "learning_rate": 0.00036539989414460615, + "loss": 0.7016, + "step": 10454 + }, + { + "epoch": 0.7273296462485652, + "grad_norm": 1.0234375, + "learning_rate": 0.00036522576375143325, + "loss": 0.7156, + "step": 10455 + }, + { + "epoch": 0.7273992138857004, + "grad_norm": 1.2890625, + "learning_rate": 0.00036505166558889625, + "loss": 0.8444, + "step": 10456 + }, + { + "epoch": 0.7274687815228356, + "grad_norm": 1.1796875, + "learning_rate": 0.00036487759966583565, + "loss": 0.9313, + "step": 10457 + }, + { + "epoch": 0.7275383491599707, + "grad_norm": 1.0859375, + "learning_rate": 0.00036470356599108887, + "loss": 0.8356, + "step": 10458 + }, + { + "epoch": 0.727607916797106, + "grad_norm": 1.71875, + "learning_rate": 0.0003645295645734931, + "loss": 1.0616, + "step": 10459 + }, + { + "epoch": 0.7276774844342412, + "grad_norm": 1.2109375, + "learning_rate": 0.00036435559542188315, + "loss": 0.7141, + "step": 10460 + }, + { + "epoch": 0.7277470520713764, + "grad_norm": 1.0234375, + "learning_rate": 0.0003641816585450922, + "loss": 0.7129, + "step": 10461 + }, + { + "epoch": 0.7278166197085116, + "grad_norm": 1.125, + "learning_rate": 0.0003640077539519516, + "loss": 0.9076, + "step": 10462 + }, + { + "epoch": 0.7278861873456468, + "grad_norm": 1.09375, + "learning_rate": 0.0003638338816512916, + "loss": 0.7787, + "step": 10463 + }, + { + "epoch": 0.727955754982782, + "grad_norm": 1.1953125, + "learning_rate": 0.0003636600416519409, + "loss": 0.9157, + "step": 10464 + }, + { + "epoch": 0.7280253226199173, + "grad_norm": 1.078125, + "learning_rate": 0.0003634862339627258, + "loss": 0.7722, + "step": 10465 + }, + { + "epoch": 0.7280948902570524, + "grad_norm": 1.3984375, + "learning_rate": 0.0003633124585924713, + "loss": 0.914, + "step": 10466 + }, + { + "epoch": 0.7281644578941876, + "grad_norm": 1.21875, + "learning_rate": 0.00036313871555000086, + "loss": 0.6732, + "step": 10467 + }, + { + "epoch": 0.7282340255313229, + "grad_norm": 1.046875, + "learning_rate": 0.00036296500484413695, + "loss": 0.7468, + "step": 10468 + }, + { + "epoch": 0.728303593168458, + "grad_norm": 1.5, + "learning_rate": 0.0003627913264836991, + "loss": 0.7366, + "step": 10469 + }, + { + "epoch": 0.7283731608055932, + "grad_norm": 1.203125, + "learning_rate": 0.00036261768047750554, + "loss": 1.0082, + "step": 10470 + }, + { + "epoch": 0.7284427284427284, + "grad_norm": 1.1484375, + "learning_rate": 0.0003624440668343736, + "loss": 0.7463, + "step": 10471 + }, + { + "epoch": 0.7285122960798637, + "grad_norm": 1.3359375, + "learning_rate": 0.0003622704855631187, + "loss": 0.6717, + "step": 10472 + }, + { + "epoch": 0.7285818637169988, + "grad_norm": 1.265625, + "learning_rate": 0.0003620969366725538, + "loss": 0.9143, + "step": 10473 + }, + { + "epoch": 0.728651431354134, + "grad_norm": 0.9296875, + "learning_rate": 0.0003619234201714916, + "loss": 0.9855, + "step": 10474 + }, + { + "epoch": 0.7287209989912693, + "grad_norm": 1.3359375, + "learning_rate": 0.00036174993606874186, + "loss": 1.0841, + "step": 10475 + }, + { + "epoch": 0.7287905666284045, + "grad_norm": 1.046875, + "learning_rate": 0.0003615764843731131, + "loss": 1.0146, + "step": 10476 + }, + { + "epoch": 0.7288601342655396, + "grad_norm": 1.0078125, + "learning_rate": 0.0003614030650934126, + "loss": 0.6484, + "step": 10477 + }, + { + "epoch": 0.7289297019026749, + "grad_norm": 1.0546875, + "learning_rate": 0.000361229678238446, + "loss": 0.7777, + "step": 10478 + }, + { + "epoch": 0.7289992695398101, + "grad_norm": 1.265625, + "learning_rate": 0.0003610563238170166, + "loss": 0.7467, + "step": 10479 + }, + { + "epoch": 0.7290688371769453, + "grad_norm": 0.8984375, + "learning_rate": 0.00036088300183792634, + "loss": 0.6488, + "step": 10480 + }, + { + "epoch": 0.7291384048140805, + "grad_norm": 1.234375, + "learning_rate": 0.00036070971230997583, + "loss": 0.9898, + "step": 10481 + }, + { + "epoch": 0.7292079724512157, + "grad_norm": 1.078125, + "learning_rate": 0.0003605364552419642, + "loss": 0.7607, + "step": 10482 + }, + { + "epoch": 0.7292775400883509, + "grad_norm": 1.3203125, + "learning_rate": 0.00036036323064268815, + "loss": 0.862, + "step": 10483 + }, + { + "epoch": 0.729347107725486, + "grad_norm": 1.0703125, + "learning_rate": 0.00036019003852094303, + "loss": 0.9068, + "step": 10484 + }, + { + "epoch": 0.7294166753626213, + "grad_norm": 1.03125, + "learning_rate": 0.0003600168788855228, + "loss": 0.6621, + "step": 10485 + }, + { + "epoch": 0.7294862429997565, + "grad_norm": 1.1484375, + "learning_rate": 0.0003598437517452201, + "loss": 0.8121, + "step": 10486 + }, + { + "epoch": 0.7295558106368917, + "grad_norm": 1.1328125, + "learning_rate": 0.00035967065710882474, + "loss": 0.9737, + "step": 10487 + }, + { + "epoch": 0.729625378274027, + "grad_norm": 1.3359375, + "learning_rate": 0.0003594975949851261, + "loss": 0.7987, + "step": 10488 + }, + { + "epoch": 0.7296949459111621, + "grad_norm": 1.1015625, + "learning_rate": 0.00035932456538291134, + "loss": 0.7368, + "step": 10489 + }, + { + "epoch": 0.7297645135482973, + "grad_norm": 1.2421875, + "learning_rate": 0.0003591515683109656, + "loss": 0.8261, + "step": 10490 + }, + { + "epoch": 0.7298340811854326, + "grad_norm": 1.40625, + "learning_rate": 0.00035897860377807303, + "loss": 1.0825, + "step": 10491 + }, + { + "epoch": 0.7299036488225678, + "grad_norm": 1.40625, + "learning_rate": 0.00035880567179301636, + "loss": 1.1469, + "step": 10492 + }, + { + "epoch": 0.7299732164597029, + "grad_norm": 1.015625, + "learning_rate": 0.0003586327723645758, + "loss": 0.6994, + "step": 10493 + }, + { + "epoch": 0.7300427840968382, + "grad_norm": 1.0859375, + "learning_rate": 0.00035845990550153, + "loss": 0.9453, + "step": 10494 + }, + { + "epoch": 0.7301123517339734, + "grad_norm": 0.953125, + "learning_rate": 0.0003582870712126566, + "loss": 0.8885, + "step": 10495 + }, + { + "epoch": 0.7301819193711085, + "grad_norm": 1.03125, + "learning_rate": 0.00035811426950673153, + "loss": 0.6837, + "step": 10496 + }, + { + "epoch": 0.7302514870082437, + "grad_norm": 1.390625, + "learning_rate": 0.0003579415003925285, + "loss": 0.8565, + "step": 10497 + }, + { + "epoch": 0.730321054645379, + "grad_norm": 1.2109375, + "learning_rate": 0.00035776876387881964, + "loss": 0.8585, + "step": 10498 + }, + { + "epoch": 0.7303906222825142, + "grad_norm": 1.171875, + "learning_rate": 0.00035759605997437574, + "loss": 0.7695, + "step": 10499 + }, + { + "epoch": 0.7304601899196493, + "grad_norm": 1.078125, + "learning_rate": 0.0003574233886879664, + "loss": 1.0199, + "step": 10500 + }, + { + "epoch": 0.7305297575567846, + "grad_norm": 1.1875, + "learning_rate": 0.0003572507500283585, + "loss": 0.9845, + "step": 10501 + }, + { + "epoch": 0.7305993251939198, + "grad_norm": 1.203125, + "learning_rate": 0.00035707814400431746, + "loss": 0.7833, + "step": 10502 + }, + { + "epoch": 0.730668892831055, + "grad_norm": 1.4453125, + "learning_rate": 0.0003569055706246077, + "loss": 1.0962, + "step": 10503 + }, + { + "epoch": 0.7307384604681902, + "grad_norm": 1.375, + "learning_rate": 0.00035673302989799204, + "loss": 0.8713, + "step": 10504 + }, + { + "epoch": 0.7308080281053254, + "grad_norm": 1.1484375, + "learning_rate": 0.0003565605218332304, + "loss": 0.7375, + "step": 10505 + }, + { + "epoch": 0.7308775957424606, + "grad_norm": 1.0859375, + "learning_rate": 0.00035638804643908274, + "loss": 0.7821, + "step": 10506 + }, + { + "epoch": 0.7309471633795959, + "grad_norm": 1.1171875, + "learning_rate": 0.00035621560372430596, + "loss": 0.6925, + "step": 10507 + }, + { + "epoch": 0.731016731016731, + "grad_norm": 1.03125, + "learning_rate": 0.0003560431936976556, + "loss": 0.9956, + "step": 10508 + }, + { + "epoch": 0.7310862986538662, + "grad_norm": 1.6015625, + "learning_rate": 0.000355870816367886, + "loss": 0.8902, + "step": 10509 + }, + { + "epoch": 0.7311558662910014, + "grad_norm": 1.4375, + "learning_rate": 0.00035569847174375, + "loss": 0.8974, + "step": 10510 + }, + { + "epoch": 0.7312254339281367, + "grad_norm": 1.09375, + "learning_rate": 0.0003555261598339983, + "loss": 0.7936, + "step": 10511 + }, + { + "epoch": 0.7312950015652718, + "grad_norm": 1.109375, + "learning_rate": 0.0003553538806473793, + "loss": 0.8037, + "step": 10512 + }, + { + "epoch": 0.731364569202407, + "grad_norm": 1.3046875, + "learning_rate": 0.00035518163419264104, + "loss": 0.6703, + "step": 10513 + }, + { + "epoch": 0.7314341368395423, + "grad_norm": 1.1640625, + "learning_rate": 0.0003550094204785296, + "loss": 0.7503, + "step": 10514 + }, + { + "epoch": 0.7315037044766775, + "grad_norm": 1.0546875, + "learning_rate": 0.0003548372395137888, + "loss": 0.7974, + "step": 10515 + }, + { + "epoch": 0.7315732721138126, + "grad_norm": 1.1953125, + "learning_rate": 0.0003546650913071607, + "loss": 0.8388, + "step": 10516 + }, + { + "epoch": 0.7316428397509479, + "grad_norm": 1.09375, + "learning_rate": 0.0003544929758673866, + "loss": 0.7568, + "step": 10517 + }, + { + "epoch": 0.7317124073880831, + "grad_norm": 1.0859375, + "learning_rate": 0.00035432089320320593, + "loss": 1.0404, + "step": 10518 + }, + { + "epoch": 0.7317819750252182, + "grad_norm": 0.8984375, + "learning_rate": 0.0003541488433233555, + "loss": 0.5822, + "step": 10519 + }, + { + "epoch": 0.7318515426623535, + "grad_norm": 1.078125, + "learning_rate": 0.0003539768262365719, + "loss": 0.9355, + "step": 10520 + }, + { + "epoch": 0.7319211102994887, + "grad_norm": 1.796875, + "learning_rate": 0.0003538048419515887, + "loss": 1.4892, + "step": 10521 + }, + { + "epoch": 0.7319906779366239, + "grad_norm": 1.2578125, + "learning_rate": 0.0003536328904771383, + "loss": 0.7739, + "step": 10522 + }, + { + "epoch": 0.732060245573759, + "grad_norm": 1.140625, + "learning_rate": 0.0003534609718219518, + "loss": 0.9225, + "step": 10523 + }, + { + "epoch": 0.7321298132108943, + "grad_norm": 1.15625, + "learning_rate": 0.00035328908599475874, + "loss": 0.712, + "step": 10524 + }, + { + "epoch": 0.7321993808480295, + "grad_norm": 1.03125, + "learning_rate": 0.0003531172330042861, + "loss": 0.865, + "step": 10525 + }, + { + "epoch": 0.7322689484851647, + "grad_norm": 1.0390625, + "learning_rate": 0.00035294541285925965, + "loss": 0.5902, + "step": 10526 + }, + { + "epoch": 0.7323385161222999, + "grad_norm": 1.0, + "learning_rate": 0.00035277362556840363, + "loss": 0.6576, + "step": 10527 + }, + { + "epoch": 0.7324080837594351, + "grad_norm": 1.5859375, + "learning_rate": 0.00035260187114044095, + "loss": 0.8863, + "step": 10528 + }, + { + "epoch": 0.7324776513965703, + "grad_norm": 0.91796875, + "learning_rate": 0.0003524301495840923, + "loss": 0.5281, + "step": 10529 + }, + { + "epoch": 0.7325472190337056, + "grad_norm": 1.09375, + "learning_rate": 0.0003522584609080761, + "loss": 0.7786, + "step": 10530 + }, + { + "epoch": 0.7326167866708407, + "grad_norm": 1.0, + "learning_rate": 0.00035208680512111056, + "loss": 0.6755, + "step": 10531 + }, + { + "epoch": 0.7326863543079759, + "grad_norm": 1.1328125, + "learning_rate": 0.00035191518223191153, + "loss": 0.6931, + "step": 10532 + }, + { + "epoch": 0.7327559219451112, + "grad_norm": 1.1953125, + "learning_rate": 0.00035174359224919273, + "loss": 0.9638, + "step": 10533 + }, + { + "epoch": 0.7328254895822464, + "grad_norm": 1.0625, + "learning_rate": 0.00035157203518166723, + "loss": 0.6878, + "step": 10534 + }, + { + "epoch": 0.7328950572193815, + "grad_norm": 1.0859375, + "learning_rate": 0.00035140051103804503, + "loss": 0.9544, + "step": 10535 + }, + { + "epoch": 0.7329646248565167, + "grad_norm": 1.0078125, + "learning_rate": 0.00035122901982703606, + "loss": 0.6548, + "step": 10536 + }, + { + "epoch": 0.733034192493652, + "grad_norm": 0.8671875, + "learning_rate": 0.0003510575615573471, + "loss": 0.6057, + "step": 10537 + }, + { + "epoch": 0.7331037601307872, + "grad_norm": 1.1171875, + "learning_rate": 0.0003508861362376846, + "loss": 0.8301, + "step": 10538 + }, + { + "epoch": 0.7331733277679223, + "grad_norm": 1.203125, + "learning_rate": 0.00035071474387675226, + "loss": 0.8204, + "step": 10539 + }, + { + "epoch": 0.7332428954050576, + "grad_norm": 1.0546875, + "learning_rate": 0.0003505433844832523, + "loss": 0.7581, + "step": 10540 + }, + { + "epoch": 0.7333124630421928, + "grad_norm": 1.2265625, + "learning_rate": 0.0003503720580658858, + "loss": 0.9967, + "step": 10541 + }, + { + "epoch": 0.733382030679328, + "grad_norm": 1.03125, + "learning_rate": 0.00035020076463335213, + "loss": 0.8009, + "step": 10542 + }, + { + "epoch": 0.7334515983164632, + "grad_norm": 0.99609375, + "learning_rate": 0.0003500295041943484, + "loss": 0.7618, + "step": 10543 + }, + { + "epoch": 0.7335211659535984, + "grad_norm": 1.109375, + "learning_rate": 0.00034985827675756997, + "loss": 0.772, + "step": 10544 + }, + { + "epoch": 0.7335907335907336, + "grad_norm": 1.2734375, + "learning_rate": 0.00034968708233171133, + "loss": 0.9681, + "step": 10545 + }, + { + "epoch": 0.7336603012278688, + "grad_norm": 1.1640625, + "learning_rate": 0.0003495159209254651, + "loss": 0.6452, + "step": 10546 + }, + { + "epoch": 0.733729868865004, + "grad_norm": 0.84375, + "learning_rate": 0.0003493447925475215, + "loss": 0.7094, + "step": 10547 + }, + { + "epoch": 0.7337994365021392, + "grad_norm": 1.1171875, + "learning_rate": 0.00034917369720657013, + "loss": 0.6876, + "step": 10548 + }, + { + "epoch": 0.7338690041392744, + "grad_norm": 1.2421875, + "learning_rate": 0.0003490026349112976, + "loss": 0.7527, + "step": 10549 + }, + { + "epoch": 0.7339385717764096, + "grad_norm": 1.3359375, + "learning_rate": 0.0003488316056703904, + "loss": 0.8257, + "step": 10550 + }, + { + "epoch": 0.7340081394135448, + "grad_norm": 1.0078125, + "learning_rate": 0.00034866060949253173, + "loss": 0.6408, + "step": 10551 + }, + { + "epoch": 0.73407770705068, + "grad_norm": 0.91015625, + "learning_rate": 0.0003484896463864047, + "loss": 0.6556, + "step": 10552 + }, + { + "epoch": 0.7341472746878153, + "grad_norm": 1.203125, + "learning_rate": 0.0003483187163606895, + "loss": 0.8944, + "step": 10553 + }, + { + "epoch": 0.7342168423249504, + "grad_norm": 1.265625, + "learning_rate": 0.0003481478194240645, + "loss": 0.8914, + "step": 10554 + }, + { + "epoch": 0.7342864099620856, + "grad_norm": 1.1171875, + "learning_rate": 0.00034797695558520835, + "loss": 0.756, + "step": 10555 + }, + { + "epoch": 0.7343559775992209, + "grad_norm": 1.3671875, + "learning_rate": 0.00034780612485279605, + "loss": 1.0581, + "step": 10556 + }, + { + "epoch": 0.7344255452363561, + "grad_norm": 1.234375, + "learning_rate": 0.0003476353272355013, + "loss": 1.0234, + "step": 10557 + }, + { + "epoch": 0.7344951128734912, + "grad_norm": 0.91796875, + "learning_rate": 0.00034746456274199625, + "loss": 0.6046, + "step": 10558 + }, + { + "epoch": 0.7345646805106265, + "grad_norm": 1.015625, + "learning_rate": 0.0003472938313809515, + "loss": 0.7405, + "step": 10559 + }, + { + "epoch": 0.7346342481477617, + "grad_norm": 1.1796875, + "learning_rate": 0.00034712313316103663, + "loss": 0.9239, + "step": 10560 + }, + { + "epoch": 0.7347038157848969, + "grad_norm": 1.0625, + "learning_rate": 0.00034695246809091784, + "loss": 0.6487, + "step": 10561 + }, + { + "epoch": 0.734773383422032, + "grad_norm": 1.015625, + "learning_rate": 0.0003467818361792615, + "loss": 0.791, + "step": 10562 + }, + { + "epoch": 0.7348429510591673, + "grad_norm": 0.80078125, + "learning_rate": 0.00034661123743473076, + "loss": 0.4346, + "step": 10563 + }, + { + "epoch": 0.7349125186963025, + "grad_norm": 1.015625, + "learning_rate": 0.00034644067186598835, + "loss": 0.7441, + "step": 10564 + }, + { + "epoch": 0.7349820863334376, + "grad_norm": 1.25, + "learning_rate": 0.0003462701394816942, + "loss": 0.607, + "step": 10565 + }, + { + "epoch": 0.7350516539705729, + "grad_norm": 1.1484375, + "learning_rate": 0.00034609964029050757, + "loss": 0.8549, + "step": 10566 + }, + { + "epoch": 0.7351212216077081, + "grad_norm": 0.828125, + "learning_rate": 0.00034592917430108537, + "loss": 0.5786, + "step": 10567 + }, + { + "epoch": 0.7351907892448433, + "grad_norm": 1.1796875, + "learning_rate": 0.0003457587415220822, + "loss": 0.7486, + "step": 10568 + }, + { + "epoch": 0.7352603568819785, + "grad_norm": 1.1875, + "learning_rate": 0.0003455883419621532, + "loss": 0.8182, + "step": 10569 + }, + { + "epoch": 0.7353299245191137, + "grad_norm": 1.0859375, + "learning_rate": 0.0003454179756299497, + "loss": 0.8274, + "step": 10570 + }, + { + "epoch": 0.7353994921562489, + "grad_norm": 0.765625, + "learning_rate": 0.0003452476425341221, + "loss": 0.732, + "step": 10571 + }, + { + "epoch": 0.7354690597933842, + "grad_norm": 0.97265625, + "learning_rate": 0.0003450773426833187, + "loss": 0.7192, + "step": 10572 + }, + { + "epoch": 0.7355386274305193, + "grad_norm": 1.25, + "learning_rate": 0.00034490707608618676, + "loss": 1.0045, + "step": 10573 + }, + { + "epoch": 0.7356081950676545, + "grad_norm": 1.328125, + "learning_rate": 0.00034473684275137184, + "loss": 1.0613, + "step": 10574 + }, + { + "epoch": 0.7356777627047897, + "grad_norm": 1.2421875, + "learning_rate": 0.000344566642687517, + "loss": 1.0541, + "step": 10575 + }, + { + "epoch": 0.735747330341925, + "grad_norm": 1.0234375, + "learning_rate": 0.0003443964759032647, + "loss": 0.9216, + "step": 10576 + }, + { + "epoch": 0.7358168979790601, + "grad_norm": 1.4453125, + "learning_rate": 0.0003442263424072547, + "loss": 0.7047, + "step": 10577 + }, + { + "epoch": 0.7358864656161953, + "grad_norm": 1.109375, + "learning_rate": 0.0003440562422081259, + "loss": 0.8835, + "step": 10578 + }, + { + "epoch": 0.7359560332533306, + "grad_norm": 1.2734375, + "learning_rate": 0.0003438861753145146, + "loss": 0.8249, + "step": 10579 + }, + { + "epoch": 0.7360256008904658, + "grad_norm": 1.0, + "learning_rate": 0.0003437161417350565, + "loss": 0.9425, + "step": 10580 + }, + { + "epoch": 0.7360951685276009, + "grad_norm": 1.546875, + "learning_rate": 0.0003435461414783846, + "loss": 0.9638, + "step": 10581 + }, + { + "epoch": 0.7361647361647362, + "grad_norm": 1.078125, + "learning_rate": 0.00034337617455313117, + "loss": 0.7829, + "step": 10582 + }, + { + "epoch": 0.7362343038018714, + "grad_norm": 0.96484375, + "learning_rate": 0.0003432062409679256, + "loss": 0.5947, + "step": 10583 + }, + { + "epoch": 0.7363038714390066, + "grad_norm": 0.9609375, + "learning_rate": 0.000343036340731397, + "loss": 0.7484, + "step": 10584 + }, + { + "epoch": 0.7363734390761418, + "grad_norm": 1.6015625, + "learning_rate": 0.00034286647385217163, + "loss": 1.0335, + "step": 10585 + }, + { + "epoch": 0.736443006713277, + "grad_norm": 1.3046875, + "learning_rate": 0.0003426966403388739, + "loss": 0.9315, + "step": 10586 + }, + { + "epoch": 0.7365125743504122, + "grad_norm": 1.0546875, + "learning_rate": 0.0003425268402001284, + "loss": 0.5161, + "step": 10587 + }, + { + "epoch": 0.7365821419875473, + "grad_norm": 1.3046875, + "learning_rate": 0.00034235707344455605, + "loss": 0.8376, + "step": 10588 + }, + { + "epoch": 0.7366517096246826, + "grad_norm": 1.375, + "learning_rate": 0.00034218734008077667, + "loss": 0.9266, + "step": 10589 + }, + { + "epoch": 0.7367212772618178, + "grad_norm": 1.0390625, + "learning_rate": 0.0003420176401174082, + "loss": 0.6244, + "step": 10590 + }, + { + "epoch": 0.736790844898953, + "grad_norm": 1.1328125, + "learning_rate": 0.0003418479735630675, + "loss": 0.8644, + "step": 10591 + }, + { + "epoch": 0.7368604125360882, + "grad_norm": 1.09375, + "learning_rate": 0.0003416783404263698, + "loss": 1.0935, + "step": 10592 + }, + { + "epoch": 0.7369299801732234, + "grad_norm": 1.359375, + "learning_rate": 0.0003415087407159273, + "loss": 0.9523, + "step": 10593 + }, + { + "epoch": 0.7369995478103586, + "grad_norm": 1.328125, + "learning_rate": 0.0003413391744403523, + "loss": 0.8776, + "step": 10594 + }, + { + "epoch": 0.7370691154474939, + "grad_norm": 1.03125, + "learning_rate": 0.00034116964160825394, + "loss": 0.7411, + "step": 10595 + }, + { + "epoch": 0.737138683084629, + "grad_norm": 1.140625, + "learning_rate": 0.0003410001422282406, + "loss": 0.9877, + "step": 10596 + }, + { + "epoch": 0.7372082507217642, + "grad_norm": 1.2265625, + "learning_rate": 0.0003408306763089182, + "loss": 0.9689, + "step": 10597 + }, + { + "epoch": 0.7372778183588995, + "grad_norm": 1.2578125, + "learning_rate": 0.00034066124385889176, + "loss": 0.7874, + "step": 10598 + }, + { + "epoch": 0.7373473859960347, + "grad_norm": 1.3515625, + "learning_rate": 0.00034049184488676423, + "loss": 0.8638, + "step": 10599 + }, + { + "epoch": 0.7374169536331698, + "grad_norm": 1.03125, + "learning_rate": 0.0003403224794011358, + "loss": 0.7705, + "step": 10600 + }, + { + "epoch": 0.737486521270305, + "grad_norm": 0.890625, + "learning_rate": 0.00034015314741060764, + "loss": 0.5618, + "step": 10601 + }, + { + "epoch": 0.7375560889074403, + "grad_norm": 1.234375, + "learning_rate": 0.00033998384892377673, + "loss": 1.0214, + "step": 10602 + }, + { + "epoch": 0.7376256565445755, + "grad_norm": 1.1171875, + "learning_rate": 0.00033981458394923936, + "loss": 0.7092, + "step": 10603 + }, + { + "epoch": 0.7376952241817106, + "grad_norm": 1.5390625, + "learning_rate": 0.0003396453524955894, + "loss": 0.8239, + "step": 10604 + }, + { + "epoch": 0.7377647918188459, + "grad_norm": 1.1171875, + "learning_rate": 0.00033947615457142, + "loss": 0.8572, + "step": 10605 + }, + { + "epoch": 0.7378343594559811, + "grad_norm": 1.09375, + "learning_rate": 0.0003393069901853225, + "loss": 0.905, + "step": 10606 + }, + { + "epoch": 0.7379039270931163, + "grad_norm": 0.84375, + "learning_rate": 0.00033913785934588556, + "loss": 0.6206, + "step": 10607 + }, + { + "epoch": 0.7379734947302515, + "grad_norm": 1.0703125, + "learning_rate": 0.0003389687620616976, + "loss": 0.7289, + "step": 10608 + }, + { + "epoch": 0.7380430623673867, + "grad_norm": 1.1796875, + "learning_rate": 0.0003387996983413436, + "loss": 0.8671, + "step": 10609 + }, + { + "epoch": 0.7381126300045219, + "grad_norm": 1.0, + "learning_rate": 0.0003386306681934086, + "loss": 0.6568, + "step": 10610 + }, + { + "epoch": 0.7381821976416572, + "grad_norm": 0.82421875, + "learning_rate": 0.00033846167162647435, + "loss": 0.6951, + "step": 10611 + }, + { + "epoch": 0.7382517652787923, + "grad_norm": 0.98046875, + "learning_rate": 0.0003382927086491223, + "loss": 0.5366, + "step": 10612 + }, + { + "epoch": 0.7383213329159275, + "grad_norm": 1.2890625, + "learning_rate": 0.000338123779269931, + "loss": 0.9697, + "step": 10613 + }, + { + "epoch": 0.7383909005530627, + "grad_norm": 1.28125, + "learning_rate": 0.00033795488349747815, + "loss": 0.9811, + "step": 10614 + }, + { + "epoch": 0.738460468190198, + "grad_norm": 0.9921875, + "learning_rate": 0.0003377860213403395, + "loss": 0.965, + "step": 10615 + }, + { + "epoch": 0.7385300358273331, + "grad_norm": 1.1015625, + "learning_rate": 0.00033761719280708905, + "loss": 0.7906, + "step": 10616 + }, + { + "epoch": 0.7385996034644683, + "grad_norm": 1.296875, + "learning_rate": 0.0003374483979062989, + "loss": 0.8549, + "step": 10617 + }, + { + "epoch": 0.7386691711016036, + "grad_norm": 1.203125, + "learning_rate": 0.00033727963664653915, + "loss": 1.0295, + "step": 10618 + }, + { + "epoch": 0.7387387387387387, + "grad_norm": 1.0625, + "learning_rate": 0.0003371109090363792, + "loss": 0.6678, + "step": 10619 + }, + { + "epoch": 0.7388083063758739, + "grad_norm": 1.2890625, + "learning_rate": 0.0003369422150843863, + "loss": 0.8005, + "step": 10620 + }, + { + "epoch": 0.7388778740130092, + "grad_norm": 1.2265625, + "learning_rate": 0.00033677355479912543, + "loss": 0.6075, + "step": 10621 + }, + { + "epoch": 0.7389474416501444, + "grad_norm": 1.6796875, + "learning_rate": 0.0003366049281891608, + "loss": 1.0783, + "step": 10622 + }, + { + "epoch": 0.7390170092872795, + "grad_norm": 1.2890625, + "learning_rate": 0.0003364363352630538, + "loss": 0.8589, + "step": 10623 + }, + { + "epoch": 0.7390865769244148, + "grad_norm": 1.1484375, + "learning_rate": 0.00033626777602936556, + "loss": 0.8846, + "step": 10624 + }, + { + "epoch": 0.73915614456155, + "grad_norm": 1.109375, + "learning_rate": 0.00033609925049665377, + "loss": 0.7845, + "step": 10625 + }, + { + "epoch": 0.7392257121986852, + "grad_norm": 1.1953125, + "learning_rate": 0.000335930758673476, + "loss": 0.6639, + "step": 10626 + }, + { + "epoch": 0.7392952798358203, + "grad_norm": 1.2265625, + "learning_rate": 0.000335762300568387, + "loss": 0.8502, + "step": 10627 + }, + { + "epoch": 0.7393648474729556, + "grad_norm": 1.1171875, + "learning_rate": 0.0003355938761899402, + "loss": 0.7402, + "step": 10628 + }, + { + "epoch": 0.7394344151100908, + "grad_norm": 1.0234375, + "learning_rate": 0.00033542548554668785, + "loss": 0.852, + "step": 10629 + }, + { + "epoch": 0.739503982747226, + "grad_norm": 1.3359375, + "learning_rate": 0.0003352571286471797, + "loss": 0.8183, + "step": 10630 + }, + { + "epoch": 0.7395735503843612, + "grad_norm": 1.0703125, + "learning_rate": 0.000335088805499964, + "loss": 0.8635, + "step": 10631 + }, + { + "epoch": 0.7396431180214964, + "grad_norm": 1.140625, + "learning_rate": 0.00033492051611358665, + "loss": 0.9351, + "step": 10632 + }, + { + "epoch": 0.7397126856586316, + "grad_norm": 1.1796875, + "learning_rate": 0.00033475226049659403, + "loss": 0.9912, + "step": 10633 + }, + { + "epoch": 0.7397822532957669, + "grad_norm": 0.95703125, + "learning_rate": 0.0003345840386575284, + "loss": 0.6397, + "step": 10634 + }, + { + "epoch": 0.739851820932902, + "grad_norm": 1.015625, + "learning_rate": 0.00033441585060493107, + "loss": 0.9123, + "step": 10635 + }, + { + "epoch": 0.7399213885700372, + "grad_norm": 1.15625, + "learning_rate": 0.00033424769634734234, + "loss": 0.6297, + "step": 10636 + }, + { + "epoch": 0.7399909562071725, + "grad_norm": 1.0390625, + "learning_rate": 0.0003340795758932996, + "loss": 0.8262, + "step": 10637 + }, + { + "epoch": 0.7400605238443076, + "grad_norm": 1.28125, + "learning_rate": 0.00033391148925133996, + "loss": 0.6155, + "step": 10638 + }, + { + "epoch": 0.7401300914814428, + "grad_norm": 1.09375, + "learning_rate": 0.0003337434364299972, + "loss": 0.7819, + "step": 10639 + }, + { + "epoch": 0.740199659118578, + "grad_norm": 1.15625, + "learning_rate": 0.0003335754174378047, + "loss": 0.8932, + "step": 10640 + }, + { + "epoch": 0.7402692267557133, + "grad_norm": 0.99609375, + "learning_rate": 0.0003334074322832933, + "loss": 0.6714, + "step": 10641 + }, + { + "epoch": 0.7403387943928484, + "grad_norm": 1.25, + "learning_rate": 0.0003332394809749927, + "loss": 0.8553, + "step": 10642 + }, + { + "epoch": 0.7404083620299836, + "grad_norm": 1.4375, + "learning_rate": 0.00033307156352143063, + "loss": 1.0901, + "step": 10643 + }, + { + "epoch": 0.7404779296671189, + "grad_norm": 0.94921875, + "learning_rate": 0.0003329036799311331, + "loss": 0.6526, + "step": 10644 + }, + { + "epoch": 0.7405474973042541, + "grad_norm": 1.1015625, + "learning_rate": 0.0003327358302126241, + "loss": 0.9371, + "step": 10645 + }, + { + "epoch": 0.7406170649413892, + "grad_norm": 1.046875, + "learning_rate": 0.0003325680143744262, + "loss": 0.6818, + "step": 10646 + }, + { + "epoch": 0.7406866325785245, + "grad_norm": 1.171875, + "learning_rate": 0.0003324002324250609, + "loss": 0.7781, + "step": 10647 + }, + { + "epoch": 0.7407562002156597, + "grad_norm": 0.84765625, + "learning_rate": 0.0003322324843730468, + "loss": 0.6444, + "step": 10648 + }, + { + "epoch": 0.7408257678527949, + "grad_norm": 1.1015625, + "learning_rate": 0.00033206477022690084, + "loss": 0.6224, + "step": 10649 + }, + { + "epoch": 0.74089533548993, + "grad_norm": 1.0625, + "learning_rate": 0.0003318970899951397, + "loss": 0.6883, + "step": 10650 + }, + { + "epoch": 0.7409649031270653, + "grad_norm": 1.2734375, + "learning_rate": 0.00033172944368627653, + "loss": 0.8715, + "step": 10651 + }, + { + "epoch": 0.7410344707642005, + "grad_norm": 0.93359375, + "learning_rate": 0.0003315618313088241, + "loss": 0.7715, + "step": 10652 + }, + { + "epoch": 0.7411040384013357, + "grad_norm": 1.0390625, + "learning_rate": 0.0003313942528712924, + "loss": 0.659, + "step": 10653 + }, + { + "epoch": 0.7411736060384709, + "grad_norm": 0.84765625, + "learning_rate": 0.0003312267083821909, + "loss": 0.602, + "step": 10654 + }, + { + "epoch": 0.7412431736756061, + "grad_norm": 1.3046875, + "learning_rate": 0.00033105919785002594, + "loss": 1.0, + "step": 10655 + }, + { + "epoch": 0.7413127413127413, + "grad_norm": 1.2421875, + "learning_rate": 0.0003308917212833036, + "loss": 0.7299, + "step": 10656 + }, + { + "epoch": 0.7413823089498766, + "grad_norm": 1.0234375, + "learning_rate": 0.00033072427869052667, + "loss": 0.6468, + "step": 10657 + }, + { + "epoch": 0.7414518765870117, + "grad_norm": 1.265625, + "learning_rate": 0.00033055687008019775, + "loss": 0.7774, + "step": 10658 + }, + { + "epoch": 0.7415214442241469, + "grad_norm": 1.171875, + "learning_rate": 0.0003303894954608165, + "loss": 0.7469, + "step": 10659 + }, + { + "epoch": 0.7415910118612822, + "grad_norm": 1.3515625, + "learning_rate": 0.00033022215484088157, + "loss": 0.9953, + "step": 10660 + }, + { + "epoch": 0.7416605794984173, + "grad_norm": 1.0703125, + "learning_rate": 0.00033005484822889, + "loss": 0.6639, + "step": 10661 + }, + { + "epoch": 0.7417301471355525, + "grad_norm": 0.8984375, + "learning_rate": 0.00032988757563333636, + "loss": 0.689, + "step": 10662 + }, + { + "epoch": 0.7417997147726877, + "grad_norm": 1.234375, + "learning_rate": 0.0003297203370627142, + "loss": 0.7924, + "step": 10663 + }, + { + "epoch": 0.741869282409823, + "grad_norm": 1.125, + "learning_rate": 0.0003295531325255141, + "loss": 0.845, + "step": 10664 + }, + { + "epoch": 0.7419388500469581, + "grad_norm": 0.88671875, + "learning_rate": 0.0003293859620302273, + "loss": 0.8241, + "step": 10665 + }, + { + "epoch": 0.7420084176840933, + "grad_norm": 1.2734375, + "learning_rate": 0.00032921882558534113, + "loss": 1.2517, + "step": 10666 + }, + { + "epoch": 0.7420779853212286, + "grad_norm": 1.203125, + "learning_rate": 0.00032905172319934174, + "loss": 0.7807, + "step": 10667 + }, + { + "epoch": 0.7421475529583638, + "grad_norm": 0.96875, + "learning_rate": 0.00032888465488071437, + "loss": 0.6846, + "step": 10668 + }, + { + "epoch": 0.7422171205954989, + "grad_norm": 0.8359375, + "learning_rate": 0.0003287176206379412, + "loss": 0.5882, + "step": 10669 + }, + { + "epoch": 0.7422866882326342, + "grad_norm": 1.2109375, + "learning_rate": 0.00032855062047950414, + "loss": 0.9032, + "step": 10670 + }, + { + "epoch": 0.7423562558697694, + "grad_norm": 1.28125, + "learning_rate": 0.0003283836544138818, + "loss": 0.8563, + "step": 10671 + }, + { + "epoch": 0.7424258235069046, + "grad_norm": 1.6484375, + "learning_rate": 0.0003282167224495527, + "loss": 0.6975, + "step": 10672 + }, + { + "epoch": 0.7424953911440398, + "grad_norm": 1.1328125, + "learning_rate": 0.000328049824594992, + "loss": 0.8196, + "step": 10673 + }, + { + "epoch": 0.742564958781175, + "grad_norm": 1.0078125, + "learning_rate": 0.0003278829608586743, + "loss": 0.8175, + "step": 10674 + }, + { + "epoch": 0.7426345264183102, + "grad_norm": 1.0625, + "learning_rate": 0.0003277161312490725, + "loss": 0.72, + "step": 10675 + }, + { + "epoch": 0.7427040940554454, + "grad_norm": 0.921875, + "learning_rate": 0.00032754933577465694, + "loss": 0.8003, + "step": 10676 + }, + { + "epoch": 0.7427736616925806, + "grad_norm": 0.94921875, + "learning_rate": 0.0003273825744438965, + "loss": 0.6127, + "step": 10677 + }, + { + "epoch": 0.7428432293297158, + "grad_norm": 1.0625, + "learning_rate": 0.00032721584726525855, + "loss": 0.8807, + "step": 10678 + }, + { + "epoch": 0.742912796966851, + "grad_norm": 0.9375, + "learning_rate": 0.0003270491542472092, + "loss": 0.9, + "step": 10679 + }, + { + "epoch": 0.7429823646039863, + "grad_norm": 0.99609375, + "learning_rate": 0.0003268824953982119, + "loss": 0.75, + "step": 10680 + }, + { + "epoch": 0.7430519322411214, + "grad_norm": 1.109375, + "learning_rate": 0.0003267158707267284, + "loss": 0.835, + "step": 10681 + }, + { + "epoch": 0.7431214998782566, + "grad_norm": 1.1953125, + "learning_rate": 0.00032654928024121953, + "loss": 0.9836, + "step": 10682 + }, + { + "epoch": 0.7431910675153919, + "grad_norm": 1.109375, + "learning_rate": 0.00032638272395014355, + "loss": 0.6932, + "step": 10683 + }, + { + "epoch": 0.743260635152527, + "grad_norm": 0.953125, + "learning_rate": 0.00032621620186195797, + "loss": 0.6945, + "step": 10684 + }, + { + "epoch": 0.7433302027896622, + "grad_norm": 1.078125, + "learning_rate": 0.0003260497139851172, + "loss": 0.7294, + "step": 10685 + }, + { + "epoch": 0.7433997704267975, + "grad_norm": 1.3515625, + "learning_rate": 0.00032588326032807524, + "loss": 0.8689, + "step": 10686 + }, + { + "epoch": 0.7434693380639327, + "grad_norm": 1.3359375, + "learning_rate": 0.00032571684089928324, + "loss": 0.7692, + "step": 10687 + }, + { + "epoch": 0.7435389057010678, + "grad_norm": 1.0859375, + "learning_rate": 0.00032555045570719135, + "loss": 0.8178, + "step": 10688 + }, + { + "epoch": 0.743608473338203, + "grad_norm": 1.3046875, + "learning_rate": 0.0003253841047602483, + "loss": 0.9112, + "step": 10689 + }, + { + "epoch": 0.7436780409753383, + "grad_norm": 1.046875, + "learning_rate": 0.0003252177880668999, + "loss": 0.7473, + "step": 10690 + }, + { + "epoch": 0.7437476086124735, + "grad_norm": 1.203125, + "learning_rate": 0.00032505150563559094, + "loss": 0.8301, + "step": 10691 + }, + { + "epoch": 0.7438171762496086, + "grad_norm": 1.0859375, + "learning_rate": 0.0003248852574747644, + "loss": 0.6411, + "step": 10692 + }, + { + "epoch": 0.7438867438867439, + "grad_norm": 1.03125, + "learning_rate": 0.0003247190435928621, + "loss": 0.7741, + "step": 10693 + }, + { + "epoch": 0.7439563115238791, + "grad_norm": 1.28125, + "learning_rate": 0.00032455286399832295, + "loss": 0.8128, + "step": 10694 + }, + { + "epoch": 0.7440258791610143, + "grad_norm": 1.15625, + "learning_rate": 0.0003243867186995847, + "loss": 0.757, + "step": 10695 + }, + { + "epoch": 0.7440954467981495, + "grad_norm": 0.98828125, + "learning_rate": 0.0003242206077050834, + "loss": 0.7755, + "step": 10696 + }, + { + "epoch": 0.7441650144352847, + "grad_norm": 1.0078125, + "learning_rate": 0.0003240545310232538, + "loss": 0.7966, + "step": 10697 + }, + { + "epoch": 0.7442345820724199, + "grad_norm": 1.1015625, + "learning_rate": 0.0003238884886625282, + "loss": 0.5772, + "step": 10698 + }, + { + "epoch": 0.7443041497095552, + "grad_norm": 0.8984375, + "learning_rate": 0.0003237224806313368, + "loss": 0.832, + "step": 10699 + }, + { + "epoch": 0.7443737173466903, + "grad_norm": 1.28125, + "learning_rate": 0.00032355650693810956, + "loss": 1.0497, + "step": 10700 + }, + { + "epoch": 0.7444432849838255, + "grad_norm": 1.0703125, + "learning_rate": 0.00032339056759127303, + "loss": 0.7867, + "step": 10701 + }, + { + "epoch": 0.7445128526209607, + "grad_norm": 0.9921875, + "learning_rate": 0.0003232246625992532, + "loss": 0.6914, + "step": 10702 + }, + { + "epoch": 0.744582420258096, + "grad_norm": 1.3046875, + "learning_rate": 0.00032305879197047405, + "loss": 0.7868, + "step": 10703 + }, + { + "epoch": 0.7446519878952311, + "grad_norm": 1.3046875, + "learning_rate": 0.00032289295571335744, + "loss": 0.9864, + "step": 10704 + }, + { + "epoch": 0.7447215555323663, + "grad_norm": 1.0625, + "learning_rate": 0.0003227271538363232, + "loss": 0.74, + "step": 10705 + }, + { + "epoch": 0.7447911231695016, + "grad_norm": 1.3046875, + "learning_rate": 0.00032256138634779053, + "loss": 1.1954, + "step": 10706 + }, + { + "epoch": 0.7448606908066367, + "grad_norm": 1.125, + "learning_rate": 0.0003223956532561765, + "loss": 0.7271, + "step": 10707 + }, + { + "epoch": 0.7449302584437719, + "grad_norm": 0.9375, + "learning_rate": 0.00032222995456989567, + "loss": 0.7618, + "step": 10708 + }, + { + "epoch": 0.7449998260809072, + "grad_norm": 0.99609375, + "learning_rate": 0.0003220642902973613, + "loss": 0.9101, + "step": 10709 + }, + { + "epoch": 0.7450693937180424, + "grad_norm": 0.984375, + "learning_rate": 0.0003218986604469851, + "loss": 0.808, + "step": 10710 + }, + { + "epoch": 0.7451389613551775, + "grad_norm": 1.2734375, + "learning_rate": 0.0003217330650271775, + "loss": 0.6827, + "step": 10711 + }, + { + "epoch": 0.7452085289923128, + "grad_norm": 1.296875, + "learning_rate": 0.00032156750404634604, + "loss": 0.7297, + "step": 10712 + }, + { + "epoch": 0.745278096629448, + "grad_norm": 1.0703125, + "learning_rate": 0.00032140197751289693, + "loss": 0.8698, + "step": 10713 + }, + { + "epoch": 0.7453476642665832, + "grad_norm": 1.6015625, + "learning_rate": 0.00032123648543523533, + "loss": 0.9402, + "step": 10714 + }, + { + "epoch": 0.7454172319037183, + "grad_norm": 1.3671875, + "learning_rate": 0.0003210710278217634, + "loss": 0.7883, + "step": 10715 + }, + { + "epoch": 0.7454867995408536, + "grad_norm": 1.15625, + "learning_rate": 0.0003209056046808827, + "loss": 0.6464, + "step": 10716 + }, + { + "epoch": 0.7455563671779888, + "grad_norm": 1.359375, + "learning_rate": 0.0003207402160209927, + "loss": 0.7639, + "step": 10717 + }, + { + "epoch": 0.745625934815124, + "grad_norm": 1.203125, + "learning_rate": 0.0003205748618504909, + "loss": 0.8756, + "step": 10718 + }, + { + "epoch": 0.7456955024522592, + "grad_norm": 0.98046875, + "learning_rate": 0.00032040954217777274, + "loss": 0.7125, + "step": 10719 + }, + { + "epoch": 0.7457650700893944, + "grad_norm": 1.3046875, + "learning_rate": 0.00032024425701123263, + "loss": 0.9542, + "step": 10720 + }, + { + "epoch": 0.7458346377265296, + "grad_norm": 0.8828125, + "learning_rate": 0.00032007900635926324, + "loss": 0.6046, + "step": 10721 + }, + { + "epoch": 0.7459042053636649, + "grad_norm": 0.9765625, + "learning_rate": 0.0003199137902302548, + "loss": 0.751, + "step": 10722 + }, + { + "epoch": 0.7459737730008, + "grad_norm": 1.40625, + "learning_rate": 0.0003197486086325959, + "loss": 0.806, + "step": 10723 + }, + { + "epoch": 0.7460433406379352, + "grad_norm": 1.0859375, + "learning_rate": 0.000319583461574674, + "loss": 0.7304, + "step": 10724 + }, + { + "epoch": 0.7461129082750705, + "grad_norm": 1.5078125, + "learning_rate": 0.00031941834906487463, + "loss": 0.7119, + "step": 10725 + }, + { + "epoch": 0.7461824759122057, + "grad_norm": 0.94140625, + "learning_rate": 0.0003192532711115812, + "loss": 0.6333, + "step": 10726 + }, + { + "epoch": 0.7462520435493408, + "grad_norm": 1.0625, + "learning_rate": 0.00031908822772317504, + "loss": 0.7809, + "step": 10727 + }, + { + "epoch": 0.746321611186476, + "grad_norm": 0.94921875, + "learning_rate": 0.00031892321890803654, + "loss": 0.8851, + "step": 10728 + }, + { + "epoch": 0.7463911788236113, + "grad_norm": 0.98828125, + "learning_rate": 0.0003187582446745446, + "loss": 0.9008, + "step": 10729 + }, + { + "epoch": 0.7464607464607464, + "grad_norm": 1.1171875, + "learning_rate": 0.0003185933050310749, + "loss": 0.6785, + "step": 10730 + }, + { + "epoch": 0.7465303140978816, + "grad_norm": 1.015625, + "learning_rate": 0.0003184283999860029, + "loss": 0.8088, + "step": 10731 + }, + { + "epoch": 0.7465998817350169, + "grad_norm": 1.3125, + "learning_rate": 0.0003182635295477014, + "loss": 0.7669, + "step": 10732 + }, + { + "epoch": 0.7466694493721521, + "grad_norm": 0.9921875, + "learning_rate": 0.00031809869372454136, + "loss": 0.6425, + "step": 10733 + }, + { + "epoch": 0.7467390170092872, + "grad_norm": 1.1171875, + "learning_rate": 0.0003179338925248926, + "loss": 0.7079, + "step": 10734 + }, + { + "epoch": 0.7468085846464225, + "grad_norm": 0.8828125, + "learning_rate": 0.0003177691259571233, + "loss": 0.782, + "step": 10735 + }, + { + "epoch": 0.7468781522835577, + "grad_norm": 1.0546875, + "learning_rate": 0.00031760439402959896, + "loss": 0.6856, + "step": 10736 + }, + { + "epoch": 0.7469477199206929, + "grad_norm": 1.046875, + "learning_rate": 0.0003174396967506837, + "loss": 0.6101, + "step": 10737 + }, + { + "epoch": 0.7470172875578281, + "grad_norm": 1.109375, + "learning_rate": 0.00031727503412874025, + "loss": 0.9982, + "step": 10738 + }, + { + "epoch": 0.7470868551949633, + "grad_norm": 1.0546875, + "learning_rate": 0.00031711040617212973, + "loss": 0.8929, + "step": 10739 + }, + { + "epoch": 0.7471564228320985, + "grad_norm": 1.40625, + "learning_rate": 0.00031694581288921076, + "loss": 0.9524, + "step": 10740 + }, + { + "epoch": 0.7472259904692337, + "grad_norm": 1.2578125, + "learning_rate": 0.00031678125428834025, + "loss": 0.8278, + "step": 10741 + }, + { + "epoch": 0.7472955581063689, + "grad_norm": 1.09375, + "learning_rate": 0.000316616730377874, + "loss": 0.8842, + "step": 10742 + }, + { + "epoch": 0.7473651257435041, + "grad_norm": 1.4140625, + "learning_rate": 0.000316452241166166, + "loss": 0.8872, + "step": 10743 + }, + { + "epoch": 0.7474346933806393, + "grad_norm": 1.609375, + "learning_rate": 0.00031628778666156776, + "loss": 0.7571, + "step": 10744 + }, + { + "epoch": 0.7475042610177746, + "grad_norm": 1.578125, + "learning_rate": 0.00031612336687242927, + "loss": 0.7759, + "step": 10745 + }, + { + "epoch": 0.7475738286549097, + "grad_norm": 0.99609375, + "learning_rate": 0.00031595898180709957, + "loss": 0.7668, + "step": 10746 + }, + { + "epoch": 0.7476433962920449, + "grad_norm": 1.15625, + "learning_rate": 0.00031579463147392463, + "loss": 0.771, + "step": 10747 + }, + { + "epoch": 0.7477129639291802, + "grad_norm": 1.296875, + "learning_rate": 0.00031563031588124966, + "loss": 0.8609, + "step": 10748 + }, + { + "epoch": 0.7477825315663154, + "grad_norm": 1.1796875, + "learning_rate": 0.0003154660350374181, + "loss": 0.7597, + "step": 10749 + }, + { + "epoch": 0.7478520992034505, + "grad_norm": 1.296875, + "learning_rate": 0.0003153017889507709, + "loss": 0.9094, + "step": 10750 + }, + { + "epoch": 0.7479216668405858, + "grad_norm": 1.2578125, + "learning_rate": 0.00031513757762964746, + "loss": 0.8527, + "step": 10751 + }, + { + "epoch": 0.747991234477721, + "grad_norm": 0.92578125, + "learning_rate": 0.0003149734010823858, + "loss": 0.6744, + "step": 10752 + }, + { + "epoch": 0.7480608021148561, + "grad_norm": 1.1796875, + "learning_rate": 0.00031480925931732254, + "loss": 0.8425, + "step": 10753 + }, + { + "epoch": 0.7481303697519913, + "grad_norm": 1.0859375, + "learning_rate": 0.0003146451523427912, + "loss": 0.8637, + "step": 10754 + }, + { + "epoch": 0.7481999373891266, + "grad_norm": 1.109375, + "learning_rate": 0.0003144810801671245, + "loss": 0.7141, + "step": 10755 + }, + { + "epoch": 0.7482695050262618, + "grad_norm": 1.25, + "learning_rate": 0.0003143170427986531, + "loss": 0.932, + "step": 10756 + }, + { + "epoch": 0.7483390726633969, + "grad_norm": 0.72265625, + "learning_rate": 0.0003141530402457067, + "loss": 0.571, + "step": 10757 + }, + { + "epoch": 0.7484086403005322, + "grad_norm": 1.34375, + "learning_rate": 0.0003139890725166118, + "loss": 1.0473, + "step": 10758 + }, + { + "epoch": 0.7484782079376674, + "grad_norm": 0.828125, + "learning_rate": 0.00031382513961969384, + "loss": 0.5942, + "step": 10759 + }, + { + "epoch": 0.7485477755748026, + "grad_norm": 1.0625, + "learning_rate": 0.00031366124156327667, + "loss": 0.7604, + "step": 10760 + }, + { + "epoch": 0.7486173432119378, + "grad_norm": 1.3671875, + "learning_rate": 0.0003134973783556825, + "loss": 0.8232, + "step": 10761 + }, + { + "epoch": 0.748686910849073, + "grad_norm": 1.15625, + "learning_rate": 0.000313333550005231, + "loss": 0.9719, + "step": 10762 + }, + { + "epoch": 0.7487564784862082, + "grad_norm": 1.03125, + "learning_rate": 0.00031316975652024106, + "loss": 0.8764, + "step": 10763 + }, + { + "epoch": 0.7488260461233435, + "grad_norm": 1.1953125, + "learning_rate": 0.00031300599790902905, + "loss": 0.6826, + "step": 10764 + }, + { + "epoch": 0.7488956137604786, + "grad_norm": 0.97265625, + "learning_rate": 0.0003128422741799094, + "loss": 0.7145, + "step": 10765 + }, + { + "epoch": 0.7489651813976138, + "grad_norm": 1.390625, + "learning_rate": 0.00031267858534119553, + "loss": 0.9347, + "step": 10766 + }, + { + "epoch": 0.749034749034749, + "grad_norm": 0.9375, + "learning_rate": 0.000312514931401199, + "loss": 0.6208, + "step": 10767 + }, + { + "epoch": 0.7491043166718843, + "grad_norm": 1.21875, + "learning_rate": 0.0003123513123682292, + "loss": 0.7783, + "step": 10768 + }, + { + "epoch": 0.7491738843090194, + "grad_norm": 0.9140625, + "learning_rate": 0.00031218772825059336, + "loss": 0.8459, + "step": 10769 + }, + { + "epoch": 0.7492434519461546, + "grad_norm": 1.15625, + "learning_rate": 0.0003120241790565979, + "loss": 0.7067, + "step": 10770 + }, + { + "epoch": 0.7493130195832899, + "grad_norm": 0.87890625, + "learning_rate": 0.0003118606647945472, + "loss": 0.7017, + "step": 10771 + }, + { + "epoch": 0.7493825872204251, + "grad_norm": 1.1484375, + "learning_rate": 0.0003116971854727435, + "loss": 0.7706, + "step": 10772 + }, + { + "epoch": 0.7494521548575602, + "grad_norm": 1.015625, + "learning_rate": 0.0003115337410994872, + "loss": 0.7049, + "step": 10773 + }, + { + "epoch": 0.7495217224946955, + "grad_norm": 1.1796875, + "learning_rate": 0.00031137033168307727, + "loss": 0.8327, + "step": 10774 + }, + { + "epoch": 0.7495912901318307, + "grad_norm": 1.140625, + "learning_rate": 0.00031120695723181125, + "loss": 0.5652, + "step": 10775 + }, + { + "epoch": 0.7496608577689658, + "grad_norm": 1.0703125, + "learning_rate": 0.0003110436177539839, + "loss": 0.9325, + "step": 10776 + }, + { + "epoch": 0.7497304254061011, + "grad_norm": 0.98046875, + "learning_rate": 0.00031088031325788944, + "loss": 0.7637, + "step": 10777 + }, + { + "epoch": 0.7497999930432363, + "grad_norm": 0.9921875, + "learning_rate": 0.0003107170437518192, + "loss": 0.5993, + "step": 10778 + }, + { + "epoch": 0.7498695606803715, + "grad_norm": 1.1796875, + "learning_rate": 0.00031055380924406285, + "loss": 1.0458, + "step": 10779 + }, + { + "epoch": 0.7499391283175066, + "grad_norm": 0.87109375, + "learning_rate": 0.0003103906097429091, + "loss": 0.6709, + "step": 10780 + }, + { + "epoch": 0.7500086959546419, + "grad_norm": 0.94921875, + "learning_rate": 0.0003102274452566445, + "loss": 0.5583, + "step": 10781 + }, + { + "epoch": 0.7500782635917771, + "grad_norm": 1.3125, + "learning_rate": 0.00031006431579355367, + "loss": 0.8395, + "step": 10782 + }, + { + "epoch": 0.7501478312289123, + "grad_norm": 1.1015625, + "learning_rate": 0.0003099012213619189, + "loss": 0.8713, + "step": 10783 + }, + { + "epoch": 0.7502173988660475, + "grad_norm": 1.28125, + "learning_rate": 0.0003097381619700218, + "loss": 0.7511, + "step": 10784 + }, + { + "epoch": 0.7502869665031827, + "grad_norm": 1.265625, + "learning_rate": 0.00030957513762614196, + "loss": 0.6935, + "step": 10785 + }, + { + "epoch": 0.7503565341403179, + "grad_norm": 0.93359375, + "learning_rate": 0.0003094121483385567, + "loss": 0.4937, + "step": 10786 + }, + { + "epoch": 0.7504261017774532, + "grad_norm": 1.375, + "learning_rate": 0.0003092491941155413, + "loss": 0.9161, + "step": 10787 + }, + { + "epoch": 0.7504956694145883, + "grad_norm": 1.1953125, + "learning_rate": 0.0003090862749653702, + "loss": 1.0645, + "step": 10788 + }, + { + "epoch": 0.7505652370517235, + "grad_norm": 1.140625, + "learning_rate": 0.00030892339089631603, + "loss": 0.5419, + "step": 10789 + }, + { + "epoch": 0.7506348046888588, + "grad_norm": 0.83984375, + "learning_rate": 0.0003087605419166484, + "loss": 0.7095, + "step": 10790 + }, + { + "epoch": 0.750704372325994, + "grad_norm": 1.3984375, + "learning_rate": 0.0003085977280346366, + "loss": 0.9747, + "step": 10791 + }, + { + "epoch": 0.7507739399631291, + "grad_norm": 1.109375, + "learning_rate": 0.0003084349492585473, + "loss": 1.0787, + "step": 10792 + }, + { + "epoch": 0.7508435076002643, + "grad_norm": 1.125, + "learning_rate": 0.00030827220559664524, + "loss": 0.7317, + "step": 10793 + }, + { + "epoch": 0.7509130752373996, + "grad_norm": 1.1171875, + "learning_rate": 0.00030810949705719395, + "loss": 0.7491, + "step": 10794 + }, + { + "epoch": 0.7509826428745348, + "grad_norm": 1.03125, + "learning_rate": 0.0003079468236484554, + "loss": 0.8051, + "step": 10795 + }, + { + "epoch": 0.7510522105116699, + "grad_norm": 1.140625, + "learning_rate": 0.00030778418537868893, + "loss": 0.6746, + "step": 10796 + }, + { + "epoch": 0.7511217781488052, + "grad_norm": 1.03125, + "learning_rate": 0.0003076215822561521, + "loss": 0.8839, + "step": 10797 + }, + { + "epoch": 0.7511913457859404, + "grad_norm": 0.90234375, + "learning_rate": 0.0003074590142891015, + "loss": 0.8582, + "step": 10798 + }, + { + "epoch": 0.7512609134230755, + "grad_norm": 1.3203125, + "learning_rate": 0.0003072964814857918, + "loss": 1.0314, + "step": 10799 + }, + { + "epoch": 0.7513304810602108, + "grad_norm": 0.86328125, + "learning_rate": 0.00030713398385447534, + "loss": 0.5999, + "step": 10800 + }, + { + "epoch": 0.751400048697346, + "grad_norm": 1.1328125, + "learning_rate": 0.00030697152140340256, + "loss": 0.761, + "step": 10801 + }, + { + "epoch": 0.7514696163344812, + "grad_norm": 1.15625, + "learning_rate": 0.0003068090941408228, + "loss": 0.906, + "step": 10802 + }, + { + "epoch": 0.7515391839716165, + "grad_norm": 1.078125, + "learning_rate": 0.0003066467020749836, + "loss": 0.7689, + "step": 10803 + }, + { + "epoch": 0.7516087516087516, + "grad_norm": 1.015625, + "learning_rate": 0.00030648434521412984, + "loss": 0.8183, + "step": 10804 + }, + { + "epoch": 0.7516783192458868, + "grad_norm": 1.609375, + "learning_rate": 0.0003063220235665056, + "loss": 0.7437, + "step": 10805 + }, + { + "epoch": 0.751747886883022, + "grad_norm": 0.890625, + "learning_rate": 0.0003061597371403525, + "loss": 0.4423, + "step": 10806 + }, + { + "epoch": 0.7518174545201572, + "grad_norm": 1.0390625, + "learning_rate": 0.00030599748594391094, + "loss": 0.5525, + "step": 10807 + }, + { + "epoch": 0.7518870221572924, + "grad_norm": 1.1875, + "learning_rate": 0.00030583526998541875, + "loss": 1.2907, + "step": 10808 + }, + { + "epoch": 0.7519565897944276, + "grad_norm": 1.0625, + "learning_rate": 0.000305673089273113, + "loss": 0.757, + "step": 10809 + }, + { + "epoch": 0.7520261574315629, + "grad_norm": 1.2421875, + "learning_rate": 0.00030551094381522806, + "loss": 0.7258, + "step": 10810 + }, + { + "epoch": 0.752095725068698, + "grad_norm": 1.0546875, + "learning_rate": 0.00030534883361999664, + "loss": 0.8237, + "step": 10811 + }, + { + "epoch": 0.7521652927058332, + "grad_norm": 1.0625, + "learning_rate": 0.0003051867586956502, + "loss": 0.5544, + "step": 10812 + }, + { + "epoch": 0.7522348603429685, + "grad_norm": 1.0703125, + "learning_rate": 0.00030502471905041815, + "loss": 0.7884, + "step": 10813 + }, + { + "epoch": 0.7523044279801037, + "grad_norm": 0.8984375, + "learning_rate": 0.0003048627146925281, + "loss": 0.4507, + "step": 10814 + }, + { + "epoch": 0.7523739956172388, + "grad_norm": 1.078125, + "learning_rate": 0.00030470074563020534, + "loss": 0.5509, + "step": 10815 + }, + { + "epoch": 0.7524435632543741, + "grad_norm": 1.2734375, + "learning_rate": 0.0003045388118716741, + "loss": 0.9134, + "step": 10816 + }, + { + "epoch": 0.7525131308915093, + "grad_norm": 1.2109375, + "learning_rate": 0.00030437691342515694, + "loss": 0.5909, + "step": 10817 + }, + { + "epoch": 0.7525826985286445, + "grad_norm": 1.28125, + "learning_rate": 0.0003042150502988739, + "loss": 0.9357, + "step": 10818 + }, + { + "epoch": 0.7526522661657796, + "grad_norm": 1.265625, + "learning_rate": 0.0003040532225010433, + "loss": 0.7893, + "step": 10819 + }, + { + "epoch": 0.7527218338029149, + "grad_norm": 1.3046875, + "learning_rate": 0.00030389143003988216, + "loss": 0.9734, + "step": 10820 + }, + { + "epoch": 0.7527914014400501, + "grad_norm": 1.0625, + "learning_rate": 0.00030372967292360587, + "loss": 0.8374, + "step": 10821 + }, + { + "epoch": 0.7528609690771852, + "grad_norm": 1.046875, + "learning_rate": 0.00030356795116042714, + "loss": 0.8235, + "step": 10822 + }, + { + "epoch": 0.7529305367143205, + "grad_norm": 1.1484375, + "learning_rate": 0.00030340626475855784, + "loss": 1.0226, + "step": 10823 + }, + { + "epoch": 0.7530001043514557, + "grad_norm": 1.6328125, + "learning_rate": 0.00030324461372620726, + "loss": 0.727, + "step": 10824 + }, + { + "epoch": 0.7530696719885909, + "grad_norm": 0.98046875, + "learning_rate": 0.000303082998071583, + "loss": 0.7792, + "step": 10825 + }, + { + "epoch": 0.7531392396257262, + "grad_norm": 1.0078125, + "learning_rate": 0.0003029214178028914, + "loss": 0.5346, + "step": 10826 + }, + { + "epoch": 0.7532088072628613, + "grad_norm": 1.15625, + "learning_rate": 0.000302759872928337, + "loss": 0.797, + "step": 10827 + }, + { + "epoch": 0.7532783748999965, + "grad_norm": 1.609375, + "learning_rate": 0.0003025983634561218, + "loss": 1.0819, + "step": 10828 + }, + { + "epoch": 0.7533479425371318, + "grad_norm": 1.03125, + "learning_rate": 0.0003024368893944462, + "loss": 0.6368, + "step": 10829 + }, + { + "epoch": 0.753417510174267, + "grad_norm": 1.3203125, + "learning_rate": 0.00030227545075150954, + "loss": 1.2654, + "step": 10830 + }, + { + "epoch": 0.7534870778114021, + "grad_norm": 1.1328125, + "learning_rate": 0.000302114047535509, + "loss": 0.7587, + "step": 10831 + }, + { + "epoch": 0.7535566454485373, + "grad_norm": 1.0390625, + "learning_rate": 0.0003019526797546395, + "loss": 0.5841, + "step": 10832 + }, + { + "epoch": 0.7536262130856726, + "grad_norm": 1.1640625, + "learning_rate": 0.00030179134741709405, + "loss": 0.8582, + "step": 10833 + }, + { + "epoch": 0.7536957807228077, + "grad_norm": 1.109375, + "learning_rate": 0.00030163005053106484, + "loss": 0.6291, + "step": 10834 + }, + { + "epoch": 0.7537653483599429, + "grad_norm": 0.98046875, + "learning_rate": 0.00030146878910474194, + "loss": 0.6024, + "step": 10835 + }, + { + "epoch": 0.7538349159970782, + "grad_norm": 0.9140625, + "learning_rate": 0.0003013075631463128, + "loss": 0.5758, + "step": 10836 + }, + { + "epoch": 0.7539044836342134, + "grad_norm": 1.140625, + "learning_rate": 0.00030114637266396416, + "loss": 0.7231, + "step": 10837 + }, + { + "epoch": 0.7539740512713485, + "grad_norm": 1.3046875, + "learning_rate": 0.00030098521766587993, + "loss": 0.8559, + "step": 10838 + }, + { + "epoch": 0.7540436189084838, + "grad_norm": 1.0859375, + "learning_rate": 0.0003008240981602435, + "loss": 0.8338, + "step": 10839 + }, + { + "epoch": 0.754113186545619, + "grad_norm": 1.1875, + "learning_rate": 0.00030066301415523477, + "loss": 0.732, + "step": 10840 + }, + { + "epoch": 0.7541827541827542, + "grad_norm": 1.1953125, + "learning_rate": 0.00030050196565903364, + "loss": 0.814, + "step": 10841 + }, + { + "epoch": 0.7542523218198894, + "grad_norm": 1.28125, + "learning_rate": 0.000300340952679817, + "loss": 0.8623, + "step": 10842 + }, + { + "epoch": 0.7543218894570246, + "grad_norm": 1.3984375, + "learning_rate": 0.00030017997522575993, + "loss": 0.7784, + "step": 10843 + }, + { + "epoch": 0.7543914570941598, + "grad_norm": 1.1484375, + "learning_rate": 0.0003000190333050363, + "loss": 0.8666, + "step": 10844 + }, + { + "epoch": 0.754461024731295, + "grad_norm": 1.1484375, + "learning_rate": 0.0002998581269258183, + "loss": 0.8736, + "step": 10845 + }, + { + "epoch": 0.7545305923684302, + "grad_norm": 1.0234375, + "learning_rate": 0.0002996972560962757, + "loss": 0.7556, + "step": 10846 + }, + { + "epoch": 0.7546001600055654, + "grad_norm": 1.2734375, + "learning_rate": 0.00029953642082457634, + "loss": 0.9385, + "step": 10847 + }, + { + "epoch": 0.7546697276427006, + "grad_norm": 1.2109375, + "learning_rate": 0.00029937562111888685, + "loss": 0.6932, + "step": 10848 + }, + { + "epoch": 0.7547392952798359, + "grad_norm": 1.171875, + "learning_rate": 0.0002992148569873723, + "loss": 0.9168, + "step": 10849 + }, + { + "epoch": 0.754808862916971, + "grad_norm": 0.96484375, + "learning_rate": 0.0002990541284381947, + "loss": 0.799, + "step": 10850 + }, + { + "epoch": 0.7548784305541062, + "grad_norm": 1.2265625, + "learning_rate": 0.00029889343547951584, + "loss": 0.957, + "step": 10851 + }, + { + "epoch": 0.7549479981912415, + "grad_norm": 1.0234375, + "learning_rate": 0.0002987327781194942, + "loss": 0.8664, + "step": 10852 + }, + { + "epoch": 0.7550175658283766, + "grad_norm": 1.0859375, + "learning_rate": 0.00029857215636628763, + "loss": 1.0017, + "step": 10853 + }, + { + "epoch": 0.7550871334655118, + "grad_norm": 0.80078125, + "learning_rate": 0.0002984115702280512, + "loss": 0.7072, + "step": 10854 + }, + { + "epoch": 0.7551567011026471, + "grad_norm": 1.09375, + "learning_rate": 0.0002982510197129393, + "loss": 0.7619, + "step": 10855 + }, + { + "epoch": 0.7552262687397823, + "grad_norm": 0.9765625, + "learning_rate": 0.0002980905048291036, + "loss": 0.7234, + "step": 10856 + }, + { + "epoch": 0.7552958363769174, + "grad_norm": 1.03125, + "learning_rate": 0.0002979300255846935, + "loss": 0.6323, + "step": 10857 + }, + { + "epoch": 0.7553654040140526, + "grad_norm": 1.1640625, + "learning_rate": 0.00029776958198785865, + "loss": 0.7847, + "step": 10858 + }, + { + "epoch": 0.7554349716511879, + "grad_norm": 1.328125, + "learning_rate": 0.0002976091740467449, + "loss": 0.8902, + "step": 10859 + }, + { + "epoch": 0.7555045392883231, + "grad_norm": 1.1015625, + "learning_rate": 0.00029744880176949706, + "loss": 0.934, + "step": 10860 + }, + { + "epoch": 0.7555741069254582, + "grad_norm": 1.046875, + "learning_rate": 0.0002972884651642576, + "loss": 0.6618, + "step": 10861 + }, + { + "epoch": 0.7556436745625935, + "grad_norm": 1.0078125, + "learning_rate": 0.0002971281642391679, + "loss": 0.9202, + "step": 10862 + }, + { + "epoch": 0.7557132421997287, + "grad_norm": 1.171875, + "learning_rate": 0.00029696789900236754, + "loss": 0.8987, + "step": 10863 + }, + { + "epoch": 0.7557828098368639, + "grad_norm": 0.9921875, + "learning_rate": 0.00029680766946199355, + "loss": 0.7069, + "step": 10864 + }, + { + "epoch": 0.7558523774739991, + "grad_norm": 1.03125, + "learning_rate": 0.000296647475626182, + "loss": 0.8637, + "step": 10865 + }, + { + "epoch": 0.7559219451111343, + "grad_norm": 1.1328125, + "learning_rate": 0.0002964873175030661, + "loss": 0.8416, + "step": 10866 + }, + { + "epoch": 0.7559915127482695, + "grad_norm": 0.8828125, + "learning_rate": 0.00029632719510077867, + "loss": 0.6538, + "step": 10867 + }, + { + "epoch": 0.7560610803854048, + "grad_norm": 1.03125, + "learning_rate": 0.0002961671084274492, + "loss": 0.639, + "step": 10868 + }, + { + "epoch": 0.7561306480225399, + "grad_norm": 1.0546875, + "learning_rate": 0.0002960070574912066, + "loss": 0.6999, + "step": 10869 + }, + { + "epoch": 0.7562002156596751, + "grad_norm": 1.0703125, + "learning_rate": 0.000295847042300177, + "loss": 0.8679, + "step": 10870 + }, + { + "epoch": 0.7562697832968103, + "grad_norm": 1.0390625, + "learning_rate": 0.0002956870628624854, + "loss": 0.6761, + "step": 10871 + }, + { + "epoch": 0.7563393509339456, + "grad_norm": 0.91015625, + "learning_rate": 0.00029552711918625496, + "loss": 0.7618, + "step": 10872 + }, + { + "epoch": 0.7564089185710807, + "grad_norm": 1.265625, + "learning_rate": 0.00029536721127960676, + "loss": 0.9856, + "step": 10873 + }, + { + "epoch": 0.7564784862082159, + "grad_norm": 0.97265625, + "learning_rate": 0.0002952073391506598, + "loss": 0.6998, + "step": 10874 + }, + { + "epoch": 0.7565480538453512, + "grad_norm": 1.0078125, + "learning_rate": 0.00029504750280753145, + "loss": 0.8895, + "step": 10875 + }, + { + "epoch": 0.7566176214824863, + "grad_norm": 1.140625, + "learning_rate": 0.0002948877022583378, + "loss": 0.6585, + "step": 10876 + }, + { + "epoch": 0.7566871891196215, + "grad_norm": 1.046875, + "learning_rate": 0.00029472793751119286, + "loss": 0.5516, + "step": 10877 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 1.265625, + "learning_rate": 0.0002945682085742081, + "loss": 0.8468, + "step": 10878 + }, + { + "epoch": 0.756826324393892, + "grad_norm": 1.1953125, + "learning_rate": 0.0002944085154554943, + "loss": 0.9927, + "step": 10879 + }, + { + "epoch": 0.7568958920310271, + "grad_norm": 1.2421875, + "learning_rate": 0.0002942488581631594, + "loss": 0.776, + "step": 10880 + }, + { + "epoch": 0.7569654596681624, + "grad_norm": 1.171875, + "learning_rate": 0.0002940892367053105, + "loss": 0.7683, + "step": 10881 + }, + { + "epoch": 0.7570350273052976, + "grad_norm": 1.1640625, + "learning_rate": 0.0002939296510900519, + "loss": 0.6813, + "step": 10882 + }, + { + "epoch": 0.7571045949424328, + "grad_norm": 1.046875, + "learning_rate": 0.00029377010132548696, + "loss": 0.6979, + "step": 10883 + }, + { + "epoch": 0.7571741625795679, + "grad_norm": 1.078125, + "learning_rate": 0.00029361058741971636, + "loss": 0.9223, + "step": 10884 + }, + { + "epoch": 0.7572437302167032, + "grad_norm": 0.97265625, + "learning_rate": 0.00029345110938083964, + "loss": 0.5794, + "step": 10885 + }, + { + "epoch": 0.7573132978538384, + "grad_norm": 1.203125, + "learning_rate": 0.00029329166721695464, + "loss": 0.7523, + "step": 10886 + }, + { + "epoch": 0.7573828654909736, + "grad_norm": 1.1015625, + "learning_rate": 0.0002931322609361567, + "loss": 0.7918, + "step": 10887 + }, + { + "epoch": 0.7574524331281088, + "grad_norm": 1.203125, + "learning_rate": 0.00029297289054653974, + "loss": 0.9891, + "step": 10888 + }, + { + "epoch": 0.757522000765244, + "grad_norm": 1.5546875, + "learning_rate": 0.00029281355605619496, + "loss": 0.9407, + "step": 10889 + }, + { + "epoch": 0.7575915684023792, + "grad_norm": 1.2578125, + "learning_rate": 0.0002926542574732141, + "loss": 0.7065, + "step": 10890 + }, + { + "epoch": 0.7576611360395145, + "grad_norm": 1.125, + "learning_rate": 0.00029249499480568463, + "loss": 0.7619, + "step": 10891 + }, + { + "epoch": 0.7577307036766496, + "grad_norm": 0.96875, + "learning_rate": 0.00029233576806169325, + "loss": 0.7175, + "step": 10892 + }, + { + "epoch": 0.7578002713137848, + "grad_norm": 1.03125, + "learning_rate": 0.00029217657724932446, + "loss": 0.7108, + "step": 10893 + }, + { + "epoch": 0.7578698389509201, + "grad_norm": 0.96875, + "learning_rate": 0.0002920174223766613, + "loss": 0.7984, + "step": 10894 + }, + { + "epoch": 0.7579394065880553, + "grad_norm": 1.046875, + "learning_rate": 0.0002918583034517852, + "loss": 0.7702, + "step": 10895 + }, + { + "epoch": 0.7580089742251904, + "grad_norm": 1.3046875, + "learning_rate": 0.00029169922048277486, + "loss": 0.8288, + "step": 10896 + }, + { + "epoch": 0.7580785418623256, + "grad_norm": 0.9765625, + "learning_rate": 0.00029154017347770845, + "loss": 0.7305, + "step": 10897 + }, + { + "epoch": 0.7581481094994609, + "grad_norm": 1.1328125, + "learning_rate": 0.0002913811624446606, + "loss": 0.7453, + "step": 10898 + }, + { + "epoch": 0.758217677136596, + "grad_norm": 0.984375, + "learning_rate": 0.00029122218739170615, + "loss": 0.5697, + "step": 10899 + }, + { + "epoch": 0.7582872447737312, + "grad_norm": 1.2578125, + "learning_rate": 0.0002910632483269161, + "loss": 0.5748, + "step": 10900 + }, + { + "epoch": 0.7583568124108665, + "grad_norm": 0.94140625, + "learning_rate": 0.00029090434525836127, + "loss": 0.6577, + "step": 10901 + }, + { + "epoch": 0.7584263800480017, + "grad_norm": 0.9921875, + "learning_rate": 0.00029074547819410944, + "loss": 0.596, + "step": 10902 + }, + { + "epoch": 0.7584959476851368, + "grad_norm": 0.9921875, + "learning_rate": 0.00029058664714222724, + "loss": 0.7455, + "step": 10903 + }, + { + "epoch": 0.7585655153222721, + "grad_norm": 1.3125, + "learning_rate": 0.00029042785211077983, + "loss": 0.8898, + "step": 10904 + }, + { + "epoch": 0.7586350829594073, + "grad_norm": 1.0234375, + "learning_rate": 0.00029026909310782945, + "loss": 0.8803, + "step": 10905 + }, + { + "epoch": 0.7587046505965425, + "grad_norm": 0.87109375, + "learning_rate": 0.00029011037014143725, + "loss": 0.8165, + "step": 10906 + }, + { + "epoch": 0.7587742182336777, + "grad_norm": 0.90625, + "learning_rate": 0.00028995168321966215, + "loss": 0.5155, + "step": 10907 + }, + { + "epoch": 0.7588437858708129, + "grad_norm": 1.28125, + "learning_rate": 0.0002897930323505615, + "loss": 1.0407, + "step": 10908 + }, + { + "epoch": 0.7589133535079481, + "grad_norm": 1.09375, + "learning_rate": 0.00028963441754219135, + "loss": 0.9402, + "step": 10909 + }, + { + "epoch": 0.7589829211450833, + "grad_norm": 1.1328125, + "learning_rate": 0.00028947583880260466, + "loss": 0.8225, + "step": 10910 + }, + { + "epoch": 0.7590524887822185, + "grad_norm": 1.09375, + "learning_rate": 0.00028931729613985394, + "loss": 0.7145, + "step": 10911 + }, + { + "epoch": 0.7591220564193537, + "grad_norm": 1.1640625, + "learning_rate": 0.00028915878956198835, + "loss": 0.733, + "step": 10912 + }, + { + "epoch": 0.7591916240564889, + "grad_norm": 1.1953125, + "learning_rate": 0.0002890003190770569, + "loss": 0.8173, + "step": 10913 + }, + { + "epoch": 0.7592611916936242, + "grad_norm": 1.484375, + "learning_rate": 0.00028884188469310525, + "loss": 0.8284, + "step": 10914 + }, + { + "epoch": 0.7593307593307593, + "grad_norm": 1.4375, + "learning_rate": 0.00028868348641817855, + "loss": 0.7452, + "step": 10915 + }, + { + "epoch": 0.7594003269678945, + "grad_norm": 0.82421875, + "learning_rate": 0.00028852512426031876, + "loss": 0.4362, + "step": 10916 + }, + { + "epoch": 0.7594698946050298, + "grad_norm": 1.203125, + "learning_rate": 0.0002883667982275671, + "loss": 0.8837, + "step": 10917 + }, + { + "epoch": 0.759539462242165, + "grad_norm": 0.984375, + "learning_rate": 0.00028820850832796276, + "loss": 0.9928, + "step": 10918 + }, + { + "epoch": 0.7596090298793001, + "grad_norm": 0.7421875, + "learning_rate": 0.00028805025456954256, + "loss": 0.6872, + "step": 10919 + }, + { + "epoch": 0.7596785975164354, + "grad_norm": 1.4140625, + "learning_rate": 0.00028789203696034216, + "loss": 1.023, + "step": 10920 + }, + { + "epoch": 0.7597481651535706, + "grad_norm": 1.59375, + "learning_rate": 0.00028773385550839414, + "loss": 0.7345, + "step": 10921 + }, + { + "epoch": 0.7598177327907057, + "grad_norm": 1.0859375, + "learning_rate": 0.00028757571022173145, + "loss": 0.8374, + "step": 10922 + }, + { + "epoch": 0.7598873004278409, + "grad_norm": 1.234375, + "learning_rate": 0.00028741760110838333, + "loss": 0.7209, + "step": 10923 + }, + { + "epoch": 0.7599568680649762, + "grad_norm": 1.140625, + "learning_rate": 0.00028725952817637747, + "loss": 0.69, + "step": 10924 + }, + { + "epoch": 0.7600264357021114, + "grad_norm": 1.4140625, + "learning_rate": 0.00028710149143374055, + "loss": 0.9986, + "step": 10925 + }, + { + "epoch": 0.7600960033392465, + "grad_norm": 0.79296875, + "learning_rate": 0.00028694349088849625, + "loss": 0.4994, + "step": 10926 + }, + { + "epoch": 0.7601655709763818, + "grad_norm": 1.0546875, + "learning_rate": 0.00028678552654866785, + "loss": 0.8706, + "step": 10927 + }, + { + "epoch": 0.760235138613517, + "grad_norm": 0.89453125, + "learning_rate": 0.00028662759842227513, + "loss": 0.6004, + "step": 10928 + }, + { + "epoch": 0.7603047062506522, + "grad_norm": 1.1015625, + "learning_rate": 0.0002864697065173377, + "loss": 0.878, + "step": 10929 + }, + { + "epoch": 0.7603742738877874, + "grad_norm": 0.90234375, + "learning_rate": 0.0002863118508418717, + "loss": 0.7029, + "step": 10930 + }, + { + "epoch": 0.7604438415249226, + "grad_norm": 1.0, + "learning_rate": 0.0002861540314038927, + "loss": 0.9266, + "step": 10931 + }, + { + "epoch": 0.7605134091620578, + "grad_norm": 1.2421875, + "learning_rate": 0.00028599624821141437, + "loss": 0.8503, + "step": 10932 + }, + { + "epoch": 0.7605829767991931, + "grad_norm": 1.2421875, + "learning_rate": 0.0002858385012724476, + "loss": 0.6257, + "step": 10933 + }, + { + "epoch": 0.7606525444363282, + "grad_norm": 1.09375, + "learning_rate": 0.00028568079059500175, + "loss": 0.7435, + "step": 10934 + }, + { + "epoch": 0.7607221120734634, + "grad_norm": 1.2734375, + "learning_rate": 0.00028552311618708495, + "loss": 0.8444, + "step": 10935 + }, + { + "epoch": 0.7607916797105986, + "grad_norm": 1.078125, + "learning_rate": 0.0002853654780567034, + "loss": 0.7446, + "step": 10936 + }, + { + "epoch": 0.7608612473477339, + "grad_norm": 1.234375, + "learning_rate": 0.0002852078762118608, + "loss": 0.8777, + "step": 10937 + }, + { + "epoch": 0.760930814984869, + "grad_norm": 1.046875, + "learning_rate": 0.0002850503106605592, + "loss": 0.7718, + "step": 10938 + }, + { + "epoch": 0.7610003826220042, + "grad_norm": 1.0625, + "learning_rate": 0.0002848927814107994, + "loss": 0.6905, + "step": 10939 + }, + { + "epoch": 0.7610699502591395, + "grad_norm": 1.34375, + "learning_rate": 0.0002847352884705796, + "loss": 0.9064, + "step": 10940 + }, + { + "epoch": 0.7611395178962747, + "grad_norm": 1.1015625, + "learning_rate": 0.0002845778318478969, + "loss": 0.6481, + "step": 10941 + }, + { + "epoch": 0.7612090855334098, + "grad_norm": 1.09375, + "learning_rate": 0.0002844204115507456, + "loss": 0.7602, + "step": 10942 + }, + { + "epoch": 0.7612786531705451, + "grad_norm": 1.3828125, + "learning_rate": 0.0002842630275871193, + "loss": 0.7536, + "step": 10943 + }, + { + "epoch": 0.7613482208076803, + "grad_norm": 1.015625, + "learning_rate": 0.00028410567996500855, + "loss": 0.9728, + "step": 10944 + }, + { + "epoch": 0.7614177884448154, + "grad_norm": 0.9375, + "learning_rate": 0.000283948368692403, + "loss": 0.5581, + "step": 10945 + }, + { + "epoch": 0.7614873560819507, + "grad_norm": 1.140625, + "learning_rate": 0.0002837910937772905, + "loss": 0.9369, + "step": 10946 + }, + { + "epoch": 0.7615569237190859, + "grad_norm": 1.0859375, + "learning_rate": 0.00028363385522765615, + "loss": 0.7163, + "step": 10947 + }, + { + "epoch": 0.7616264913562211, + "grad_norm": 1.4453125, + "learning_rate": 0.0002834766530514837, + "loss": 0.9173, + "step": 10948 + }, + { + "epoch": 0.7616960589933562, + "grad_norm": 1.0390625, + "learning_rate": 0.00028331948725675526, + "loss": 0.9139, + "step": 10949 + }, + { + "epoch": 0.7617656266304915, + "grad_norm": 1.359375, + "learning_rate": 0.00028316235785145116, + "loss": 0.8441, + "step": 10950 + }, + { + "epoch": 0.7618351942676267, + "grad_norm": 1.21875, + "learning_rate": 0.0002830052648435495, + "loss": 0.9639, + "step": 10951 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 1.046875, + "learning_rate": 0.0002828482082410262, + "loss": 0.6816, + "step": 10952 + }, + { + "epoch": 0.7619743295418971, + "grad_norm": 1.078125, + "learning_rate": 0.0002826911880518561, + "loss": 0.7788, + "step": 10953 + }, + { + "epoch": 0.7620438971790323, + "grad_norm": 1.046875, + "learning_rate": 0.0002825342042840123, + "loss": 0.6936, + "step": 10954 + }, + { + "epoch": 0.7621134648161675, + "grad_norm": 1.15625, + "learning_rate": 0.00028237725694546544, + "loss": 0.7557, + "step": 10955 + }, + { + "epoch": 0.7621830324533028, + "grad_norm": 1.046875, + "learning_rate": 0.000282220346044184, + "loss": 0.7433, + "step": 10956 + }, + { + "epoch": 0.7622526000904379, + "grad_norm": 0.78515625, + "learning_rate": 0.0002820634715881358, + "loss": 0.6299, + "step": 10957 + }, + { + "epoch": 0.7623221677275731, + "grad_norm": 1.0, + "learning_rate": 0.0002819066335852856, + "loss": 0.8281, + "step": 10958 + }, + { + "epoch": 0.7623917353647084, + "grad_norm": 0.86328125, + "learning_rate": 0.0002817498320435969, + "loss": 0.4377, + "step": 10959 + }, + { + "epoch": 0.7624613030018436, + "grad_norm": 1.1640625, + "learning_rate": 0.0002815930669710319, + "loss": 0.9633, + "step": 10960 + }, + { + "epoch": 0.7625308706389787, + "grad_norm": 1.2421875, + "learning_rate": 0.00028143633837555005, + "loss": 0.9027, + "step": 10961 + }, + { + "epoch": 0.7626004382761139, + "grad_norm": 1.015625, + "learning_rate": 0.0002812796462651087, + "loss": 0.7701, + "step": 10962 + }, + { + "epoch": 0.7626700059132492, + "grad_norm": 1.0078125, + "learning_rate": 0.00028112299064766424, + "loss": 0.7056, + "step": 10963 + }, + { + "epoch": 0.7627395735503844, + "grad_norm": 1.1484375, + "learning_rate": 0.00028096637153117123, + "loss": 0.8179, + "step": 10964 + }, + { + "epoch": 0.7628091411875195, + "grad_norm": 1.0703125, + "learning_rate": 0.00028080978892358176, + "loss": 0.9911, + "step": 10965 + }, + { + "epoch": 0.7628787088246548, + "grad_norm": 1.2265625, + "learning_rate": 0.00028065324283284586, + "loss": 0.755, + "step": 10966 + }, + { + "epoch": 0.76294827646179, + "grad_norm": 1.0234375, + "learning_rate": 0.0002804967332669125, + "loss": 0.6879, + "step": 10967 + }, + { + "epoch": 0.7630178440989251, + "grad_norm": 1.265625, + "learning_rate": 0.00028034026023372873, + "loss": 0.8447, + "step": 10968 + }, + { + "epoch": 0.7630874117360604, + "grad_norm": 0.9609375, + "learning_rate": 0.0002801838237412393, + "loss": 0.6667, + "step": 10969 + }, + { + "epoch": 0.7631569793731956, + "grad_norm": 0.921875, + "learning_rate": 0.00028002742379738674, + "loss": 0.6053, + "step": 10970 + }, + { + "epoch": 0.7632265470103308, + "grad_norm": 1.03125, + "learning_rate": 0.000279871060410113, + "loss": 0.9168, + "step": 10971 + }, + { + "epoch": 0.763296114647466, + "grad_norm": 1.0703125, + "learning_rate": 0.0002797147335873569, + "loss": 0.7791, + "step": 10972 + }, + { + "epoch": 0.7633656822846012, + "grad_norm": 1.5625, + "learning_rate": 0.00027955844333705626, + "loss": 1.2102, + "step": 10973 + }, + { + "epoch": 0.7634352499217364, + "grad_norm": 1.171875, + "learning_rate": 0.00027940218966714635, + "loss": 0.7311, + "step": 10974 + }, + { + "epoch": 0.7635048175588716, + "grad_norm": 0.859375, + "learning_rate": 0.0002792459725855615, + "loss": 0.6264, + "step": 10975 + }, + { + "epoch": 0.7635743851960068, + "grad_norm": 1.171875, + "learning_rate": 0.000279089792100233, + "loss": 0.868, + "step": 10976 + }, + { + "epoch": 0.763643952833142, + "grad_norm": 1.2109375, + "learning_rate": 0.0002789336482190912, + "loss": 0.7842, + "step": 10977 + }, + { + "epoch": 0.7637135204702772, + "grad_norm": 1.15625, + "learning_rate": 0.0002787775409500645, + "loss": 0.8368, + "step": 10978 + }, + { + "epoch": 0.7637830881074125, + "grad_norm": 1.4375, + "learning_rate": 0.0002786214703010791, + "loss": 0.9723, + "step": 10979 + }, + { + "epoch": 0.7638526557445476, + "grad_norm": 1.0546875, + "learning_rate": 0.00027846543628005916, + "loss": 0.7424, + "step": 10980 + }, + { + "epoch": 0.7639222233816828, + "grad_norm": 1.015625, + "learning_rate": 0.0002783094388949274, + "loss": 0.6343, + "step": 10981 + }, + { + "epoch": 0.7639917910188181, + "grad_norm": 1.2421875, + "learning_rate": 0.00027815347815360526, + "loss": 0.7459, + "step": 10982 + }, + { + "epoch": 0.7640613586559533, + "grad_norm": 1.234375, + "learning_rate": 0.0002779975540640111, + "loss": 0.7888, + "step": 10983 + }, + { + "epoch": 0.7641309262930884, + "grad_norm": 1.2421875, + "learning_rate": 0.0002778416666340615, + "loss": 0.8225, + "step": 10984 + }, + { + "epoch": 0.7642004939302237, + "grad_norm": 1.1484375, + "learning_rate": 0.0002776858158716723, + "loss": 0.8207, + "step": 10985 + }, + { + "epoch": 0.7642700615673589, + "grad_norm": 1.1953125, + "learning_rate": 0.00027753000178475687, + "loss": 0.9062, + "step": 10986 + }, + { + "epoch": 0.764339629204494, + "grad_norm": 1.25, + "learning_rate": 0.00027737422438122637, + "loss": 0.902, + "step": 10987 + }, + { + "epoch": 0.7644091968416292, + "grad_norm": 1.09375, + "learning_rate": 0.00027721848366899025, + "loss": 0.9645, + "step": 10988 + }, + { + "epoch": 0.7644787644787645, + "grad_norm": 1.1328125, + "learning_rate": 0.0002770627796559567, + "loss": 1.067, + "step": 10989 + }, + { + "epoch": 0.7645483321158997, + "grad_norm": 1.140625, + "learning_rate": 0.000276907112350031, + "loss": 0.7567, + "step": 10990 + }, + { + "epoch": 0.7646178997530348, + "grad_norm": 0.9375, + "learning_rate": 0.00027675148175911746, + "loss": 0.6529, + "step": 10991 + }, + { + "epoch": 0.7646874673901701, + "grad_norm": 1.140625, + "learning_rate": 0.0002765958878911187, + "loss": 0.9196, + "step": 10992 + }, + { + "epoch": 0.7647570350273053, + "grad_norm": 1.0, + "learning_rate": 0.00027644033075393436, + "loss": 0.6483, + "step": 10993 + }, + { + "epoch": 0.7648266026644405, + "grad_norm": 1.3515625, + "learning_rate": 0.0002762848103554627, + "loss": 0.7832, + "step": 10994 + }, + { + "epoch": 0.7648961703015758, + "grad_norm": 1.1484375, + "learning_rate": 0.0002761293267036007, + "loss": 0.7285, + "step": 10995 + }, + { + "epoch": 0.7649657379387109, + "grad_norm": 1.1953125, + "learning_rate": 0.0002759738798062431, + "loss": 1.087, + "step": 10996 + }, + { + "epoch": 0.7650353055758461, + "grad_norm": 1.2421875, + "learning_rate": 0.00027581846967128255, + "loss": 0.7953, + "step": 10997 + }, + { + "epoch": 0.7651048732129814, + "grad_norm": 1.484375, + "learning_rate": 0.0002756630963066097, + "loss": 0.9286, + "step": 10998 + }, + { + "epoch": 0.7651744408501165, + "grad_norm": 1.0859375, + "learning_rate": 0.0002755077597201139, + "loss": 0.7367, + "step": 10999 + }, + { + "epoch": 0.7652440084872517, + "grad_norm": 0.82421875, + "learning_rate": 0.0002753524599196826, + "loss": 0.8037, + "step": 11000 + }, + { + "epoch": 0.7653135761243869, + "grad_norm": 1.25, + "learning_rate": 0.0002751971969132009, + "loss": 0.7853, + "step": 11001 + }, + { + "epoch": 0.7653831437615222, + "grad_norm": 1.2265625, + "learning_rate": 0.00027504197070855196, + "loss": 0.8712, + "step": 11002 + }, + { + "epoch": 0.7654527113986573, + "grad_norm": 1.0703125, + "learning_rate": 0.000274886781313618, + "loss": 0.7074, + "step": 11003 + }, + { + "epoch": 0.7655222790357925, + "grad_norm": 0.91015625, + "learning_rate": 0.0002747316287362782, + "loss": 0.8645, + "step": 11004 + }, + { + "epoch": 0.7655918466729278, + "grad_norm": 1.1328125, + "learning_rate": 0.00027457651298441055, + "loss": 0.7833, + "step": 11005 + }, + { + "epoch": 0.765661414310063, + "grad_norm": 0.98046875, + "learning_rate": 0.0002744214340658916, + "loss": 0.788, + "step": 11006 + }, + { + "epoch": 0.7657309819471981, + "grad_norm": 1.2734375, + "learning_rate": 0.0002742663919885949, + "loss": 1.0066, + "step": 11007 + }, + { + "epoch": 0.7658005495843334, + "grad_norm": 1.0234375, + "learning_rate": 0.0002741113867603927, + "loss": 0.7631, + "step": 11008 + }, + { + "epoch": 0.7658701172214686, + "grad_norm": 1.109375, + "learning_rate": 0.0002739564183891554, + "loss": 0.7219, + "step": 11009 + }, + { + "epoch": 0.7659396848586038, + "grad_norm": 0.9765625, + "learning_rate": 0.0002738014868827521, + "loss": 0.7114, + "step": 11010 + }, + { + "epoch": 0.766009252495739, + "grad_norm": 1.1015625, + "learning_rate": 0.00027364659224904885, + "loss": 0.7598, + "step": 11011 + }, + { + "epoch": 0.7660788201328742, + "grad_norm": 0.9765625, + "learning_rate": 0.0002734917344959103, + "loss": 0.8081, + "step": 11012 + }, + { + "epoch": 0.7661483877700094, + "grad_norm": 1.2421875, + "learning_rate": 0.0002733369136311995, + "loss": 0.6663, + "step": 11013 + }, + { + "epoch": 0.7662179554071445, + "grad_norm": 1.171875, + "learning_rate": 0.000273182129662778, + "loss": 0.8881, + "step": 11014 + }, + { + "epoch": 0.7662875230442798, + "grad_norm": 1.109375, + "learning_rate": 0.00027302738259850443, + "loss": 0.8484, + "step": 11015 + }, + { + "epoch": 0.766357090681415, + "grad_norm": 1.5234375, + "learning_rate": 0.0002728726724462359, + "loss": 0.8909, + "step": 11016 + }, + { + "epoch": 0.7664266583185502, + "grad_norm": 1.109375, + "learning_rate": 0.00027271799921382844, + "loss": 0.5954, + "step": 11017 + }, + { + "epoch": 0.7664962259556855, + "grad_norm": 1.0390625, + "learning_rate": 0.00027256336290913484, + "loss": 0.7347, + "step": 11018 + }, + { + "epoch": 0.7665657935928206, + "grad_norm": 1.25, + "learning_rate": 0.0002724087635400071, + "loss": 0.8684, + "step": 11019 + }, + { + "epoch": 0.7666353612299558, + "grad_norm": 1.3125, + "learning_rate": 0.00027225420111429534, + "loss": 0.8686, + "step": 11020 + }, + { + "epoch": 0.7667049288670911, + "grad_norm": 1.3671875, + "learning_rate": 0.00027209967563984717, + "loss": 0.963, + "step": 11021 + }, + { + "epoch": 0.7667744965042262, + "grad_norm": 0.82421875, + "learning_rate": 0.0002719451871245082, + "loss": 0.7989, + "step": 11022 + }, + { + "epoch": 0.7668440641413614, + "grad_norm": 1.234375, + "learning_rate": 0.000271790735576123, + "loss": 0.6866, + "step": 11023 + }, + { + "epoch": 0.7669136317784967, + "grad_norm": 0.94140625, + "learning_rate": 0.0002716363210025341, + "loss": 0.8577, + "step": 11024 + }, + { + "epoch": 0.7669831994156319, + "grad_norm": 0.83203125, + "learning_rate": 0.0002714819434115816, + "loss": 0.5434, + "step": 11025 + }, + { + "epoch": 0.767052767052767, + "grad_norm": 1.1328125, + "learning_rate": 0.0002713276028111037, + "loss": 0.8487, + "step": 11026 + }, + { + "epoch": 0.7671223346899022, + "grad_norm": 1.125, + "learning_rate": 0.0002711732992089374, + "loss": 0.8545, + "step": 11027 + }, + { + "epoch": 0.7671919023270375, + "grad_norm": 1.1015625, + "learning_rate": 0.00027101903261291763, + "loss": 0.9335, + "step": 11028 + }, + { + "epoch": 0.7672614699641727, + "grad_norm": 1.015625, + "learning_rate": 0.00027086480303087715, + "loss": 0.6288, + "step": 11029 + }, + { + "epoch": 0.7673310376013078, + "grad_norm": 1.1328125, + "learning_rate": 0.0002707106104706464, + "loss": 0.8595, + "step": 11030 + }, + { + "epoch": 0.7674006052384431, + "grad_norm": 3.515625, + "learning_rate": 0.0002705564549400551, + "loss": 0.6332, + "step": 11031 + }, + { + "epoch": 0.7674701728755783, + "grad_norm": 1.109375, + "learning_rate": 0.0002704023364469306, + "loss": 0.8574, + "step": 11032 + }, + { + "epoch": 0.7675397405127135, + "grad_norm": 0.90234375, + "learning_rate": 0.0002702482549990977, + "loss": 0.6316, + "step": 11033 + }, + { + "epoch": 0.7676093081498487, + "grad_norm": 0.90234375, + "learning_rate": 0.0002700942106043804, + "loss": 0.6021, + "step": 11034 + }, + { + "epoch": 0.7676788757869839, + "grad_norm": 1.015625, + "learning_rate": 0.0002699402032706003, + "loss": 0.5737, + "step": 11035 + }, + { + "epoch": 0.7677484434241191, + "grad_norm": 1.4140625, + "learning_rate": 0.00026978623300557647, + "loss": 0.9823, + "step": 11036 + }, + { + "epoch": 0.7678180110612544, + "grad_norm": 1.078125, + "learning_rate": 0.00026963229981712724, + "loss": 0.9057, + "step": 11037 + }, + { + "epoch": 0.7678875786983895, + "grad_norm": 1.0625, + "learning_rate": 0.00026947840371306875, + "loss": 0.6561, + "step": 11038 + }, + { + "epoch": 0.7679571463355247, + "grad_norm": 1.3046875, + "learning_rate": 0.00026932454470121484, + "loss": 0.8715, + "step": 11039 + }, + { + "epoch": 0.7680267139726599, + "grad_norm": 0.95703125, + "learning_rate": 0.0002691707227893774, + "loss": 0.7328, + "step": 11040 + }, + { + "epoch": 0.7680962816097952, + "grad_norm": 1.03125, + "learning_rate": 0.00026901693798536686, + "loss": 0.7496, + "step": 11041 + }, + { + "epoch": 0.7681658492469303, + "grad_norm": 1.2421875, + "learning_rate": 0.00026886319029699224, + "loss": 0.6002, + "step": 11042 + }, + { + "epoch": 0.7682354168840655, + "grad_norm": 1.0546875, + "learning_rate": 0.00026870947973205953, + "loss": 0.7732, + "step": 11043 + }, + { + "epoch": 0.7683049845212008, + "grad_norm": 1.0, + "learning_rate": 0.0002685558062983732, + "loss": 0.8224, + "step": 11044 + }, + { + "epoch": 0.7683745521583359, + "grad_norm": 0.96875, + "learning_rate": 0.00026840217000373624, + "loss": 0.8444, + "step": 11045 + }, + { + "epoch": 0.7684441197954711, + "grad_norm": 1.078125, + "learning_rate": 0.00026824857085594987, + "loss": 0.7547, + "step": 11046 + }, + { + "epoch": 0.7685136874326064, + "grad_norm": 1.1328125, + "learning_rate": 0.0002680950088628128, + "loss": 0.6605, + "step": 11047 + }, + { + "epoch": 0.7685832550697416, + "grad_norm": 1.265625, + "learning_rate": 0.00026794148403212184, + "loss": 0.6999, + "step": 11048 + }, + { + "epoch": 0.7686528227068767, + "grad_norm": 1.2890625, + "learning_rate": 0.00026778799637167274, + "loss": 0.7683, + "step": 11049 + }, + { + "epoch": 0.768722390344012, + "grad_norm": 1.2421875, + "learning_rate": 0.00026763454588925816, + "loss": 0.9527, + "step": 11050 + }, + { + "epoch": 0.7687919579811472, + "grad_norm": 1.34375, + "learning_rate": 0.00026748113259267005, + "loss": 0.7597, + "step": 11051 + }, + { + "epoch": 0.7688615256182824, + "grad_norm": 0.99609375, + "learning_rate": 0.0002673277564896982, + "loss": 0.6541, + "step": 11052 + }, + { + "epoch": 0.7689310932554175, + "grad_norm": 0.96484375, + "learning_rate": 0.0002671744175881299, + "loss": 0.8399, + "step": 11053 + }, + { + "epoch": 0.7690006608925528, + "grad_norm": 1.125, + "learning_rate": 0.0002670211158957506, + "loss": 0.832, + "step": 11054 + }, + { + "epoch": 0.769070228529688, + "grad_norm": 1.59375, + "learning_rate": 0.00026686785142034455, + "loss": 0.9521, + "step": 11055 + }, + { + "epoch": 0.7691397961668232, + "grad_norm": 1.1796875, + "learning_rate": 0.00026671462416969416, + "loss": 0.8878, + "step": 11056 + }, + { + "epoch": 0.7692093638039584, + "grad_norm": 1.203125, + "learning_rate": 0.00026656143415157896, + "loss": 0.7894, + "step": 11057 + }, + { + "epoch": 0.7692789314410936, + "grad_norm": 1.1796875, + "learning_rate": 0.00026640828137377713, + "loss": 0.8664, + "step": 11058 + }, + { + "epoch": 0.7693484990782288, + "grad_norm": 1.2265625, + "learning_rate": 0.00026625516584406517, + "loss": 0.9427, + "step": 11059 + }, + { + "epoch": 0.7694180667153641, + "grad_norm": 1.078125, + "learning_rate": 0.00026610208757021784, + "loss": 0.7319, + "step": 11060 + }, + { + "epoch": 0.7694876343524992, + "grad_norm": 1.0234375, + "learning_rate": 0.0002659490465600074, + "loss": 0.7949, + "step": 11061 + }, + { + "epoch": 0.7695572019896344, + "grad_norm": 1.3359375, + "learning_rate": 0.00026579604282120416, + "loss": 0.9307, + "step": 11062 + }, + { + "epoch": 0.7696267696267697, + "grad_norm": 1.1015625, + "learning_rate": 0.00026564307636157725, + "loss": 0.8738, + "step": 11063 + }, + { + "epoch": 0.7696963372639049, + "grad_norm": 1.03125, + "learning_rate": 0.00026549014718889373, + "loss": 0.7137, + "step": 11064 + }, + { + "epoch": 0.76976590490104, + "grad_norm": 1.234375, + "learning_rate": 0.0002653372553109181, + "loss": 0.8228, + "step": 11065 + }, + { + "epoch": 0.7698354725381752, + "grad_norm": 1.5078125, + "learning_rate": 0.00026518440073541394, + "loss": 0.87, + "step": 11066 + }, + { + "epoch": 0.7699050401753105, + "grad_norm": 1.1640625, + "learning_rate": 0.0002650315834701421, + "loss": 0.8271, + "step": 11067 + }, + { + "epoch": 0.7699746078124456, + "grad_norm": 0.875, + "learning_rate": 0.00026487880352286177, + "loss": 0.5988, + "step": 11068 + }, + { + "epoch": 0.7700441754495808, + "grad_norm": 1.015625, + "learning_rate": 0.0002647260609013303, + "loss": 0.8066, + "step": 11069 + }, + { + "epoch": 0.7701137430867161, + "grad_norm": 1.15625, + "learning_rate": 0.0002645733556133039, + "loss": 0.7901, + "step": 11070 + }, + { + "epoch": 0.7701833107238513, + "grad_norm": 1.3125, + "learning_rate": 0.0002644206876665356, + "loss": 0.7836, + "step": 11071 + }, + { + "epoch": 0.7702528783609864, + "grad_norm": 1.1171875, + "learning_rate": 0.00026426805706877685, + "loss": 0.9634, + "step": 11072 + }, + { + "epoch": 0.7703224459981217, + "grad_norm": 1.3828125, + "learning_rate": 0.00026411546382777793, + "loss": 1.0034, + "step": 11073 + }, + { + "epoch": 0.7703920136352569, + "grad_norm": 1.1015625, + "learning_rate": 0.00026396290795128687, + "loss": 0.6641, + "step": 11074 + }, + { + "epoch": 0.7704615812723921, + "grad_norm": 0.765625, + "learning_rate": 0.0002638103894470494, + "loss": 0.7064, + "step": 11075 + }, + { + "epoch": 0.7705311489095273, + "grad_norm": 1.03125, + "learning_rate": 0.0002636579083228093, + "loss": 0.6517, + "step": 11076 + }, + { + "epoch": 0.7706007165466625, + "grad_norm": 0.96484375, + "learning_rate": 0.0002635054645863093, + "loss": 0.7051, + "step": 11077 + }, + { + "epoch": 0.7706702841837977, + "grad_norm": 1.2578125, + "learning_rate": 0.00026335305824528985, + "loss": 1.0038, + "step": 11078 + }, + { + "epoch": 0.7707398518209329, + "grad_norm": 1.5625, + "learning_rate": 0.00026320068930748896, + "loss": 0.7383, + "step": 11079 + }, + { + "epoch": 0.7708094194580681, + "grad_norm": 1.1875, + "learning_rate": 0.0002630483577806435, + "loss": 0.8704, + "step": 11080 + }, + { + "epoch": 0.7708789870952033, + "grad_norm": 0.90234375, + "learning_rate": 0.00026289606367248784, + "loss": 0.6069, + "step": 11081 + }, + { + "epoch": 0.7709485547323385, + "grad_norm": 0.9140625, + "learning_rate": 0.0002627438069907546, + "loss": 0.6408, + "step": 11082 + }, + { + "epoch": 0.7710181223694738, + "grad_norm": 0.8125, + "learning_rate": 0.00026259158774317483, + "loss": 0.6818, + "step": 11083 + }, + { + "epoch": 0.7710876900066089, + "grad_norm": 1.15625, + "learning_rate": 0.00026243940593747764, + "loss": 0.9507, + "step": 11084 + }, + { + "epoch": 0.7711572576437441, + "grad_norm": 1.09375, + "learning_rate": 0.00026228726158138984, + "loss": 0.6779, + "step": 11085 + }, + { + "epoch": 0.7712268252808794, + "grad_norm": 1.015625, + "learning_rate": 0.00026213515468263626, + "loss": 0.6792, + "step": 11086 + }, + { + "epoch": 0.7712963929180146, + "grad_norm": 1.03125, + "learning_rate": 0.0002619830852489404, + "loss": 0.8555, + "step": 11087 + }, + { + "epoch": 0.7713659605551497, + "grad_norm": 0.921875, + "learning_rate": 0.000261831053288024, + "loss": 0.8009, + "step": 11088 + }, + { + "epoch": 0.771435528192285, + "grad_norm": 1.015625, + "learning_rate": 0.000261679058807606, + "loss": 0.861, + "step": 11089 + }, + { + "epoch": 0.7715050958294202, + "grad_norm": 1.125, + "learning_rate": 0.0002615271018154036, + "loss": 0.8647, + "step": 11090 + }, + { + "epoch": 0.7715746634665553, + "grad_norm": 1.3046875, + "learning_rate": 0.0002613751823191328, + "loss": 0.8819, + "step": 11091 + }, + { + "epoch": 0.7716442311036905, + "grad_norm": 1.09375, + "learning_rate": 0.00026122330032650774, + "loss": 0.6314, + "step": 11092 + }, + { + "epoch": 0.7717137987408258, + "grad_norm": 0.92578125, + "learning_rate": 0.0002610714558452394, + "loss": 0.7506, + "step": 11093 + }, + { + "epoch": 0.771783366377961, + "grad_norm": 0.9765625, + "learning_rate": 0.0002609196488830383, + "loss": 0.6546, + "step": 11094 + }, + { + "epoch": 0.7718529340150961, + "grad_norm": 1.3671875, + "learning_rate": 0.0002607678794476119, + "loss": 1.1312, + "step": 11095 + }, + { + "epoch": 0.7719225016522314, + "grad_norm": 1.0546875, + "learning_rate": 0.00026061614754666697, + "loss": 0.766, + "step": 11096 + }, + { + "epoch": 0.7719920692893666, + "grad_norm": 1.40625, + "learning_rate": 0.0002604644531879069, + "loss": 0.8864, + "step": 11097 + }, + { + "epoch": 0.7720616369265018, + "grad_norm": 1.21875, + "learning_rate": 0.0002603127963790347, + "loss": 0.7961, + "step": 11098 + }, + { + "epoch": 0.772131204563637, + "grad_norm": 1.125, + "learning_rate": 0.0002601611771277505, + "loss": 0.6875, + "step": 11099 + }, + { + "epoch": 0.7722007722007722, + "grad_norm": 0.85546875, + "learning_rate": 0.0002600095954417522, + "loss": 0.8267, + "step": 11100 + }, + { + "epoch": 0.7722703398379074, + "grad_norm": 1.125, + "learning_rate": 0.00025985805132873685, + "loss": 0.9563, + "step": 11101 + }, + { + "epoch": 0.7723399074750427, + "grad_norm": 1.328125, + "learning_rate": 0.0002597065447963993, + "loss": 0.7816, + "step": 11102 + }, + { + "epoch": 0.7724094751121778, + "grad_norm": 0.9453125, + "learning_rate": 0.0002595550758524322, + "loss": 0.72, + "step": 11103 + }, + { + "epoch": 0.772479042749313, + "grad_norm": 1.2734375, + "learning_rate": 0.0002594036445045258, + "loss": 0.9023, + "step": 11104 + }, + { + "epoch": 0.7725486103864482, + "grad_norm": 1.3828125, + "learning_rate": 0.0002592522507603695, + "loss": 0.8009, + "step": 11105 + }, + { + "epoch": 0.7726181780235835, + "grad_norm": 3.078125, + "learning_rate": 0.0002591008946276506, + "loss": 0.6459, + "step": 11106 + }, + { + "epoch": 0.7726877456607186, + "grad_norm": 1.0078125, + "learning_rate": 0.00025894957611405356, + "loss": 0.6435, + "step": 11107 + }, + { + "epoch": 0.7727573132978538, + "grad_norm": 0.94921875, + "learning_rate": 0.00025879829522726215, + "loss": 0.8786, + "step": 11108 + }, + { + "epoch": 0.7728268809349891, + "grad_norm": 0.890625, + "learning_rate": 0.0002586470519749571, + "loss": 0.8265, + "step": 11109 + }, + { + "epoch": 0.7728964485721243, + "grad_norm": 1.25, + "learning_rate": 0.00025849584636481826, + "loss": 1.0323, + "step": 11110 + }, + { + "epoch": 0.7729660162092594, + "grad_norm": 1.015625, + "learning_rate": 0.0002583446784045227, + "loss": 0.7944, + "step": 11111 + }, + { + "epoch": 0.7730355838463947, + "grad_norm": 0.83984375, + "learning_rate": 0.00025819354810174643, + "loss": 0.4472, + "step": 11112 + }, + { + "epoch": 0.7731051514835299, + "grad_norm": 1.03125, + "learning_rate": 0.00025804245546416274, + "loss": 0.9385, + "step": 11113 + }, + { + "epoch": 0.773174719120665, + "grad_norm": 1.1171875, + "learning_rate": 0.0002578914004994429, + "loss": 1.0097, + "step": 11114 + }, + { + "epoch": 0.7732442867578003, + "grad_norm": 1.109375, + "learning_rate": 0.0002577403832152578, + "loss": 0.9413, + "step": 11115 + }, + { + "epoch": 0.7733138543949355, + "grad_norm": 1.140625, + "learning_rate": 0.00025758940361927474, + "loss": 0.7039, + "step": 11116 + }, + { + "epoch": 0.7733834220320707, + "grad_norm": 1.34375, + "learning_rate": 0.00025743846171915973, + "loss": 0.913, + "step": 11117 + }, + { + "epoch": 0.7734529896692058, + "grad_norm": 0.97265625, + "learning_rate": 0.0002572875575225766, + "loss": 0.6947, + "step": 11118 + }, + { + "epoch": 0.7735225573063411, + "grad_norm": 1.125, + "learning_rate": 0.00025713669103718774, + "loss": 0.8442, + "step": 11119 + }, + { + "epoch": 0.7735921249434763, + "grad_norm": 1.0859375, + "learning_rate": 0.0002569858622706537, + "loss": 0.8794, + "step": 11120 + }, + { + "epoch": 0.7736616925806115, + "grad_norm": 1.234375, + "learning_rate": 0.0002568350712306322, + "loss": 0.9511, + "step": 11121 + }, + { + "epoch": 0.7737312602177467, + "grad_norm": 1.109375, + "learning_rate": 0.00025668431792478033, + "loss": 0.8524, + "step": 11122 + }, + { + "epoch": 0.7738008278548819, + "grad_norm": 0.9296875, + "learning_rate": 0.00025653360236075186, + "loss": 0.7128, + "step": 11123 + }, + { + "epoch": 0.7738703954920171, + "grad_norm": 1.3203125, + "learning_rate": 0.00025638292454619995, + "loss": 1.1436, + "step": 11124 + }, + { + "epoch": 0.7739399631291524, + "grad_norm": 1.3828125, + "learning_rate": 0.0002562322844887748, + "loss": 0.7928, + "step": 11125 + }, + { + "epoch": 0.7740095307662875, + "grad_norm": 0.82421875, + "learning_rate": 0.0002560816821961256, + "loss": 0.5738, + "step": 11126 + }, + { + "epoch": 0.7740790984034227, + "grad_norm": 1.3203125, + "learning_rate": 0.0002559311176758986, + "loss": 0.8763, + "step": 11127 + }, + { + "epoch": 0.774148666040558, + "grad_norm": 1.1953125, + "learning_rate": 0.00025578059093573946, + "loss": 0.836, + "step": 11128 + }, + { + "epoch": 0.7742182336776932, + "grad_norm": 1.2578125, + "learning_rate": 0.0002556301019832905, + "loss": 0.962, + "step": 11129 + }, + { + "epoch": 0.7742878013148283, + "grad_norm": 1.1328125, + "learning_rate": 0.0002554796508261933, + "loss": 0.8005, + "step": 11130 + }, + { + "epoch": 0.7743573689519635, + "grad_norm": 1.1953125, + "learning_rate": 0.0002553292374720868, + "loss": 0.7667, + "step": 11131 + }, + { + "epoch": 0.7744269365890988, + "grad_norm": 1.1953125, + "learning_rate": 0.00025517886192860786, + "loss": 0.7471, + "step": 11132 + }, + { + "epoch": 0.774496504226234, + "grad_norm": 1.1953125, + "learning_rate": 0.0002550285242033922, + "loss": 0.9119, + "step": 11133 + }, + { + "epoch": 0.7745660718633691, + "grad_norm": 1.1484375, + "learning_rate": 0.00025487822430407336, + "loss": 0.709, + "step": 11134 + }, + { + "epoch": 0.7746356395005044, + "grad_norm": 1.3359375, + "learning_rate": 0.00025472796223828265, + "loss": 0.9223, + "step": 11135 + }, + { + "epoch": 0.7747052071376396, + "grad_norm": 1.140625, + "learning_rate": 0.00025457773801364935, + "loss": 0.8418, + "step": 11136 + }, + { + "epoch": 0.7747747747747747, + "grad_norm": 1.3359375, + "learning_rate": 0.0002544275516378012, + "loss": 0.933, + "step": 11137 + }, + { + "epoch": 0.77484434241191, + "grad_norm": 1.0859375, + "learning_rate": 0.00025427740311836434, + "loss": 0.7165, + "step": 11138 + }, + { + "epoch": 0.7749139100490452, + "grad_norm": 1.1484375, + "learning_rate": 0.00025412729246296193, + "loss": 0.6515, + "step": 11139 + }, + { + "epoch": 0.7749834776861804, + "grad_norm": 1.34375, + "learning_rate": 0.0002539772196792164, + "loss": 0.8536, + "step": 11140 + }, + { + "epoch": 0.7750530453233156, + "grad_norm": 1.375, + "learning_rate": 0.0002538271847747472, + "loss": 0.9593, + "step": 11141 + }, + { + "epoch": 0.7751226129604508, + "grad_norm": 1.2421875, + "learning_rate": 0.00025367718775717277, + "loss": 0.6178, + "step": 11142 + }, + { + "epoch": 0.775192180597586, + "grad_norm": 1.4375, + "learning_rate": 0.0002535272286341087, + "loss": 1.0278, + "step": 11143 + }, + { + "epoch": 0.7752617482347212, + "grad_norm": 1.0859375, + "learning_rate": 0.0002533773074131699, + "loss": 0.6262, + "step": 11144 + }, + { + "epoch": 0.7753313158718564, + "grad_norm": 1.296875, + "learning_rate": 0.0002532274241019681, + "loss": 0.9228, + "step": 11145 + }, + { + "epoch": 0.7754008835089916, + "grad_norm": 0.96484375, + "learning_rate": 0.000253077578708113, + "loss": 0.7004, + "step": 11146 + }, + { + "epoch": 0.7754704511461268, + "grad_norm": 1.390625, + "learning_rate": 0.0002529277712392144, + "loss": 1.1014, + "step": 11147 + }, + { + "epoch": 0.7755400187832621, + "grad_norm": 0.89453125, + "learning_rate": 0.000252778001702878, + "loss": 0.6228, + "step": 11148 + }, + { + "epoch": 0.7756095864203972, + "grad_norm": 1.34375, + "learning_rate": 0.0002526282701067084, + "loss": 0.7733, + "step": 11149 + }, + { + "epoch": 0.7756791540575324, + "grad_norm": 1.2109375, + "learning_rate": 0.00025247857645830784, + "loss": 0.9438, + "step": 11150 + }, + { + "epoch": 0.7757487216946677, + "grad_norm": 0.94140625, + "learning_rate": 0.00025232892076527746, + "loss": 0.8475, + "step": 11151 + }, + { + "epoch": 0.7758182893318029, + "grad_norm": 0.984375, + "learning_rate": 0.0002521793030352163, + "loss": 0.9927, + "step": 11152 + }, + { + "epoch": 0.775887856968938, + "grad_norm": 1.046875, + "learning_rate": 0.0002520297232757205, + "loss": 0.6633, + "step": 11153 + }, + { + "epoch": 0.7759574246060733, + "grad_norm": 1.2265625, + "learning_rate": 0.0002518801814943855, + "loss": 0.8391, + "step": 11154 + }, + { + "epoch": 0.7760269922432085, + "grad_norm": 0.97265625, + "learning_rate": 0.00025173067769880384, + "loss": 0.7216, + "step": 11155 + }, + { + "epoch": 0.7760965598803437, + "grad_norm": 1.3203125, + "learning_rate": 0.00025158121189656715, + "loss": 0.9152, + "step": 11156 + }, + { + "epoch": 0.7761661275174788, + "grad_norm": 1.3203125, + "learning_rate": 0.0002514317840952639, + "loss": 0.7242, + "step": 11157 + }, + { + "epoch": 0.7762356951546141, + "grad_norm": 1.2421875, + "learning_rate": 0.0002512823943024819, + "loss": 0.9426, + "step": 11158 + }, + { + "epoch": 0.7763052627917493, + "grad_norm": 1.15625, + "learning_rate": 0.0002511330425258057, + "loss": 0.9526, + "step": 11159 + }, + { + "epoch": 0.7763748304288844, + "grad_norm": 0.9375, + "learning_rate": 0.00025098372877281914, + "loss": 0.7022, + "step": 11160 + }, + { + "epoch": 0.7764443980660197, + "grad_norm": 2.15625, + "learning_rate": 0.00025083445305110387, + "loss": 0.53, + "step": 11161 + }, + { + "epoch": 0.7765139657031549, + "grad_norm": 1.25, + "learning_rate": 0.00025068521536823887, + "loss": 0.7777, + "step": 11162 + }, + { + "epoch": 0.7765835333402901, + "grad_norm": 1.2421875, + "learning_rate": 0.00025053601573180186, + "loss": 0.9637, + "step": 11163 + }, + { + "epoch": 0.7766531009774253, + "grad_norm": 0.90625, + "learning_rate": 0.000250386854149368, + "loss": 0.8286, + "step": 11164 + }, + { + "epoch": 0.7767226686145605, + "grad_norm": 1.0859375, + "learning_rate": 0.0002502377306285115, + "loss": 0.919, + "step": 11165 + }, + { + "epoch": 0.7767922362516957, + "grad_norm": 1.0078125, + "learning_rate": 0.00025008864517680416, + "loss": 0.8048, + "step": 11166 + }, + { + "epoch": 0.776861803888831, + "grad_norm": 1.0625, + "learning_rate": 0.0002499395978018153, + "loss": 0.8079, + "step": 11167 + }, + { + "epoch": 0.7769313715259661, + "grad_norm": 1.0703125, + "learning_rate": 0.0002497905885111135, + "loss": 0.8159, + "step": 11168 + }, + { + "epoch": 0.7770009391631013, + "grad_norm": 1.046875, + "learning_rate": 0.00024964161731226374, + "loss": 0.6467, + "step": 11169 + }, + { + "epoch": 0.7770705068002365, + "grad_norm": 1.1328125, + "learning_rate": 0.0002494926842128311, + "loss": 0.8966, + "step": 11170 + }, + { + "epoch": 0.7771400744373718, + "grad_norm": 1.1796875, + "learning_rate": 0.00024934378922037673, + "loss": 0.8229, + "step": 11171 + }, + { + "epoch": 0.7772096420745069, + "grad_norm": 0.98828125, + "learning_rate": 0.00024919493234246137, + "loss": 0.7604, + "step": 11172 + }, + { + "epoch": 0.7772792097116421, + "grad_norm": 1.046875, + "learning_rate": 0.00024904611358664286, + "loss": 0.7434, + "step": 11173 + }, + { + "epoch": 0.7773487773487774, + "grad_norm": 1.453125, + "learning_rate": 0.0002488973329604774, + "loss": 0.9076, + "step": 11174 + }, + { + "epoch": 0.7774183449859126, + "grad_norm": 0.8984375, + "learning_rate": 0.0002487485904715201, + "loss": 0.6852, + "step": 11175 + }, + { + "epoch": 0.7774879126230477, + "grad_norm": 1.0390625, + "learning_rate": 0.0002485998861273226, + "loss": 0.9038, + "step": 11176 + }, + { + "epoch": 0.777557480260183, + "grad_norm": 1.625, + "learning_rate": 0.00024845121993543565, + "loss": 0.9135, + "step": 11177 + }, + { + "epoch": 0.7776270478973182, + "grad_norm": 1.015625, + "learning_rate": 0.000248302591903407, + "loss": 0.7006, + "step": 11178 + }, + { + "epoch": 0.7776966155344534, + "grad_norm": 0.91796875, + "learning_rate": 0.00024815400203878445, + "loss": 0.6716, + "step": 11179 + }, + { + "epoch": 0.7777661831715886, + "grad_norm": 1.1875, + "learning_rate": 0.00024800545034911226, + "loss": 0.8515, + "step": 11180 + }, + { + "epoch": 0.7778357508087238, + "grad_norm": 1.09375, + "learning_rate": 0.00024785693684193256, + "loss": 0.8551, + "step": 11181 + }, + { + "epoch": 0.777905318445859, + "grad_norm": 1.09375, + "learning_rate": 0.0002477084615247868, + "loss": 1.0675, + "step": 11182 + }, + { + "epoch": 0.7779748860829941, + "grad_norm": 1.3359375, + "learning_rate": 0.0002475600244052133, + "loss": 0.8185, + "step": 11183 + }, + { + "epoch": 0.7780444537201294, + "grad_norm": 1.0, + "learning_rate": 0.0002474116254907495, + "loss": 0.7336, + "step": 11184 + }, + { + "epoch": 0.7781140213572646, + "grad_norm": 1.1875, + "learning_rate": 0.00024726326478892956, + "loss": 0.8614, + "step": 11185 + }, + { + "epoch": 0.7781835889943998, + "grad_norm": 1.296875, + "learning_rate": 0.00024711494230728737, + "loss": 0.8187, + "step": 11186 + }, + { + "epoch": 0.778253156631535, + "grad_norm": 0.91015625, + "learning_rate": 0.0002469666580533534, + "loss": 0.6623, + "step": 11187 + }, + { + "epoch": 0.7783227242686702, + "grad_norm": 0.94921875, + "learning_rate": 0.0002468184120346568, + "loss": 0.7252, + "step": 11188 + }, + { + "epoch": 0.7783922919058054, + "grad_norm": 1.0703125, + "learning_rate": 0.0002466702042587253, + "loss": 0.6136, + "step": 11189 + }, + { + "epoch": 0.7784618595429407, + "grad_norm": 1.2734375, + "learning_rate": 0.00024652203473308375, + "loss": 1.0125, + "step": 11190 + }, + { + "epoch": 0.7785314271800758, + "grad_norm": 1.171875, + "learning_rate": 0.00024637390346525544, + "loss": 0.8981, + "step": 11191 + }, + { + "epoch": 0.778600994817211, + "grad_norm": 1.5078125, + "learning_rate": 0.0002462258104627612, + "loss": 1.1634, + "step": 11192 + }, + { + "epoch": 0.7786705624543463, + "grad_norm": 1.1484375, + "learning_rate": 0.0002460777557331215, + "loss": 0.8263, + "step": 11193 + }, + { + "epoch": 0.7787401300914815, + "grad_norm": 1.0078125, + "learning_rate": 0.0002459297392838534, + "loss": 0.7874, + "step": 11194 + }, + { + "epoch": 0.7788096977286166, + "grad_norm": 1.359375, + "learning_rate": 0.0002457817611224721, + "loss": 0.9767, + "step": 11195 + }, + { + "epoch": 0.7788792653657518, + "grad_norm": 1.1328125, + "learning_rate": 0.00024563382125649167, + "loss": 0.7474, + "step": 11196 + }, + { + "epoch": 0.7789488330028871, + "grad_norm": 1.0859375, + "learning_rate": 0.00024548591969342313, + "loss": 0.8021, + "step": 11197 + }, + { + "epoch": 0.7790184006400223, + "grad_norm": 1.125, + "learning_rate": 0.000245338056440777, + "loss": 0.8787, + "step": 11198 + }, + { + "epoch": 0.7790879682771574, + "grad_norm": 1.7265625, + "learning_rate": 0.00024519023150606026, + "loss": 0.7636, + "step": 11199 + }, + { + "epoch": 0.7791575359142927, + "grad_norm": 1.0390625, + "learning_rate": 0.0002450424448967793, + "loss": 0.6921, + "step": 11200 + }, + { + "epoch": 0.7792271035514279, + "grad_norm": 0.9296875, + "learning_rate": 0.0002448946966204374, + "loss": 0.548, + "step": 11201 + }, + { + "epoch": 0.779296671188563, + "grad_norm": 1.2890625, + "learning_rate": 0.0002447469866845371, + "loss": 1.0118, + "step": 11202 + }, + { + "epoch": 0.7793662388256983, + "grad_norm": 1.109375, + "learning_rate": 0.00024459931509657776, + "loss": 0.8319, + "step": 11203 + }, + { + "epoch": 0.7794358064628335, + "grad_norm": 1.1171875, + "learning_rate": 0.00024445168186405797, + "loss": 0.794, + "step": 11204 + }, + { + "epoch": 0.7795053740999687, + "grad_norm": 0.95703125, + "learning_rate": 0.00024430408699447324, + "loss": 0.6468, + "step": 11205 + }, + { + "epoch": 0.779574941737104, + "grad_norm": 1.0625, + "learning_rate": 0.00024415653049531807, + "loss": 0.6303, + "step": 11206 + }, + { + "epoch": 0.7796445093742391, + "grad_norm": 1.2421875, + "learning_rate": 0.0002440090123740848, + "loss": 0.7152, + "step": 11207 + }, + { + "epoch": 0.7797140770113743, + "grad_norm": 1.1484375, + "learning_rate": 0.00024386153263826339, + "loss": 0.8969, + "step": 11208 + }, + { + "epoch": 0.7797836446485095, + "grad_norm": 1.234375, + "learning_rate": 0.00024371409129534205, + "loss": 0.9184, + "step": 11209 + }, + { + "epoch": 0.7798532122856447, + "grad_norm": 1.0546875, + "learning_rate": 0.0002435666883528067, + "loss": 0.9784, + "step": 11210 + }, + { + "epoch": 0.7799227799227799, + "grad_norm": 0.9453125, + "learning_rate": 0.0002434193238181428, + "loss": 0.7509, + "step": 11211 + }, + { + "epoch": 0.7799923475599151, + "grad_norm": 1.265625, + "learning_rate": 0.00024327199769883222, + "loss": 0.6518, + "step": 11212 + }, + { + "epoch": 0.7800619151970504, + "grad_norm": 1.25, + "learning_rate": 0.00024312471000235503, + "loss": 0.8756, + "step": 11213 + }, + { + "epoch": 0.7801314828341855, + "grad_norm": 0.98046875, + "learning_rate": 0.00024297746073619043, + "loss": 1.0232, + "step": 11214 + }, + { + "epoch": 0.7802010504713207, + "grad_norm": 1.09375, + "learning_rate": 0.00024283024990781444, + "loss": 0.7334, + "step": 11215 + }, + { + "epoch": 0.780270618108456, + "grad_norm": 1.2109375, + "learning_rate": 0.0002426830775247022, + "loss": 0.8859, + "step": 11216 + }, + { + "epoch": 0.7803401857455912, + "grad_norm": 1.203125, + "learning_rate": 0.00024253594359432585, + "loss": 0.7879, + "step": 11217 + }, + { + "epoch": 0.7804097533827263, + "grad_norm": 1.4609375, + "learning_rate": 0.00024238884812415674, + "loss": 0.9066, + "step": 11218 + }, + { + "epoch": 0.7804793210198615, + "grad_norm": 1.1015625, + "learning_rate": 0.0002422417911216629, + "loss": 0.7788, + "step": 11219 + }, + { + "epoch": 0.7805488886569968, + "grad_norm": 0.9453125, + "learning_rate": 0.00024209477259431157, + "loss": 0.5782, + "step": 11220 + }, + { + "epoch": 0.780618456294132, + "grad_norm": 0.97265625, + "learning_rate": 0.00024194779254956778, + "loss": 0.6061, + "step": 11221 + }, + { + "epoch": 0.7806880239312671, + "grad_norm": 1.5625, + "learning_rate": 0.00024180085099489423, + "loss": 1.0929, + "step": 11222 + }, + { + "epoch": 0.7807575915684024, + "grad_norm": 0.85546875, + "learning_rate": 0.00024165394793775196, + "loss": 0.5921, + "step": 11223 + }, + { + "epoch": 0.7808271592055376, + "grad_norm": 1.1953125, + "learning_rate": 0.00024150708338559922, + "loss": 0.6992, + "step": 11224 + }, + { + "epoch": 0.7808967268426728, + "grad_norm": 1.0, + "learning_rate": 0.00024136025734589428, + "loss": 0.7722, + "step": 11225 + }, + { + "epoch": 0.780966294479808, + "grad_norm": 1.1015625, + "learning_rate": 0.0002412134698260916, + "loss": 0.8283, + "step": 11226 + }, + { + "epoch": 0.7810358621169432, + "grad_norm": 1.109375, + "learning_rate": 0.00024106672083364412, + "loss": 0.8134, + "step": 11227 + }, + { + "epoch": 0.7811054297540784, + "grad_norm": 1.5546875, + "learning_rate": 0.00024092001037600354, + "loss": 0.8106, + "step": 11228 + }, + { + "epoch": 0.7811749973912137, + "grad_norm": 1.1875, + "learning_rate": 0.00024077333846061856, + "loss": 0.5998, + "step": 11229 + }, + { + "epoch": 0.7812445650283488, + "grad_norm": 1.15625, + "learning_rate": 0.0002406267050949369, + "loss": 0.7085, + "step": 11230 + }, + { + "epoch": 0.781314132665484, + "grad_norm": 1.203125, + "learning_rate": 0.00024048011028640328, + "loss": 0.9104, + "step": 11231 + }, + { + "epoch": 0.7813837003026192, + "grad_norm": 1.15625, + "learning_rate": 0.00024033355404246172, + "loss": 0.8226, + "step": 11232 + }, + { + "epoch": 0.7814532679397544, + "grad_norm": 1.2265625, + "learning_rate": 0.00024018703637055305, + "loss": 0.809, + "step": 11233 + }, + { + "epoch": 0.7815228355768896, + "grad_norm": 1.0078125, + "learning_rate": 0.0002400405572781168, + "loss": 0.7225, + "step": 11234 + }, + { + "epoch": 0.7815924032140248, + "grad_norm": 0.9921875, + "learning_rate": 0.000239894116772591, + "loss": 0.7899, + "step": 11235 + }, + { + "epoch": 0.7816619708511601, + "grad_norm": 1.1328125, + "learning_rate": 0.00023974771486141066, + "loss": 0.8799, + "step": 11236 + }, + { + "epoch": 0.7817315384882952, + "grad_norm": 1.1015625, + "learning_rate": 0.00023960135155200914, + "loss": 0.8214, + "step": 11237 + }, + { + "epoch": 0.7818011061254304, + "grad_norm": 1.09375, + "learning_rate": 0.0002394550268518183, + "loss": 0.7401, + "step": 11238 + }, + { + "epoch": 0.7818706737625657, + "grad_norm": 1.234375, + "learning_rate": 0.00023930874076826802, + "loss": 0.823, + "step": 11239 + }, + { + "epoch": 0.7819402413997009, + "grad_norm": 0.984375, + "learning_rate": 0.00023916249330878581, + "loss": 0.5478, + "step": 11240 + }, + { + "epoch": 0.782009809036836, + "grad_norm": 1.3671875, + "learning_rate": 0.00023901628448079693, + "loss": 0.804, + "step": 11241 + }, + { + "epoch": 0.7820793766739713, + "grad_norm": 0.9921875, + "learning_rate": 0.00023887011429172568, + "loss": 0.8168, + "step": 11242 + }, + { + "epoch": 0.7821489443111065, + "grad_norm": 1.15625, + "learning_rate": 0.00023872398274899344, + "loss": 0.8006, + "step": 11243 + }, + { + "epoch": 0.7822185119482417, + "grad_norm": 0.83984375, + "learning_rate": 0.0002385778898600206, + "loss": 0.5458, + "step": 11244 + }, + { + "epoch": 0.7822880795853768, + "grad_norm": 0.96484375, + "learning_rate": 0.00023843183563222425, + "loss": 0.9043, + "step": 11245 + }, + { + "epoch": 0.7823576472225121, + "grad_norm": 1.015625, + "learning_rate": 0.00023828582007302102, + "loss": 0.9319, + "step": 11246 + }, + { + "epoch": 0.7824272148596473, + "grad_norm": 0.9296875, + "learning_rate": 0.00023813984318982428, + "loss": 0.6755, + "step": 11247 + }, + { + "epoch": 0.7824967824967825, + "grad_norm": 1.09375, + "learning_rate": 0.00023799390499004626, + "loss": 0.7779, + "step": 11248 + }, + { + "epoch": 0.7825663501339177, + "grad_norm": 1.1171875, + "learning_rate": 0.0002378480054810972, + "loss": 0.7601, + "step": 11249 + }, + { + "epoch": 0.7826359177710529, + "grad_norm": 1.0, + "learning_rate": 0.00023770214467038487, + "loss": 0.6702, + "step": 11250 + }, + { + "epoch": 0.7827054854081881, + "grad_norm": 0.94921875, + "learning_rate": 0.00023755632256531513, + "loss": 0.6623, + "step": 11251 + }, + { + "epoch": 0.7827750530453234, + "grad_norm": 1.203125, + "learning_rate": 0.00023741053917329224, + "loss": 0.868, + "step": 11252 + }, + { + "epoch": 0.7828446206824585, + "grad_norm": 1.0234375, + "learning_rate": 0.00023726479450171878, + "loss": 0.6187, + "step": 11253 + }, + { + "epoch": 0.7829141883195937, + "grad_norm": 1.109375, + "learning_rate": 0.0002371190885579946, + "loss": 1.0516, + "step": 11254 + }, + { + "epoch": 0.782983755956729, + "grad_norm": 1.4296875, + "learning_rate": 0.0002369734213495176, + "loss": 1.0133, + "step": 11255 + }, + { + "epoch": 0.7830533235938641, + "grad_norm": 1.109375, + "learning_rate": 0.00023682779288368438, + "loss": 0.8297, + "step": 11256 + }, + { + "epoch": 0.7831228912309993, + "grad_norm": 0.80859375, + "learning_rate": 0.00023668220316788935, + "loss": 0.5686, + "step": 11257 + }, + { + "epoch": 0.7831924588681345, + "grad_norm": 1.0859375, + "learning_rate": 0.0002365366522095247, + "loss": 0.82, + "step": 11258 + }, + { + "epoch": 0.7832620265052698, + "grad_norm": 1.03125, + "learning_rate": 0.00023639114001598038, + "loss": 0.6162, + "step": 11259 + }, + { + "epoch": 0.7833315941424049, + "grad_norm": 1.234375, + "learning_rate": 0.00023624566659464542, + "loss": 0.7459, + "step": 11260 + }, + { + "epoch": 0.7834011617795401, + "grad_norm": 1.109375, + "learning_rate": 0.00023610023195290563, + "loss": 0.6846, + "step": 11261 + }, + { + "epoch": 0.7834707294166754, + "grad_norm": 1.515625, + "learning_rate": 0.0002359548360981457, + "loss": 0.9254, + "step": 11262 + }, + { + "epoch": 0.7835402970538106, + "grad_norm": 1.1171875, + "learning_rate": 0.0002358094790377484, + "loss": 0.8209, + "step": 11263 + }, + { + "epoch": 0.7836098646909457, + "grad_norm": 1.46875, + "learning_rate": 0.0002356641607790939, + "loss": 1.0074, + "step": 11264 + }, + { + "epoch": 0.783679432328081, + "grad_norm": 1.1640625, + "learning_rate": 0.00023551888132956056, + "loss": 0.7177, + "step": 11265 + }, + { + "epoch": 0.7837489999652162, + "grad_norm": 0.76953125, + "learning_rate": 0.00023537364069652511, + "loss": 0.4447, + "step": 11266 + }, + { + "epoch": 0.7838185676023514, + "grad_norm": 0.953125, + "learning_rate": 0.00023522843888736257, + "loss": 0.7159, + "step": 11267 + }, + { + "epoch": 0.7838881352394866, + "grad_norm": 1.046875, + "learning_rate": 0.000235083275909445, + "loss": 0.7927, + "step": 11268 + }, + { + "epoch": 0.7839577028766218, + "grad_norm": 1.0859375, + "learning_rate": 0.0002349381517701431, + "loss": 0.7813, + "step": 11269 + }, + { + "epoch": 0.784027270513757, + "grad_norm": 1.3046875, + "learning_rate": 0.00023479306647682552, + "loss": 0.8981, + "step": 11270 + }, + { + "epoch": 0.7840968381508922, + "grad_norm": 1.34375, + "learning_rate": 0.00023464802003685947, + "loss": 0.9437, + "step": 11271 + }, + { + "epoch": 0.7841664057880274, + "grad_norm": 0.9765625, + "learning_rate": 0.0002345030124576093, + "loss": 0.7387, + "step": 11272 + }, + { + "epoch": 0.7842359734251626, + "grad_norm": 1.1640625, + "learning_rate": 0.00023435804374643743, + "loss": 0.9306, + "step": 11273 + }, + { + "epoch": 0.7843055410622978, + "grad_norm": 0.9296875, + "learning_rate": 0.00023421311391070532, + "loss": 0.6672, + "step": 11274 + }, + { + "epoch": 0.784375108699433, + "grad_norm": 1.109375, + "learning_rate": 0.00023406822295777107, + "loss": 1.0116, + "step": 11275 + }, + { + "epoch": 0.7844446763365682, + "grad_norm": 1.3203125, + "learning_rate": 0.00023392337089499194, + "loss": 0.7879, + "step": 11276 + }, + { + "epoch": 0.7845142439737034, + "grad_norm": 1.171875, + "learning_rate": 0.000233778557729723, + "loss": 0.7684, + "step": 11277 + }, + { + "epoch": 0.7845838116108387, + "grad_norm": 1.21875, + "learning_rate": 0.00023363378346931684, + "loss": 0.7108, + "step": 11278 + }, + { + "epoch": 0.7846533792479738, + "grad_norm": 1.1796875, + "learning_rate": 0.00023348904812112403, + "loss": 0.7429, + "step": 11279 + }, + { + "epoch": 0.784722946885109, + "grad_norm": 1.265625, + "learning_rate": 0.00023334435169249402, + "loss": 0.7404, + "step": 11280 + }, + { + "epoch": 0.7847925145222443, + "grad_norm": 1.34375, + "learning_rate": 0.0002331996941907738, + "loss": 0.842, + "step": 11281 + }, + { + "epoch": 0.7848620821593795, + "grad_norm": 1.0859375, + "learning_rate": 0.00023305507562330807, + "loss": 0.8211, + "step": 11282 + }, + { + "epoch": 0.7849316497965146, + "grad_norm": 1.375, + "learning_rate": 0.00023291049599743975, + "loss": 0.8249, + "step": 11283 + }, + { + "epoch": 0.7850012174336498, + "grad_norm": 0.98828125, + "learning_rate": 0.0002327659553205099, + "loss": 0.6402, + "step": 11284 + }, + { + "epoch": 0.7850707850707851, + "grad_norm": 1.5234375, + "learning_rate": 0.00023262145359985808, + "loss": 0.6879, + "step": 11285 + }, + { + "epoch": 0.7851403527079203, + "grad_norm": 1.125, + "learning_rate": 0.00023247699084282092, + "loss": 0.69, + "step": 11286 + }, + { + "epoch": 0.7852099203450554, + "grad_norm": 0.984375, + "learning_rate": 0.00023233256705673333, + "loss": 0.6935, + "step": 11287 + }, + { + "epoch": 0.7852794879821907, + "grad_norm": 1.0078125, + "learning_rate": 0.00023218818224892868, + "loss": 0.6691, + "step": 11288 + }, + { + "epoch": 0.7853490556193259, + "grad_norm": 0.95703125, + "learning_rate": 0.0002320438364267383, + "loss": 0.9906, + "step": 11289 + }, + { + "epoch": 0.7854186232564611, + "grad_norm": 1.078125, + "learning_rate": 0.00023189952959749106, + "loss": 0.7326, + "step": 11290 + }, + { + "epoch": 0.7854881908935963, + "grad_norm": 1.15625, + "learning_rate": 0.00023175526176851403, + "loss": 0.7043, + "step": 11291 + }, + { + "epoch": 0.7855577585307315, + "grad_norm": 1.0546875, + "learning_rate": 0.00023161103294713282, + "loss": 0.7164, + "step": 11292 + }, + { + "epoch": 0.7856273261678667, + "grad_norm": 1.0234375, + "learning_rate": 0.00023146684314067002, + "loss": 1.0191, + "step": 11293 + }, + { + "epoch": 0.785696893805002, + "grad_norm": 1.234375, + "learning_rate": 0.00023132269235644733, + "loss": 0.9472, + "step": 11294 + }, + { + "epoch": 0.7857664614421371, + "grad_norm": 0.99609375, + "learning_rate": 0.0002311785806017842, + "loss": 0.6708, + "step": 11295 + }, + { + "epoch": 0.7858360290792723, + "grad_norm": 1.1328125, + "learning_rate": 0.0002310345078839975, + "loss": 0.707, + "step": 11296 + }, + { + "epoch": 0.7859055967164075, + "grad_norm": 0.8984375, + "learning_rate": 0.00023089047421040243, + "loss": 0.5383, + "step": 11297 + }, + { + "epoch": 0.7859751643535428, + "grad_norm": 1.296875, + "learning_rate": 0.0002307464795883124, + "loss": 0.7086, + "step": 11298 + }, + { + "epoch": 0.7860447319906779, + "grad_norm": 1.03125, + "learning_rate": 0.00023060252402503913, + "loss": 0.6749, + "step": 11299 + }, + { + "epoch": 0.7861142996278131, + "grad_norm": 1.1640625, + "learning_rate": 0.0002304586075278916, + "loss": 0.8607, + "step": 11300 + }, + { + "epoch": 0.7861838672649484, + "grad_norm": 1.125, + "learning_rate": 0.00023031473010417703, + "loss": 0.8593, + "step": 11301 + }, + { + "epoch": 0.7862534349020835, + "grad_norm": 0.8828125, + "learning_rate": 0.00023017089176120088, + "loss": 0.6794, + "step": 11302 + }, + { + "epoch": 0.7863230025392187, + "grad_norm": 1.3828125, + "learning_rate": 0.00023002709250626686, + "loss": 0.9, + "step": 11303 + }, + { + "epoch": 0.786392570176354, + "grad_norm": 1.3828125, + "learning_rate": 0.00022988333234667626, + "loss": 0.8108, + "step": 11304 + }, + { + "epoch": 0.7864621378134892, + "grad_norm": 1.34375, + "learning_rate": 0.00022973961128972797, + "loss": 0.6458, + "step": 11305 + }, + { + "epoch": 0.7865317054506243, + "grad_norm": 1.046875, + "learning_rate": 0.0002295959293427201, + "loss": 0.7002, + "step": 11306 + }, + { + "epoch": 0.7866012730877596, + "grad_norm": 1.046875, + "learning_rate": 0.0002294522865129476, + "loss": 0.8288, + "step": 11307 + }, + { + "epoch": 0.7866708407248948, + "grad_norm": 1.140625, + "learning_rate": 0.00022930868280770413, + "loss": 0.841, + "step": 11308 + }, + { + "epoch": 0.78674040836203, + "grad_norm": 0.99609375, + "learning_rate": 0.00022916511823428142, + "loss": 0.6988, + "step": 11309 + }, + { + "epoch": 0.7868099759991651, + "grad_norm": 0.93359375, + "learning_rate": 0.00022902159279996871, + "loss": 0.6887, + "step": 11310 + }, + { + "epoch": 0.7868795436363004, + "grad_norm": 1.09375, + "learning_rate": 0.00022887810651205331, + "loss": 0.9695, + "step": 11311 + }, + { + "epoch": 0.7869491112734356, + "grad_norm": 1.2109375, + "learning_rate": 0.00022873465937782079, + "loss": 0.7344, + "step": 11312 + }, + { + "epoch": 0.7870186789105708, + "grad_norm": 1.09375, + "learning_rate": 0.00022859125140455515, + "loss": 0.9219, + "step": 11313 + }, + { + "epoch": 0.787088246547706, + "grad_norm": 1.25, + "learning_rate": 0.00022844788259953765, + "loss": 0.923, + "step": 11314 + }, + { + "epoch": 0.7871578141848412, + "grad_norm": 1.2578125, + "learning_rate": 0.00022830455297004738, + "loss": 0.7885, + "step": 11315 + }, + { + "epoch": 0.7872273818219764, + "grad_norm": 1.1953125, + "learning_rate": 0.00022816126252336223, + "loss": 0.9752, + "step": 11316 + }, + { + "epoch": 0.7872969494591117, + "grad_norm": 0.8828125, + "learning_rate": 0.00022801801126675814, + "loss": 0.7138, + "step": 11317 + }, + { + "epoch": 0.7873665170962468, + "grad_norm": 1.1796875, + "learning_rate": 0.00022787479920750842, + "loss": 0.9382, + "step": 11318 + }, + { + "epoch": 0.787436084733382, + "grad_norm": 1.0390625, + "learning_rate": 0.00022773162635288425, + "loss": 0.7596, + "step": 11319 + }, + { + "epoch": 0.7875056523705173, + "grad_norm": 1.1015625, + "learning_rate": 0.0002275884927101557, + "loss": 0.6886, + "step": 11320 + }, + { + "epoch": 0.7875752200076525, + "grad_norm": 1.234375, + "learning_rate": 0.0002274453982865904, + "loss": 0.6172, + "step": 11321 + }, + { + "epoch": 0.7876447876447876, + "grad_norm": 1.09375, + "learning_rate": 0.00022730234308945352, + "loss": 0.5757, + "step": 11322 + }, + { + "epoch": 0.7877143552819228, + "grad_norm": 1.015625, + "learning_rate": 0.00022715932712600928, + "loss": 0.8766, + "step": 11323 + }, + { + "epoch": 0.7877839229190581, + "grad_norm": 1.296875, + "learning_rate": 0.00022701635040351897, + "loss": 0.6226, + "step": 11324 + }, + { + "epoch": 0.7878534905561932, + "grad_norm": 0.8984375, + "learning_rate": 0.00022687341292924212, + "loss": 0.5474, + "step": 11325 + }, + { + "epoch": 0.7879230581933284, + "grad_norm": 1.3515625, + "learning_rate": 0.00022673051471043637, + "loss": 0.6295, + "step": 11326 + }, + { + "epoch": 0.7879926258304637, + "grad_norm": 1.1640625, + "learning_rate": 0.00022658765575435792, + "loss": 0.7875, + "step": 11327 + }, + { + "epoch": 0.7880621934675989, + "grad_norm": 1.1484375, + "learning_rate": 0.00022644483606825994, + "loss": 0.7761, + "step": 11328 + }, + { + "epoch": 0.788131761104734, + "grad_norm": 1.203125, + "learning_rate": 0.00022630205565939387, + "loss": 0.8934, + "step": 11329 + }, + { + "epoch": 0.7882013287418693, + "grad_norm": 1.078125, + "learning_rate": 0.00022615931453500972, + "loss": 0.9277, + "step": 11330 + }, + { + "epoch": 0.7882708963790045, + "grad_norm": 1.1171875, + "learning_rate": 0.0002260166127023554, + "loss": 0.5723, + "step": 11331 + }, + { + "epoch": 0.7883404640161397, + "grad_norm": 1.0703125, + "learning_rate": 0.0002258739501686763, + "loss": 0.7372, + "step": 11332 + }, + { + "epoch": 0.788410031653275, + "grad_norm": 1.1328125, + "learning_rate": 0.0002257313269412159, + "loss": 0.7885, + "step": 11333 + }, + { + "epoch": 0.7884795992904101, + "grad_norm": 1.2109375, + "learning_rate": 0.0002255887430272161, + "loss": 0.7335, + "step": 11334 + }, + { + "epoch": 0.7885491669275453, + "grad_norm": 1.390625, + "learning_rate": 0.0002254461984339169, + "loss": 1.1112, + "step": 11335 + }, + { + "epoch": 0.7886187345646805, + "grad_norm": 0.98828125, + "learning_rate": 0.00022530369316855537, + "loss": 0.7953, + "step": 11336 + }, + { + "epoch": 0.7886883022018157, + "grad_norm": 1.140625, + "learning_rate": 0.00022516122723836786, + "loss": 0.5744, + "step": 11337 + }, + { + "epoch": 0.7887578698389509, + "grad_norm": 1.046875, + "learning_rate": 0.00022501880065058777, + "loss": 0.859, + "step": 11338 + }, + { + "epoch": 0.7888274374760861, + "grad_norm": 0.8984375, + "learning_rate": 0.00022487641341244647, + "loss": 0.7815, + "step": 11339 + }, + { + "epoch": 0.7888970051132214, + "grad_norm": 1.0859375, + "learning_rate": 0.00022473406553117403, + "loss": 0.7874, + "step": 11340 + }, + { + "epoch": 0.7889665727503565, + "grad_norm": 0.95703125, + "learning_rate": 0.00022459175701399837, + "loss": 0.8343, + "step": 11341 + }, + { + "epoch": 0.7890361403874917, + "grad_norm": 1.0, + "learning_rate": 0.00022444948786814502, + "loss": 0.7663, + "step": 11342 + }, + { + "epoch": 0.789105708024627, + "grad_norm": 1.203125, + "learning_rate": 0.00022430725810083718, + "loss": 0.7282, + "step": 11343 + }, + { + "epoch": 0.7891752756617622, + "grad_norm": 1.1640625, + "learning_rate": 0.00022416506771929712, + "loss": 0.7486, + "step": 11344 + }, + { + "epoch": 0.7892448432988973, + "grad_norm": 1.203125, + "learning_rate": 0.0002240229167307446, + "loss": 0.8693, + "step": 11345 + }, + { + "epoch": 0.7893144109360326, + "grad_norm": 0.8984375, + "learning_rate": 0.00022388080514239718, + "loss": 0.8566, + "step": 11346 + }, + { + "epoch": 0.7893839785731678, + "grad_norm": 1.140625, + "learning_rate": 0.0002237387329614703, + "loss": 0.5645, + "step": 11347 + }, + { + "epoch": 0.789453546210303, + "grad_norm": 1.0703125, + "learning_rate": 0.00022359670019517797, + "loss": 0.8409, + "step": 11348 + }, + { + "epoch": 0.7895231138474381, + "grad_norm": 1.0390625, + "learning_rate": 0.000223454706850732, + "loss": 0.8261, + "step": 11349 + }, + { + "epoch": 0.7895926814845734, + "grad_norm": 0.75390625, + "learning_rate": 0.0002233127529353417, + "loss": 0.5983, + "step": 11350 + }, + { + "epoch": 0.7896622491217086, + "grad_norm": 0.9453125, + "learning_rate": 0.00022317083845621534, + "loss": 0.8145, + "step": 11351 + }, + { + "epoch": 0.7897318167588437, + "grad_norm": 1.5546875, + "learning_rate": 0.00022302896342055802, + "loss": 0.9216, + "step": 11352 + }, + { + "epoch": 0.789801384395979, + "grad_norm": 1.046875, + "learning_rate": 0.00022288712783557387, + "loss": 0.7054, + "step": 11353 + }, + { + "epoch": 0.7898709520331142, + "grad_norm": 0.89453125, + "learning_rate": 0.00022274533170846424, + "loss": 0.6841, + "step": 11354 + }, + { + "epoch": 0.7899405196702494, + "grad_norm": 0.9453125, + "learning_rate": 0.00022260357504642924, + "loss": 0.811, + "step": 11355 + }, + { + "epoch": 0.7900100873073846, + "grad_norm": 1.015625, + "learning_rate": 0.0002224618578566664, + "loss": 0.7281, + "step": 11356 + }, + { + "epoch": 0.7900796549445198, + "grad_norm": 1.171875, + "learning_rate": 0.00022232018014637102, + "loss": 0.8313, + "step": 11357 + }, + { + "epoch": 0.790149222581655, + "grad_norm": 1.0703125, + "learning_rate": 0.0002221785419227371, + "loss": 0.7241, + "step": 11358 + }, + { + "epoch": 0.7902187902187903, + "grad_norm": 0.984375, + "learning_rate": 0.00022203694319295665, + "loss": 0.9019, + "step": 11359 + }, + { + "epoch": 0.7902883578559254, + "grad_norm": 1.0625, + "learning_rate": 0.00022189538396421893, + "loss": 0.867, + "step": 11360 + }, + { + "epoch": 0.7903579254930606, + "grad_norm": 1.1015625, + "learning_rate": 0.00022175386424371136, + "loss": 0.8799, + "step": 11361 + }, + { + "epoch": 0.7904274931301958, + "grad_norm": 1.15625, + "learning_rate": 0.00022161238403861993, + "loss": 0.991, + "step": 11362 + }, + { + "epoch": 0.7904970607673311, + "grad_norm": 1.21875, + "learning_rate": 0.0002214709433561286, + "loss": 1.1083, + "step": 11363 + }, + { + "epoch": 0.7905666284044662, + "grad_norm": 1.1875, + "learning_rate": 0.00022132954220341873, + "loss": 0.6552, + "step": 11364 + }, + { + "epoch": 0.7906361960416014, + "grad_norm": 1.1484375, + "learning_rate": 0.00022118818058766953, + "loss": 0.6947, + "step": 11365 + }, + { + "epoch": 0.7907057636787367, + "grad_norm": 1.2578125, + "learning_rate": 0.0002210468585160591, + "loss": 0.8956, + "step": 11366 + }, + { + "epoch": 0.7907753313158719, + "grad_norm": 1.140625, + "learning_rate": 0.0002209055759957632, + "loss": 0.762, + "step": 11367 + }, + { + "epoch": 0.790844898953007, + "grad_norm": 1.21875, + "learning_rate": 0.00022076433303395504, + "loss": 0.9421, + "step": 11368 + }, + { + "epoch": 0.7909144665901423, + "grad_norm": 0.9609375, + "learning_rate": 0.00022062312963780663, + "loss": 0.5986, + "step": 11369 + }, + { + "epoch": 0.7909840342272775, + "grad_norm": 0.97265625, + "learning_rate": 0.00022048196581448732, + "loss": 0.7251, + "step": 11370 + }, + { + "epoch": 0.7910536018644126, + "grad_norm": 1.0234375, + "learning_rate": 0.0002203408415711644, + "loss": 0.6971, + "step": 11371 + }, + { + "epoch": 0.7911231695015479, + "grad_norm": 1.453125, + "learning_rate": 0.00022019975691500382, + "loss": 0.9502, + "step": 11372 + }, + { + "epoch": 0.7911927371386831, + "grad_norm": 0.78125, + "learning_rate": 0.0002200587118531694, + "loss": 0.6688, + "step": 11373 + }, + { + "epoch": 0.7912623047758183, + "grad_norm": 1.0078125, + "learning_rate": 0.00021991770639282238, + "loss": 0.7167, + "step": 11374 + }, + { + "epoch": 0.7913318724129534, + "grad_norm": 1.09375, + "learning_rate": 0.00021977674054112205, + "loss": 0.8944, + "step": 11375 + }, + { + "epoch": 0.7914014400500887, + "grad_norm": 1.2109375, + "learning_rate": 0.00021963581430522628, + "loss": 0.5945, + "step": 11376 + }, + { + "epoch": 0.7914710076872239, + "grad_norm": 0.91015625, + "learning_rate": 0.00021949492769229073, + "loss": 0.5457, + "step": 11377 + }, + { + "epoch": 0.7915405753243591, + "grad_norm": 1.3359375, + "learning_rate": 0.0002193540807094687, + "loss": 0.9095, + "step": 11378 + }, + { + "epoch": 0.7916101429614943, + "grad_norm": 1.0234375, + "learning_rate": 0.0002192132733639115, + "loss": 0.8057, + "step": 11379 + }, + { + "epoch": 0.7916797105986295, + "grad_norm": 1.03125, + "learning_rate": 0.00021907250566276882, + "loss": 0.6894, + "step": 11380 + }, + { + "epoch": 0.7917492782357647, + "grad_norm": 0.921875, + "learning_rate": 0.0002189317776131884, + "loss": 0.586, + "step": 11381 + }, + { + "epoch": 0.7918188458729, + "grad_norm": 1.1484375, + "learning_rate": 0.00021879108922231516, + "loss": 0.8967, + "step": 11382 + }, + { + "epoch": 0.7918884135100351, + "grad_norm": 1.0390625, + "learning_rate": 0.000218650440497293, + "loss": 0.7873, + "step": 11383 + }, + { + "epoch": 0.7919579811471703, + "grad_norm": 1.15625, + "learning_rate": 0.00021850983144526304, + "loss": 0.8728, + "step": 11384 + }, + { + "epoch": 0.7920275487843056, + "grad_norm": 1.28125, + "learning_rate": 0.00021836926207336504, + "loss": 0.9168, + "step": 11385 + }, + { + "epoch": 0.7920971164214408, + "grad_norm": 1.3046875, + "learning_rate": 0.00021822873238873597, + "loss": 0.7407, + "step": 11386 + }, + { + "epoch": 0.7921666840585759, + "grad_norm": 1.1171875, + "learning_rate": 0.00021808824239851165, + "loss": 0.9536, + "step": 11387 + }, + { + "epoch": 0.7922362516957111, + "grad_norm": 1.2265625, + "learning_rate": 0.0002179477921098253, + "loss": 0.9262, + "step": 11388 + }, + { + "epoch": 0.7923058193328464, + "grad_norm": 1.09375, + "learning_rate": 0.00021780738152980795, + "loss": 0.7183, + "step": 11389 + }, + { + "epoch": 0.7923753869699816, + "grad_norm": 1.140625, + "learning_rate": 0.00021766701066558924, + "loss": 0.7785, + "step": 11390 + }, + { + "epoch": 0.7924449546071167, + "grad_norm": 1.109375, + "learning_rate": 0.00021752667952429673, + "loss": 0.7651, + "step": 11391 + }, + { + "epoch": 0.792514522244252, + "grad_norm": 1.1171875, + "learning_rate": 0.00021738638811305555, + "loss": 0.6941, + "step": 11392 + }, + { + "epoch": 0.7925840898813872, + "grad_norm": 1.3984375, + "learning_rate": 0.00021724613643898848, + "loss": 0.7646, + "step": 11393 + }, + { + "epoch": 0.7926536575185223, + "grad_norm": 1.2109375, + "learning_rate": 0.0002171059245092174, + "loss": 0.9519, + "step": 11394 + }, + { + "epoch": 0.7927232251556576, + "grad_norm": 1.046875, + "learning_rate": 0.00021696575233086157, + "loss": 0.8139, + "step": 11395 + }, + { + "epoch": 0.7927927927927928, + "grad_norm": 1.0859375, + "learning_rate": 0.0002168256199110379, + "loss": 1.0322, + "step": 11396 + }, + { + "epoch": 0.792862360429928, + "grad_norm": 1.0390625, + "learning_rate": 0.00021668552725686186, + "loss": 0.8298, + "step": 11397 + }, + { + "epoch": 0.7929319280670633, + "grad_norm": 1.078125, + "learning_rate": 0.00021654547437544635, + "loss": 0.6793, + "step": 11398 + }, + { + "epoch": 0.7930014957041984, + "grad_norm": 1.5859375, + "learning_rate": 0.00021640546127390302, + "loss": 0.9698, + "step": 11399 + }, + { + "epoch": 0.7930710633413336, + "grad_norm": 1.2578125, + "learning_rate": 0.00021626548795934054, + "loss": 0.8393, + "step": 11400 + }, + { + "epoch": 0.7931406309784688, + "grad_norm": 0.828125, + "learning_rate": 0.0002161255544388665, + "loss": 0.6756, + "step": 11401 + }, + { + "epoch": 0.793210198615604, + "grad_norm": 0.91796875, + "learning_rate": 0.0002159856607195857, + "loss": 0.4974, + "step": 11402 + }, + { + "epoch": 0.7932797662527392, + "grad_norm": 1.4296875, + "learning_rate": 0.00021584580680860088, + "loss": 1.0398, + "step": 11403 + }, + { + "epoch": 0.7933493338898744, + "grad_norm": 1.5078125, + "learning_rate": 0.00021570599271301404, + "loss": 0.9352, + "step": 11404 + }, + { + "epoch": 0.7934189015270097, + "grad_norm": 1.234375, + "learning_rate": 0.00021556621843992385, + "loss": 0.6301, + "step": 11405 + }, + { + "epoch": 0.7934884691641448, + "grad_norm": 0.96484375, + "learning_rate": 0.00021542648399642717, + "loss": 0.6083, + "step": 11406 + }, + { + "epoch": 0.79355803680128, + "grad_norm": 1.2578125, + "learning_rate": 0.00021528678938961888, + "loss": 0.6556, + "step": 11407 + }, + { + "epoch": 0.7936276044384153, + "grad_norm": 1.4140625, + "learning_rate": 0.00021514713462659208, + "loss": 0.9146, + "step": 11408 + }, + { + "epoch": 0.7936971720755505, + "grad_norm": 1.2578125, + "learning_rate": 0.0002150075197144382, + "loss": 0.8798, + "step": 11409 + }, + { + "epoch": 0.7937667397126856, + "grad_norm": 1.015625, + "learning_rate": 0.0002148679446602455, + "loss": 0.7407, + "step": 11410 + }, + { + "epoch": 0.7938363073498209, + "grad_norm": 1.21875, + "learning_rate": 0.0002147284094711015, + "loss": 0.8417, + "step": 11411 + }, + { + "epoch": 0.7939058749869561, + "grad_norm": 1.0703125, + "learning_rate": 0.00021458891415409055, + "loss": 0.8481, + "step": 11412 + }, + { + "epoch": 0.7939754426240913, + "grad_norm": 1.234375, + "learning_rate": 0.00021444945871629595, + "loss": 0.724, + "step": 11413 + }, + { + "epoch": 0.7940450102612264, + "grad_norm": 1.3515625, + "learning_rate": 0.00021431004316479818, + "loss": 1.0432, + "step": 11414 + }, + { + "epoch": 0.7941145778983617, + "grad_norm": 1.0546875, + "learning_rate": 0.00021417066750667658, + "loss": 0.9333, + "step": 11415 + }, + { + "epoch": 0.7941841455354969, + "grad_norm": 0.96875, + "learning_rate": 0.00021403133174900747, + "loss": 0.4688, + "step": 11416 + }, + { + "epoch": 0.794253713172632, + "grad_norm": 1.1640625, + "learning_rate": 0.0002138920358988653, + "loss": 0.8709, + "step": 11417 + }, + { + "epoch": 0.7943232808097673, + "grad_norm": 1.078125, + "learning_rate": 0.00021375277996332377, + "loss": 0.6321, + "step": 11418 + }, + { + "epoch": 0.7943928484469025, + "grad_norm": 1.4296875, + "learning_rate": 0.00021361356394945308, + "loss": 1.1456, + "step": 11419 + }, + { + "epoch": 0.7944624160840377, + "grad_norm": 1.1328125, + "learning_rate": 0.00021347438786432205, + "loss": 0.6778, + "step": 11420 + }, + { + "epoch": 0.794531983721173, + "grad_norm": 1.53125, + "learning_rate": 0.0002133352517149968, + "loss": 0.7393, + "step": 11421 + }, + { + "epoch": 0.7946015513583081, + "grad_norm": 1.1015625, + "learning_rate": 0.00021319615550854243, + "loss": 0.5918, + "step": 11422 + }, + { + "epoch": 0.7946711189954433, + "grad_norm": 0.8984375, + "learning_rate": 0.0002130570992520219, + "loss": 0.5849, + "step": 11423 + }, + { + "epoch": 0.7947406866325786, + "grad_norm": 1.4609375, + "learning_rate": 0.00021291808295249493, + "loss": 0.7043, + "step": 11424 + }, + { + "epoch": 0.7948102542697137, + "grad_norm": 0.98046875, + "learning_rate": 0.0002127791066170208, + "loss": 0.5975, + "step": 11425 + }, + { + "epoch": 0.7948798219068489, + "grad_norm": 0.93359375, + "learning_rate": 0.00021264017025265558, + "loss": 0.5729, + "step": 11426 + }, + { + "epoch": 0.7949493895439841, + "grad_norm": 1.0625, + "learning_rate": 0.00021250127386645412, + "loss": 0.7633, + "step": 11427 + }, + { + "epoch": 0.7950189571811194, + "grad_norm": 1.046875, + "learning_rate": 0.00021236241746546848, + "loss": 0.8501, + "step": 11428 + }, + { + "epoch": 0.7950885248182545, + "grad_norm": 1.09375, + "learning_rate": 0.00021222360105674953, + "loss": 0.7237, + "step": 11429 + }, + { + "epoch": 0.7951580924553897, + "grad_norm": 0.94140625, + "learning_rate": 0.00021208482464734525, + "loss": 0.7258, + "step": 11430 + }, + { + "epoch": 0.795227660092525, + "grad_norm": 1.4453125, + "learning_rate": 0.00021194608824430205, + "loss": 0.781, + "step": 11431 + }, + { + "epoch": 0.7952972277296602, + "grad_norm": 1.2734375, + "learning_rate": 0.00021180739185466468, + "loss": 1.0382, + "step": 11432 + }, + { + "epoch": 0.7953667953667953, + "grad_norm": 1.0859375, + "learning_rate": 0.00021166873548547526, + "loss": 0.7659, + "step": 11433 + }, + { + "epoch": 0.7954363630039306, + "grad_norm": 1.140625, + "learning_rate": 0.00021153011914377395, + "loss": 0.8325, + "step": 11434 + }, + { + "epoch": 0.7955059306410658, + "grad_norm": 1.1796875, + "learning_rate": 0.00021139154283659846, + "loss": 0.8347, + "step": 11435 + }, + { + "epoch": 0.795575498278201, + "grad_norm": 1.0703125, + "learning_rate": 0.000211253006570986, + "loss": 0.7483, + "step": 11436 + }, + { + "epoch": 0.7956450659153362, + "grad_norm": 1.2890625, + "learning_rate": 0.00021111451035397033, + "loss": 0.7951, + "step": 11437 + }, + { + "epoch": 0.7957146335524714, + "grad_norm": 0.98046875, + "learning_rate": 0.0002109760541925836, + "loss": 0.8256, + "step": 11438 + }, + { + "epoch": 0.7957842011896066, + "grad_norm": 3.0, + "learning_rate": 0.0002108376380938556, + "loss": 0.8697, + "step": 11439 + }, + { + "epoch": 0.7958537688267417, + "grad_norm": 1.4609375, + "learning_rate": 0.0002106992620648146, + "loss": 1.0132, + "step": 11440 + }, + { + "epoch": 0.795923336463877, + "grad_norm": 1.3203125, + "learning_rate": 0.000210560926112487, + "loss": 0.8459, + "step": 11441 + }, + { + "epoch": 0.7959929041010122, + "grad_norm": 1.15625, + "learning_rate": 0.00021042263024389617, + "loss": 0.6435, + "step": 11442 + }, + { + "epoch": 0.7960624717381474, + "grad_norm": 1.2890625, + "learning_rate": 0.00021028437446606475, + "loss": 0.9039, + "step": 11443 + }, + { + "epoch": 0.7961320393752827, + "grad_norm": 0.94140625, + "learning_rate": 0.00021014615878601207, + "loss": 0.7823, + "step": 11444 + }, + { + "epoch": 0.7962016070124178, + "grad_norm": 1.15625, + "learning_rate": 0.00021000798321075653, + "loss": 0.7463, + "step": 11445 + }, + { + "epoch": 0.796271174649553, + "grad_norm": 0.97265625, + "learning_rate": 0.00020986984774731354, + "loss": 0.6783, + "step": 11446 + }, + { + "epoch": 0.7963407422866883, + "grad_norm": 0.93359375, + "learning_rate": 0.00020973175240269739, + "loss": 0.7749, + "step": 11447 + }, + { + "epoch": 0.7964103099238234, + "grad_norm": 1.0546875, + "learning_rate": 0.0002095936971839195, + "loss": 0.691, + "step": 11448 + }, + { + "epoch": 0.7964798775609586, + "grad_norm": 1.453125, + "learning_rate": 0.00020945568209798928, + "loss": 0.886, + "step": 11449 + }, + { + "epoch": 0.7965494451980939, + "grad_norm": 1.203125, + "learning_rate": 0.00020931770715191533, + "loss": 1.0378, + "step": 11450 + }, + { + "epoch": 0.7966190128352291, + "grad_norm": 0.9765625, + "learning_rate": 0.00020917977235270302, + "loss": 0.8532, + "step": 11451 + }, + { + "epoch": 0.7966885804723642, + "grad_norm": 1.203125, + "learning_rate": 0.00020904187770735572, + "loss": 0.838, + "step": 11452 + }, + { + "epoch": 0.7967581481094994, + "grad_norm": 1.3125, + "learning_rate": 0.00020890402322287495, + "loss": 0.6772, + "step": 11453 + }, + { + "epoch": 0.7968277157466347, + "grad_norm": 1.1171875, + "learning_rate": 0.00020876620890626041, + "loss": 0.8102, + "step": 11454 + }, + { + "epoch": 0.7968972833837699, + "grad_norm": 1.0703125, + "learning_rate": 0.00020862843476451, + "loss": 0.8577, + "step": 11455 + }, + { + "epoch": 0.796966851020905, + "grad_norm": 1.125, + "learning_rate": 0.00020849070080461852, + "loss": 0.9275, + "step": 11456 + }, + { + "epoch": 0.7970364186580403, + "grad_norm": 1.28125, + "learning_rate": 0.00020835300703358006, + "loss": 0.7921, + "step": 11457 + }, + { + "epoch": 0.7971059862951755, + "grad_norm": 1.0703125, + "learning_rate": 0.00020821535345838537, + "loss": 1.0517, + "step": 11458 + }, + { + "epoch": 0.7971755539323107, + "grad_norm": 1.0625, + "learning_rate": 0.00020807774008602454, + "loss": 0.6087, + "step": 11459 + }, + { + "epoch": 0.7972451215694459, + "grad_norm": 1.0859375, + "learning_rate": 0.00020794016692348417, + "loss": 0.8276, + "step": 11460 + }, + { + "epoch": 0.7973146892065811, + "grad_norm": 0.86328125, + "learning_rate": 0.00020780263397775, + "loss": 0.7144, + "step": 11461 + }, + { + "epoch": 0.7973842568437163, + "grad_norm": 0.953125, + "learning_rate": 0.00020766514125580493, + "loss": 0.6881, + "step": 11462 + }, + { + "epoch": 0.7974538244808516, + "grad_norm": 1.0703125, + "learning_rate": 0.00020752768876463034, + "loss": 0.8766, + "step": 11463 + }, + { + "epoch": 0.7975233921179867, + "grad_norm": 0.9453125, + "learning_rate": 0.00020739027651120567, + "loss": 0.6938, + "step": 11464 + }, + { + "epoch": 0.7975929597551219, + "grad_norm": 0.98046875, + "learning_rate": 0.00020725290450250767, + "loss": 0.7493, + "step": 11465 + }, + { + "epoch": 0.7976625273922571, + "grad_norm": 1.2578125, + "learning_rate": 0.0002071155727455114, + "loss": 0.8856, + "step": 11466 + }, + { + "epoch": 0.7977320950293924, + "grad_norm": 1.2734375, + "learning_rate": 0.00020697828124718965, + "loss": 0.8574, + "step": 11467 + }, + { + "epoch": 0.7978016626665275, + "grad_norm": 1.25, + "learning_rate": 0.0002068410300145136, + "loss": 0.9403, + "step": 11468 + }, + { + "epoch": 0.7978712303036627, + "grad_norm": 0.9921875, + "learning_rate": 0.00020670381905445257, + "loss": 0.9696, + "step": 11469 + }, + { + "epoch": 0.797940797940798, + "grad_norm": 1.2890625, + "learning_rate": 0.00020656664837397288, + "loss": 0.8581, + "step": 11470 + }, + { + "epoch": 0.7980103655779331, + "grad_norm": 1.53125, + "learning_rate": 0.00020642951798003972, + "loss": 0.88, + "step": 11471 + }, + { + "epoch": 0.7980799332150683, + "grad_norm": 1.1171875, + "learning_rate": 0.00020629242787961556, + "loss": 0.8249, + "step": 11472 + }, + { + "epoch": 0.7981495008522036, + "grad_norm": 1.1796875, + "learning_rate": 0.00020615537807966167, + "loss": 0.8592, + "step": 11473 + }, + { + "epoch": 0.7982190684893388, + "grad_norm": 1.015625, + "learning_rate": 0.00020601836858713597, + "loss": 0.7575, + "step": 11474 + }, + { + "epoch": 0.7982886361264739, + "grad_norm": 1.125, + "learning_rate": 0.00020588139940899597, + "loss": 0.5698, + "step": 11475 + }, + { + "epoch": 0.7983582037636092, + "grad_norm": 1.125, + "learning_rate": 0.00020574447055219546, + "loss": 0.9127, + "step": 11476 + }, + { + "epoch": 0.7984277714007444, + "grad_norm": 1.0859375, + "learning_rate": 0.00020560758202368745, + "loss": 0.7286, + "step": 11477 + }, + { + "epoch": 0.7984973390378796, + "grad_norm": 1.1328125, + "learning_rate": 0.0002054707338304227, + "loss": 0.9412, + "step": 11478 + }, + { + "epoch": 0.7985669066750147, + "grad_norm": 1.3125, + "learning_rate": 0.0002053339259793493, + "loss": 0.7786, + "step": 11479 + }, + { + "epoch": 0.79863647431215, + "grad_norm": 1.0703125, + "learning_rate": 0.0002051971584774137, + "loss": 0.6275, + "step": 11480 + }, + { + "epoch": 0.7987060419492852, + "grad_norm": 0.79296875, + "learning_rate": 0.00020506043133155982, + "loss": 0.801, + "step": 11481 + }, + { + "epoch": 0.7987756095864204, + "grad_norm": 1.09375, + "learning_rate": 0.00020492374454873097, + "loss": 0.8623, + "step": 11482 + }, + { + "epoch": 0.7988451772235556, + "grad_norm": 1.140625, + "learning_rate": 0.00020478709813586692, + "loss": 0.7663, + "step": 11483 + }, + { + "epoch": 0.7989147448606908, + "grad_norm": 1.796875, + "learning_rate": 0.0002046504920999056, + "loss": 1.0197, + "step": 11484 + }, + { + "epoch": 0.798984312497826, + "grad_norm": 0.953125, + "learning_rate": 0.00020451392644778356, + "loss": 0.7304, + "step": 11485 + }, + { + "epoch": 0.7990538801349613, + "grad_norm": 0.97265625, + "learning_rate": 0.00020437740118643466, + "loss": 0.6665, + "step": 11486 + }, + { + "epoch": 0.7991234477720964, + "grad_norm": 1.0859375, + "learning_rate": 0.00020424091632279128, + "loss": 0.8603, + "step": 11487 + }, + { + "epoch": 0.7991930154092316, + "grad_norm": 1.3671875, + "learning_rate": 0.000204104471863783, + "loss": 1.0289, + "step": 11488 + }, + { + "epoch": 0.7992625830463669, + "grad_norm": 1.2421875, + "learning_rate": 0.00020396806781633836, + "loss": 0.8675, + "step": 11489 + }, + { + "epoch": 0.799332150683502, + "grad_norm": 1.125, + "learning_rate": 0.0002038317041873826, + "loss": 0.7787, + "step": 11490 + }, + { + "epoch": 0.7994017183206372, + "grad_norm": 1.6875, + "learning_rate": 0.00020369538098383987, + "loss": 0.9907, + "step": 11491 + }, + { + "epoch": 0.7994712859577724, + "grad_norm": 1.0234375, + "learning_rate": 0.0002035590982126324, + "loss": 0.8315, + "step": 11492 + }, + { + "epoch": 0.7995408535949077, + "grad_norm": 1.21875, + "learning_rate": 0.00020342285588067954, + "loss": 0.6062, + "step": 11493 + }, + { + "epoch": 0.7996104212320428, + "grad_norm": 1.1875, + "learning_rate": 0.00020328665399489866, + "loss": 0.7618, + "step": 11494 + }, + { + "epoch": 0.799679988869178, + "grad_norm": 1.1796875, + "learning_rate": 0.00020315049256220584, + "loss": 0.9091, + "step": 11495 + }, + { + "epoch": 0.7997495565063133, + "grad_norm": 1.125, + "learning_rate": 0.00020301437158951486, + "loss": 0.9664, + "step": 11496 + }, + { + "epoch": 0.7998191241434485, + "grad_norm": 1.109375, + "learning_rate": 0.0002028782910837369, + "loss": 0.9051, + "step": 11497 + }, + { + "epoch": 0.7998886917805836, + "grad_norm": 1.2421875, + "learning_rate": 0.00020274225105178134, + "loss": 0.9678, + "step": 11498 + }, + { + "epoch": 0.7999582594177189, + "grad_norm": 1.25, + "learning_rate": 0.00020260625150055612, + "loss": 0.7862, + "step": 11499 + }, + { + "epoch": 0.8000278270548541, + "grad_norm": 1.1484375, + "learning_rate": 0.0002024702924369659, + "loss": 0.8813, + "step": 11500 + }, + { + "epoch": 0.8000973946919893, + "grad_norm": 1.2109375, + "learning_rate": 0.00020233437386791463, + "loss": 0.6225, + "step": 11501 + }, + { + "epoch": 0.8001669623291245, + "grad_norm": 1.2109375, + "learning_rate": 0.00020219849580030313, + "loss": 0.7075, + "step": 11502 + }, + { + "epoch": 0.8002365299662597, + "grad_norm": 0.859375, + "learning_rate": 0.0002020626582410311, + "loss": 0.6389, + "step": 11503 + }, + { + "epoch": 0.8003060976033949, + "grad_norm": 1.03125, + "learning_rate": 0.000201926861196995, + "loss": 0.8935, + "step": 11504 + }, + { + "epoch": 0.80037566524053, + "grad_norm": 1.1171875, + "learning_rate": 0.00020179110467509042, + "loss": 0.8508, + "step": 11505 + }, + { + "epoch": 0.8004452328776653, + "grad_norm": 1.0625, + "learning_rate": 0.00020165538868221046, + "loss": 0.8624, + "step": 11506 + }, + { + "epoch": 0.8005148005148005, + "grad_norm": 1.09375, + "learning_rate": 0.00020151971322524597, + "loss": 0.7857, + "step": 11507 + }, + { + "epoch": 0.8005843681519357, + "grad_norm": 1.3046875, + "learning_rate": 0.0002013840783110854, + "loss": 0.8685, + "step": 11508 + }, + { + "epoch": 0.800653935789071, + "grad_norm": 1.0078125, + "learning_rate": 0.00020124848394661622, + "loss": 0.8577, + "step": 11509 + }, + { + "epoch": 0.8007235034262061, + "grad_norm": 1.15625, + "learning_rate": 0.0002011129301387231, + "loss": 0.6494, + "step": 11510 + }, + { + "epoch": 0.8007930710633413, + "grad_norm": 1.0703125, + "learning_rate": 0.00020097741689428884, + "loss": 0.7739, + "step": 11511 + }, + { + "epoch": 0.8008626387004766, + "grad_norm": 1.296875, + "learning_rate": 0.00020084194422019365, + "loss": 0.7364, + "step": 11512 + }, + { + "epoch": 0.8009322063376118, + "grad_norm": 1.171875, + "learning_rate": 0.00020070651212331648, + "loss": 0.7561, + "step": 11513 + }, + { + "epoch": 0.8010017739747469, + "grad_norm": 1.25, + "learning_rate": 0.00020057112061053407, + "loss": 0.8435, + "step": 11514 + }, + { + "epoch": 0.8010713416118822, + "grad_norm": 1.1796875, + "learning_rate": 0.0002004357696887208, + "loss": 0.7197, + "step": 11515 + }, + { + "epoch": 0.8011409092490174, + "grad_norm": 1.296875, + "learning_rate": 0.00020030045936474884, + "loss": 0.9916, + "step": 11516 + }, + { + "epoch": 0.8012104768861525, + "grad_norm": 1.0703125, + "learning_rate": 0.0002001651896454889, + "loss": 0.8623, + "step": 11517 + }, + { + "epoch": 0.8012800445232877, + "grad_norm": 1.1015625, + "learning_rate": 0.00020002996053780907, + "loss": 0.875, + "step": 11518 + }, + { + "epoch": 0.801349612160423, + "grad_norm": 1.2421875, + "learning_rate": 0.00019989477204857586, + "loss": 0.8547, + "step": 11519 + }, + { + "epoch": 0.8014191797975582, + "grad_norm": 1.21875, + "learning_rate": 0.00019975962418465298, + "loss": 0.6164, + "step": 11520 + }, + { + "epoch": 0.8014887474346933, + "grad_norm": 1.1640625, + "learning_rate": 0.00019962451695290328, + "loss": 0.8538, + "step": 11521 + }, + { + "epoch": 0.8015583150718286, + "grad_norm": 1.46875, + "learning_rate": 0.00019948945036018606, + "loss": 0.6978, + "step": 11522 + }, + { + "epoch": 0.8016278827089638, + "grad_norm": 1.0546875, + "learning_rate": 0.0001993544244133597, + "loss": 0.5762, + "step": 11523 + }, + { + "epoch": 0.801697450346099, + "grad_norm": 1.046875, + "learning_rate": 0.00019921943911928032, + "loss": 0.742, + "step": 11524 + }, + { + "epoch": 0.8017670179832342, + "grad_norm": 1.125, + "learning_rate": 0.0001990844944848017, + "loss": 0.6807, + "step": 11525 + }, + { + "epoch": 0.8018365856203694, + "grad_norm": 1.296875, + "learning_rate": 0.0001989495905167752, + "loss": 0.8195, + "step": 11526 + }, + { + "epoch": 0.8019061532575046, + "grad_norm": 1.2890625, + "learning_rate": 0.00019881472722205085, + "loss": 0.9336, + "step": 11527 + }, + { + "epoch": 0.8019757208946399, + "grad_norm": 0.87109375, + "learning_rate": 0.00019867990460747676, + "loss": 0.6865, + "step": 11528 + }, + { + "epoch": 0.802045288531775, + "grad_norm": 1.421875, + "learning_rate": 0.00019854512267989812, + "loss": 0.8677, + "step": 11529 + }, + { + "epoch": 0.8021148561689102, + "grad_norm": 0.91796875, + "learning_rate": 0.0001984103814461582, + "loss": 0.721, + "step": 11530 + }, + { + "epoch": 0.8021844238060454, + "grad_norm": 0.8515625, + "learning_rate": 0.0001982756809130991, + "loss": 0.7316, + "step": 11531 + }, + { + "epoch": 0.8022539914431807, + "grad_norm": 1.0859375, + "learning_rate": 0.00019814102108755972, + "loss": 0.7935, + "step": 11532 + }, + { + "epoch": 0.8023235590803158, + "grad_norm": 1.234375, + "learning_rate": 0.00019800640197637786, + "loss": 1.0023, + "step": 11533 + }, + { + "epoch": 0.802393126717451, + "grad_norm": 0.8359375, + "learning_rate": 0.00019787182358638823, + "loss": 0.5747, + "step": 11534 + }, + { + "epoch": 0.8024626943545863, + "grad_norm": 1.5703125, + "learning_rate": 0.00019773728592442465, + "loss": 0.7026, + "step": 11535 + }, + { + "epoch": 0.8025322619917215, + "grad_norm": 1.234375, + "learning_rate": 0.00019760278899731777, + "loss": 0.8311, + "step": 11536 + }, + { + "epoch": 0.8026018296288566, + "grad_norm": 1.09375, + "learning_rate": 0.0001974683328118969, + "loss": 0.8812, + "step": 11537 + }, + { + "epoch": 0.8026713972659919, + "grad_norm": 0.90234375, + "learning_rate": 0.0001973339173749893, + "loss": 0.5614, + "step": 11538 + }, + { + "epoch": 0.8027409649031271, + "grad_norm": 1.1171875, + "learning_rate": 0.00019719954269341956, + "loss": 0.8661, + "step": 11539 + }, + { + "epoch": 0.8028105325402622, + "grad_norm": 1.2109375, + "learning_rate": 0.00019706520877401035, + "loss": 0.9264, + "step": 11540 + }, + { + "epoch": 0.8028801001773975, + "grad_norm": 0.86328125, + "learning_rate": 0.0001969309156235829, + "loss": 0.7194, + "step": 11541 + }, + { + "epoch": 0.8029496678145327, + "grad_norm": 0.89453125, + "learning_rate": 0.00019679666324895595, + "loss": 0.5175, + "step": 11542 + }, + { + "epoch": 0.8030192354516679, + "grad_norm": 1.1484375, + "learning_rate": 0.00019666245165694596, + "loss": 0.6996, + "step": 11543 + }, + { + "epoch": 0.803088803088803, + "grad_norm": 0.9375, + "learning_rate": 0.00019652828085436736, + "loss": 0.6839, + "step": 11544 + }, + { + "epoch": 0.8031583707259383, + "grad_norm": 1.2890625, + "learning_rate": 0.0001963941508480328, + "loss": 0.914, + "step": 11545 + }, + { + "epoch": 0.8032279383630735, + "grad_norm": 1.1875, + "learning_rate": 0.00019626006164475307, + "loss": 0.8159, + "step": 11546 + }, + { + "epoch": 0.8032975060002087, + "grad_norm": 1.0234375, + "learning_rate": 0.00019612601325133628, + "loss": 0.6157, + "step": 11547 + }, + { + "epoch": 0.8033670736373439, + "grad_norm": 1.0078125, + "learning_rate": 0.0001959920056745884, + "loss": 0.7854, + "step": 11548 + }, + { + "epoch": 0.8034366412744791, + "grad_norm": 1.203125, + "learning_rate": 0.00019585803892131426, + "loss": 0.8643, + "step": 11549 + }, + { + "epoch": 0.8035062089116143, + "grad_norm": 1.0859375, + "learning_rate": 0.0001957241129983155, + "loss": 1.024, + "step": 11550 + }, + { + "epoch": 0.8035757765487496, + "grad_norm": 0.90625, + "learning_rate": 0.00019559022791239245, + "loss": 0.6395, + "step": 11551 + }, + { + "epoch": 0.8036453441858847, + "grad_norm": 1.0703125, + "learning_rate": 0.00019545638367034335, + "loss": 0.6166, + "step": 11552 + }, + { + "epoch": 0.8037149118230199, + "grad_norm": 1.0390625, + "learning_rate": 0.00019532258027896377, + "loss": 0.5566, + "step": 11553 + }, + { + "epoch": 0.8037844794601552, + "grad_norm": 0.9609375, + "learning_rate": 0.0001951888177450476, + "loss": 0.947, + "step": 11554 + }, + { + "epoch": 0.8038540470972904, + "grad_norm": 1.0390625, + "learning_rate": 0.00019505509607538663, + "loss": 0.8042, + "step": 11555 + }, + { + "epoch": 0.8039236147344255, + "grad_norm": 1.1015625, + "learning_rate": 0.00019492141527677087, + "loss": 0.8172, + "step": 11556 + }, + { + "epoch": 0.8039931823715607, + "grad_norm": 1.375, + "learning_rate": 0.0001947877753559878, + "loss": 0.7079, + "step": 11557 + }, + { + "epoch": 0.804062750008696, + "grad_norm": 1.1796875, + "learning_rate": 0.00019465417631982262, + "loss": 0.8163, + "step": 11558 + }, + { + "epoch": 0.8041323176458312, + "grad_norm": 1.109375, + "learning_rate": 0.00019452061817505918, + "loss": 0.9206, + "step": 11559 + }, + { + "epoch": 0.8042018852829663, + "grad_norm": 1.0, + "learning_rate": 0.0001943871009284791, + "loss": 0.7406, + "step": 11560 + }, + { + "epoch": 0.8042714529201016, + "grad_norm": 1.296875, + "learning_rate": 0.00019425362458686148, + "loss": 0.8649, + "step": 11561 + }, + { + "epoch": 0.8043410205572368, + "grad_norm": 1.0546875, + "learning_rate": 0.00019412018915698315, + "loss": 0.8075, + "step": 11562 + }, + { + "epoch": 0.804410588194372, + "grad_norm": 2.25, + "learning_rate": 0.00019398679464562008, + "loss": 1.022, + "step": 11563 + }, + { + "epoch": 0.8044801558315072, + "grad_norm": 1.5703125, + "learning_rate": 0.00019385344105954462, + "loss": 1.0943, + "step": 11564 + }, + { + "epoch": 0.8045497234686424, + "grad_norm": 1.2421875, + "learning_rate": 0.00019372012840552822, + "loss": 0.769, + "step": 11565 + }, + { + "epoch": 0.8046192911057776, + "grad_norm": 1.53125, + "learning_rate": 0.00019358685669033994, + "loss": 1.0729, + "step": 11566 + }, + { + "epoch": 0.8046888587429128, + "grad_norm": 1.296875, + "learning_rate": 0.00019345362592074645, + "loss": 0.7369, + "step": 11567 + }, + { + "epoch": 0.804758426380048, + "grad_norm": 1.4453125, + "learning_rate": 0.00019332043610351224, + "loss": 0.7023, + "step": 11568 + }, + { + "epoch": 0.8048279940171832, + "grad_norm": 1.6015625, + "learning_rate": 0.00019318728724540047, + "loss": 0.8439, + "step": 11569 + }, + { + "epoch": 0.8048975616543184, + "grad_norm": 1.1640625, + "learning_rate": 0.0001930541793531717, + "loss": 0.9113, + "step": 11570 + }, + { + "epoch": 0.8049671292914536, + "grad_norm": 1.1640625, + "learning_rate": 0.00019292111243358445, + "loss": 0.9856, + "step": 11571 + }, + { + "epoch": 0.8050366969285888, + "grad_norm": 0.9140625, + "learning_rate": 0.00019278808649339496, + "loss": 0.688, + "step": 11572 + }, + { + "epoch": 0.805106264565724, + "grad_norm": 1.078125, + "learning_rate": 0.00019265510153935772, + "loss": 0.7357, + "step": 11573 + }, + { + "epoch": 0.8051758322028593, + "grad_norm": 0.9921875, + "learning_rate": 0.00019252215757822533, + "loss": 0.6238, + "step": 11574 + }, + { + "epoch": 0.8052453998399944, + "grad_norm": 1.140625, + "learning_rate": 0.00019238925461674783, + "loss": 0.6776, + "step": 11575 + }, + { + "epoch": 0.8053149674771296, + "grad_norm": 1.140625, + "learning_rate": 0.00019225639266167317, + "loss": 0.8295, + "step": 11576 + }, + { + "epoch": 0.8053845351142649, + "grad_norm": 1.2890625, + "learning_rate": 0.00019212357171974738, + "loss": 0.8657, + "step": 11577 + }, + { + "epoch": 0.8054541027514001, + "grad_norm": 0.96875, + "learning_rate": 0.00019199079179771494, + "loss": 0.6886, + "step": 11578 + }, + { + "epoch": 0.8055236703885352, + "grad_norm": 0.96875, + "learning_rate": 0.00019185805290231718, + "loss": 0.8363, + "step": 11579 + }, + { + "epoch": 0.8055932380256705, + "grad_norm": 1.3515625, + "learning_rate": 0.00019172535504029443, + "loss": 0.8789, + "step": 11580 + }, + { + "epoch": 0.8056628056628057, + "grad_norm": 1.6015625, + "learning_rate": 0.0001915926982183841, + "loss": 1.0713, + "step": 11581 + }, + { + "epoch": 0.8057323732999409, + "grad_norm": 1.1015625, + "learning_rate": 0.0001914600824433217, + "loss": 0.7367, + "step": 11582 + }, + { + "epoch": 0.805801940937076, + "grad_norm": 1.2734375, + "learning_rate": 0.00019132750772184092, + "loss": 0.7935, + "step": 11583 + }, + { + "epoch": 0.8058715085742113, + "grad_norm": 1.125, + "learning_rate": 0.00019119497406067354, + "loss": 0.7253, + "step": 11584 + }, + { + "epoch": 0.8059410762113465, + "grad_norm": 1.0390625, + "learning_rate": 0.00019106248146654869, + "loss": 0.6743, + "step": 11585 + }, + { + "epoch": 0.8060106438484816, + "grad_norm": 1.21875, + "learning_rate": 0.00019093002994619346, + "loss": 0.9133, + "step": 11586 + }, + { + "epoch": 0.8060802114856169, + "grad_norm": 1.0546875, + "learning_rate": 0.00019079761950633323, + "loss": 0.763, + "step": 11587 + }, + { + "epoch": 0.8061497791227521, + "grad_norm": 0.8125, + "learning_rate": 0.0001906652501536915, + "loss": 0.5673, + "step": 11588 + }, + { + "epoch": 0.8062193467598873, + "grad_norm": 1.015625, + "learning_rate": 0.00019053292189498904, + "loss": 0.6943, + "step": 11589 + }, + { + "epoch": 0.8062889143970225, + "grad_norm": 1.28125, + "learning_rate": 0.00019040063473694448, + "loss": 0.8947, + "step": 11590 + }, + { + "epoch": 0.8063584820341577, + "grad_norm": 1.0703125, + "learning_rate": 0.00019026838868627506, + "loss": 0.7626, + "step": 11591 + }, + { + "epoch": 0.8064280496712929, + "grad_norm": 0.8671875, + "learning_rate": 0.00019013618374969578, + "loss": 0.8147, + "step": 11592 + }, + { + "epoch": 0.8064976173084282, + "grad_norm": 0.890625, + "learning_rate": 0.00019000401993391868, + "loss": 0.4795, + "step": 11593 + }, + { + "epoch": 0.8065671849455633, + "grad_norm": 1.359375, + "learning_rate": 0.00018987189724565512, + "loss": 0.7489, + "step": 11594 + }, + { + "epoch": 0.8066367525826985, + "grad_norm": 0.83203125, + "learning_rate": 0.00018973981569161337, + "loss": 0.6915, + "step": 11595 + }, + { + "epoch": 0.8067063202198337, + "grad_norm": 1.3125, + "learning_rate": 0.00018960777527849936, + "loss": 0.8171, + "step": 11596 + }, + { + "epoch": 0.806775887856969, + "grad_norm": 1.15625, + "learning_rate": 0.0001894757760130179, + "loss": 1.0254, + "step": 11597 + }, + { + "epoch": 0.8068454554941041, + "grad_norm": 1.0234375, + "learning_rate": 0.00018934381790187139, + "loss": 0.8368, + "step": 11598 + }, + { + "epoch": 0.8069150231312393, + "grad_norm": 1.34375, + "learning_rate": 0.0001892119009517599, + "loss": 0.91, + "step": 11599 + }, + { + "epoch": 0.8069845907683746, + "grad_norm": 1.15625, + "learning_rate": 0.00018908002516938106, + "loss": 0.8141, + "step": 11600 + }, + { + "epoch": 0.8070541584055098, + "grad_norm": 1.015625, + "learning_rate": 0.0001889481905614313, + "loss": 0.947, + "step": 11601 + }, + { + "epoch": 0.8071237260426449, + "grad_norm": 1.5625, + "learning_rate": 0.00018881639713460452, + "loss": 0.9287, + "step": 11602 + }, + { + "epoch": 0.8071932936797802, + "grad_norm": 0.8359375, + "learning_rate": 0.00018868464489559257, + "loss": 0.5424, + "step": 11603 + }, + { + "epoch": 0.8072628613169154, + "grad_norm": 1.1640625, + "learning_rate": 0.00018855293385108474, + "loss": 0.6286, + "step": 11604 + }, + { + "epoch": 0.8073324289540506, + "grad_norm": 1.1328125, + "learning_rate": 0.00018842126400776883, + "loss": 0.6904, + "step": 11605 + }, + { + "epoch": 0.8074019965911858, + "grad_norm": 1.2421875, + "learning_rate": 0.0001882896353723308, + "loss": 0.667, + "step": 11606 + }, + { + "epoch": 0.807471564228321, + "grad_norm": 1.0625, + "learning_rate": 0.00018815804795145385, + "loss": 0.9498, + "step": 11607 + }, + { + "epoch": 0.8075411318654562, + "grad_norm": 1.15625, + "learning_rate": 0.0001880265017518189, + "loss": 0.6469, + "step": 11608 + }, + { + "epoch": 0.8076106995025913, + "grad_norm": 1.234375, + "learning_rate": 0.00018789499678010548, + "loss": 1.0743, + "step": 11609 + }, + { + "epoch": 0.8076802671397266, + "grad_norm": 0.95703125, + "learning_rate": 0.0001877635330429911, + "loss": 0.6298, + "step": 11610 + }, + { + "epoch": 0.8077498347768618, + "grad_norm": 1.1953125, + "learning_rate": 0.00018763211054715034, + "loss": 0.7808, + "step": 11611 + }, + { + "epoch": 0.807819402413997, + "grad_norm": 1.0234375, + "learning_rate": 0.00018750072929925654, + "loss": 0.7542, + "step": 11612 + }, + { + "epoch": 0.8078889700511322, + "grad_norm": 1.3671875, + "learning_rate": 0.00018736938930598047, + "loss": 0.8713, + "step": 11613 + }, + { + "epoch": 0.8079585376882674, + "grad_norm": 1.0390625, + "learning_rate": 0.00018723809057399066, + "loss": 0.8394, + "step": 11614 + }, + { + "epoch": 0.8080281053254026, + "grad_norm": 0.8046875, + "learning_rate": 0.00018710683310995392, + "loss": 0.5079, + "step": 11615 + }, + { + "epoch": 0.8080976729625379, + "grad_norm": 1.125, + "learning_rate": 0.00018697561692053512, + "loss": 0.762, + "step": 11616 + }, + { + "epoch": 0.808167240599673, + "grad_norm": 1.109375, + "learning_rate": 0.00018684444201239658, + "loss": 0.9277, + "step": 11617 + }, + { + "epoch": 0.8082368082368082, + "grad_norm": 1.0, + "learning_rate": 0.00018671330839219836, + "loss": 0.8453, + "step": 11618 + }, + { + "epoch": 0.8083063758739435, + "grad_norm": 0.8984375, + "learning_rate": 0.0001865822160665992, + "loss": 0.8398, + "step": 11619 + }, + { + "epoch": 0.8083759435110787, + "grad_norm": 1.1953125, + "learning_rate": 0.00018645116504225536, + "loss": 0.8521, + "step": 11620 + }, + { + "epoch": 0.8084455111482138, + "grad_norm": 1.171875, + "learning_rate": 0.0001863201553258207, + "loss": 0.8618, + "step": 11621 + }, + { + "epoch": 0.808515078785349, + "grad_norm": 1.03125, + "learning_rate": 0.00018618918692394715, + "loss": 0.7516, + "step": 11622 + }, + { + "epoch": 0.8085846464224843, + "grad_norm": 1.2109375, + "learning_rate": 0.00018605825984328473, + "loss": 0.8396, + "step": 11623 + }, + { + "epoch": 0.8086542140596195, + "grad_norm": 1.0703125, + "learning_rate": 0.00018592737409048156, + "loss": 0.7484, + "step": 11624 + }, + { + "epoch": 0.8087237816967546, + "grad_norm": 1.1953125, + "learning_rate": 0.00018579652967218286, + "loss": 0.7865, + "step": 11625 + }, + { + "epoch": 0.8087933493338899, + "grad_norm": 0.9375, + "learning_rate": 0.0001856657265950328, + "loss": 0.7438, + "step": 11626 + }, + { + "epoch": 0.8088629169710251, + "grad_norm": 1.109375, + "learning_rate": 0.00018553496486567244, + "loss": 0.7413, + "step": 11627 + }, + { + "epoch": 0.8089324846081603, + "grad_norm": 1.28125, + "learning_rate": 0.00018540424449074123, + "loss": 0.7804, + "step": 11628 + }, + { + "epoch": 0.8090020522452955, + "grad_norm": 1.25, + "learning_rate": 0.00018527356547687657, + "loss": 0.8049, + "step": 11629 + }, + { + "epoch": 0.8090716198824307, + "grad_norm": 1.109375, + "learning_rate": 0.00018514292783071407, + "loss": 0.5443, + "step": 11630 + }, + { + "epoch": 0.8091411875195659, + "grad_norm": 1.296875, + "learning_rate": 0.0001850123315588864, + "loss": 1.0762, + "step": 11631 + }, + { + "epoch": 0.8092107551567012, + "grad_norm": 1.125, + "learning_rate": 0.00018488177666802454, + "loss": 1.1306, + "step": 11632 + }, + { + "epoch": 0.8092803227938363, + "grad_norm": 1.421875, + "learning_rate": 0.00018475126316475744, + "loss": 0.8331, + "step": 11633 + }, + { + "epoch": 0.8093498904309715, + "grad_norm": 1.109375, + "learning_rate": 0.0001846207910557124, + "loss": 0.815, + "step": 11634 + }, + { + "epoch": 0.8094194580681067, + "grad_norm": 1.09375, + "learning_rate": 0.00018449036034751375, + "loss": 0.7388, + "step": 11635 + }, + { + "epoch": 0.809489025705242, + "grad_norm": 0.984375, + "learning_rate": 0.00018435997104678382, + "loss": 0.784, + "step": 11636 + }, + { + "epoch": 0.8095585933423771, + "grad_norm": 1.390625, + "learning_rate": 0.00018422962316014347, + "loss": 0.7113, + "step": 11637 + }, + { + "epoch": 0.8096281609795123, + "grad_norm": 1.0546875, + "learning_rate": 0.00018409931669421132, + "loss": 0.7346, + "step": 11638 + }, + { + "epoch": 0.8096977286166476, + "grad_norm": 0.91015625, + "learning_rate": 0.0001839690516556032, + "loss": 0.6103, + "step": 11639 + }, + { + "epoch": 0.8097672962537827, + "grad_norm": 1.2421875, + "learning_rate": 0.00018383882805093367, + "loss": 0.841, + "step": 11640 + }, + { + "epoch": 0.8098368638909179, + "grad_norm": 1.09375, + "learning_rate": 0.0001837086458868148, + "loss": 0.8778, + "step": 11641 + }, + { + "epoch": 0.8099064315280532, + "grad_norm": 0.80859375, + "learning_rate": 0.0001835785051698562, + "loss": 0.6279, + "step": 11642 + }, + { + "epoch": 0.8099759991651884, + "grad_norm": 1.3671875, + "learning_rate": 0.00018344840590666612, + "loss": 1.0347, + "step": 11643 + }, + { + "epoch": 0.8100455668023235, + "grad_norm": 1.375, + "learning_rate": 0.0001833183481038504, + "loss": 0.8542, + "step": 11644 + }, + { + "epoch": 0.8101151344394588, + "grad_norm": 1.1328125, + "learning_rate": 0.00018318833176801265, + "loss": 0.6418, + "step": 11645 + }, + { + "epoch": 0.810184702076594, + "grad_norm": 0.984375, + "learning_rate": 0.00018305835690575413, + "loss": 0.7551, + "step": 11646 + }, + { + "epoch": 0.8102542697137292, + "grad_norm": 1.1953125, + "learning_rate": 0.00018292842352367444, + "loss": 0.5889, + "step": 11647 + }, + { + "epoch": 0.8103238373508643, + "grad_norm": 1.3203125, + "learning_rate": 0.00018279853162837145, + "loss": 1.0018, + "step": 11648 + }, + { + "epoch": 0.8103934049879996, + "grad_norm": 1.5234375, + "learning_rate": 0.00018266868122643998, + "loss": 0.9089, + "step": 11649 + }, + { + "epoch": 0.8104629726251348, + "grad_norm": 1.046875, + "learning_rate": 0.00018253887232447285, + "loss": 0.8482, + "step": 11650 + }, + { + "epoch": 0.81053254026227, + "grad_norm": 1.109375, + "learning_rate": 0.0001824091049290616, + "loss": 0.8073, + "step": 11651 + }, + { + "epoch": 0.8106021078994052, + "grad_norm": 1.21875, + "learning_rate": 0.00018227937904679526, + "loss": 0.7902, + "step": 11652 + }, + { + "epoch": 0.8106716755365404, + "grad_norm": 1.125, + "learning_rate": 0.00018214969468426022, + "loss": 0.8469, + "step": 11653 + }, + { + "epoch": 0.8107412431736756, + "grad_norm": 1.2109375, + "learning_rate": 0.00018202005184804172, + "loss": 0.9903, + "step": 11654 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 1.1640625, + "learning_rate": 0.00018189045054472163, + "loss": 0.8441, + "step": 11655 + }, + { + "epoch": 0.810880378447946, + "grad_norm": 1.1796875, + "learning_rate": 0.00018176089078088132, + "loss": 0.7733, + "step": 11656 + }, + { + "epoch": 0.8109499460850812, + "grad_norm": 0.9765625, + "learning_rate": 0.00018163137256309837, + "loss": 0.5121, + "step": 11657 + }, + { + "epoch": 0.8110195137222165, + "grad_norm": 1.3515625, + "learning_rate": 0.00018150189589794975, + "loss": 0.7866, + "step": 11658 + }, + { + "epoch": 0.8110890813593516, + "grad_norm": 0.9765625, + "learning_rate": 0.0001813724607920093, + "loss": 0.7702, + "step": 11659 + }, + { + "epoch": 0.8111586489964868, + "grad_norm": 1.0625, + "learning_rate": 0.00018124306725184858, + "loss": 0.703, + "step": 11660 + }, + { + "epoch": 0.811228216633622, + "grad_norm": 1.1640625, + "learning_rate": 0.00018111371528403851, + "loss": 0.7809, + "step": 11661 + }, + { + "epoch": 0.8112977842707573, + "grad_norm": 0.91015625, + "learning_rate": 0.00018098440489514668, + "loss": 0.6439, + "step": 11662 + }, + { + "epoch": 0.8113673519078924, + "grad_norm": 1.046875, + "learning_rate": 0.0001808551360917384, + "loss": 0.7882, + "step": 11663 + }, + { + "epoch": 0.8114369195450276, + "grad_norm": 1.1875, + "learning_rate": 0.00018072590888037744, + "loss": 0.9247, + "step": 11664 + }, + { + "epoch": 0.8115064871821629, + "grad_norm": 1.1796875, + "learning_rate": 0.00018059672326762533, + "loss": 0.7383, + "step": 11665 + }, + { + "epoch": 0.8115760548192981, + "grad_norm": 1.4140625, + "learning_rate": 0.00018046757926004164, + "loss": 0.8242, + "step": 11666 + }, + { + "epoch": 0.8116456224564332, + "grad_norm": 0.9609375, + "learning_rate": 0.00018033847686418347, + "loss": 0.91, + "step": 11667 + }, + { + "epoch": 0.8117151900935685, + "grad_norm": 1.046875, + "learning_rate": 0.00018020941608660614, + "loss": 0.7519, + "step": 11668 + }, + { + "epoch": 0.8117847577307037, + "grad_norm": 1.28125, + "learning_rate": 0.00018008039693386246, + "loss": 0.7026, + "step": 11669 + }, + { + "epoch": 0.8118543253678389, + "grad_norm": 1.3828125, + "learning_rate": 0.0001799514194125037, + "loss": 0.749, + "step": 11670 + }, + { + "epoch": 0.8119238930049741, + "grad_norm": 1.0390625, + "learning_rate": 0.00017982248352907827, + "loss": 0.8047, + "step": 11671 + }, + { + "epoch": 0.8119934606421093, + "grad_norm": 1.2109375, + "learning_rate": 0.00017969358929013346, + "loss": 0.8779, + "step": 11672 + }, + { + "epoch": 0.8120630282792445, + "grad_norm": 1.046875, + "learning_rate": 0.0001795647367022135, + "loss": 0.8413, + "step": 11673 + }, + { + "epoch": 0.8121325959163797, + "grad_norm": 1.125, + "learning_rate": 0.00017943592577186063, + "loss": 0.8145, + "step": 11674 + }, + { + "epoch": 0.8122021635535149, + "grad_norm": 1.0859375, + "learning_rate": 0.00017930715650561546, + "loss": 0.9092, + "step": 11675 + }, + { + "epoch": 0.8122717311906501, + "grad_norm": 0.94140625, + "learning_rate": 0.00017917842891001658, + "loss": 0.871, + "step": 11676 + }, + { + "epoch": 0.8123412988277853, + "grad_norm": 1.046875, + "learning_rate": 0.00017904974299159983, + "loss": 0.7144, + "step": 11677 + }, + { + "epoch": 0.8124108664649206, + "grad_norm": 1.2109375, + "learning_rate": 0.000178921098756899, + "loss": 0.7592, + "step": 11678 + }, + { + "epoch": 0.8124804341020557, + "grad_norm": 0.9296875, + "learning_rate": 0.0001787924962124462, + "loss": 0.7098, + "step": 11679 + }, + { + "epoch": 0.8125500017391909, + "grad_norm": 1.0703125, + "learning_rate": 0.00017866393536477155, + "loss": 0.8082, + "step": 11680 + }, + { + "epoch": 0.8126195693763262, + "grad_norm": 1.1796875, + "learning_rate": 0.00017853541622040237, + "loss": 0.5663, + "step": 11681 + }, + { + "epoch": 0.8126891370134613, + "grad_norm": 1.0078125, + "learning_rate": 0.000178406938785864, + "loss": 0.9504, + "step": 11682 + }, + { + "epoch": 0.8127587046505965, + "grad_norm": 1.34375, + "learning_rate": 0.00017827850306768024, + "loss": 0.9961, + "step": 11683 + }, + { + "epoch": 0.8128282722877318, + "grad_norm": 1.0625, + "learning_rate": 0.0001781501090723725, + "loss": 0.8448, + "step": 11684 + }, + { + "epoch": 0.812897839924867, + "grad_norm": 1.125, + "learning_rate": 0.00017802175680645948, + "loss": 0.8075, + "step": 11685 + }, + { + "epoch": 0.8129674075620021, + "grad_norm": 1.1640625, + "learning_rate": 0.00017789344627645897, + "loss": 0.9241, + "step": 11686 + }, + { + "epoch": 0.8130369751991373, + "grad_norm": 0.8984375, + "learning_rate": 0.0001777651774888851, + "loss": 0.7554, + "step": 11687 + }, + { + "epoch": 0.8131065428362726, + "grad_norm": 1.1875, + "learning_rate": 0.00017763695045025152, + "loss": 0.8555, + "step": 11688 + }, + { + "epoch": 0.8131761104734078, + "grad_norm": 1.0234375, + "learning_rate": 0.00017750876516706837, + "loss": 0.7459, + "step": 11689 + }, + { + "epoch": 0.8132456781105429, + "grad_norm": 1.375, + "learning_rate": 0.00017738062164584457, + "loss": 0.5829, + "step": 11690 + }, + { + "epoch": 0.8133152457476782, + "grad_norm": 1.3125, + "learning_rate": 0.00017725251989308654, + "loss": 0.7101, + "step": 11691 + }, + { + "epoch": 0.8133848133848134, + "grad_norm": 1.1640625, + "learning_rate": 0.00017712445991529814, + "loss": 0.7135, + "step": 11692 + }, + { + "epoch": 0.8134543810219486, + "grad_norm": 1.0390625, + "learning_rate": 0.00017699644171898256, + "loss": 0.6507, + "step": 11693 + }, + { + "epoch": 0.8135239486590838, + "grad_norm": 0.98828125, + "learning_rate": 0.0001768684653106395, + "loss": 0.8359, + "step": 11694 + }, + { + "epoch": 0.813593516296219, + "grad_norm": 1.3828125, + "learning_rate": 0.00017674053069676677, + "loss": 0.9942, + "step": 11695 + }, + { + "epoch": 0.8136630839333542, + "grad_norm": 1.1328125, + "learning_rate": 0.00017661263788386005, + "loss": 0.7005, + "step": 11696 + }, + { + "epoch": 0.8137326515704895, + "grad_norm": 1.015625, + "learning_rate": 0.00017648478687841353, + "loss": 0.706, + "step": 11697 + }, + { + "epoch": 0.8138022192076246, + "grad_norm": 1.265625, + "learning_rate": 0.00017635697768691894, + "loss": 0.918, + "step": 11698 + }, + { + "epoch": 0.8138717868447598, + "grad_norm": 1.0078125, + "learning_rate": 0.00017622921031586525, + "loss": 0.7621, + "step": 11699 + }, + { + "epoch": 0.813941354481895, + "grad_norm": 1.0234375, + "learning_rate": 0.00017610148477174037, + "loss": 0.7783, + "step": 11700 + }, + { + "epoch": 0.8140109221190303, + "grad_norm": 0.89453125, + "learning_rate": 0.00017597380106102923, + "loss": 0.7429, + "step": 11701 + }, + { + "epoch": 0.8140804897561654, + "grad_norm": 1.0546875, + "learning_rate": 0.0001758461591902152, + "loss": 0.9648, + "step": 11702 + }, + { + "epoch": 0.8141500573933006, + "grad_norm": 1.1484375, + "learning_rate": 0.00017571855916577895, + "loss": 0.9206, + "step": 11703 + }, + { + "epoch": 0.8142196250304359, + "grad_norm": 1.0703125, + "learning_rate": 0.0001755910009941998, + "loss": 0.9077, + "step": 11704 + }, + { + "epoch": 0.814289192667571, + "grad_norm": 1.2578125, + "learning_rate": 0.0001754634846819543, + "loss": 0.772, + "step": 11705 + }, + { + "epoch": 0.8143587603047062, + "grad_norm": 1.15625, + "learning_rate": 0.0001753360102355166, + "loss": 1.0154, + "step": 11706 + }, + { + "epoch": 0.8144283279418415, + "grad_norm": 1.015625, + "learning_rate": 0.00017520857766136012, + "loss": 0.7274, + "step": 11707 + }, + { + "epoch": 0.8144978955789767, + "grad_norm": 1.296875, + "learning_rate": 0.00017508118696595487, + "loss": 0.9585, + "step": 11708 + }, + { + "epoch": 0.8145674632161118, + "grad_norm": 1.1796875, + "learning_rate": 0.00017495383815576904, + "loss": 0.7493, + "step": 11709 + }, + { + "epoch": 0.8146370308532471, + "grad_norm": 1.03125, + "learning_rate": 0.00017482653123726855, + "loss": 0.7809, + "step": 11710 + }, + { + "epoch": 0.8147065984903823, + "grad_norm": 1.046875, + "learning_rate": 0.00017469926621691757, + "loss": 0.694, + "step": 11711 + }, + { + "epoch": 0.8147761661275175, + "grad_norm": 0.984375, + "learning_rate": 0.00017457204310117837, + "loss": 0.7324, + "step": 11712 + }, + { + "epoch": 0.8148457337646526, + "grad_norm": 1.0234375, + "learning_rate": 0.00017444486189651, + "loss": 0.9471, + "step": 11713 + }, + { + "epoch": 0.8149153014017879, + "grad_norm": 1.2109375, + "learning_rate": 0.00017431772260937073, + "loss": 0.8573, + "step": 11714 + }, + { + "epoch": 0.8149848690389231, + "grad_norm": 1.484375, + "learning_rate": 0.00017419062524621544, + "loss": 0.904, + "step": 11715 + }, + { + "epoch": 0.8150544366760583, + "grad_norm": 1.4453125, + "learning_rate": 0.00017406356981349813, + "loss": 0.8472, + "step": 11716 + }, + { + "epoch": 0.8151240043131935, + "grad_norm": 0.87109375, + "learning_rate": 0.00017393655631766947, + "loss": 0.5351, + "step": 11717 + }, + { + "epoch": 0.8151935719503287, + "grad_norm": 1.1015625, + "learning_rate": 0.00017380958476517904, + "loss": 0.8369, + "step": 11718 + }, + { + "epoch": 0.8152631395874639, + "grad_norm": 1.2265625, + "learning_rate": 0.00017368265516247338, + "loss": 0.7326, + "step": 11719 + }, + { + "epoch": 0.8153327072245992, + "grad_norm": 1.3359375, + "learning_rate": 0.00017355576751599744, + "loss": 0.8288, + "step": 11720 + }, + { + "epoch": 0.8154022748617343, + "grad_norm": 1.03125, + "learning_rate": 0.0001734289218321944, + "loss": 0.7971, + "step": 11721 + }, + { + "epoch": 0.8154718424988695, + "grad_norm": 1.3984375, + "learning_rate": 0.0001733021181175044, + "loss": 1.0176, + "step": 11722 + }, + { + "epoch": 0.8155414101360048, + "grad_norm": 1.2421875, + "learning_rate": 0.00017317535637836602, + "loss": 0.7279, + "step": 11723 + }, + { + "epoch": 0.81561097777314, + "grad_norm": 1.3828125, + "learning_rate": 0.00017304863662121527, + "loss": 1.0087, + "step": 11724 + }, + { + "epoch": 0.8156805454102751, + "grad_norm": 1.0234375, + "learning_rate": 0.00017292195885248662, + "loss": 0.5907, + "step": 11725 + }, + { + "epoch": 0.8157501130474103, + "grad_norm": 1.3203125, + "learning_rate": 0.00017279532307861245, + "loss": 0.9269, + "step": 11726 + }, + { + "epoch": 0.8158196806845456, + "grad_norm": 0.98046875, + "learning_rate": 0.00017266872930602197, + "loss": 0.7344, + "step": 11727 + }, + { + "epoch": 0.8158892483216807, + "grad_norm": 1.0390625, + "learning_rate": 0.00017254217754114365, + "loss": 0.7148, + "step": 11728 + }, + { + "epoch": 0.8159588159588159, + "grad_norm": 1.125, + "learning_rate": 0.00017241566779040263, + "loss": 0.7071, + "step": 11729 + }, + { + "epoch": 0.8160283835959512, + "grad_norm": 1.1015625, + "learning_rate": 0.00017228920006022287, + "loss": 1.045, + "step": 11730 + }, + { + "epoch": 0.8160979512330864, + "grad_norm": 1.234375, + "learning_rate": 0.00017216277435702542, + "loss": 0.9198, + "step": 11731 + }, + { + "epoch": 0.8161675188702215, + "grad_norm": 1.0390625, + "learning_rate": 0.00017203639068722975, + "loss": 0.7394, + "step": 11732 + }, + { + "epoch": 0.8162370865073568, + "grad_norm": 1.125, + "learning_rate": 0.00017191004905725283, + "loss": 0.8574, + "step": 11733 + }, + { + "epoch": 0.816306654144492, + "grad_norm": 0.8203125, + "learning_rate": 0.0001717837494735097, + "loss": 0.6116, + "step": 11734 + }, + { + "epoch": 0.8163762217816272, + "grad_norm": 1.0703125, + "learning_rate": 0.00017165749194241343, + "loss": 0.6934, + "step": 11735 + }, + { + "epoch": 0.8164457894187624, + "grad_norm": 1.1171875, + "learning_rate": 0.00017153127647037458, + "loss": 0.7483, + "step": 11736 + }, + { + "epoch": 0.8165153570558976, + "grad_norm": 1.125, + "learning_rate": 0.00017140510306380176, + "loss": 0.8049, + "step": 11737 + }, + { + "epoch": 0.8165849246930328, + "grad_norm": 1.1328125, + "learning_rate": 0.0001712789717291009, + "loss": 0.9112, + "step": 11738 + }, + { + "epoch": 0.816654492330168, + "grad_norm": 1.0234375, + "learning_rate": 0.00017115288247267725, + "loss": 0.868, + "step": 11739 + }, + { + "epoch": 0.8167240599673032, + "grad_norm": 1.3984375, + "learning_rate": 0.00017102683530093255, + "loss": 0.7093, + "step": 11740 + }, + { + "epoch": 0.8167936276044384, + "grad_norm": 0.96484375, + "learning_rate": 0.0001709008302202666, + "loss": 0.8052, + "step": 11741 + }, + { + "epoch": 0.8168631952415736, + "grad_norm": 1.109375, + "learning_rate": 0.0001707748672370777, + "loss": 0.9135, + "step": 11742 + }, + { + "epoch": 0.8169327628787089, + "grad_norm": 1.3671875, + "learning_rate": 0.00017064894635776117, + "loss": 0.7799, + "step": 11743 + }, + { + "epoch": 0.817002330515844, + "grad_norm": 1.21875, + "learning_rate": 0.00017052306758871127, + "loss": 0.9484, + "step": 11744 + }, + { + "epoch": 0.8170718981529792, + "grad_norm": 1.125, + "learning_rate": 0.00017039723093631876, + "loss": 0.8796, + "step": 11745 + }, + { + "epoch": 0.8171414657901145, + "grad_norm": 1.0703125, + "learning_rate": 0.00017027143640697362, + "loss": 0.6473, + "step": 11746 + }, + { + "epoch": 0.8172110334272497, + "grad_norm": 1.25, + "learning_rate": 0.00017014568400706265, + "loss": 0.7669, + "step": 11747 + }, + { + "epoch": 0.8172806010643848, + "grad_norm": 1.4296875, + "learning_rate": 0.00017001997374297095, + "loss": 0.9723, + "step": 11748 + }, + { + "epoch": 0.8173501687015201, + "grad_norm": 0.953125, + "learning_rate": 0.00016989430562108188, + "loss": 0.7161, + "step": 11749 + }, + { + "epoch": 0.8174197363386553, + "grad_norm": 1.078125, + "learning_rate": 0.00016976867964777598, + "loss": 0.894, + "step": 11750 + }, + { + "epoch": 0.8174893039757904, + "grad_norm": 1.2109375, + "learning_rate": 0.0001696430958294315, + "loss": 0.7189, + "step": 11751 + }, + { + "epoch": 0.8175588716129256, + "grad_norm": 1.46875, + "learning_rate": 0.0001695175541724253, + "loss": 0.8627, + "step": 11752 + }, + { + "epoch": 0.8176284392500609, + "grad_norm": 1.265625, + "learning_rate": 0.00016939205468313213, + "loss": 0.9201, + "step": 11753 + }, + { + "epoch": 0.8176980068871961, + "grad_norm": 0.76953125, + "learning_rate": 0.0001692665973679237, + "loss": 0.4185, + "step": 11754 + }, + { + "epoch": 0.8177675745243312, + "grad_norm": 1.0546875, + "learning_rate": 0.00016914118223317033, + "loss": 0.7814, + "step": 11755 + }, + { + "epoch": 0.8178371421614665, + "grad_norm": 1.328125, + "learning_rate": 0.00016901580928523963, + "loss": 0.5873, + "step": 11756 + }, + { + "epoch": 0.8179067097986017, + "grad_norm": 1.5546875, + "learning_rate": 0.00016889047853049766, + "loss": 0.9774, + "step": 11757 + }, + { + "epoch": 0.8179762774357369, + "grad_norm": 0.81640625, + "learning_rate": 0.00016876518997530843, + "loss": 0.6748, + "step": 11758 + }, + { + "epoch": 0.8180458450728721, + "grad_norm": 1.1328125, + "learning_rate": 0.00016863994362603275, + "loss": 0.7434, + "step": 11759 + }, + { + "epoch": 0.8181154127100073, + "grad_norm": 0.984375, + "learning_rate": 0.00016851473948903062, + "loss": 0.9599, + "step": 11760 + }, + { + "epoch": 0.8181849803471425, + "grad_norm": 0.94140625, + "learning_rate": 0.00016838957757065877, + "loss": 0.6192, + "step": 11761 + }, + { + "epoch": 0.8182545479842778, + "grad_norm": 1.1796875, + "learning_rate": 0.00016826445787727285, + "loss": 0.7509, + "step": 11762 + }, + { + "epoch": 0.8183241156214129, + "grad_norm": 1.125, + "learning_rate": 0.00016813938041522526, + "loss": 0.7313, + "step": 11763 + }, + { + "epoch": 0.8183936832585481, + "grad_norm": 1.1171875, + "learning_rate": 0.00016801434519086723, + "loss": 0.8128, + "step": 11764 + }, + { + "epoch": 0.8184632508956833, + "grad_norm": 1.203125, + "learning_rate": 0.00016788935221054703, + "loss": 0.9444, + "step": 11765 + }, + { + "epoch": 0.8185328185328186, + "grad_norm": 1.140625, + "learning_rate": 0.00016776440148061133, + "loss": 0.9276, + "step": 11766 + }, + { + "epoch": 0.8186023861699537, + "grad_norm": 1.4453125, + "learning_rate": 0.0001676394930074049, + "loss": 0.6666, + "step": 11767 + }, + { + "epoch": 0.8186719538070889, + "grad_norm": 1.2265625, + "learning_rate": 0.00016751462679726948, + "loss": 0.7016, + "step": 11768 + }, + { + "epoch": 0.8187415214442242, + "grad_norm": 1.328125, + "learning_rate": 0.00016738980285654537, + "loss": 0.8631, + "step": 11769 + }, + { + "epoch": 0.8188110890813594, + "grad_norm": 1.078125, + "learning_rate": 0.00016726502119156984, + "loss": 0.7953, + "step": 11770 + }, + { + "epoch": 0.8188806567184945, + "grad_norm": 1.046875, + "learning_rate": 0.0001671402818086797, + "loss": 0.8171, + "step": 11771 + }, + { + "epoch": 0.8189502243556298, + "grad_norm": 1.0703125, + "learning_rate": 0.0001670155847142082, + "loss": 0.5757, + "step": 11772 + }, + { + "epoch": 0.819019791992765, + "grad_norm": 1.1640625, + "learning_rate": 0.0001668909299144865, + "loss": 0.9231, + "step": 11773 + }, + { + "epoch": 0.8190893596299001, + "grad_norm": 1.2734375, + "learning_rate": 0.00016676631741584447, + "loss": 0.7563, + "step": 11774 + }, + { + "epoch": 0.8191589272670354, + "grad_norm": 0.94921875, + "learning_rate": 0.00016664174722460866, + "loss": 0.6916, + "step": 11775 + }, + { + "epoch": 0.8192284949041706, + "grad_norm": 1.1796875, + "learning_rate": 0.00016651721934710483, + "loss": 0.7551, + "step": 11776 + }, + { + "epoch": 0.8192980625413058, + "grad_norm": 1.21875, + "learning_rate": 0.00016639273378965536, + "loss": 0.697, + "step": 11777 + }, + { + "epoch": 0.8193676301784409, + "grad_norm": 1.3203125, + "learning_rate": 0.00016626829055858128, + "loss": 0.9492, + "step": 11778 + }, + { + "epoch": 0.8194371978155762, + "grad_norm": 1.046875, + "learning_rate": 0.000166143889660201, + "loss": 0.6445, + "step": 11779 + }, + { + "epoch": 0.8195067654527114, + "grad_norm": 1.34375, + "learning_rate": 0.0001660195311008309, + "loss": 0.8859, + "step": 11780 + }, + { + "epoch": 0.8195763330898466, + "grad_norm": 1.1953125, + "learning_rate": 0.00016589521488678582, + "loss": 0.8174, + "step": 11781 + }, + { + "epoch": 0.8196459007269818, + "grad_norm": 0.8828125, + "learning_rate": 0.0001657709410243774, + "loss": 0.6393, + "step": 11782 + }, + { + "epoch": 0.819715468364117, + "grad_norm": 1.28125, + "learning_rate": 0.00016564670951991556, + "loss": 0.9107, + "step": 11783 + }, + { + "epoch": 0.8197850360012522, + "grad_norm": 1.2890625, + "learning_rate": 0.00016552252037970838, + "loss": 0.965, + "step": 11784 + }, + { + "epoch": 0.8198546036383875, + "grad_norm": 1.2734375, + "learning_rate": 0.00016539837361006184, + "loss": 0.6285, + "step": 11785 + }, + { + "epoch": 0.8199241712755226, + "grad_norm": 1.125, + "learning_rate": 0.00016527426921727917, + "loss": 0.5821, + "step": 11786 + }, + { + "epoch": 0.8199937389126578, + "grad_norm": 1.3125, + "learning_rate": 0.00016515020720766149, + "loss": 0.7352, + "step": 11787 + }, + { + "epoch": 0.820063306549793, + "grad_norm": 1.1796875, + "learning_rate": 0.00016502618758750854, + "loss": 0.9191, + "step": 11788 + }, + { + "epoch": 0.8201328741869283, + "grad_norm": 1.0703125, + "learning_rate": 0.00016490221036311704, + "loss": 0.8733, + "step": 11789 + }, + { + "epoch": 0.8202024418240634, + "grad_norm": 1.234375, + "learning_rate": 0.00016477827554078228, + "loss": 0.9191, + "step": 11790 + }, + { + "epoch": 0.8202720094611986, + "grad_norm": 1.1875, + "learning_rate": 0.0001646543831267966, + "loss": 0.7753, + "step": 11791 + }, + { + "epoch": 0.8203415770983339, + "grad_norm": 1.3046875, + "learning_rate": 0.00016453053312745115, + "loss": 0.689, + "step": 11792 + }, + { + "epoch": 0.8204111447354691, + "grad_norm": 1.3046875, + "learning_rate": 0.0001644067255490339, + "loss": 0.8691, + "step": 11793 + }, + { + "epoch": 0.8204807123726042, + "grad_norm": 1.234375, + "learning_rate": 0.00016428296039783152, + "loss": 0.6144, + "step": 11794 + }, + { + "epoch": 0.8205502800097395, + "grad_norm": 1.1484375, + "learning_rate": 0.0001641592376801282, + "loss": 1.0364, + "step": 11795 + }, + { + "epoch": 0.8206198476468747, + "grad_norm": 0.8671875, + "learning_rate": 0.0001640355574022059, + "loss": 0.694, + "step": 11796 + }, + { + "epoch": 0.8206894152840098, + "grad_norm": 1.5703125, + "learning_rate": 0.00016391191957034422, + "loss": 0.8732, + "step": 11797 + }, + { + "epoch": 0.8207589829211451, + "grad_norm": 1.515625, + "learning_rate": 0.00016378832419082102, + "loss": 0.9643, + "step": 11798 + }, + { + "epoch": 0.8208285505582803, + "grad_norm": 1.234375, + "learning_rate": 0.00016366477126991208, + "loss": 0.8182, + "step": 11799 + }, + { + "epoch": 0.8208981181954155, + "grad_norm": 0.98828125, + "learning_rate": 0.00016354126081389076, + "loss": 0.705, + "step": 11800 + }, + { + "epoch": 0.8209676858325506, + "grad_norm": 1.1796875, + "learning_rate": 0.0001634177928290278, + "loss": 0.7874, + "step": 11801 + }, + { + "epoch": 0.8210372534696859, + "grad_norm": 0.99609375, + "learning_rate": 0.00016329436732159263, + "loss": 0.5235, + "step": 11802 + }, + { + "epoch": 0.8211068211068211, + "grad_norm": 0.87109375, + "learning_rate": 0.00016317098429785248, + "loss": 0.6098, + "step": 11803 + }, + { + "epoch": 0.8211763887439563, + "grad_norm": 1.296875, + "learning_rate": 0.00016304764376407177, + "loss": 0.7253, + "step": 11804 + }, + { + "epoch": 0.8212459563810915, + "grad_norm": 1.015625, + "learning_rate": 0.00016292434572651293, + "loss": 0.6622, + "step": 11805 + }, + { + "epoch": 0.8213155240182267, + "grad_norm": 1.0546875, + "learning_rate": 0.00016280109019143685, + "loss": 0.5838, + "step": 11806 + }, + { + "epoch": 0.8213850916553619, + "grad_norm": 1.0859375, + "learning_rate": 0.00016267787716510142, + "loss": 0.7913, + "step": 11807 + }, + { + "epoch": 0.8214546592924972, + "grad_norm": 0.91796875, + "learning_rate": 0.00016255470665376304, + "loss": 0.6731, + "step": 11808 + }, + { + "epoch": 0.8215242269296323, + "grad_norm": 1.546875, + "learning_rate": 0.00016243157866367575, + "loss": 1.1183, + "step": 11809 + }, + { + "epoch": 0.8215937945667675, + "grad_norm": 1.0859375, + "learning_rate": 0.0001623084932010912, + "loss": 0.8298, + "step": 11810 + }, + { + "epoch": 0.8216633622039028, + "grad_norm": 1.1875, + "learning_rate": 0.00016218545027225895, + "loss": 0.7191, + "step": 11811 + }, + { + "epoch": 0.821732929841038, + "grad_norm": 1.1484375, + "learning_rate": 0.00016206244988342666, + "loss": 0.7012, + "step": 11812 + }, + { + "epoch": 0.8218024974781731, + "grad_norm": 1.03125, + "learning_rate": 0.0001619394920408398, + "loss": 0.8181, + "step": 11813 + }, + { + "epoch": 0.8218720651153083, + "grad_norm": 1.1875, + "learning_rate": 0.00016181657675074147, + "loss": 0.6448, + "step": 11814 + }, + { + "epoch": 0.8219416327524436, + "grad_norm": 1.2734375, + "learning_rate": 0.00016169370401937223, + "loss": 0.7898, + "step": 11815 + }, + { + "epoch": 0.8220112003895788, + "grad_norm": 1.1484375, + "learning_rate": 0.00016157087385297142, + "loss": 0.797, + "step": 11816 + }, + { + "epoch": 0.8220807680267139, + "grad_norm": 0.9296875, + "learning_rate": 0.00016144808625777595, + "loss": 0.7012, + "step": 11817 + }, + { + "epoch": 0.8221503356638492, + "grad_norm": 1.2578125, + "learning_rate": 0.00016132534124001997, + "loss": 0.7456, + "step": 11818 + }, + { + "epoch": 0.8222199033009844, + "grad_norm": 1.2265625, + "learning_rate": 0.00016120263880593566, + "loss": 0.7268, + "step": 11819 + }, + { + "epoch": 0.8222894709381195, + "grad_norm": 1.0703125, + "learning_rate": 0.00016107997896175374, + "loss": 0.7633, + "step": 11820 + }, + { + "epoch": 0.8223590385752548, + "grad_norm": 1.21875, + "learning_rate": 0.0001609573617137019, + "loss": 0.8158, + "step": 11821 + }, + { + "epoch": 0.82242860621239, + "grad_norm": 0.94140625, + "learning_rate": 0.00016083478706800604, + "loss": 0.6253, + "step": 11822 + }, + { + "epoch": 0.8224981738495252, + "grad_norm": 1.1171875, + "learning_rate": 0.00016071225503089026, + "loss": 0.5911, + "step": 11823 + }, + { + "epoch": 0.8225677414866605, + "grad_norm": 0.921875, + "learning_rate": 0.00016058976560857574, + "loss": 0.5473, + "step": 11824 + }, + { + "epoch": 0.8226373091237956, + "grad_norm": 0.9375, + "learning_rate": 0.00016046731880728184, + "loss": 0.6328, + "step": 11825 + }, + { + "epoch": 0.8227068767609308, + "grad_norm": 1.21875, + "learning_rate": 0.000160344914633226, + "loss": 0.8415, + "step": 11826 + }, + { + "epoch": 0.822776444398066, + "grad_norm": 0.96484375, + "learning_rate": 0.00016022255309262334, + "loss": 0.8935, + "step": 11827 + }, + { + "epoch": 0.8228460120352012, + "grad_norm": 1.0390625, + "learning_rate": 0.00016010023419168673, + "loss": 0.7528, + "step": 11828 + }, + { + "epoch": 0.8229155796723364, + "grad_norm": 0.96875, + "learning_rate": 0.0001599779579366265, + "loss": 0.7167, + "step": 11829 + }, + { + "epoch": 0.8229851473094716, + "grad_norm": 1.53125, + "learning_rate": 0.00015985572433365158, + "loss": 0.9432, + "step": 11830 + }, + { + "epoch": 0.8230547149466069, + "grad_norm": 1.4765625, + "learning_rate": 0.00015973353338896856, + "loss": 0.9314, + "step": 11831 + }, + { + "epoch": 0.823124282583742, + "grad_norm": 1.015625, + "learning_rate": 0.0001596113851087815, + "loss": 0.818, + "step": 11832 + }, + { + "epoch": 0.8231938502208772, + "grad_norm": 1.28125, + "learning_rate": 0.00015948927949929216, + "loss": 0.7425, + "step": 11833 + }, + { + "epoch": 0.8232634178580125, + "grad_norm": 1.0546875, + "learning_rate": 0.0001593672165667007, + "loss": 0.6268, + "step": 11834 + }, + { + "epoch": 0.8233329854951477, + "grad_norm": 1.0234375, + "learning_rate": 0.00015924519631720514, + "loss": 0.8678, + "step": 11835 + }, + { + "epoch": 0.8234025531322828, + "grad_norm": 1.5078125, + "learning_rate": 0.00015912321875700074, + "loss": 1.029, + "step": 11836 + }, + { + "epoch": 0.8234721207694181, + "grad_norm": 0.953125, + "learning_rate": 0.00015900128389228086, + "loss": 0.8239, + "step": 11837 + }, + { + "epoch": 0.8235416884065533, + "grad_norm": 1.28125, + "learning_rate": 0.00015887939172923692, + "loss": 0.6994, + "step": 11838 + }, + { + "epoch": 0.8236112560436885, + "grad_norm": 0.98828125, + "learning_rate": 0.0001587575422740578, + "loss": 0.5914, + "step": 11839 + }, + { + "epoch": 0.8236808236808236, + "grad_norm": 0.8671875, + "learning_rate": 0.00015863573553293042, + "loss": 0.7486, + "step": 11840 + }, + { + "epoch": 0.8237503913179589, + "grad_norm": 0.99609375, + "learning_rate": 0.00015851397151203983, + "loss": 0.7262, + "step": 11841 + }, + { + "epoch": 0.8238199589550941, + "grad_norm": 0.83984375, + "learning_rate": 0.0001583922502175684, + "loss": 0.7706, + "step": 11842 + }, + { + "epoch": 0.8238895265922292, + "grad_norm": 1.0703125, + "learning_rate": 0.00015827057165569624, + "loss": 0.7025, + "step": 11843 + }, + { + "epoch": 0.8239590942293645, + "grad_norm": 1.03125, + "learning_rate": 0.0001581489358326018, + "loss": 0.7226, + "step": 11844 + }, + { + "epoch": 0.8240286618664997, + "grad_norm": 0.82421875, + "learning_rate": 0.0001580273427544614, + "loss": 0.5513, + "step": 11845 + }, + { + "epoch": 0.8240982295036349, + "grad_norm": 1.03125, + "learning_rate": 0.00015790579242744873, + "loss": 0.8411, + "step": 11846 + }, + { + "epoch": 0.8241677971407702, + "grad_norm": 1.046875, + "learning_rate": 0.00015778428485773522, + "loss": 0.8452, + "step": 11847 + }, + { + "epoch": 0.8242373647779053, + "grad_norm": 1.09375, + "learning_rate": 0.00015766282005149056, + "loss": 0.6987, + "step": 11848 + }, + { + "epoch": 0.8243069324150405, + "grad_norm": 1.3203125, + "learning_rate": 0.00015754139801488256, + "loss": 0.9424, + "step": 11849 + }, + { + "epoch": 0.8243765000521758, + "grad_norm": 1.03125, + "learning_rate": 0.00015742001875407598, + "loss": 0.6831, + "step": 11850 + }, + { + "epoch": 0.824446067689311, + "grad_norm": 0.7109375, + "learning_rate": 0.0001572986822752336, + "loss": 0.6033, + "step": 11851 + }, + { + "epoch": 0.8245156353264461, + "grad_norm": 1.1953125, + "learning_rate": 0.000157177388584517, + "loss": 0.9047, + "step": 11852 + }, + { + "epoch": 0.8245852029635813, + "grad_norm": 1.125, + "learning_rate": 0.00015705613768808414, + "loss": 0.8724, + "step": 11853 + }, + { + "epoch": 0.8246547706007166, + "grad_norm": 0.99609375, + "learning_rate": 0.00015693492959209187, + "loss": 0.8429, + "step": 11854 + }, + { + "epoch": 0.8247243382378517, + "grad_norm": 0.8359375, + "learning_rate": 0.0001568137643026948, + "loss": 0.6627, + "step": 11855 + }, + { + "epoch": 0.8247939058749869, + "grad_norm": 1.1171875, + "learning_rate": 0.0001566926418260447, + "loss": 0.8518, + "step": 11856 + }, + { + "epoch": 0.8248634735121222, + "grad_norm": 1.1015625, + "learning_rate": 0.00015657156216829148, + "loss": 0.938, + "step": 11857 + }, + { + "epoch": 0.8249330411492574, + "grad_norm": 1.2578125, + "learning_rate": 0.00015645052533558323, + "loss": 0.7418, + "step": 11858 + }, + { + "epoch": 0.8250026087863925, + "grad_norm": 1.015625, + "learning_rate": 0.0001563295313340657, + "loss": 0.7214, + "step": 11859 + }, + { + "epoch": 0.8250721764235278, + "grad_norm": 1.0625, + "learning_rate": 0.00015620858016988205, + "loss": 0.857, + "step": 11860 + }, + { + "epoch": 0.825141744060663, + "grad_norm": 0.89453125, + "learning_rate": 0.0001560876718491735, + "loss": 0.5527, + "step": 11861 + }, + { + "epoch": 0.8252113116977982, + "grad_norm": 1.3125, + "learning_rate": 0.00015596680637807936, + "loss": 0.8925, + "step": 11862 + }, + { + "epoch": 0.8252808793349334, + "grad_norm": 1.2421875, + "learning_rate": 0.00015584598376273674, + "loss": 0.6308, + "step": 11863 + }, + { + "epoch": 0.8253504469720686, + "grad_norm": 0.98046875, + "learning_rate": 0.00015572520400928026, + "loss": 0.7314, + "step": 11864 + }, + { + "epoch": 0.8254200146092038, + "grad_norm": 1.15625, + "learning_rate": 0.00015560446712384223, + "loss": 0.7527, + "step": 11865 + }, + { + "epoch": 0.825489582246339, + "grad_norm": 1.03125, + "learning_rate": 0.00015548377311255324, + "loss": 0.6309, + "step": 11866 + }, + { + "epoch": 0.8255591498834742, + "grad_norm": 0.80859375, + "learning_rate": 0.0001553631219815419, + "loss": 0.4498, + "step": 11867 + }, + { + "epoch": 0.8256287175206094, + "grad_norm": 1.1328125, + "learning_rate": 0.00015524251373693354, + "loss": 0.7638, + "step": 11868 + }, + { + "epoch": 0.8256982851577446, + "grad_norm": 1.296875, + "learning_rate": 0.00015512194838485284, + "loss": 0.846, + "step": 11869 + }, + { + "epoch": 0.8257678527948799, + "grad_norm": 0.9765625, + "learning_rate": 0.0001550014259314211, + "loss": 0.983, + "step": 11870 + }, + { + "epoch": 0.825837420432015, + "grad_norm": 1.2109375, + "learning_rate": 0.00015488094638275751, + "loss": 0.7911, + "step": 11871 + }, + { + "epoch": 0.8259069880691502, + "grad_norm": 1.3671875, + "learning_rate": 0.0001547605097449798, + "loss": 0.9909, + "step": 11872 + }, + { + "epoch": 0.8259765557062855, + "grad_norm": 1.0703125, + "learning_rate": 0.00015464011602420324, + "loss": 0.8125, + "step": 11873 + }, + { + "epoch": 0.8260461233434206, + "grad_norm": 1.078125, + "learning_rate": 0.00015451976522654076, + "loss": 0.9632, + "step": 11874 + }, + { + "epoch": 0.8261156909805558, + "grad_norm": 0.98046875, + "learning_rate": 0.0001543994573581028, + "loss": 0.7242, + "step": 11875 + }, + { + "epoch": 0.8261852586176911, + "grad_norm": 1.140625, + "learning_rate": 0.00015427919242499822, + "loss": 0.5407, + "step": 11876 + }, + { + "epoch": 0.8262548262548263, + "grad_norm": 1.2421875, + "learning_rate": 0.0001541589704333337, + "loss": 0.798, + "step": 11877 + }, + { + "epoch": 0.8263243938919614, + "grad_norm": 1.203125, + "learning_rate": 0.0001540387913892134, + "loss": 0.8822, + "step": 11878 + }, + { + "epoch": 0.8263939615290966, + "grad_norm": 1.15625, + "learning_rate": 0.00015391865529873906, + "loss": 0.7547, + "step": 11879 + }, + { + "epoch": 0.8264635291662319, + "grad_norm": 1.21875, + "learning_rate": 0.0001537985621680108, + "loss": 0.7576, + "step": 11880 + }, + { + "epoch": 0.8265330968033671, + "grad_norm": 1.1484375, + "learning_rate": 0.00015367851200312666, + "loss": 0.6437, + "step": 11881 + }, + { + "epoch": 0.8266026644405022, + "grad_norm": 1.484375, + "learning_rate": 0.00015355850481018162, + "loss": 0.859, + "step": 11882 + }, + { + "epoch": 0.8266722320776375, + "grad_norm": 1.0625, + "learning_rate": 0.00015343854059526952, + "loss": 0.703, + "step": 11883 + }, + { + "epoch": 0.8267417997147727, + "grad_norm": 1.09375, + "learning_rate": 0.00015331861936448144, + "loss": 0.7281, + "step": 11884 + }, + { + "epoch": 0.8268113673519079, + "grad_norm": 0.94921875, + "learning_rate": 0.00015319874112390598, + "loss": 0.8862, + "step": 11885 + }, + { + "epoch": 0.8268809349890431, + "grad_norm": 1.2890625, + "learning_rate": 0.00015307890587963036, + "loss": 0.8087, + "step": 11886 + }, + { + "epoch": 0.8269505026261783, + "grad_norm": 1.1796875, + "learning_rate": 0.00015295911363773918, + "loss": 0.7998, + "step": 11887 + }, + { + "epoch": 0.8270200702633135, + "grad_norm": 1.21875, + "learning_rate": 0.0001528393644043149, + "loss": 0.9382, + "step": 11888 + }, + { + "epoch": 0.8270896379004488, + "grad_norm": 1.203125, + "learning_rate": 0.00015271965818543744, + "loss": 0.8733, + "step": 11889 + }, + { + "epoch": 0.8271592055375839, + "grad_norm": 1.28125, + "learning_rate": 0.00015259999498718513, + "loss": 0.8205, + "step": 11890 + }, + { + "epoch": 0.8272287731747191, + "grad_norm": 1.1328125, + "learning_rate": 0.00015248037481563415, + "loss": 0.7617, + "step": 11891 + }, + { + "epoch": 0.8272983408118543, + "grad_norm": 1.1484375, + "learning_rate": 0.00015236079767685785, + "loss": 0.6772, + "step": 11892 + }, + { + "epoch": 0.8273679084489896, + "grad_norm": 0.77734375, + "learning_rate": 0.00015224126357692757, + "loss": 0.4945, + "step": 11893 + }, + { + "epoch": 0.8274374760861247, + "grad_norm": 1.0, + "learning_rate": 0.00015212177252191294, + "loss": 0.8874, + "step": 11894 + }, + { + "epoch": 0.8275070437232599, + "grad_norm": 1.2890625, + "learning_rate": 0.00015200232451788133, + "loss": 0.8229, + "step": 11895 + }, + { + "epoch": 0.8275766113603952, + "grad_norm": 0.9453125, + "learning_rate": 0.00015188291957089718, + "loss": 0.6975, + "step": 11896 + }, + { + "epoch": 0.8276461789975303, + "grad_norm": 1.0390625, + "learning_rate": 0.00015176355768702388, + "loss": 0.6228, + "step": 11897 + }, + { + "epoch": 0.8277157466346655, + "grad_norm": 1.140625, + "learning_rate": 0.0001516442388723216, + "loss": 0.8128, + "step": 11898 + }, + { + "epoch": 0.8277853142718008, + "grad_norm": 1.3515625, + "learning_rate": 0.0001515249631328486, + "loss": 1.1627, + "step": 11899 + }, + { + "epoch": 0.827854881908936, + "grad_norm": 1.2265625, + "learning_rate": 0.00015140573047466133, + "loss": 0.6686, + "step": 11900 + }, + { + "epoch": 0.8279244495460711, + "grad_norm": 1.25, + "learning_rate": 0.0001512865409038141, + "loss": 1.0271, + "step": 11901 + }, + { + "epoch": 0.8279940171832064, + "grad_norm": 0.96875, + "learning_rate": 0.00015116739442635853, + "loss": 0.8546, + "step": 11902 + }, + { + "epoch": 0.8280635848203416, + "grad_norm": 1.3046875, + "learning_rate": 0.00015104829104834394, + "loss": 0.7779, + "step": 11903 + }, + { + "epoch": 0.8281331524574768, + "grad_norm": 0.9921875, + "learning_rate": 0.0001509292307758181, + "loss": 0.7168, + "step": 11904 + }, + { + "epoch": 0.8282027200946119, + "grad_norm": 1.328125, + "learning_rate": 0.00015081021361482662, + "loss": 0.8175, + "step": 11905 + }, + { + "epoch": 0.8282722877317472, + "grad_norm": 1.09375, + "learning_rate": 0.00015069123957141219, + "loss": 0.8798, + "step": 11906 + }, + { + "epoch": 0.8283418553688824, + "grad_norm": 1.171875, + "learning_rate": 0.00015057230865161552, + "loss": 1.0727, + "step": 11907 + }, + { + "epoch": 0.8284114230060176, + "grad_norm": 1.203125, + "learning_rate": 0.00015045342086147562, + "loss": 1.0258, + "step": 11908 + }, + { + "epoch": 0.8284809906431528, + "grad_norm": 1.0, + "learning_rate": 0.00015033457620702918, + "loss": 0.9374, + "step": 11909 + }, + { + "epoch": 0.828550558280288, + "grad_norm": 1.2890625, + "learning_rate": 0.00015021577469431037, + "loss": 0.7597, + "step": 11910 + }, + { + "epoch": 0.8286201259174232, + "grad_norm": 1.46875, + "learning_rate": 0.00015009701632935103, + "loss": 0.9634, + "step": 11911 + }, + { + "epoch": 0.8286896935545585, + "grad_norm": 1.0390625, + "learning_rate": 0.00014997830111818133, + "loss": 0.9563, + "step": 11912 + }, + { + "epoch": 0.8287592611916936, + "grad_norm": 1.5859375, + "learning_rate": 0.00014985962906682938, + "loss": 0.5603, + "step": 11913 + }, + { + "epoch": 0.8288288288288288, + "grad_norm": 1.140625, + "learning_rate": 0.00014974100018132018, + "loss": 0.818, + "step": 11914 + }, + { + "epoch": 0.8288983964659641, + "grad_norm": 1.203125, + "learning_rate": 0.00014962241446767765, + "loss": 0.9945, + "step": 11915 + }, + { + "epoch": 0.8289679641030993, + "grad_norm": 1.1796875, + "learning_rate": 0.0001495038719319226, + "loss": 0.8931, + "step": 11916 + }, + { + "epoch": 0.8290375317402344, + "grad_norm": 1.2109375, + "learning_rate": 0.0001493853725800739, + "loss": 0.7958, + "step": 11917 + }, + { + "epoch": 0.8291070993773696, + "grad_norm": 1.0234375, + "learning_rate": 0.0001492669164181486, + "loss": 0.9674, + "step": 11918 + }, + { + "epoch": 0.8291766670145049, + "grad_norm": 1.0859375, + "learning_rate": 0.00014914850345216146, + "loss": 0.8706, + "step": 11919 + }, + { + "epoch": 0.82924623465164, + "grad_norm": 1.2109375, + "learning_rate": 0.00014903013368812478, + "loss": 0.7766, + "step": 11920 + }, + { + "epoch": 0.8293158022887752, + "grad_norm": 1.171875, + "learning_rate": 0.00014891180713204845, + "loss": 1.0476, + "step": 11921 + }, + { + "epoch": 0.8293853699259105, + "grad_norm": 1.34375, + "learning_rate": 0.0001487935237899407, + "loss": 0.9776, + "step": 11922 + }, + { + "epoch": 0.8294549375630457, + "grad_norm": 1.375, + "learning_rate": 0.0001486752836678077, + "loss": 0.7633, + "step": 11923 + }, + { + "epoch": 0.8295245052001808, + "grad_norm": 1.015625, + "learning_rate": 0.0001485570867716528, + "loss": 0.9992, + "step": 11924 + }, + { + "epoch": 0.8295940728373161, + "grad_norm": 0.921875, + "learning_rate": 0.00014843893310747714, + "loss": 0.8413, + "step": 11925 + }, + { + "epoch": 0.8296636404744513, + "grad_norm": 1.015625, + "learning_rate": 0.00014832082268128032, + "loss": 0.8898, + "step": 11926 + }, + { + "epoch": 0.8297332081115865, + "grad_norm": 1.2578125, + "learning_rate": 0.00014820275549905958, + "loss": 0.7373, + "step": 11927 + }, + { + "epoch": 0.8298027757487217, + "grad_norm": 1.1484375, + "learning_rate": 0.00014808473156680934, + "loss": 0.8684, + "step": 11928 + }, + { + "epoch": 0.8298723433858569, + "grad_norm": 1.546875, + "learning_rate": 0.0001479667508905227, + "loss": 1.0323, + "step": 11929 + }, + { + "epoch": 0.8299419110229921, + "grad_norm": 1.1015625, + "learning_rate": 0.00014784881347618985, + "loss": 1.0172, + "step": 11930 + }, + { + "epoch": 0.8300114786601273, + "grad_norm": 0.94140625, + "learning_rate": 0.00014773091932979886, + "loss": 0.8575, + "step": 11931 + }, + { + "epoch": 0.8300810462972625, + "grad_norm": 1.1171875, + "learning_rate": 0.00014761306845733602, + "loss": 0.7712, + "step": 11932 + }, + { + "epoch": 0.8301506139343977, + "grad_norm": 1.6328125, + "learning_rate": 0.00014749526086478538, + "loss": 1.0773, + "step": 11933 + }, + { + "epoch": 0.8302201815715329, + "grad_norm": 1.3046875, + "learning_rate": 0.0001473774965581286, + "loss": 0.836, + "step": 11934 + }, + { + "epoch": 0.8302897492086682, + "grad_norm": 1.125, + "learning_rate": 0.0001472597755433447, + "loss": 0.9085, + "step": 11935 + }, + { + "epoch": 0.8303593168458033, + "grad_norm": 1.171875, + "learning_rate": 0.0001471420978264112, + "loss": 1.0149, + "step": 11936 + }, + { + "epoch": 0.8304288844829385, + "grad_norm": 1.0390625, + "learning_rate": 0.00014702446341330355, + "loss": 0.6521, + "step": 11937 + }, + { + "epoch": 0.8304984521200738, + "grad_norm": 0.875, + "learning_rate": 0.00014690687230999434, + "loss": 0.6956, + "step": 11938 + }, + { + "epoch": 0.830568019757209, + "grad_norm": 1.0859375, + "learning_rate": 0.00014678932452245397, + "loss": 0.8258, + "step": 11939 + }, + { + "epoch": 0.8306375873943441, + "grad_norm": 1.109375, + "learning_rate": 0.00014667182005665124, + "loss": 0.6902, + "step": 11940 + }, + { + "epoch": 0.8307071550314794, + "grad_norm": 1.3046875, + "learning_rate": 0.00014655435891855261, + "loss": 0.9419, + "step": 11941 + }, + { + "epoch": 0.8307767226686146, + "grad_norm": 1.0546875, + "learning_rate": 0.00014643694111412175, + "loss": 0.5936, + "step": 11942 + }, + { + "epoch": 0.8308462903057497, + "grad_norm": 1.2890625, + "learning_rate": 0.000146319566649321, + "loss": 1.076, + "step": 11943 + }, + { + "epoch": 0.8309158579428849, + "grad_norm": 1.1484375, + "learning_rate": 0.00014620223553010947, + "loss": 0.9286, + "step": 11944 + }, + { + "epoch": 0.8309854255800202, + "grad_norm": 1.140625, + "learning_rate": 0.00014608494776244529, + "loss": 0.7935, + "step": 11945 + }, + { + "epoch": 0.8310549932171554, + "grad_norm": 1.0859375, + "learning_rate": 0.00014596770335228315, + "loss": 0.9173, + "step": 11946 + }, + { + "epoch": 0.8311245608542905, + "grad_norm": 1.1328125, + "learning_rate": 0.0001458505023055765, + "loss": 0.8054, + "step": 11947 + }, + { + "epoch": 0.8311941284914258, + "grad_norm": 1.1796875, + "learning_rate": 0.00014573334462827624, + "loss": 0.8516, + "step": 11948 + }, + { + "epoch": 0.831263696128561, + "grad_norm": 1.1796875, + "learning_rate": 0.00014561623032633065, + "loss": 1.006, + "step": 11949 + }, + { + "epoch": 0.8313332637656962, + "grad_norm": 0.8828125, + "learning_rate": 0.00014549915940568648, + "loss": 0.5378, + "step": 11950 + }, + { + "epoch": 0.8314028314028314, + "grad_norm": 1.1640625, + "learning_rate": 0.0001453821318722882, + "loss": 0.8296, + "step": 11951 + }, + { + "epoch": 0.8314723990399666, + "grad_norm": 1.1640625, + "learning_rate": 0.00014526514773207776, + "loss": 0.8133, + "step": 11952 + }, + { + "epoch": 0.8315419666771018, + "grad_norm": 0.9296875, + "learning_rate": 0.00014514820699099463, + "loss": 0.7953, + "step": 11953 + }, + { + "epoch": 0.8316115343142371, + "grad_norm": 1.171875, + "learning_rate": 0.0001450313096549768, + "loss": 0.8305, + "step": 11954 + }, + { + "epoch": 0.8316811019513722, + "grad_norm": 1.25, + "learning_rate": 0.00014491445572995988, + "loss": 0.968, + "step": 11955 + }, + { + "epoch": 0.8317506695885074, + "grad_norm": 1.0, + "learning_rate": 0.00014479764522187677, + "loss": 0.6758, + "step": 11956 + }, + { + "epoch": 0.8318202372256426, + "grad_norm": 1.3125, + "learning_rate": 0.00014468087813665888, + "loss": 0.7468, + "step": 11957 + }, + { + "epoch": 0.8318898048627779, + "grad_norm": 1.1953125, + "learning_rate": 0.00014456415448023464, + "loss": 0.9035, + "step": 11958 + }, + { + "epoch": 0.831959372499913, + "grad_norm": 1.09375, + "learning_rate": 0.00014444747425853123, + "loss": 0.6326, + "step": 11959 + }, + { + "epoch": 0.8320289401370482, + "grad_norm": 0.69921875, + "learning_rate": 0.00014433083747747243, + "loss": 0.4847, + "step": 11960 + }, + { + "epoch": 0.8320985077741835, + "grad_norm": 1.15625, + "learning_rate": 0.00014421424414298113, + "loss": 0.8069, + "step": 11961 + }, + { + "epoch": 0.8321680754113187, + "grad_norm": 1.078125, + "learning_rate": 0.00014409769426097695, + "loss": 0.7904, + "step": 11962 + }, + { + "epoch": 0.8322376430484538, + "grad_norm": 1.125, + "learning_rate": 0.00014398118783737746, + "loss": 0.8436, + "step": 11963 + }, + { + "epoch": 0.8323072106855891, + "grad_norm": 1.1953125, + "learning_rate": 0.00014386472487809898, + "loss": 0.6022, + "step": 11964 + }, + { + "epoch": 0.8323767783227243, + "grad_norm": 1.2109375, + "learning_rate": 0.00014374830538905448, + "loss": 0.9282, + "step": 11965 + }, + { + "epoch": 0.8324463459598594, + "grad_norm": 1.453125, + "learning_rate": 0.0001436319293761552, + "loss": 0.7943, + "step": 11966 + }, + { + "epoch": 0.8325159135969947, + "grad_norm": 0.8515625, + "learning_rate": 0.00014351559684531, + "loss": 0.5992, + "step": 11967 + }, + { + "epoch": 0.8325854812341299, + "grad_norm": 1.1484375, + "learning_rate": 0.00014339930780242572, + "loss": 0.7814, + "step": 11968 + }, + { + "epoch": 0.8326550488712651, + "grad_norm": 1.03125, + "learning_rate": 0.00014328306225340725, + "loss": 0.823, + "step": 11969 + }, + { + "epoch": 0.8327246165084002, + "grad_norm": 1.1796875, + "learning_rate": 0.00014316686020415649, + "loss": 0.8234, + "step": 11970 + }, + { + "epoch": 0.8327941841455355, + "grad_norm": 1.0, + "learning_rate": 0.0001430507016605741, + "loss": 0.554, + "step": 11971 + }, + { + "epoch": 0.8328637517826707, + "grad_norm": 0.88671875, + "learning_rate": 0.00014293458662855741, + "loss": 0.6621, + "step": 11972 + }, + { + "epoch": 0.8329333194198059, + "grad_norm": 0.97265625, + "learning_rate": 0.0001428185151140028, + "loss": 0.6714, + "step": 11973 + }, + { + "epoch": 0.8330028870569411, + "grad_norm": 0.84765625, + "learning_rate": 0.0001427024871228031, + "loss": 0.5449, + "step": 11974 + }, + { + "epoch": 0.8330724546940763, + "grad_norm": 1.1953125, + "learning_rate": 0.00014258650266085038, + "loss": 1.0034, + "step": 11975 + }, + { + "epoch": 0.8331420223312115, + "grad_norm": 1.203125, + "learning_rate": 0.00014247056173403305, + "loss": 0.8965, + "step": 11976 + }, + { + "epoch": 0.8332115899683468, + "grad_norm": 1.171875, + "learning_rate": 0.0001423546643482384, + "loss": 0.8702, + "step": 11977 + }, + { + "epoch": 0.8332811576054819, + "grad_norm": 1.1015625, + "learning_rate": 0.00014223881050935117, + "loss": 0.7934, + "step": 11978 + }, + { + "epoch": 0.8333507252426171, + "grad_norm": 1.0859375, + "learning_rate": 0.00014212300022325376, + "loss": 0.9499, + "step": 11979 + }, + { + "epoch": 0.8334202928797524, + "grad_norm": 1.1484375, + "learning_rate": 0.00014200723349582644, + "loss": 1.0192, + "step": 11980 + }, + { + "epoch": 0.8334898605168876, + "grad_norm": 1.0390625, + "learning_rate": 0.00014189151033294688, + "loss": 0.7489, + "step": 11981 + }, + { + "epoch": 0.8335594281540227, + "grad_norm": 0.84765625, + "learning_rate": 0.00014177583074049128, + "loss": 0.6674, + "step": 11982 + }, + { + "epoch": 0.8336289957911579, + "grad_norm": 1.0546875, + "learning_rate": 0.00014166019472433344, + "loss": 0.7599, + "step": 11983 + }, + { + "epoch": 0.8336985634282932, + "grad_norm": 1.1640625, + "learning_rate": 0.00014154460229034427, + "loss": 0.7389, + "step": 11984 + }, + { + "epoch": 0.8337681310654284, + "grad_norm": 1.4296875, + "learning_rate": 0.0001414290534443936, + "loss": 0.8186, + "step": 11985 + }, + { + "epoch": 0.8338376987025635, + "grad_norm": 0.88671875, + "learning_rate": 0.00014131354819234775, + "loss": 0.7043, + "step": 11986 + }, + { + "epoch": 0.8339072663396988, + "grad_norm": 1.1015625, + "learning_rate": 0.00014119808654007216, + "loss": 0.965, + "step": 11987 + }, + { + "epoch": 0.833976833976834, + "grad_norm": 1.0703125, + "learning_rate": 0.00014108266849342877, + "loss": 0.7556, + "step": 11988 + }, + { + "epoch": 0.8340464016139691, + "grad_norm": 1.2734375, + "learning_rate": 0.00014096729405827847, + "loss": 0.8747, + "step": 11989 + }, + { + "epoch": 0.8341159692511044, + "grad_norm": 1.234375, + "learning_rate": 0.00014085196324047878, + "loss": 0.7994, + "step": 11990 + }, + { + "epoch": 0.8341855368882396, + "grad_norm": 1.0625, + "learning_rate": 0.00014073667604588635, + "loss": 0.645, + "step": 11991 + }, + { + "epoch": 0.8342551045253748, + "grad_norm": 1.1171875, + "learning_rate": 0.0001406214324803542, + "loss": 0.8727, + "step": 11992 + }, + { + "epoch": 0.83432467216251, + "grad_norm": 0.96484375, + "learning_rate": 0.0001405062325497344, + "loss": 0.6552, + "step": 11993 + }, + { + "epoch": 0.8343942397996452, + "grad_norm": 0.8671875, + "learning_rate": 0.0001403910762598758, + "loss": 0.5523, + "step": 11994 + }, + { + "epoch": 0.8344638074367804, + "grad_norm": 0.99609375, + "learning_rate": 0.0001402759636166253, + "loss": 0.7422, + "step": 11995 + }, + { + "epoch": 0.8345333750739156, + "grad_norm": 1.0546875, + "learning_rate": 0.00014016089462582837, + "loss": 0.9296, + "step": 11996 + }, + { + "epoch": 0.8346029427110508, + "grad_norm": 1.2890625, + "learning_rate": 0.00014004586929332742, + "loss": 0.887, + "step": 11997 + }, + { + "epoch": 0.834672510348186, + "grad_norm": 1.0703125, + "learning_rate": 0.00013993088762496265, + "loss": 0.8131, + "step": 11998 + }, + { + "epoch": 0.8347420779853212, + "grad_norm": 0.78515625, + "learning_rate": 0.00013981594962657218, + "loss": 0.4077, + "step": 11999 + }, + { + "epoch": 0.8348116456224565, + "grad_norm": 1.4453125, + "learning_rate": 0.00013970105530399212, + "loss": 1.0036, + "step": 12000 + }, + { + "epoch": 0.8348812132595916, + "grad_norm": 1.140625, + "learning_rate": 0.0001395862046630564, + "loss": 0.7612, + "step": 12001 + }, + { + "epoch": 0.8349507808967268, + "grad_norm": 1.203125, + "learning_rate": 0.00013947139770959627, + "loss": 0.5829, + "step": 12002 + }, + { + "epoch": 0.8350203485338621, + "grad_norm": 1.1640625, + "learning_rate": 0.00013935663444944135, + "loss": 0.8279, + "step": 12003 + }, + { + "epoch": 0.8350899161709973, + "grad_norm": 1.203125, + "learning_rate": 0.0001392419148884183, + "loss": 0.9097, + "step": 12004 + }, + { + "epoch": 0.8351594838081324, + "grad_norm": 1.15625, + "learning_rate": 0.00013912723903235257, + "loss": 0.65, + "step": 12005 + }, + { + "epoch": 0.8352290514452677, + "grad_norm": 1.0859375, + "learning_rate": 0.0001390126068870663, + "loss": 0.7578, + "step": 12006 + }, + { + "epoch": 0.8352986190824029, + "grad_norm": 0.97265625, + "learning_rate": 0.00013889801845838034, + "loss": 0.7598, + "step": 12007 + }, + { + "epoch": 0.835368186719538, + "grad_norm": 1.25, + "learning_rate": 0.00013878347375211253, + "loss": 0.863, + "step": 12008 + }, + { + "epoch": 0.8354377543566732, + "grad_norm": 0.7890625, + "learning_rate": 0.00013866897277407908, + "loss": 0.5281, + "step": 12009 + }, + { + "epoch": 0.8355073219938085, + "grad_norm": 1.4296875, + "learning_rate": 0.00013855451553009392, + "loss": 1.1497, + "step": 12010 + }, + { + "epoch": 0.8355768896309437, + "grad_norm": 0.98828125, + "learning_rate": 0.00013844010202596847, + "loss": 0.7283, + "step": 12011 + }, + { + "epoch": 0.8356464572680788, + "grad_norm": 0.9921875, + "learning_rate": 0.0001383257322675121, + "loss": 0.8684, + "step": 12012 + }, + { + "epoch": 0.8357160249052141, + "grad_norm": 1.46875, + "learning_rate": 0.00013821140626053163, + "loss": 0.9184, + "step": 12013 + }, + { + "epoch": 0.8357855925423493, + "grad_norm": 1.03125, + "learning_rate": 0.00013809712401083229, + "loss": 0.9563, + "step": 12014 + }, + { + "epoch": 0.8358551601794845, + "grad_norm": 1.2734375, + "learning_rate": 0.0001379828855242168, + "loss": 1.0051, + "step": 12015 + }, + { + "epoch": 0.8359247278166198, + "grad_norm": 0.80859375, + "learning_rate": 0.00013786869080648534, + "loss": 0.5833, + "step": 12016 + }, + { + "epoch": 0.8359942954537549, + "grad_norm": 1.09375, + "learning_rate": 0.00013775453986343645, + "loss": 0.7164, + "step": 12017 + }, + { + "epoch": 0.8360638630908901, + "grad_norm": 1.0625, + "learning_rate": 0.0001376404327008659, + "loss": 0.8782, + "step": 12018 + }, + { + "epoch": 0.8361334307280254, + "grad_norm": 0.8671875, + "learning_rate": 0.00013752636932456763, + "loss": 0.6167, + "step": 12019 + }, + { + "epoch": 0.8362029983651605, + "grad_norm": 1.1484375, + "learning_rate": 0.000137412349740333, + "loss": 0.9512, + "step": 12020 + }, + { + "epoch": 0.8362725660022957, + "grad_norm": 0.94140625, + "learning_rate": 0.00013729837395395173, + "loss": 0.8319, + "step": 12021 + }, + { + "epoch": 0.8363421336394309, + "grad_norm": 1.0390625, + "learning_rate": 0.00013718444197121038, + "loss": 0.8411, + "step": 12022 + }, + { + "epoch": 0.8364117012765662, + "grad_norm": 1.234375, + "learning_rate": 0.00013707055379789425, + "loss": 1.0388, + "step": 12023 + }, + { + "epoch": 0.8364812689137013, + "grad_norm": 1.140625, + "learning_rate": 0.000136956709439786, + "loss": 0.7488, + "step": 12024 + }, + { + "epoch": 0.8365508365508365, + "grad_norm": 1.0234375, + "learning_rate": 0.00013684290890266605, + "loss": 0.9733, + "step": 12025 + }, + { + "epoch": 0.8366204041879718, + "grad_norm": 1.140625, + "learning_rate": 0.00013672915219231264, + "loss": 0.8369, + "step": 12026 + }, + { + "epoch": 0.836689971825107, + "grad_norm": 1.0859375, + "learning_rate": 0.00013661543931450115, + "loss": 0.8344, + "step": 12027 + }, + { + "epoch": 0.8367595394622421, + "grad_norm": 0.97265625, + "learning_rate": 0.00013650177027500632, + "loss": 0.9055, + "step": 12028 + }, + { + "epoch": 0.8368291070993774, + "grad_norm": 1.0625, + "learning_rate": 0.0001363881450795993, + "loss": 0.6351, + "step": 12029 + }, + { + "epoch": 0.8368986747365126, + "grad_norm": 1.1484375, + "learning_rate": 0.000136274563734049, + "loss": 0.8663, + "step": 12030 + }, + { + "epoch": 0.8369682423736478, + "grad_norm": 0.9375, + "learning_rate": 0.00013616102624412318, + "loss": 0.6302, + "step": 12031 + }, + { + "epoch": 0.837037810010783, + "grad_norm": 1.5, + "learning_rate": 0.0001360475326155861, + "loss": 0.8401, + "step": 12032 + }, + { + "epoch": 0.8371073776479182, + "grad_norm": 0.9140625, + "learning_rate": 0.00013593408285420095, + "loss": 0.7076, + "step": 12033 + }, + { + "epoch": 0.8371769452850534, + "grad_norm": 1.328125, + "learning_rate": 0.00013582067696572752, + "loss": 1.0298, + "step": 12034 + }, + { + "epoch": 0.8372465129221885, + "grad_norm": 0.96875, + "learning_rate": 0.0001357073149559247, + "loss": 0.6113, + "step": 12035 + }, + { + "epoch": 0.8373160805593238, + "grad_norm": 1.03125, + "learning_rate": 0.00013559399683054773, + "loss": 0.8717, + "step": 12036 + }, + { + "epoch": 0.837385648196459, + "grad_norm": 1.125, + "learning_rate": 0.0001354807225953507, + "loss": 0.7921, + "step": 12037 + }, + { + "epoch": 0.8374552158335942, + "grad_norm": 1.1875, + "learning_rate": 0.00013536749225608535, + "loss": 0.7053, + "step": 12038 + }, + { + "epoch": 0.8375247834707295, + "grad_norm": 0.98828125, + "learning_rate": 0.0001352543058185006, + "loss": 0.8521, + "step": 12039 + }, + { + "epoch": 0.8375943511078646, + "grad_norm": 1.359375, + "learning_rate": 0.00013514116328834348, + "loss": 0.7449, + "step": 12040 + }, + { + "epoch": 0.8376639187449998, + "grad_norm": 1.1953125, + "learning_rate": 0.00013502806467135874, + "loss": 0.9089, + "step": 12041 + }, + { + "epoch": 0.8377334863821351, + "grad_norm": 1.109375, + "learning_rate": 0.0001349150099732893, + "loss": 0.6579, + "step": 12042 + }, + { + "epoch": 0.8378030540192702, + "grad_norm": 1.4921875, + "learning_rate": 0.00013480199919987536, + "loss": 0.9044, + "step": 12043 + }, + { + "epoch": 0.8378726216564054, + "grad_norm": 1.2109375, + "learning_rate": 0.0001346890323568548, + "loss": 0.7812, + "step": 12044 + }, + { + "epoch": 0.8379421892935407, + "grad_norm": 1.515625, + "learning_rate": 0.0001345761094499638, + "loss": 0.8827, + "step": 12045 + }, + { + "epoch": 0.8380117569306759, + "grad_norm": 1.0234375, + "learning_rate": 0.0001344632304849358, + "loss": 1.0183, + "step": 12046 + }, + { + "epoch": 0.838081324567811, + "grad_norm": 1.1484375, + "learning_rate": 0.0001343503954675025, + "loss": 0.9414, + "step": 12047 + }, + { + "epoch": 0.8381508922049462, + "grad_norm": 1.1640625, + "learning_rate": 0.00013423760440339262, + "loss": 0.6596, + "step": 12048 + }, + { + "epoch": 0.8382204598420815, + "grad_norm": 1.03125, + "learning_rate": 0.00013412485729833367, + "loss": 0.6656, + "step": 12049 + }, + { + "epoch": 0.8382900274792167, + "grad_norm": 1.0703125, + "learning_rate": 0.00013401215415805002, + "loss": 0.8514, + "step": 12050 + }, + { + "epoch": 0.8383595951163518, + "grad_norm": 1.2265625, + "learning_rate": 0.00013389949498826415, + "loss": 0.8535, + "step": 12051 + }, + { + "epoch": 0.8384291627534871, + "grad_norm": 1.1171875, + "learning_rate": 0.00013378687979469684, + "loss": 0.5699, + "step": 12052 + }, + { + "epoch": 0.8384987303906223, + "grad_norm": 1.4765625, + "learning_rate": 0.00013367430858306562, + "loss": 0.9435, + "step": 12053 + }, + { + "epoch": 0.8385682980277575, + "grad_norm": 0.9140625, + "learning_rate": 0.00013356178135908613, + "loss": 0.5726, + "step": 12054 + }, + { + "epoch": 0.8386378656648927, + "grad_norm": 1.078125, + "learning_rate": 0.0001334492981284723, + "loss": 0.7919, + "step": 12055 + }, + { + "epoch": 0.8387074333020279, + "grad_norm": 1.015625, + "learning_rate": 0.00013333685889693557, + "loss": 0.5164, + "step": 12056 + }, + { + "epoch": 0.8387770009391631, + "grad_norm": 1.078125, + "learning_rate": 0.0001332244636701848, + "loss": 0.5733, + "step": 12057 + }, + { + "epoch": 0.8388465685762984, + "grad_norm": 1.015625, + "learning_rate": 0.00013311211245392674, + "loss": 0.6505, + "step": 12058 + }, + { + "epoch": 0.8389161362134335, + "grad_norm": 1.5, + "learning_rate": 0.00013299980525386613, + "loss": 0.8734, + "step": 12059 + }, + { + "epoch": 0.8389857038505687, + "grad_norm": 1.2734375, + "learning_rate": 0.00013288754207570563, + "loss": 0.9035, + "step": 12060 + }, + { + "epoch": 0.8390552714877039, + "grad_norm": 0.93359375, + "learning_rate": 0.00013277532292514527, + "loss": 0.685, + "step": 12061 + }, + { + "epoch": 0.8391248391248392, + "grad_norm": 1.1484375, + "learning_rate": 0.00013266314780788246, + "loss": 0.7693, + "step": 12062 + }, + { + "epoch": 0.8391944067619743, + "grad_norm": 1.28125, + "learning_rate": 0.00013255101672961366, + "loss": 0.8275, + "step": 12063 + }, + { + "epoch": 0.8392639743991095, + "grad_norm": 1.5, + "learning_rate": 0.00013243892969603177, + "loss": 0.8141, + "step": 12064 + }, + { + "epoch": 0.8393335420362448, + "grad_norm": 1.171875, + "learning_rate": 0.00013232688671282832, + "loss": 0.8422, + "step": 12065 + }, + { + "epoch": 0.8394031096733799, + "grad_norm": 1.171875, + "learning_rate": 0.000132214887785692, + "loss": 0.7412, + "step": 12066 + }, + { + "epoch": 0.8394726773105151, + "grad_norm": 1.0859375, + "learning_rate": 0.00013210293292030995, + "loss": 0.9879, + "step": 12067 + }, + { + "epoch": 0.8395422449476504, + "grad_norm": 1.1953125, + "learning_rate": 0.00013199102212236614, + "loss": 0.6968, + "step": 12068 + }, + { + "epoch": 0.8396118125847856, + "grad_norm": 1.0234375, + "learning_rate": 0.00013187915539754325, + "loss": 0.6176, + "step": 12069 + }, + { + "epoch": 0.8396813802219207, + "grad_norm": 1.171875, + "learning_rate": 0.0001317673327515213, + "loss": 0.7005, + "step": 12070 + }, + { + "epoch": 0.839750947859056, + "grad_norm": 1.28125, + "learning_rate": 0.0001316555541899781, + "loss": 0.9586, + "step": 12071 + }, + { + "epoch": 0.8398205154961912, + "grad_norm": 1.234375, + "learning_rate": 0.00013154381971858898, + "loss": 0.8076, + "step": 12072 + }, + { + "epoch": 0.8398900831333264, + "grad_norm": 1.3359375, + "learning_rate": 0.00013143212934302694, + "loss": 0.8819, + "step": 12073 + }, + { + "epoch": 0.8399596507704615, + "grad_norm": 1.1015625, + "learning_rate": 0.00013132048306896394, + "loss": 0.6674, + "step": 12074 + }, + { + "epoch": 0.8400292184075968, + "grad_norm": 1.125, + "learning_rate": 0.00013120888090206828, + "loss": 0.8355, + "step": 12075 + }, + { + "epoch": 0.840098786044732, + "grad_norm": 1.1328125, + "learning_rate": 0.00013109732284800646, + "loss": 0.712, + "step": 12076 + }, + { + "epoch": 0.8401683536818672, + "grad_norm": 1.2734375, + "learning_rate": 0.00013098580891244315, + "loss": 0.6425, + "step": 12077 + }, + { + "epoch": 0.8402379213190024, + "grad_norm": 0.89453125, + "learning_rate": 0.00013087433910104006, + "loss": 0.6536, + "step": 12078 + }, + { + "epoch": 0.8403074889561376, + "grad_norm": 1.0390625, + "learning_rate": 0.00013076291341945756, + "loss": 0.7956, + "step": 12079 + }, + { + "epoch": 0.8403770565932728, + "grad_norm": 1.3671875, + "learning_rate": 0.0001306515318733529, + "loss": 0.9083, + "step": 12080 + }, + { + "epoch": 0.8404466242304081, + "grad_norm": 1.234375, + "learning_rate": 0.00013054019446838173, + "loss": 1.0065, + "step": 12081 + }, + { + "epoch": 0.8405161918675432, + "grad_norm": 1.25, + "learning_rate": 0.00013042890121019691, + "loss": 1.097, + "step": 12082 + }, + { + "epoch": 0.8405857595046784, + "grad_norm": 1.3828125, + "learning_rate": 0.00013031765210444956, + "loss": 0.8163, + "step": 12083 + }, + { + "epoch": 0.8406553271418137, + "grad_norm": 1.09375, + "learning_rate": 0.00013020644715678855, + "loss": 0.7708, + "step": 12084 + }, + { + "epoch": 0.8407248947789489, + "grad_norm": 1.171875, + "learning_rate": 0.00013009528637285994, + "loss": 0.6088, + "step": 12085 + }, + { + "epoch": 0.840794462416084, + "grad_norm": 1.03125, + "learning_rate": 0.00012998416975830795, + "loss": 0.8125, + "step": 12086 + }, + { + "epoch": 0.8408640300532192, + "grad_norm": 1.0078125, + "learning_rate": 0.0001298730973187745, + "loss": 0.7017, + "step": 12087 + }, + { + "epoch": 0.8409335976903545, + "grad_norm": 1.109375, + "learning_rate": 0.00012976206905989973, + "loss": 0.7749, + "step": 12088 + }, + { + "epoch": 0.8410031653274896, + "grad_norm": 1.5234375, + "learning_rate": 0.0001296510849873207, + "loss": 1.0097, + "step": 12089 + }, + { + "epoch": 0.8410727329646248, + "grad_norm": 1.09375, + "learning_rate": 0.00012954014510667246, + "loss": 0.7875, + "step": 12090 + }, + { + "epoch": 0.8411423006017601, + "grad_norm": 1.0703125, + "learning_rate": 0.00012942924942358825, + "loss": 0.811, + "step": 12091 + }, + { + "epoch": 0.8412118682388953, + "grad_norm": 1.15625, + "learning_rate": 0.00012931839794369892, + "loss": 0.7561, + "step": 12092 + }, + { + "epoch": 0.8412814358760304, + "grad_norm": 1.109375, + "learning_rate": 0.00012920759067263287, + "loss": 0.7486, + "step": 12093 + }, + { + "epoch": 0.8413510035131657, + "grad_norm": 1.4375, + "learning_rate": 0.00012909682761601604, + "loss": 0.9831, + "step": 12094 + }, + { + "epoch": 0.8414205711503009, + "grad_norm": 1.1328125, + "learning_rate": 0.0001289861087794727, + "loss": 0.9107, + "step": 12095 + }, + { + "epoch": 0.8414901387874361, + "grad_norm": 1.1875, + "learning_rate": 0.00012887543416862445, + "loss": 0.8535, + "step": 12096 + }, + { + "epoch": 0.8415597064245713, + "grad_norm": 1.03125, + "learning_rate": 0.00012876480378909083, + "loss": 0.8429, + "step": 12097 + }, + { + "epoch": 0.8416292740617065, + "grad_norm": 1.078125, + "learning_rate": 0.0001286542176464892, + "loss": 0.56, + "step": 12098 + }, + { + "epoch": 0.8416988416988417, + "grad_norm": 0.98046875, + "learning_rate": 0.00012854367574643467, + "loss": 0.8433, + "step": 12099 + }, + { + "epoch": 0.8417684093359769, + "grad_norm": 1.0, + "learning_rate": 0.00012843317809453959, + "loss": 0.958, + "step": 12100 + }, + { + "epoch": 0.8418379769731121, + "grad_norm": 0.9921875, + "learning_rate": 0.00012832272469641458, + "loss": 0.8084, + "step": 12101 + }, + { + "epoch": 0.8419075446102473, + "grad_norm": 0.94921875, + "learning_rate": 0.00012821231555766832, + "loss": 0.6842, + "step": 12102 + }, + { + "epoch": 0.8419771122473825, + "grad_norm": 1.546875, + "learning_rate": 0.0001281019506839065, + "loss": 1.1108, + "step": 12103 + }, + { + "epoch": 0.8420466798845178, + "grad_norm": 1.3515625, + "learning_rate": 0.00012799163008073278, + "loss": 0.794, + "step": 12104 + }, + { + "epoch": 0.8421162475216529, + "grad_norm": 1.0703125, + "learning_rate": 0.0001278813537537489, + "loss": 0.732, + "step": 12105 + }, + { + "epoch": 0.8421858151587881, + "grad_norm": 0.87890625, + "learning_rate": 0.0001277711217085541, + "loss": 0.544, + "step": 12106 + }, + { + "epoch": 0.8422553827959234, + "grad_norm": 1.015625, + "learning_rate": 0.00012766093395074552, + "loss": 0.7447, + "step": 12107 + }, + { + "epoch": 0.8423249504330586, + "grad_norm": 1.015625, + "learning_rate": 0.00012755079048591756, + "loss": 0.7633, + "step": 12108 + }, + { + "epoch": 0.8423945180701937, + "grad_norm": 1.15625, + "learning_rate": 0.00012744069131966318, + "loss": 0.934, + "step": 12109 + }, + { + "epoch": 0.842464085707329, + "grad_norm": 1.2734375, + "learning_rate": 0.00012733063645757226, + "loss": 0.8014, + "step": 12110 + }, + { + "epoch": 0.8425336533444642, + "grad_norm": 1.203125, + "learning_rate": 0.000127220625905233, + "loss": 0.7666, + "step": 12111 + }, + { + "epoch": 0.8426032209815993, + "grad_norm": 1.203125, + "learning_rate": 0.00012711065966823155, + "loss": 1.0765, + "step": 12112 + }, + { + "epoch": 0.8426727886187345, + "grad_norm": 1.28125, + "learning_rate": 0.00012700073775215093, + "loss": 0.6896, + "step": 12113 + }, + { + "epoch": 0.8427423562558698, + "grad_norm": 1.046875, + "learning_rate": 0.00012689086016257257, + "loss": 0.8598, + "step": 12114 + }, + { + "epoch": 0.842811923893005, + "grad_norm": 1.453125, + "learning_rate": 0.00012678102690507544, + "loss": 0.9347, + "step": 12115 + }, + { + "epoch": 0.8428814915301401, + "grad_norm": 0.97265625, + "learning_rate": 0.0001266712379852367, + "loss": 0.8, + "step": 12116 + }, + { + "epoch": 0.8429510591672754, + "grad_norm": 1.40625, + "learning_rate": 0.00012656149340863055, + "loss": 0.9972, + "step": 12117 + }, + { + "epoch": 0.8430206268044106, + "grad_norm": 0.98046875, + "learning_rate": 0.00012645179318082912, + "loss": 0.768, + "step": 12118 + }, + { + "epoch": 0.8430901944415458, + "grad_norm": 1.203125, + "learning_rate": 0.00012634213730740253, + "loss": 0.8108, + "step": 12119 + }, + { + "epoch": 0.843159762078681, + "grad_norm": 1.171875, + "learning_rate": 0.00012623252579391898, + "loss": 0.8261, + "step": 12120 + }, + { + "epoch": 0.8432293297158162, + "grad_norm": 1.1171875, + "learning_rate": 0.00012612295864594358, + "loss": 0.6399, + "step": 12121 + }, + { + "epoch": 0.8432988973529514, + "grad_norm": 1.109375, + "learning_rate": 0.00012601343586903947, + "loss": 0.9218, + "step": 12122 + }, + { + "epoch": 0.8433684649900867, + "grad_norm": 1.1328125, + "learning_rate": 0.00012590395746876802, + "loss": 0.6682, + "step": 12123 + }, + { + "epoch": 0.8434380326272218, + "grad_norm": 1.1953125, + "learning_rate": 0.00012579452345068775, + "loss": 0.7739, + "step": 12124 + }, + { + "epoch": 0.843507600264357, + "grad_norm": 1.40625, + "learning_rate": 0.0001256851338203552, + "loss": 0.966, + "step": 12125 + }, + { + "epoch": 0.8435771679014922, + "grad_norm": 1.140625, + "learning_rate": 0.00012557578858332486, + "loss": 0.6529, + "step": 12126 + }, + { + "epoch": 0.8436467355386275, + "grad_norm": 1.046875, + "learning_rate": 0.00012546648774514868, + "loss": 0.7381, + "step": 12127 + }, + { + "epoch": 0.8437163031757626, + "grad_norm": 1.2890625, + "learning_rate": 0.00012535723131137588, + "loss": 0.7895, + "step": 12128 + }, + { + "epoch": 0.8437858708128978, + "grad_norm": 1.3671875, + "learning_rate": 0.00012524801928755447, + "loss": 0.9043, + "step": 12129 + }, + { + "epoch": 0.8438554384500331, + "grad_norm": 1.140625, + "learning_rate": 0.00012513885167922978, + "loss": 0.8401, + "step": 12130 + }, + { + "epoch": 0.8439250060871683, + "grad_norm": 1.046875, + "learning_rate": 0.0001250297284919445, + "loss": 0.6916, + "step": 12131 + }, + { + "epoch": 0.8439945737243034, + "grad_norm": 0.9296875, + "learning_rate": 0.0001249206497312393, + "loss": 0.808, + "step": 12132 + }, + { + "epoch": 0.8440641413614387, + "grad_norm": 1.1796875, + "learning_rate": 0.00012481161540265273, + "loss": 0.8094, + "step": 12133 + }, + { + "epoch": 0.8441337089985739, + "grad_norm": 0.89453125, + "learning_rate": 0.0001247026255117213, + "loss": 0.8593, + "step": 12134 + }, + { + "epoch": 0.844203276635709, + "grad_norm": 0.97265625, + "learning_rate": 0.00012459368006397865, + "loss": 0.4995, + "step": 12135 + }, + { + "epoch": 0.8442728442728443, + "grad_norm": 1.28125, + "learning_rate": 0.0001244847790649565, + "loss": 0.8877, + "step": 12136 + }, + { + "epoch": 0.8443424119099795, + "grad_norm": 1.0546875, + "learning_rate": 0.00012437592252018416, + "loss": 0.8176, + "step": 12137 + }, + { + "epoch": 0.8444119795471147, + "grad_norm": 1.0859375, + "learning_rate": 0.00012426711043518924, + "loss": 0.6735, + "step": 12138 + }, + { + "epoch": 0.8444815471842498, + "grad_norm": 1.15625, + "learning_rate": 0.0001241583428154963, + "loss": 0.8597, + "step": 12139 + }, + { + "epoch": 0.8445511148213851, + "grad_norm": 0.87109375, + "learning_rate": 0.0001240496196666283, + "loss": 0.8347, + "step": 12140 + }, + { + "epoch": 0.8446206824585203, + "grad_norm": 1.0, + "learning_rate": 0.0001239409409941056, + "loss": 0.7573, + "step": 12141 + }, + { + "epoch": 0.8446902500956555, + "grad_norm": 1.3984375, + "learning_rate": 0.00012383230680344592, + "loss": 0.886, + "step": 12142 + }, + { + "epoch": 0.8447598177327907, + "grad_norm": 1.359375, + "learning_rate": 0.0001237237171001655, + "loss": 0.9381, + "step": 12143 + }, + { + "epoch": 0.8448293853699259, + "grad_norm": 1.4765625, + "learning_rate": 0.00012361517188977822, + "loss": 0.957, + "step": 12144 + }, + { + "epoch": 0.8448989530070611, + "grad_norm": 1.0078125, + "learning_rate": 0.00012350667117779512, + "loss": 0.8213, + "step": 12145 + }, + { + "epoch": 0.8449685206441964, + "grad_norm": 1.609375, + "learning_rate": 0.00012339821496972536, + "loss": 0.7806, + "step": 12146 + }, + { + "epoch": 0.8450380882813315, + "grad_norm": 1.203125, + "learning_rate": 0.00012328980327107575, + "loss": 0.972, + "step": 12147 + }, + { + "epoch": 0.8451076559184667, + "grad_norm": 0.8984375, + "learning_rate": 0.0001231814360873511, + "loss": 0.7905, + "step": 12148 + }, + { + "epoch": 0.845177223555602, + "grad_norm": 0.80078125, + "learning_rate": 0.0001230731134240538, + "loss": 0.7074, + "step": 12149 + }, + { + "epoch": 0.8452467911927372, + "grad_norm": 1.28125, + "learning_rate": 0.00012296483528668345, + "loss": 0.8852, + "step": 12150 + }, + { + "epoch": 0.8453163588298723, + "grad_norm": 1.1953125, + "learning_rate": 0.0001228566016807382, + "loss": 0.7658, + "step": 12151 + }, + { + "epoch": 0.8453859264670075, + "grad_norm": 0.91015625, + "learning_rate": 0.00012274841261171376, + "loss": 0.588, + "step": 12152 + }, + { + "epoch": 0.8454554941041428, + "grad_norm": 1.2890625, + "learning_rate": 0.0001226402680851033, + "loss": 0.9275, + "step": 12153 + }, + { + "epoch": 0.845525061741278, + "grad_norm": 1.171875, + "learning_rate": 0.00012253216810639755, + "loss": 0.8288, + "step": 12154 + }, + { + "epoch": 0.8455946293784131, + "grad_norm": 1.421875, + "learning_rate": 0.00012242411268108578, + "loss": 0.9221, + "step": 12155 + }, + { + "epoch": 0.8456641970155484, + "grad_norm": 1.03125, + "learning_rate": 0.00012231610181465415, + "loss": 0.8036, + "step": 12156 + }, + { + "epoch": 0.8457337646526836, + "grad_norm": 1.125, + "learning_rate": 0.0001222081355125868, + "loss": 0.5976, + "step": 12157 + }, + { + "epoch": 0.8458033322898187, + "grad_norm": 1.1484375, + "learning_rate": 0.00012210021378036628, + "loss": 0.7004, + "step": 12158 + }, + { + "epoch": 0.845872899926954, + "grad_norm": 1.140625, + "learning_rate": 0.00012199233662347198, + "loss": 0.7648, + "step": 12159 + }, + { + "epoch": 0.8459424675640892, + "grad_norm": 1.0859375, + "learning_rate": 0.00012188450404738105, + "loss": 0.7067, + "step": 12160 + }, + { + "epoch": 0.8460120352012244, + "grad_norm": 1.1328125, + "learning_rate": 0.00012177671605756901, + "loss": 0.7604, + "step": 12161 + }, + { + "epoch": 0.8460816028383596, + "grad_norm": 1.1171875, + "learning_rate": 0.00012166897265950894, + "loss": 0.886, + "step": 12162 + }, + { + "epoch": 0.8461511704754948, + "grad_norm": 0.86328125, + "learning_rate": 0.00012156127385867144, + "loss": 0.542, + "step": 12163 + }, + { + "epoch": 0.84622073811263, + "grad_norm": 1.0703125, + "learning_rate": 0.00012145361966052449, + "loss": 0.6837, + "step": 12164 + }, + { + "epoch": 0.8462903057497652, + "grad_norm": 1.15625, + "learning_rate": 0.00012134601007053447, + "loss": 0.7801, + "step": 12165 + }, + { + "epoch": 0.8463598733869004, + "grad_norm": 0.7890625, + "learning_rate": 0.00012123844509416559, + "loss": 0.4521, + "step": 12166 + }, + { + "epoch": 0.8464294410240356, + "grad_norm": 1.4921875, + "learning_rate": 0.00012113092473687914, + "loss": 1.0481, + "step": 12167 + }, + { + "epoch": 0.8464990086611708, + "grad_norm": 1.140625, + "learning_rate": 0.00012102344900413442, + "loss": 1.0554, + "step": 12168 + }, + { + "epoch": 0.8465685762983061, + "grad_norm": 0.9140625, + "learning_rate": 0.00012091601790138851, + "loss": 0.7972, + "step": 12169 + }, + { + "epoch": 0.8466381439354412, + "grad_norm": 1.125, + "learning_rate": 0.00012080863143409648, + "loss": 0.7867, + "step": 12170 + }, + { + "epoch": 0.8467077115725764, + "grad_norm": 1.03125, + "learning_rate": 0.00012070128960771043, + "loss": 0.7568, + "step": 12171 + }, + { + "epoch": 0.8467772792097117, + "grad_norm": 1.140625, + "learning_rate": 0.00012059399242768122, + "loss": 0.6988, + "step": 12172 + }, + { + "epoch": 0.8468468468468469, + "grad_norm": 0.9921875, + "learning_rate": 0.00012048673989945657, + "loss": 0.6091, + "step": 12173 + }, + { + "epoch": 0.846916414483982, + "grad_norm": 1.203125, + "learning_rate": 0.00012037953202848184, + "loss": 0.5821, + "step": 12174 + }, + { + "epoch": 0.8469859821211173, + "grad_norm": 0.9765625, + "learning_rate": 0.00012027236882020099, + "loss": 0.829, + "step": 12175 + }, + { + "epoch": 0.8470555497582525, + "grad_norm": 1.0, + "learning_rate": 0.00012016525028005521, + "loss": 0.5804, + "step": 12176 + }, + { + "epoch": 0.8471251173953877, + "grad_norm": 1.078125, + "learning_rate": 0.00012005817641348337, + "loss": 0.6997, + "step": 12177 + }, + { + "epoch": 0.8471946850325228, + "grad_norm": 1.2578125, + "learning_rate": 0.00011995114722592193, + "loss": 0.8979, + "step": 12178 + }, + { + "epoch": 0.8472642526696581, + "grad_norm": 1.4609375, + "learning_rate": 0.0001198441627228054, + "loss": 1.0183, + "step": 12179 + }, + { + "epoch": 0.8473338203067933, + "grad_norm": 1.2578125, + "learning_rate": 0.00011973722290956613, + "loss": 0.8141, + "step": 12180 + }, + { + "epoch": 0.8474033879439284, + "grad_norm": 1.046875, + "learning_rate": 0.00011963032779163397, + "loss": 0.7746, + "step": 12181 + }, + { + "epoch": 0.8474729555810637, + "grad_norm": 1.1015625, + "learning_rate": 0.00011952347737443603, + "loss": 0.8633, + "step": 12182 + }, + { + "epoch": 0.8475425232181989, + "grad_norm": 0.84375, + "learning_rate": 0.00011941667166339809, + "loss": 0.6313, + "step": 12183 + }, + { + "epoch": 0.8476120908553341, + "grad_norm": 1.3203125, + "learning_rate": 0.00011930991066394315, + "loss": 0.8317, + "step": 12184 + }, + { + "epoch": 0.8476816584924693, + "grad_norm": 0.97265625, + "learning_rate": 0.00011920319438149185, + "loss": 0.7958, + "step": 12185 + }, + { + "epoch": 0.8477512261296045, + "grad_norm": 1.140625, + "learning_rate": 0.00011909652282146299, + "loss": 0.8036, + "step": 12186 + }, + { + "epoch": 0.8478207937667397, + "grad_norm": 1.3125, + "learning_rate": 0.00011898989598927257, + "loss": 0.9245, + "step": 12187 + }, + { + "epoch": 0.847890361403875, + "grad_norm": 1.0, + "learning_rate": 0.00011888331389033447, + "loss": 0.9106, + "step": 12188 + }, + { + "epoch": 0.8479599290410101, + "grad_norm": 0.875, + "learning_rate": 0.00011877677653006058, + "loss": 0.5841, + "step": 12189 + }, + { + "epoch": 0.8480294966781453, + "grad_norm": 0.7890625, + "learning_rate": 0.00011867028391386037, + "loss": 0.4885, + "step": 12190 + }, + { + "epoch": 0.8480990643152805, + "grad_norm": 1.3046875, + "learning_rate": 0.00011856383604714094, + "loss": 1.0532, + "step": 12191 + }, + { + "epoch": 0.8481686319524158, + "grad_norm": 1.0703125, + "learning_rate": 0.00011845743293530697, + "loss": 0.8783, + "step": 12192 + }, + { + "epoch": 0.8482381995895509, + "grad_norm": 1.0625, + "learning_rate": 0.00011835107458376126, + "loss": 0.8023, + "step": 12193 + }, + { + "epoch": 0.8483077672266861, + "grad_norm": 1.4140625, + "learning_rate": 0.00011824476099790426, + "loss": 0.784, + "step": 12194 + }, + { + "epoch": 0.8483773348638214, + "grad_norm": 0.796875, + "learning_rate": 0.000118138492183134, + "loss": 0.7627, + "step": 12195 + }, + { + "epoch": 0.8484469025009566, + "grad_norm": 1.09375, + "learning_rate": 0.00011803226814484602, + "loss": 0.8796, + "step": 12196 + }, + { + "epoch": 0.8485164701380917, + "grad_norm": 1.1484375, + "learning_rate": 0.00011792608888843392, + "loss": 1.01, + "step": 12197 + }, + { + "epoch": 0.848586037775227, + "grad_norm": 0.97265625, + "learning_rate": 0.00011781995441928939, + "loss": 0.7642, + "step": 12198 + }, + { + "epoch": 0.8486556054123622, + "grad_norm": 1.1328125, + "learning_rate": 0.00011771386474280077, + "loss": 0.9497, + "step": 12199 + }, + { + "epoch": 0.8487251730494974, + "grad_norm": 1.3046875, + "learning_rate": 0.0001176078198643552, + "loss": 0.8968, + "step": 12200 + }, + { + "epoch": 0.8487947406866326, + "grad_norm": 1.1171875, + "learning_rate": 0.00011750181978933682, + "loss": 0.8285, + "step": 12201 + }, + { + "epoch": 0.8488643083237678, + "grad_norm": 1.265625, + "learning_rate": 0.00011739586452312812, + "loss": 1.0651, + "step": 12202 + }, + { + "epoch": 0.848933875960903, + "grad_norm": 1.1171875, + "learning_rate": 0.00011728995407110854, + "loss": 0.757, + "step": 12203 + }, + { + "epoch": 0.8490034435980381, + "grad_norm": 1.15625, + "learning_rate": 0.00011718408843865602, + "loss": 0.727, + "step": 12204 + }, + { + "epoch": 0.8490730112351734, + "grad_norm": 1.4140625, + "learning_rate": 0.00011707826763114593, + "loss": 0.773, + "step": 12205 + }, + { + "epoch": 0.8491425788723086, + "grad_norm": 1.03125, + "learning_rate": 0.00011697249165395085, + "loss": 0.7744, + "step": 12206 + }, + { + "epoch": 0.8492121465094438, + "grad_norm": 1.203125, + "learning_rate": 0.00011686676051244183, + "loss": 0.8447, + "step": 12207 + }, + { + "epoch": 0.849281714146579, + "grad_norm": 0.76953125, + "learning_rate": 0.00011676107421198767, + "loss": 0.4904, + "step": 12208 + }, + { + "epoch": 0.8493512817837142, + "grad_norm": 1.1171875, + "learning_rate": 0.00011665543275795432, + "loss": 0.9249, + "step": 12209 + }, + { + "epoch": 0.8494208494208494, + "grad_norm": 0.98046875, + "learning_rate": 0.00011654983615570546, + "loss": 0.5941, + "step": 12210 + }, + { + "epoch": 0.8494904170579847, + "grad_norm": 0.90625, + "learning_rate": 0.00011644428441060295, + "loss": 0.7498, + "step": 12211 + }, + { + "epoch": 0.8495599846951198, + "grad_norm": 1.171875, + "learning_rate": 0.00011633877752800648, + "loss": 0.8989, + "step": 12212 + }, + { + "epoch": 0.849629552332255, + "grad_norm": 1.078125, + "learning_rate": 0.00011623331551327276, + "loss": 0.8272, + "step": 12213 + }, + { + "epoch": 0.8496991199693903, + "grad_norm": 0.87109375, + "learning_rate": 0.00011612789837175686, + "loss": 0.5662, + "step": 12214 + }, + { + "epoch": 0.8497686876065255, + "grad_norm": 1.1640625, + "learning_rate": 0.00011602252610881115, + "loss": 0.7696, + "step": 12215 + }, + { + "epoch": 0.8498382552436606, + "grad_norm": 1.0234375, + "learning_rate": 0.00011591719872978601, + "loss": 0.7914, + "step": 12216 + }, + { + "epoch": 0.8499078228807958, + "grad_norm": 1.5234375, + "learning_rate": 0.0001158119162400294, + "loss": 1.0314, + "step": 12217 + }, + { + "epoch": 0.8499773905179311, + "grad_norm": 0.74609375, + "learning_rate": 0.00011570667864488716, + "loss": 0.6842, + "step": 12218 + }, + { + "epoch": 0.8500469581550663, + "grad_norm": 1.2109375, + "learning_rate": 0.00011560148594970266, + "loss": 0.8372, + "step": 12219 + }, + { + "epoch": 0.8501165257922014, + "grad_norm": 1.1171875, + "learning_rate": 0.00011549633815981652, + "loss": 0.7289, + "step": 12220 + }, + { + "epoch": 0.8501860934293367, + "grad_norm": 0.921875, + "learning_rate": 0.0001153912352805685, + "loss": 0.746, + "step": 12221 + }, + { + "epoch": 0.8502556610664719, + "grad_norm": 1.1171875, + "learning_rate": 0.00011528617731729485, + "loss": 0.7059, + "step": 12222 + }, + { + "epoch": 0.850325228703607, + "grad_norm": 1.1875, + "learning_rate": 0.00011518116427532988, + "loss": 0.6602, + "step": 12223 + }, + { + "epoch": 0.8503947963407423, + "grad_norm": 1.2265625, + "learning_rate": 0.0001150761961600052, + "loss": 0.7342, + "step": 12224 + }, + { + "epoch": 0.8504643639778775, + "grad_norm": 1.125, + "learning_rate": 0.00011497127297665111, + "loss": 0.836, + "step": 12225 + }, + { + "epoch": 0.8505339316150127, + "grad_norm": 1.2890625, + "learning_rate": 0.00011486639473059502, + "loss": 0.7602, + "step": 12226 + }, + { + "epoch": 0.850603499252148, + "grad_norm": 0.99609375, + "learning_rate": 0.00011476156142716198, + "loss": 0.6743, + "step": 12227 + }, + { + "epoch": 0.8506730668892831, + "grad_norm": 1.3515625, + "learning_rate": 0.00011465677307167477, + "loss": 1.1329, + "step": 12228 + }, + { + "epoch": 0.8507426345264183, + "grad_norm": 1.4921875, + "learning_rate": 0.0001145520296694541, + "loss": 0.999, + "step": 12229 + }, + { + "epoch": 0.8508122021635535, + "grad_norm": 0.9375, + "learning_rate": 0.00011444733122581863, + "loss": 0.7615, + "step": 12230 + }, + { + "epoch": 0.8508817698006887, + "grad_norm": 1.046875, + "learning_rate": 0.00011434267774608398, + "loss": 0.7004, + "step": 12231 + }, + { + "epoch": 0.8509513374378239, + "grad_norm": 1.109375, + "learning_rate": 0.00011423806923556424, + "loss": 0.9571, + "step": 12232 + }, + { + "epoch": 0.8510209050749591, + "grad_norm": 1.1484375, + "learning_rate": 0.0001141335056995706, + "loss": 0.8939, + "step": 12233 + }, + { + "epoch": 0.8510904727120944, + "grad_norm": 0.9453125, + "learning_rate": 0.00011402898714341269, + "loss": 0.7862, + "step": 12234 + }, + { + "epoch": 0.8511600403492295, + "grad_norm": 1.03125, + "learning_rate": 0.00011392451357239697, + "loss": 0.7262, + "step": 12235 + }, + { + "epoch": 0.8512296079863647, + "grad_norm": 1.1953125, + "learning_rate": 0.0001138200849918285, + "loss": 0.7603, + "step": 12236 + }, + { + "epoch": 0.8512991756235, + "grad_norm": 1.2578125, + "learning_rate": 0.00011371570140700937, + "loss": 0.7464, + "step": 12237 + }, + { + "epoch": 0.8513687432606352, + "grad_norm": 1.875, + "learning_rate": 0.00011361136282323959, + "loss": 0.8486, + "step": 12238 + }, + { + "epoch": 0.8514383108977703, + "grad_norm": 0.96484375, + "learning_rate": 0.00011350706924581711, + "loss": 0.6902, + "step": 12239 + }, + { + "epoch": 0.8515078785349056, + "grad_norm": 1.34375, + "learning_rate": 0.00011340282068003749, + "loss": 0.9536, + "step": 12240 + }, + { + "epoch": 0.8515774461720408, + "grad_norm": 1.0703125, + "learning_rate": 0.00011329861713119394, + "loss": 0.6185, + "step": 12241 + }, + { + "epoch": 0.851647013809176, + "grad_norm": 1.3125, + "learning_rate": 0.00011319445860457711, + "loss": 0.9331, + "step": 12242 + }, + { + "epoch": 0.8517165814463111, + "grad_norm": 1.171875, + "learning_rate": 0.00011309034510547578, + "loss": 0.9381, + "step": 12243 + }, + { + "epoch": 0.8517861490834464, + "grad_norm": 1.0859375, + "learning_rate": 0.0001129862766391766, + "loss": 0.8453, + "step": 12244 + }, + { + "epoch": 0.8518557167205816, + "grad_norm": 1.359375, + "learning_rate": 0.00011288225321096323, + "loss": 0.8247, + "step": 12245 + }, + { + "epoch": 0.8519252843577167, + "grad_norm": 0.84765625, + "learning_rate": 0.0001127782748261178, + "loss": 0.7709, + "step": 12246 + }, + { + "epoch": 0.851994851994852, + "grad_norm": 1.21875, + "learning_rate": 0.0001126743414899194, + "loss": 0.7957, + "step": 12247 + }, + { + "epoch": 0.8520644196319872, + "grad_norm": 1.28125, + "learning_rate": 0.00011257045320764581, + "loss": 0.975, + "step": 12248 + }, + { + "epoch": 0.8521339872691224, + "grad_norm": 1.2265625, + "learning_rate": 0.00011246660998457136, + "loss": 1.0589, + "step": 12249 + }, + { + "epoch": 0.8522035549062577, + "grad_norm": 1.140625, + "learning_rate": 0.0001123628118259692, + "loss": 0.8049, + "step": 12250 + }, + { + "epoch": 0.8522731225433928, + "grad_norm": 1.4765625, + "learning_rate": 0.00011225905873710929, + "loss": 1.0467, + "step": 12251 + }, + { + "epoch": 0.852342690180528, + "grad_norm": 1.1171875, + "learning_rate": 0.00011215535072325956, + "loss": 0.5724, + "step": 12252 + }, + { + "epoch": 0.8524122578176633, + "grad_norm": 1.1171875, + "learning_rate": 0.00011205168778968644, + "loss": 0.9426, + "step": 12253 + }, + { + "epoch": 0.8524818254547984, + "grad_norm": 1.0, + "learning_rate": 0.00011194806994165297, + "loss": 0.6056, + "step": 12254 + }, + { + "epoch": 0.8525513930919336, + "grad_norm": 0.859375, + "learning_rate": 0.00011184449718442047, + "loss": 0.711, + "step": 12255 + }, + { + "epoch": 0.8526209607290688, + "grad_norm": 1.015625, + "learning_rate": 0.00011174096952324753, + "loss": 0.9735, + "step": 12256 + }, + { + "epoch": 0.8526905283662041, + "grad_norm": 1.15625, + "learning_rate": 0.00011163748696339104, + "loss": 0.7256, + "step": 12257 + }, + { + "epoch": 0.8527600960033392, + "grad_norm": 1.375, + "learning_rate": 0.00011153404951010537, + "loss": 0.6559, + "step": 12258 + }, + { + "epoch": 0.8528296636404744, + "grad_norm": 0.9609375, + "learning_rate": 0.00011143065716864243, + "loss": 0.8445, + "step": 12259 + }, + { + "epoch": 0.8528992312776097, + "grad_norm": 1.015625, + "learning_rate": 0.00011132730994425211, + "loss": 0.6826, + "step": 12260 + }, + { + "epoch": 0.8529687989147449, + "grad_norm": 1.1953125, + "learning_rate": 0.00011122400784218157, + "loss": 0.8554, + "step": 12261 + }, + { + "epoch": 0.85303836655188, + "grad_norm": 0.984375, + "learning_rate": 0.00011112075086767626, + "loss": 0.8118, + "step": 12262 + }, + { + "epoch": 0.8531079341890153, + "grad_norm": 1.109375, + "learning_rate": 0.00011101753902597877, + "loss": 0.693, + "step": 12263 + }, + { + "epoch": 0.8531775018261505, + "grad_norm": 1.2421875, + "learning_rate": 0.00011091437232233015, + "loss": 0.8587, + "step": 12264 + }, + { + "epoch": 0.8532470694632857, + "grad_norm": 1.09375, + "learning_rate": 0.00011081125076196807, + "loss": 0.7478, + "step": 12265 + }, + { + "epoch": 0.8533166371004209, + "grad_norm": 1.0390625, + "learning_rate": 0.00011070817435012892, + "loss": 0.8519, + "step": 12266 + }, + { + "epoch": 0.8533862047375561, + "grad_norm": 0.96875, + "learning_rate": 0.00011060514309204639, + "loss": 0.5889, + "step": 12267 + }, + { + "epoch": 0.8534557723746913, + "grad_norm": 1.484375, + "learning_rate": 0.00011050215699295196, + "loss": 1.0257, + "step": 12268 + }, + { + "epoch": 0.8535253400118264, + "grad_norm": 1.34375, + "learning_rate": 0.00011039921605807446, + "loss": 0.7956, + "step": 12269 + }, + { + "epoch": 0.8535949076489617, + "grad_norm": 2.1875, + "learning_rate": 0.00011029632029264069, + "loss": 1.1196, + "step": 12270 + }, + { + "epoch": 0.8536644752860969, + "grad_norm": 1.328125, + "learning_rate": 0.00011019346970187538, + "loss": 0.9829, + "step": 12271 + }, + { + "epoch": 0.8537340429232321, + "grad_norm": 1.1640625, + "learning_rate": 0.0001100906642910009, + "loss": 0.9512, + "step": 12272 + }, + { + "epoch": 0.8538036105603674, + "grad_norm": 1.046875, + "learning_rate": 0.00010998790406523685, + "loss": 0.6774, + "step": 12273 + }, + { + "epoch": 0.8538731781975025, + "grad_norm": 1.0625, + "learning_rate": 0.00010988518902980115, + "loss": 0.8759, + "step": 12274 + }, + { + "epoch": 0.8539427458346377, + "grad_norm": 1.28125, + "learning_rate": 0.00010978251918990889, + "loss": 0.6949, + "step": 12275 + }, + { + "epoch": 0.854012313471773, + "grad_norm": 1.1796875, + "learning_rate": 0.00010967989455077353, + "loss": 0.8158, + "step": 12276 + }, + { + "epoch": 0.8540818811089081, + "grad_norm": 1.109375, + "learning_rate": 0.00010957731511760527, + "loss": 0.7616, + "step": 12277 + }, + { + "epoch": 0.8541514487460433, + "grad_norm": 1.03125, + "learning_rate": 0.00010947478089561314, + "loss": 0.6717, + "step": 12278 + }, + { + "epoch": 0.8542210163831786, + "grad_norm": 1.1953125, + "learning_rate": 0.00010937229189000286, + "loss": 0.8055, + "step": 12279 + }, + { + "epoch": 0.8542905840203138, + "grad_norm": 1.171875, + "learning_rate": 0.00010926984810597851, + "loss": 0.8088, + "step": 12280 + }, + { + "epoch": 0.8543601516574489, + "grad_norm": 1.046875, + "learning_rate": 0.00010916744954874192, + "loss": 0.8147, + "step": 12281 + }, + { + "epoch": 0.8544297192945841, + "grad_norm": 0.92578125, + "learning_rate": 0.00010906509622349204, + "loss": 0.8183, + "step": 12282 + }, + { + "epoch": 0.8544992869317194, + "grad_norm": 1.296875, + "learning_rate": 0.00010896278813542593, + "loss": 0.7092, + "step": 12283 + }, + { + "epoch": 0.8545688545688546, + "grad_norm": 0.9140625, + "learning_rate": 0.00010886052528973789, + "loss": 0.9953, + "step": 12284 + }, + { + "epoch": 0.8546384222059897, + "grad_norm": 0.89453125, + "learning_rate": 0.00010875830769162109, + "loss": 0.9157, + "step": 12285 + }, + { + "epoch": 0.854707989843125, + "grad_norm": 0.9375, + "learning_rate": 0.00010865613534626517, + "loss": 0.6253, + "step": 12286 + }, + { + "epoch": 0.8547775574802602, + "grad_norm": 1.3125, + "learning_rate": 0.00010855400825885786, + "loss": 0.9757, + "step": 12287 + }, + { + "epoch": 0.8548471251173954, + "grad_norm": 1.046875, + "learning_rate": 0.00010845192643458501, + "loss": 0.7175, + "step": 12288 + }, + { + "epoch": 0.8549166927545306, + "grad_norm": 1.21875, + "learning_rate": 0.00010834988987862936, + "loss": 0.6697, + "step": 12289 + }, + { + "epoch": 0.8549862603916658, + "grad_norm": 1.078125, + "learning_rate": 0.00010824789859617224, + "loss": 0.9938, + "step": 12290 + }, + { + "epoch": 0.855055828028801, + "grad_norm": 1.1796875, + "learning_rate": 0.0001081459525923919, + "loss": 0.766, + "step": 12291 + }, + { + "epoch": 0.8551253956659363, + "grad_norm": 1.0625, + "learning_rate": 0.00010804405187246502, + "loss": 0.6445, + "step": 12292 + }, + { + "epoch": 0.8551949633030714, + "grad_norm": 1.1484375, + "learning_rate": 0.00010794219644156522, + "loss": 0.7632, + "step": 12293 + }, + { + "epoch": 0.8552645309402066, + "grad_norm": 1.0546875, + "learning_rate": 0.00010784038630486437, + "loss": 0.8305, + "step": 12294 + }, + { + "epoch": 0.8553340985773418, + "grad_norm": 1.4609375, + "learning_rate": 0.000107738621467532, + "loss": 0.765, + "step": 12295 + }, + { + "epoch": 0.855403666214477, + "grad_norm": 0.9296875, + "learning_rate": 0.00010763690193473519, + "loss": 0.7739, + "step": 12296 + }, + { + "epoch": 0.8554732338516122, + "grad_norm": 1.140625, + "learning_rate": 0.0001075352277116386, + "loss": 0.8307, + "step": 12297 + }, + { + "epoch": 0.8555428014887474, + "grad_norm": 1.0859375, + "learning_rate": 0.00010743359880340442, + "loss": 0.648, + "step": 12298 + }, + { + "epoch": 0.8556123691258827, + "grad_norm": 1.234375, + "learning_rate": 0.00010733201521519364, + "loss": 0.8163, + "step": 12299 + }, + { + "epoch": 0.8556819367630178, + "grad_norm": 0.84765625, + "learning_rate": 0.0001072304769521637, + "loss": 0.5939, + "step": 12300 + }, + { + "epoch": 0.855751504400153, + "grad_norm": 1.359375, + "learning_rate": 0.00010712898401947024, + "loss": 0.8583, + "step": 12301 + }, + { + "epoch": 0.8558210720372883, + "grad_norm": 0.8828125, + "learning_rate": 0.00010702753642226649, + "loss": 0.7241, + "step": 12302 + }, + { + "epoch": 0.8558906396744235, + "grad_norm": 1.0546875, + "learning_rate": 0.00010692613416570341, + "loss": 0.7889, + "step": 12303 + }, + { + "epoch": 0.8559602073115586, + "grad_norm": 0.9765625, + "learning_rate": 0.00010682477725493, + "loss": 0.7806, + "step": 12304 + }, + { + "epoch": 0.8560297749486939, + "grad_norm": 0.91015625, + "learning_rate": 0.00010672346569509229, + "loss": 0.7109, + "step": 12305 + }, + { + "epoch": 0.8560993425858291, + "grad_norm": 0.77734375, + "learning_rate": 0.00010662219949133478, + "loss": 0.5379, + "step": 12306 + }, + { + "epoch": 0.8561689102229643, + "grad_norm": 1.0078125, + "learning_rate": 0.00010652097864879884, + "loss": 0.8115, + "step": 12307 + }, + { + "epoch": 0.8562384778600994, + "grad_norm": 1.1171875, + "learning_rate": 0.00010641980317262423, + "loss": 0.7521, + "step": 12308 + }, + { + "epoch": 0.8563080454972347, + "grad_norm": 1.0625, + "learning_rate": 0.00010631867306794795, + "loss": 0.7504, + "step": 12309 + }, + { + "epoch": 0.8563776131343699, + "grad_norm": 0.83203125, + "learning_rate": 0.00010621758833990513, + "loss": 0.701, + "step": 12310 + }, + { + "epoch": 0.8564471807715051, + "grad_norm": 1.015625, + "learning_rate": 0.00010611654899362789, + "loss": 0.8729, + "step": 12311 + }, + { + "epoch": 0.8565167484086403, + "grad_norm": 0.9765625, + "learning_rate": 0.00010601555503424687, + "loss": 0.7054, + "step": 12312 + }, + { + "epoch": 0.8565863160457755, + "grad_norm": 1.0546875, + "learning_rate": 0.00010591460646689022, + "loss": 0.78, + "step": 12313 + }, + { + "epoch": 0.8566558836829107, + "grad_norm": 0.96875, + "learning_rate": 0.00010581370329668316, + "loss": 0.7408, + "step": 12314 + }, + { + "epoch": 0.856725451320046, + "grad_norm": 1.0625, + "learning_rate": 0.00010571284552874939, + "loss": 0.7002, + "step": 12315 + }, + { + "epoch": 0.8567950189571811, + "grad_norm": 1.09375, + "learning_rate": 0.00010561203316820922, + "loss": 0.7092, + "step": 12316 + }, + { + "epoch": 0.8568645865943163, + "grad_norm": 0.9921875, + "learning_rate": 0.00010551126622018248, + "loss": 0.858, + "step": 12317 + }, + { + "epoch": 0.8569341542314516, + "grad_norm": 1.28125, + "learning_rate": 0.00010541054468978507, + "loss": 1.0974, + "step": 12318 + }, + { + "epoch": 0.8570037218685868, + "grad_norm": 1.2578125, + "learning_rate": 0.00010530986858213088, + "loss": 0.9695, + "step": 12319 + }, + { + "epoch": 0.8570732895057219, + "grad_norm": 1.03125, + "learning_rate": 0.00010520923790233217, + "loss": 0.8819, + "step": 12320 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.359375, + "learning_rate": 0.00010510865265549818, + "loss": 0.8206, + "step": 12321 + }, + { + "epoch": 0.8572124247799924, + "grad_norm": 1.0078125, + "learning_rate": 0.00010500811284673628, + "loss": 0.7581, + "step": 12322 + }, + { + "epoch": 0.8572819924171275, + "grad_norm": 1.0625, + "learning_rate": 0.00010490761848115127, + "loss": 1.0358, + "step": 12323 + }, + { + "epoch": 0.8573515600542627, + "grad_norm": 1.1953125, + "learning_rate": 0.00010480716956384584, + "loss": 0.8605, + "step": 12324 + }, + { + "epoch": 0.857421127691398, + "grad_norm": 1.2265625, + "learning_rate": 0.00010470676609992014, + "loss": 0.928, + "step": 12325 + }, + { + "epoch": 0.8574906953285332, + "grad_norm": 1.4140625, + "learning_rate": 0.0001046064080944723, + "loss": 0.8935, + "step": 12326 + }, + { + "epoch": 0.8575602629656683, + "grad_norm": 1.2421875, + "learning_rate": 0.00010450609555259805, + "loss": 0.9788, + "step": 12327 + }, + { + "epoch": 0.8576298306028036, + "grad_norm": 1.0859375, + "learning_rate": 0.00010440582847939061, + "loss": 0.6736, + "step": 12328 + }, + { + "epoch": 0.8576993982399388, + "grad_norm": 0.953125, + "learning_rate": 0.00010430560687994117, + "loss": 0.6788, + "step": 12329 + }, + { + "epoch": 0.857768965877074, + "grad_norm": 1.4140625, + "learning_rate": 0.00010420543075933786, + "loss": 0.7944, + "step": 12330 + }, + { + "epoch": 0.8578385335142092, + "grad_norm": 1.3515625, + "learning_rate": 0.00010410530012266817, + "loss": 0.8764, + "step": 12331 + }, + { + "epoch": 0.8579081011513444, + "grad_norm": 0.99609375, + "learning_rate": 0.00010400521497501558, + "loss": 0.6809, + "step": 12332 + }, + { + "epoch": 0.8579776687884796, + "grad_norm": 1.3828125, + "learning_rate": 0.00010390517532146182, + "loss": 0.8418, + "step": 12333 + }, + { + "epoch": 0.8580472364256148, + "grad_norm": 0.86328125, + "learning_rate": 0.00010380518116708692, + "loss": 0.6544, + "step": 12334 + }, + { + "epoch": 0.85811680406275, + "grad_norm": 1.34375, + "learning_rate": 0.00010370523251696751, + "loss": 0.891, + "step": 12335 + }, + { + "epoch": 0.8581863716998852, + "grad_norm": 1.4140625, + "learning_rate": 0.00010360532937617894, + "loss": 1.1266, + "step": 12336 + }, + { + "epoch": 0.8582559393370204, + "grad_norm": 0.90625, + "learning_rate": 0.0001035054717497933, + "loss": 0.5519, + "step": 12337 + }, + { + "epoch": 0.8583255069741557, + "grad_norm": 1.1875, + "learning_rate": 0.0001034056596428814, + "loss": 0.9505, + "step": 12338 + }, + { + "epoch": 0.8583950746112908, + "grad_norm": 1.1875, + "learning_rate": 0.00010330589306051074, + "loss": 1.0161, + "step": 12339 + }, + { + "epoch": 0.858464642248426, + "grad_norm": 1.3828125, + "learning_rate": 0.00010320617200774718, + "loss": 0.7717, + "step": 12340 + }, + { + "epoch": 0.8585342098855613, + "grad_norm": 1.40625, + "learning_rate": 0.0001031064964896542, + "loss": 0.9509, + "step": 12341 + }, + { + "epoch": 0.8586037775226965, + "grad_norm": 1.375, + "learning_rate": 0.00010300686651129265, + "loss": 1.0319, + "step": 12342 + }, + { + "epoch": 0.8586733451598316, + "grad_norm": 0.96484375, + "learning_rate": 0.00010290728207772104, + "loss": 0.734, + "step": 12343 + }, + { + "epoch": 0.8587429127969669, + "grad_norm": 0.890625, + "learning_rate": 0.00010280774319399599, + "loss": 0.7712, + "step": 12344 + }, + { + "epoch": 0.8588124804341021, + "grad_norm": 0.953125, + "learning_rate": 0.00010270824986517169, + "loss": 0.87, + "step": 12345 + }, + { + "epoch": 0.8588820480712372, + "grad_norm": 1.4921875, + "learning_rate": 0.00010260880209629985, + "loss": 0.8685, + "step": 12346 + }, + { + "epoch": 0.8589516157083724, + "grad_norm": 0.99609375, + "learning_rate": 0.00010250939989242957, + "loss": 0.662, + "step": 12347 + }, + { + "epoch": 0.8590211833455077, + "grad_norm": 1.0859375, + "learning_rate": 0.00010241004325860859, + "loss": 0.6317, + "step": 12348 + }, + { + "epoch": 0.8590907509826429, + "grad_norm": 1.015625, + "learning_rate": 0.00010231073219988108, + "loss": 0.663, + "step": 12349 + }, + { + "epoch": 0.859160318619778, + "grad_norm": 1.265625, + "learning_rate": 0.00010221146672129022, + "loss": 1.0907, + "step": 12350 + }, + { + "epoch": 0.8592298862569133, + "grad_norm": 1.1171875, + "learning_rate": 0.00010211224682787567, + "loss": 0.7271, + "step": 12351 + }, + { + "epoch": 0.8592994538940485, + "grad_norm": 1.03125, + "learning_rate": 0.00010201307252467573, + "loss": 0.7229, + "step": 12352 + }, + { + "epoch": 0.8593690215311837, + "grad_norm": 1.0703125, + "learning_rate": 0.00010191394381672547, + "loss": 0.848, + "step": 12353 + }, + { + "epoch": 0.859438589168319, + "grad_norm": 1.0234375, + "learning_rate": 0.00010181486070905855, + "loss": 0.9485, + "step": 12354 + }, + { + "epoch": 0.8595081568054541, + "grad_norm": 0.92578125, + "learning_rate": 0.00010171582320670602, + "loss": 0.6691, + "step": 12355 + }, + { + "epoch": 0.8595777244425893, + "grad_norm": 0.9609375, + "learning_rate": 0.00010161683131469635, + "loss": 0.9354, + "step": 12356 + }, + { + "epoch": 0.8596472920797246, + "grad_norm": 1.203125, + "learning_rate": 0.00010151788503805548, + "loss": 0.8412, + "step": 12357 + }, + { + "epoch": 0.8597168597168597, + "grad_norm": 1.046875, + "learning_rate": 0.00010141898438180785, + "loss": 0.8609, + "step": 12358 + }, + { + "epoch": 0.8597864273539949, + "grad_norm": 1.0546875, + "learning_rate": 0.00010132012935097512, + "loss": 1.051, + "step": 12359 + }, + { + "epoch": 0.8598559949911301, + "grad_norm": 0.97265625, + "learning_rate": 0.0001012213199505766, + "loss": 0.9528, + "step": 12360 + }, + { + "epoch": 0.8599255626282654, + "grad_norm": 1.1953125, + "learning_rate": 0.00010112255618562894, + "loss": 0.9181, + "step": 12361 + }, + { + "epoch": 0.8599951302654005, + "grad_norm": 1.25, + "learning_rate": 0.00010102383806114735, + "loss": 0.9213, + "step": 12362 + }, + { + "epoch": 0.8600646979025357, + "grad_norm": 1.09375, + "learning_rate": 0.00010092516558214427, + "loss": 0.7589, + "step": 12363 + }, + { + "epoch": 0.860134265539671, + "grad_norm": 1.0703125, + "learning_rate": 0.00010082653875362946, + "loss": 0.8638, + "step": 12364 + }, + { + "epoch": 0.8602038331768062, + "grad_norm": 1.40625, + "learning_rate": 0.00010072795758061082, + "loss": 0.7913, + "step": 12365 + }, + { + "epoch": 0.8602734008139413, + "grad_norm": 1.1875, + "learning_rate": 0.0001006294220680939, + "loss": 0.7034, + "step": 12366 + }, + { + "epoch": 0.8603429684510766, + "grad_norm": 1.5234375, + "learning_rate": 0.00010053093222108168, + "loss": 0.9527, + "step": 12367 + }, + { + "epoch": 0.8604125360882118, + "grad_norm": 1.21875, + "learning_rate": 0.00010043248804457494, + "loss": 0.7554, + "step": 12368 + }, + { + "epoch": 0.860482103725347, + "grad_norm": 1.1875, + "learning_rate": 0.0001003340895435726, + "loss": 0.9477, + "step": 12369 + }, + { + "epoch": 0.8605516713624821, + "grad_norm": 1.125, + "learning_rate": 0.00010023573672307052, + "loss": 0.8501, + "step": 12370 + }, + { + "epoch": 0.8606212389996174, + "grad_norm": 1.2890625, + "learning_rate": 0.00010013742958806238, + "loss": 0.6945, + "step": 12371 + }, + { + "epoch": 0.8606908066367526, + "grad_norm": 1.1796875, + "learning_rate": 0.00010003916814353986, + "loss": 0.9602, + "step": 12372 + }, + { + "epoch": 0.8607603742738877, + "grad_norm": 1.140625, + "learning_rate": 9.994095239449253e-05, + "loss": 0.6929, + "step": 12373 + }, + { + "epoch": 0.860829941911023, + "grad_norm": 0.828125, + "learning_rate": 9.984278234590694e-05, + "loss": 0.6558, + "step": 12374 + }, + { + "epoch": 0.8608995095481582, + "grad_norm": 1.15625, + "learning_rate": 9.974465800276755e-05, + "loss": 0.8409, + "step": 12375 + }, + { + "epoch": 0.8609690771852934, + "grad_norm": 1.1875, + "learning_rate": 9.964657937005683e-05, + "loss": 0.8428, + "step": 12376 + }, + { + "epoch": 0.8610386448224286, + "grad_norm": 0.78125, + "learning_rate": 9.95485464527549e-05, + "loss": 0.9456, + "step": 12377 + }, + { + "epoch": 0.8611082124595638, + "grad_norm": 1.265625, + "learning_rate": 9.945055925583913e-05, + "loss": 0.7942, + "step": 12378 + }, + { + "epoch": 0.861177780096699, + "grad_norm": 0.88671875, + "learning_rate": 9.935261778428473e-05, + "loss": 0.6238, + "step": 12379 + }, + { + "epoch": 0.8612473477338343, + "grad_norm": 1.1015625, + "learning_rate": 9.925472204306485e-05, + "loss": 0.8494, + "step": 12380 + }, + { + "epoch": 0.8613169153709694, + "grad_norm": 1.140625, + "learning_rate": 9.915687203715007e-05, + "loss": 0.7037, + "step": 12381 + }, + { + "epoch": 0.8613864830081046, + "grad_norm": 1.5546875, + "learning_rate": 9.905906777150874e-05, + "loss": 0.9521, + "step": 12382 + }, + { + "epoch": 0.8614560506452398, + "grad_norm": 1.3359375, + "learning_rate": 9.89613092511068e-05, + "loss": 0.9395, + "step": 12383 + }, + { + "epoch": 0.8615256182823751, + "grad_norm": 1.046875, + "learning_rate": 9.886359648090826e-05, + "loss": 0.7269, + "step": 12384 + }, + { + "epoch": 0.8615951859195102, + "grad_norm": 1.0390625, + "learning_rate": 9.876592946587393e-05, + "loss": 0.7753, + "step": 12385 + }, + { + "epoch": 0.8616647535566454, + "grad_norm": 0.98828125, + "learning_rate": 9.866830821096318e-05, + "loss": 0.7973, + "step": 12386 + }, + { + "epoch": 0.8617343211937807, + "grad_norm": 1.546875, + "learning_rate": 9.857073272113282e-05, + "loss": 0.6929, + "step": 12387 + }, + { + "epoch": 0.8618038888309159, + "grad_norm": 0.94140625, + "learning_rate": 9.847320300133722e-05, + "loss": 0.8213, + "step": 12388 + }, + { + "epoch": 0.861873456468051, + "grad_norm": 1.1015625, + "learning_rate": 9.837571905652808e-05, + "loss": 0.8065, + "step": 12389 + }, + { + "epoch": 0.8619430241051863, + "grad_norm": 1.21875, + "learning_rate": 9.827828089165547e-05, + "loss": 0.7282, + "step": 12390 + }, + { + "epoch": 0.8620125917423215, + "grad_norm": 0.83984375, + "learning_rate": 9.818088851166684e-05, + "loss": 0.6526, + "step": 12391 + }, + { + "epoch": 0.8620821593794566, + "grad_norm": 1.03125, + "learning_rate": 9.808354192150725e-05, + "loss": 0.918, + "step": 12392 + }, + { + "epoch": 0.8621517270165919, + "grad_norm": 1.1171875, + "learning_rate": 9.79862411261192e-05, + "loss": 1.1219, + "step": 12393 + }, + { + "epoch": 0.8622212946537271, + "grad_norm": 1.296875, + "learning_rate": 9.788898613044328e-05, + "loss": 0.8244, + "step": 12394 + }, + { + "epoch": 0.8622908622908623, + "grad_norm": 1.0546875, + "learning_rate": 9.779177693941799e-05, + "loss": 0.7914, + "step": 12395 + }, + { + "epoch": 0.8623604299279974, + "grad_norm": 0.91015625, + "learning_rate": 9.76946135579787e-05, + "loss": 0.6921, + "step": 12396 + }, + { + "epoch": 0.8624299975651327, + "grad_norm": 1.15625, + "learning_rate": 9.759749599105883e-05, + "loss": 0.9035, + "step": 12397 + }, + { + "epoch": 0.8624995652022679, + "grad_norm": 0.7578125, + "learning_rate": 9.750042424358984e-05, + "loss": 0.6005, + "step": 12398 + }, + { + "epoch": 0.8625691328394031, + "grad_norm": 1.2109375, + "learning_rate": 9.740339832050016e-05, + "loss": 0.8299, + "step": 12399 + }, + { + "epoch": 0.8626387004765383, + "grad_norm": 0.94140625, + "learning_rate": 9.730641822671649e-05, + "loss": 0.8842, + "step": 12400 + }, + { + "epoch": 0.8627082681136735, + "grad_norm": 1.1171875, + "learning_rate": 9.720948396716323e-05, + "loss": 1.0405, + "step": 12401 + }, + { + "epoch": 0.8627778357508087, + "grad_norm": 1.4609375, + "learning_rate": 9.711259554676188e-05, + "loss": 0.8272, + "step": 12402 + }, + { + "epoch": 0.862847403387944, + "grad_norm": 1.109375, + "learning_rate": 9.701575297043197e-05, + "loss": 0.9613, + "step": 12403 + }, + { + "epoch": 0.8629169710250791, + "grad_norm": 0.96484375, + "learning_rate": 9.691895624309066e-05, + "loss": 0.6554, + "step": 12404 + }, + { + "epoch": 0.8629865386622143, + "grad_norm": 0.79296875, + "learning_rate": 9.682220536965314e-05, + "loss": 0.6862, + "step": 12405 + }, + { + "epoch": 0.8630561062993496, + "grad_norm": 1.2890625, + "learning_rate": 9.672550035503158e-05, + "loss": 0.6961, + "step": 12406 + }, + { + "epoch": 0.8631256739364848, + "grad_norm": 1.203125, + "learning_rate": 9.662884120413617e-05, + "loss": 0.9276, + "step": 12407 + }, + { + "epoch": 0.8631952415736199, + "grad_norm": 0.890625, + "learning_rate": 9.653222792187489e-05, + "loss": 0.584, + "step": 12408 + }, + { + "epoch": 0.8632648092107551, + "grad_norm": 1.015625, + "learning_rate": 9.643566051315334e-05, + "loss": 0.6404, + "step": 12409 + }, + { + "epoch": 0.8633343768478904, + "grad_norm": 1.1484375, + "learning_rate": 9.633913898287472e-05, + "loss": 0.7392, + "step": 12410 + }, + { + "epoch": 0.8634039444850256, + "grad_norm": 1.0078125, + "learning_rate": 9.624266333593968e-05, + "loss": 0.6214, + "step": 12411 + }, + { + "epoch": 0.8634735121221607, + "grad_norm": 1.21875, + "learning_rate": 9.614623357724706e-05, + "loss": 0.9785, + "step": 12412 + }, + { + "epoch": 0.863543079759296, + "grad_norm": 1.0078125, + "learning_rate": 9.604984971169273e-05, + "loss": 0.7385, + "step": 12413 + }, + { + "epoch": 0.8636126473964312, + "grad_norm": 0.796875, + "learning_rate": 9.595351174417089e-05, + "loss": 0.6108, + "step": 12414 + }, + { + "epoch": 0.8636822150335663, + "grad_norm": 1.1640625, + "learning_rate": 9.585721967957306e-05, + "loss": 0.8732, + "step": 12415 + }, + { + "epoch": 0.8637517826707016, + "grad_norm": 1.4609375, + "learning_rate": 9.576097352278846e-05, + "loss": 0.8376, + "step": 12416 + }, + { + "epoch": 0.8638213503078368, + "grad_norm": 0.9765625, + "learning_rate": 9.566477327870371e-05, + "loss": 0.883, + "step": 12417 + }, + { + "epoch": 0.863890917944972, + "grad_norm": 1.34375, + "learning_rate": 9.55686189522036e-05, + "loss": 0.9439, + "step": 12418 + }, + { + "epoch": 0.8639604855821073, + "grad_norm": 1.1875, + "learning_rate": 9.547251054817052e-05, + "loss": 0.9326, + "step": 12419 + }, + { + "epoch": 0.8640300532192424, + "grad_norm": 1.1484375, + "learning_rate": 9.537644807148416e-05, + "loss": 0.6504, + "step": 12420 + }, + { + "epoch": 0.8640996208563776, + "grad_norm": 1.2421875, + "learning_rate": 9.528043152702204e-05, + "loss": 0.9633, + "step": 12421 + }, + { + "epoch": 0.8641691884935128, + "grad_norm": 0.9453125, + "learning_rate": 9.518446091965938e-05, + "loss": 0.7757, + "step": 12422 + }, + { + "epoch": 0.864238756130648, + "grad_norm": 1.203125, + "learning_rate": 9.50885362542695e-05, + "loss": 0.8814, + "step": 12423 + }, + { + "epoch": 0.8643083237677832, + "grad_norm": 1.109375, + "learning_rate": 9.49926575357225e-05, + "loss": 0.6348, + "step": 12424 + }, + { + "epoch": 0.8643778914049184, + "grad_norm": 1.15625, + "learning_rate": 9.489682476888673e-05, + "loss": 0.8008, + "step": 12425 + }, + { + "epoch": 0.8644474590420537, + "grad_norm": 1.1796875, + "learning_rate": 9.480103795862805e-05, + "loss": 0.8026, + "step": 12426 + }, + { + "epoch": 0.8645170266791888, + "grad_norm": 1.15625, + "learning_rate": 9.470529710981036e-05, + "loss": 0.8007, + "step": 12427 + }, + { + "epoch": 0.864586594316324, + "grad_norm": 1.15625, + "learning_rate": 9.460960222729443e-05, + "loss": 0.9158, + "step": 12428 + }, + { + "epoch": 0.8646561619534593, + "grad_norm": 1.4375, + "learning_rate": 9.45139533159396e-05, + "loss": 0.6803, + "step": 12429 + }, + { + "epoch": 0.8647257295905945, + "grad_norm": 1.203125, + "learning_rate": 9.441835038060221e-05, + "loss": 1.0275, + "step": 12430 + }, + { + "epoch": 0.8647952972277296, + "grad_norm": 1.234375, + "learning_rate": 9.432279342613637e-05, + "loss": 0.9827, + "step": 12431 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.984375, + "learning_rate": 9.42272824573941e-05, + "loss": 0.469, + "step": 12432 + }, + { + "epoch": 0.8649344325020001, + "grad_norm": 1.28125, + "learning_rate": 9.413181747922517e-05, + "loss": 0.8856, + "step": 12433 + }, + { + "epoch": 0.8650040001391353, + "grad_norm": 1.265625, + "learning_rate": 9.403639849647672e-05, + "loss": 0.8125, + "step": 12434 + }, + { + "epoch": 0.8650735677762704, + "grad_norm": 1.3984375, + "learning_rate": 9.39410255139933e-05, + "loss": 0.6766, + "step": 12435 + }, + { + "epoch": 0.8651431354134057, + "grad_norm": 0.99609375, + "learning_rate": 9.384569853661773e-05, + "loss": 0.8128, + "step": 12436 + }, + { + "epoch": 0.8652127030505409, + "grad_norm": 1.2421875, + "learning_rate": 9.375041756919045e-05, + "loss": 0.9982, + "step": 12437 + }, + { + "epoch": 0.865282270687676, + "grad_norm": 0.8828125, + "learning_rate": 9.365518261654904e-05, + "loss": 0.5813, + "step": 12438 + }, + { + "epoch": 0.8653518383248113, + "grad_norm": 1.0546875, + "learning_rate": 9.355999368352907e-05, + "loss": 0.8297, + "step": 12439 + }, + { + "epoch": 0.8654214059619465, + "grad_norm": 1.1015625, + "learning_rate": 9.346485077496369e-05, + "loss": 0.8446, + "step": 12440 + }, + { + "epoch": 0.8654909735990817, + "grad_norm": 1.0390625, + "learning_rate": 9.336975389568425e-05, + "loss": 0.7497, + "step": 12441 + }, + { + "epoch": 0.865560541236217, + "grad_norm": 0.94140625, + "learning_rate": 9.327470305051866e-05, + "loss": 0.7889, + "step": 12442 + }, + { + "epoch": 0.8656301088733521, + "grad_norm": 1.0859375, + "learning_rate": 9.317969824429363e-05, + "loss": 0.6662, + "step": 12443 + }, + { + "epoch": 0.8656996765104873, + "grad_norm": 1.015625, + "learning_rate": 9.308473948183283e-05, + "loss": 0.7784, + "step": 12444 + }, + { + "epoch": 0.8657692441476226, + "grad_norm": 1.0859375, + "learning_rate": 9.298982676795764e-05, + "loss": 0.8745, + "step": 12445 + }, + { + "epoch": 0.8658388117847577, + "grad_norm": 1.1015625, + "learning_rate": 9.289496010748722e-05, + "loss": 0.7813, + "step": 12446 + }, + { + "epoch": 0.8659083794218929, + "grad_norm": 1.109375, + "learning_rate": 9.280013950523891e-05, + "loss": 0.9556, + "step": 12447 + }, + { + "epoch": 0.8659779470590281, + "grad_norm": 0.98046875, + "learning_rate": 9.270536496602678e-05, + "loss": 0.8412, + "step": 12448 + }, + { + "epoch": 0.8660475146961634, + "grad_norm": 1.0625, + "learning_rate": 9.261063649466306e-05, + "loss": 0.7001, + "step": 12449 + }, + { + "epoch": 0.8661170823332985, + "grad_norm": 1.171875, + "learning_rate": 9.251595409595748e-05, + "loss": 0.8509, + "step": 12450 + }, + { + "epoch": 0.8661866499704337, + "grad_norm": 1.234375, + "learning_rate": 9.242131777471796e-05, + "loss": 0.6475, + "step": 12451 + }, + { + "epoch": 0.866256217607569, + "grad_norm": 1.21875, + "learning_rate": 9.232672753574944e-05, + "loss": 0.7832, + "step": 12452 + }, + { + "epoch": 0.8663257852447042, + "grad_norm": 0.9140625, + "learning_rate": 9.223218338385441e-05, + "loss": 0.7964, + "step": 12453 + }, + { + "epoch": 0.8663953528818393, + "grad_norm": 1.0234375, + "learning_rate": 9.21376853238336e-05, + "loss": 0.6256, + "step": 12454 + }, + { + "epoch": 0.8664649205189746, + "grad_norm": 1.2421875, + "learning_rate": 9.204323336048548e-05, + "loss": 0.8907, + "step": 12455 + }, + { + "epoch": 0.8665344881561098, + "grad_norm": 1.3203125, + "learning_rate": 9.194882749860545e-05, + "loss": 0.839, + "step": 12456 + }, + { + "epoch": 0.866604055793245, + "grad_norm": 1.109375, + "learning_rate": 9.185446774298678e-05, + "loss": 0.9559, + "step": 12457 + }, + { + "epoch": 0.8666736234303802, + "grad_norm": 1.28125, + "learning_rate": 9.176015409842098e-05, + "loss": 0.9003, + "step": 12458 + }, + { + "epoch": 0.8667431910675154, + "grad_norm": 1.1328125, + "learning_rate": 9.166588656969676e-05, + "loss": 1.0415, + "step": 12459 + }, + { + "epoch": 0.8668127587046506, + "grad_norm": 1.2734375, + "learning_rate": 9.157166516160031e-05, + "loss": 0.8276, + "step": 12460 + }, + { + "epoch": 0.8668823263417857, + "grad_norm": 1.25, + "learning_rate": 9.147748987891614e-05, + "loss": 0.9967, + "step": 12461 + }, + { + "epoch": 0.866951893978921, + "grad_norm": 1.59375, + "learning_rate": 9.138336072642573e-05, + "loss": 0.4936, + "step": 12462 + }, + { + "epoch": 0.8670214616160562, + "grad_norm": 1.015625, + "learning_rate": 9.128927770890826e-05, + "loss": 0.682, + "step": 12463 + }, + { + "epoch": 0.8670910292531914, + "grad_norm": 0.98828125, + "learning_rate": 9.119524083114106e-05, + "loss": 0.5948, + "step": 12464 + }, + { + "epoch": 0.8671605968903267, + "grad_norm": 1.203125, + "learning_rate": 9.110125009789905e-05, + "loss": 0.8522, + "step": 12465 + }, + { + "epoch": 0.8672301645274618, + "grad_norm": 0.9765625, + "learning_rate": 9.100730551395431e-05, + "loss": 0.9638, + "step": 12466 + }, + { + "epoch": 0.867299732164597, + "grad_norm": 1.109375, + "learning_rate": 9.09134070840767e-05, + "loss": 0.8597, + "step": 12467 + }, + { + "epoch": 0.8673692998017323, + "grad_norm": 1.0234375, + "learning_rate": 9.081955481303416e-05, + "loss": 0.6316, + "step": 12468 + }, + { + "epoch": 0.8674388674388674, + "grad_norm": 1.328125, + "learning_rate": 9.072574870559224e-05, + "loss": 1.031, + "step": 12469 + }, + { + "epoch": 0.8675084350760026, + "grad_norm": 1.09375, + "learning_rate": 9.06319887665138e-05, + "loss": 0.7982, + "step": 12470 + }, + { + "epoch": 0.8675780027131379, + "grad_norm": 0.953125, + "learning_rate": 9.053827500055911e-05, + "loss": 0.7964, + "step": 12471 + }, + { + "epoch": 0.8676475703502731, + "grad_norm": 1.1171875, + "learning_rate": 9.044460741248683e-05, + "loss": 0.8397, + "step": 12472 + }, + { + "epoch": 0.8677171379874082, + "grad_norm": 0.890625, + "learning_rate": 9.035098600705305e-05, + "loss": 0.7533, + "step": 12473 + }, + { + "epoch": 0.8677867056245434, + "grad_norm": 1.046875, + "learning_rate": 9.025741078901106e-05, + "loss": 0.6866, + "step": 12474 + }, + { + "epoch": 0.8678562732616787, + "grad_norm": 0.953125, + "learning_rate": 9.016388176311251e-05, + "loss": 0.5942, + "step": 12475 + }, + { + "epoch": 0.8679258408988139, + "grad_norm": 1.03125, + "learning_rate": 9.007039893410607e-05, + "loss": 0.7647, + "step": 12476 + }, + { + "epoch": 0.867995408535949, + "grad_norm": 1.2890625, + "learning_rate": 8.997696230673824e-05, + "loss": 0.8657, + "step": 12477 + }, + { + "epoch": 0.8680649761730843, + "grad_norm": 1.109375, + "learning_rate": 8.988357188575347e-05, + "loss": 0.6619, + "step": 12478 + }, + { + "epoch": 0.8681345438102195, + "grad_norm": 1.0859375, + "learning_rate": 8.979022767589373e-05, + "loss": 0.6795, + "step": 12479 + }, + { + "epoch": 0.8682041114473547, + "grad_norm": 1.0703125, + "learning_rate": 8.969692968189835e-05, + "loss": 0.7326, + "step": 12480 + }, + { + "epoch": 0.8682736790844899, + "grad_norm": 0.9921875, + "learning_rate": 8.960367790850455e-05, + "loss": 0.7366, + "step": 12481 + }, + { + "epoch": 0.8683432467216251, + "grad_norm": 1.296875, + "learning_rate": 8.951047236044719e-05, + "loss": 0.8505, + "step": 12482 + }, + { + "epoch": 0.8684128143587603, + "grad_norm": 0.97265625, + "learning_rate": 8.941731304245903e-05, + "loss": 1.0211, + "step": 12483 + }, + { + "epoch": 0.8684823819958956, + "grad_norm": 1.1328125, + "learning_rate": 8.932419995927e-05, + "loss": 0.8623, + "step": 12484 + }, + { + "epoch": 0.8685519496330307, + "grad_norm": 1.15625, + "learning_rate": 8.923113311560782e-05, + "loss": 0.746, + "step": 12485 + }, + { + "epoch": 0.8686215172701659, + "grad_norm": 1.296875, + "learning_rate": 8.913811251619807e-05, + "loss": 0.9333, + "step": 12486 + }, + { + "epoch": 0.8686910849073011, + "grad_norm": 1.1328125, + "learning_rate": 8.90451381657641e-05, + "loss": 0.9065, + "step": 12487 + }, + { + "epoch": 0.8687606525444364, + "grad_norm": 1.1796875, + "learning_rate": 8.89522100690262e-05, + "loss": 1.2835, + "step": 12488 + }, + { + "epoch": 0.8688302201815715, + "grad_norm": 1.1484375, + "learning_rate": 8.88593282307033e-05, + "loss": 0.8051, + "step": 12489 + }, + { + "epoch": 0.8688997878187067, + "grad_norm": 1.0, + "learning_rate": 8.876649265551107e-05, + "loss": 0.7495, + "step": 12490 + }, + { + "epoch": 0.868969355455842, + "grad_norm": 1.1484375, + "learning_rate": 8.86737033481635e-05, + "loss": 0.869, + "step": 12491 + }, + { + "epoch": 0.8690389230929771, + "grad_norm": 1.34375, + "learning_rate": 8.85809603133716e-05, + "loss": 0.8236, + "step": 12492 + }, + { + "epoch": 0.8691084907301123, + "grad_norm": 0.98828125, + "learning_rate": 8.848826355584494e-05, + "loss": 0.8449, + "step": 12493 + }, + { + "epoch": 0.8691780583672476, + "grad_norm": 1.046875, + "learning_rate": 8.839561308028987e-05, + "loss": 0.6904, + "step": 12494 + }, + { + "epoch": 0.8692476260043828, + "grad_norm": 0.88671875, + "learning_rate": 8.830300889141051e-05, + "loss": 0.5591, + "step": 12495 + }, + { + "epoch": 0.8693171936415179, + "grad_norm": 1.140625, + "learning_rate": 8.821045099390911e-05, + "loss": 0.8662, + "step": 12496 + }, + { + "epoch": 0.8693867612786532, + "grad_norm": 1.4296875, + "learning_rate": 8.811793939248547e-05, + "loss": 0.7743, + "step": 12497 + }, + { + "epoch": 0.8694563289157884, + "grad_norm": 1.140625, + "learning_rate": 8.802547409183659e-05, + "loss": 0.7568, + "step": 12498 + }, + { + "epoch": 0.8695258965529236, + "grad_norm": 1.0078125, + "learning_rate": 8.793305509665727e-05, + "loss": 0.7029, + "step": 12499 + }, + { + "epoch": 0.8695954641900587, + "grad_norm": 1.015625, + "learning_rate": 8.784068241164023e-05, + "loss": 0.7975, + "step": 12500 + }, + { + "epoch": 0.869665031827194, + "grad_norm": 1.109375, + "learning_rate": 8.774835604147602e-05, + "loss": 0.8389, + "step": 12501 + }, + { + "epoch": 0.8697345994643292, + "grad_norm": 0.9921875, + "learning_rate": 8.76560759908519e-05, + "loss": 0.9547, + "step": 12502 + }, + { + "epoch": 0.8698041671014644, + "grad_norm": 0.96484375, + "learning_rate": 8.75638422644539e-05, + "loss": 0.6406, + "step": 12503 + }, + { + "epoch": 0.8698737347385996, + "grad_norm": 1.078125, + "learning_rate": 8.747165486696474e-05, + "loss": 0.75, + "step": 12504 + }, + { + "epoch": 0.8699433023757348, + "grad_norm": 1.046875, + "learning_rate": 8.737951380306564e-05, + "loss": 0.7778, + "step": 12505 + }, + { + "epoch": 0.87001287001287, + "grad_norm": 0.95703125, + "learning_rate": 8.728741907743476e-05, + "loss": 0.7914, + "step": 12506 + }, + { + "epoch": 0.8700824376500053, + "grad_norm": 1.171875, + "learning_rate": 8.719537069474848e-05, + "loss": 0.8327, + "step": 12507 + }, + { + "epoch": 0.8701520052871404, + "grad_norm": 1.3515625, + "learning_rate": 8.71033686596805e-05, + "loss": 1.1135, + "step": 12508 + }, + { + "epoch": 0.8702215729242756, + "grad_norm": 0.92578125, + "learning_rate": 8.701141297690163e-05, + "loss": 0.8336, + "step": 12509 + }, + { + "epoch": 0.8702911405614109, + "grad_norm": 1.0703125, + "learning_rate": 8.69195036510818e-05, + "loss": 0.6165, + "step": 12510 + }, + { + "epoch": 0.870360708198546, + "grad_norm": 1.140625, + "learning_rate": 8.68276406868873e-05, + "loss": 0.895, + "step": 12511 + }, + { + "epoch": 0.8704302758356812, + "grad_norm": 1.421875, + "learning_rate": 8.673582408898251e-05, + "loss": 0.8865, + "step": 12512 + }, + { + "epoch": 0.8704998434728164, + "grad_norm": 1.0546875, + "learning_rate": 8.664405386202911e-05, + "loss": 0.735, + "step": 12513 + }, + { + "epoch": 0.8705694111099517, + "grad_norm": 1.3984375, + "learning_rate": 8.655233001068708e-05, + "loss": 0.9591, + "step": 12514 + }, + { + "epoch": 0.8706389787470868, + "grad_norm": 0.97265625, + "learning_rate": 8.646065253961377e-05, + "loss": 0.8405, + "step": 12515 + }, + { + "epoch": 0.870708546384222, + "grad_norm": 0.94921875, + "learning_rate": 8.636902145346381e-05, + "loss": 0.7767, + "step": 12516 + }, + { + "epoch": 0.8707781140213573, + "grad_norm": 1.1875, + "learning_rate": 8.627743675689004e-05, + "loss": 0.8149, + "step": 12517 + }, + { + "epoch": 0.8708476816584925, + "grad_norm": 0.85546875, + "learning_rate": 8.618589845454239e-05, + "loss": 0.7053, + "step": 12518 + }, + { + "epoch": 0.8709172492956276, + "grad_norm": 1.0234375, + "learning_rate": 8.609440655106903e-05, + "loss": 0.5503, + "step": 12519 + }, + { + "epoch": 0.8709868169327629, + "grad_norm": 1.125, + "learning_rate": 8.600296105111505e-05, + "loss": 0.6353, + "step": 12520 + }, + { + "epoch": 0.8710563845698981, + "grad_norm": 1.0234375, + "learning_rate": 8.591156195932403e-05, + "loss": 0.6243, + "step": 12521 + }, + { + "epoch": 0.8711259522070333, + "grad_norm": 1.3046875, + "learning_rate": 8.582020928033651e-05, + "loss": 0.9454, + "step": 12522 + }, + { + "epoch": 0.8711955198441685, + "grad_norm": 0.9296875, + "learning_rate": 8.572890301879066e-05, + "loss": 0.7238, + "step": 12523 + }, + { + "epoch": 0.8712650874813037, + "grad_norm": 1.3359375, + "learning_rate": 8.56376431793231e-05, + "loss": 0.849, + "step": 12524 + }, + { + "epoch": 0.8713346551184389, + "grad_norm": 1.234375, + "learning_rate": 8.554642976656734e-05, + "loss": 0.5616, + "step": 12525 + }, + { + "epoch": 0.871404222755574, + "grad_norm": 1.21875, + "learning_rate": 8.54552627851548e-05, + "loss": 0.7973, + "step": 12526 + }, + { + "epoch": 0.8714737903927093, + "grad_norm": 1.0703125, + "learning_rate": 8.5364142239714e-05, + "loss": 0.7257, + "step": 12527 + }, + { + "epoch": 0.8715433580298445, + "grad_norm": 0.97265625, + "learning_rate": 8.527306813487213e-05, + "loss": 0.6967, + "step": 12528 + }, + { + "epoch": 0.8716129256669797, + "grad_norm": 1.1796875, + "learning_rate": 8.518204047525336e-05, + "loss": 0.8197, + "step": 12529 + }, + { + "epoch": 0.871682493304115, + "grad_norm": 0.9140625, + "learning_rate": 8.509105926547945e-05, + "loss": 0.4892, + "step": 12530 + }, + { + "epoch": 0.8717520609412501, + "grad_norm": 1.40625, + "learning_rate": 8.500012451017014e-05, + "loss": 1.0708, + "step": 12531 + }, + { + "epoch": 0.8718216285783853, + "grad_norm": 0.9765625, + "learning_rate": 8.490923621394242e-05, + "loss": 0.7588, + "step": 12532 + }, + { + "epoch": 0.8718911962155206, + "grad_norm": 1.078125, + "learning_rate": 8.481839438141159e-05, + "loss": 0.7692, + "step": 12533 + }, + { + "epoch": 0.8719607638526558, + "grad_norm": 1.1015625, + "learning_rate": 8.472759901718952e-05, + "loss": 0.9075, + "step": 12534 + }, + { + "epoch": 0.8720303314897909, + "grad_norm": 0.6640625, + "learning_rate": 8.463685012588685e-05, + "loss": 0.5494, + "step": 12535 + }, + { + "epoch": 0.8720998991269262, + "grad_norm": 1.1015625, + "learning_rate": 8.4546147712111e-05, + "loss": 0.7523, + "step": 12536 + }, + { + "epoch": 0.8721694667640614, + "grad_norm": 1.0859375, + "learning_rate": 8.445549178046774e-05, + "loss": 0.7254, + "step": 12537 + }, + { + "epoch": 0.8722390344011965, + "grad_norm": 0.8515625, + "learning_rate": 8.436488233555973e-05, + "loss": 0.5229, + "step": 12538 + }, + { + "epoch": 0.8723086020383317, + "grad_norm": 0.984375, + "learning_rate": 8.427431938198805e-05, + "loss": 0.5742, + "step": 12539 + }, + { + "epoch": 0.872378169675467, + "grad_norm": 1.1015625, + "learning_rate": 8.418380292435079e-05, + "loss": 0.8908, + "step": 12540 + }, + { + "epoch": 0.8724477373126022, + "grad_norm": 1.1171875, + "learning_rate": 8.409333296724364e-05, + "loss": 0.7893, + "step": 12541 + }, + { + "epoch": 0.8725173049497373, + "grad_norm": 1.1015625, + "learning_rate": 8.40029095152609e-05, + "loss": 0.546, + "step": 12542 + }, + { + "epoch": 0.8725868725868726, + "grad_norm": 1.2890625, + "learning_rate": 8.391253257299336e-05, + "loss": 0.6896, + "step": 12543 + }, + { + "epoch": 0.8726564402240078, + "grad_norm": 1.0859375, + "learning_rate": 8.382220214503011e-05, + "loss": 0.6414, + "step": 12544 + }, + { + "epoch": 0.872726007861143, + "grad_norm": 1.1875, + "learning_rate": 8.373191823595727e-05, + "loss": 0.7615, + "step": 12545 + }, + { + "epoch": 0.8727955754982782, + "grad_norm": 1.2421875, + "learning_rate": 8.364168085035939e-05, + "loss": 0.8378, + "step": 12546 + }, + { + "epoch": 0.8728651431354134, + "grad_norm": 0.95703125, + "learning_rate": 8.355148999281825e-05, + "loss": 0.7706, + "step": 12547 + }, + { + "epoch": 0.8729347107725486, + "grad_norm": 1.15625, + "learning_rate": 8.346134566791308e-05, + "loss": 0.863, + "step": 12548 + }, + { + "epoch": 0.8730042784096839, + "grad_norm": 1.1171875, + "learning_rate": 8.337124788022122e-05, + "loss": 0.632, + "step": 12549 + }, + { + "epoch": 0.873073846046819, + "grad_norm": 1.0390625, + "learning_rate": 8.32811966343171e-05, + "loss": 0.8922, + "step": 12550 + }, + { + "epoch": 0.8731434136839542, + "grad_norm": 1.2109375, + "learning_rate": 8.319119193477342e-05, + "loss": 0.8238, + "step": 12551 + }, + { + "epoch": 0.8732129813210894, + "grad_norm": 1.3203125, + "learning_rate": 8.310123378615975e-05, + "loss": 1.0244, + "step": 12552 + }, + { + "epoch": 0.8732825489582247, + "grad_norm": 1.3046875, + "learning_rate": 8.301132219304408e-05, + "loss": 0.9827, + "step": 12553 + }, + { + "epoch": 0.8733521165953598, + "grad_norm": 1.21875, + "learning_rate": 8.292145715999144e-05, + "loss": 0.6882, + "step": 12554 + }, + { + "epoch": 0.873421684232495, + "grad_norm": 0.99609375, + "learning_rate": 8.283163869156451e-05, + "loss": 0.6744, + "step": 12555 + }, + { + "epoch": 0.8734912518696303, + "grad_norm": 1.09375, + "learning_rate": 8.274186679232443e-05, + "loss": 0.7423, + "step": 12556 + }, + { + "epoch": 0.8735608195067655, + "grad_norm": 1.484375, + "learning_rate": 8.265214146682909e-05, + "loss": 0.9127, + "step": 12557 + }, + { + "epoch": 0.8736303871439006, + "grad_norm": 1.34375, + "learning_rate": 8.256246271963419e-05, + "loss": 0.7658, + "step": 12558 + }, + { + "epoch": 0.8736999547810359, + "grad_norm": 1.1328125, + "learning_rate": 8.247283055529298e-05, + "loss": 0.9081, + "step": 12559 + }, + { + "epoch": 0.8737695224181711, + "grad_norm": 1.578125, + "learning_rate": 8.238324497835681e-05, + "loss": 0.7911, + "step": 12560 + }, + { + "epoch": 0.8738390900553062, + "grad_norm": 1.0625, + "learning_rate": 8.229370599337449e-05, + "loss": 0.7267, + "step": 12561 + }, + { + "epoch": 0.8739086576924415, + "grad_norm": 1.375, + "learning_rate": 8.220421360489205e-05, + "loss": 0.823, + "step": 12562 + }, + { + "epoch": 0.8739782253295767, + "grad_norm": 1.1015625, + "learning_rate": 8.211476781745375e-05, + "loss": 0.8053, + "step": 12563 + }, + { + "epoch": 0.8740477929667119, + "grad_norm": 1.0078125, + "learning_rate": 8.202536863560083e-05, + "loss": 0.6133, + "step": 12564 + }, + { + "epoch": 0.874117360603847, + "grad_norm": 1.140625, + "learning_rate": 8.193601606387302e-05, + "loss": 0.8156, + "step": 12565 + }, + { + "epoch": 0.8741869282409823, + "grad_norm": 1.359375, + "learning_rate": 8.184671010680677e-05, + "loss": 1.0027, + "step": 12566 + }, + { + "epoch": 0.8742564958781175, + "grad_norm": 1.0546875, + "learning_rate": 8.175745076893681e-05, + "loss": 0.8141, + "step": 12567 + }, + { + "epoch": 0.8743260635152527, + "grad_norm": 1.4140625, + "learning_rate": 8.166823805479507e-05, + "loss": 0.8223, + "step": 12568 + }, + { + "epoch": 0.8743956311523879, + "grad_norm": 1.0234375, + "learning_rate": 8.157907196891157e-05, + "loss": 0.6991, + "step": 12569 + }, + { + "epoch": 0.8744651987895231, + "grad_norm": 0.83984375, + "learning_rate": 8.14899525158137e-05, + "loss": 0.6608, + "step": 12570 + }, + { + "epoch": 0.8745347664266583, + "grad_norm": 0.94140625, + "learning_rate": 8.14008797000264e-05, + "loss": 0.9501, + "step": 12571 + }, + { + "epoch": 0.8746043340637936, + "grad_norm": 1.046875, + "learning_rate": 8.13118535260724e-05, + "loss": 0.6796, + "step": 12572 + }, + { + "epoch": 0.8746739017009287, + "grad_norm": 2.09375, + "learning_rate": 8.122287399847173e-05, + "loss": 1.0462, + "step": 12573 + }, + { + "epoch": 0.8747434693380639, + "grad_norm": 1.203125, + "learning_rate": 8.113394112174255e-05, + "loss": 0.7808, + "step": 12574 + }, + { + "epoch": 0.8748130369751992, + "grad_norm": 1.203125, + "learning_rate": 8.10450549004006e-05, + "loss": 0.829, + "step": 12575 + }, + { + "epoch": 0.8748826046123344, + "grad_norm": 0.8828125, + "learning_rate": 8.095621533895869e-05, + "loss": 0.7831, + "step": 12576 + }, + { + "epoch": 0.8749521722494695, + "grad_norm": 1.21875, + "learning_rate": 8.086742244192802e-05, + "loss": 0.7498, + "step": 12577 + }, + { + "epoch": 0.8750217398866047, + "grad_norm": 1.2578125, + "learning_rate": 8.077867621381662e-05, + "loss": 0.8737, + "step": 12578 + }, + { + "epoch": 0.87509130752374, + "grad_norm": 0.8984375, + "learning_rate": 8.068997665913113e-05, + "loss": 0.7675, + "step": 12579 + }, + { + "epoch": 0.8751608751608752, + "grad_norm": 1.1953125, + "learning_rate": 8.060132378237473e-05, + "loss": 1.011, + "step": 12580 + }, + { + "epoch": 0.8752304427980103, + "grad_norm": 1.140625, + "learning_rate": 8.051271758804913e-05, + "loss": 0.7039, + "step": 12581 + }, + { + "epoch": 0.8753000104351456, + "grad_norm": 1.1171875, + "learning_rate": 8.042415808065306e-05, + "loss": 0.7051, + "step": 12582 + }, + { + "epoch": 0.8753695780722808, + "grad_norm": 1.046875, + "learning_rate": 8.033564526468318e-05, + "loss": 0.7209, + "step": 12583 + }, + { + "epoch": 0.875439145709416, + "grad_norm": 1.4375, + "learning_rate": 8.024717914463397e-05, + "loss": 0.9931, + "step": 12584 + }, + { + "epoch": 0.8755087133465512, + "grad_norm": 1.21875, + "learning_rate": 8.01587597249972e-05, + "loss": 1.0059, + "step": 12585 + }, + { + "epoch": 0.8755782809836864, + "grad_norm": 1.0, + "learning_rate": 8.007038701026215e-05, + "loss": 0.8795, + "step": 12586 + }, + { + "epoch": 0.8756478486208216, + "grad_norm": 1.0390625, + "learning_rate": 7.998206100491578e-05, + "loss": 0.7554, + "step": 12587 + }, + { + "epoch": 0.8757174162579568, + "grad_norm": 1.0234375, + "learning_rate": 7.989378171344341e-05, + "loss": 0.7503, + "step": 12588 + }, + { + "epoch": 0.875786983895092, + "grad_norm": 1.171875, + "learning_rate": 7.980554914032712e-05, + "loss": 0.9079, + "step": 12589 + }, + { + "epoch": 0.8758565515322272, + "grad_norm": 0.6953125, + "learning_rate": 7.971736329004675e-05, + "loss": 0.5777, + "step": 12590 + }, + { + "epoch": 0.8759261191693624, + "grad_norm": 2.09375, + "learning_rate": 7.962922416708029e-05, + "loss": 0.9667, + "step": 12591 + }, + { + "epoch": 0.8759956868064976, + "grad_norm": 1.1875, + "learning_rate": 7.954113177590272e-05, + "loss": 0.9835, + "step": 12592 + }, + { + "epoch": 0.8760652544436328, + "grad_norm": 1.28125, + "learning_rate": 7.945308612098712e-05, + "loss": 0.9883, + "step": 12593 + }, + { + "epoch": 0.876134822080768, + "grad_norm": 1.21875, + "learning_rate": 7.93650872068038e-05, + "loss": 0.8747, + "step": 12594 + }, + { + "epoch": 0.8762043897179033, + "grad_norm": 1.046875, + "learning_rate": 7.927713503782107e-05, + "loss": 0.8781, + "step": 12595 + }, + { + "epoch": 0.8762739573550384, + "grad_norm": 1.1328125, + "learning_rate": 7.91892296185045e-05, + "loss": 0.7986, + "step": 12596 + }, + { + "epoch": 0.8763435249921736, + "grad_norm": 1.1015625, + "learning_rate": 7.91013709533177e-05, + "loss": 0.7469, + "step": 12597 + }, + { + "epoch": 0.8764130926293089, + "grad_norm": 1.0, + "learning_rate": 7.90135590467217e-05, + "loss": 0.9099, + "step": 12598 + }, + { + "epoch": 0.8764826602664441, + "grad_norm": 1.1484375, + "learning_rate": 7.892579390317511e-05, + "loss": 0.7349, + "step": 12599 + }, + { + "epoch": 0.8765522279035792, + "grad_norm": 0.9765625, + "learning_rate": 7.883807552713384e-05, + "loss": 0.6099, + "step": 12600 + }, + { + "epoch": 0.8766217955407145, + "grad_norm": 1.046875, + "learning_rate": 7.875040392305222e-05, + "loss": 0.9179, + "step": 12601 + }, + { + "epoch": 0.8766913631778497, + "grad_norm": 1.03125, + "learning_rate": 7.866277909538177e-05, + "loss": 0.8175, + "step": 12602 + }, + { + "epoch": 0.8767609308149849, + "grad_norm": 1.4609375, + "learning_rate": 7.857520104857163e-05, + "loss": 1.1682, + "step": 12603 + }, + { + "epoch": 0.87683049845212, + "grad_norm": 1.0703125, + "learning_rate": 7.848766978706812e-05, + "loss": 0.6443, + "step": 12604 + }, + { + "epoch": 0.8769000660892553, + "grad_norm": 1.296875, + "learning_rate": 7.840018531531623e-05, + "loss": 0.7135, + "step": 12605 + }, + { + "epoch": 0.8769696337263905, + "grad_norm": 1.2421875, + "learning_rate": 7.831274763775754e-05, + "loss": 0.9032, + "step": 12606 + }, + { + "epoch": 0.8770392013635256, + "grad_norm": 1.453125, + "learning_rate": 7.822535675883202e-05, + "loss": 0.734, + "step": 12607 + }, + { + "epoch": 0.8771087690006609, + "grad_norm": 0.875, + "learning_rate": 7.813801268297672e-05, + "loss": 0.726, + "step": 12608 + }, + { + "epoch": 0.8771783366377961, + "grad_norm": 1.109375, + "learning_rate": 7.805071541462672e-05, + "loss": 0.7206, + "step": 12609 + }, + { + "epoch": 0.8772479042749313, + "grad_norm": 1.2734375, + "learning_rate": 7.796346495821415e-05, + "loss": 0.7493, + "step": 12610 + }, + { + "epoch": 0.8773174719120665, + "grad_norm": 0.83984375, + "learning_rate": 7.78762613181696e-05, + "loss": 0.8072, + "step": 12611 + }, + { + "epoch": 0.8773870395492017, + "grad_norm": 0.84765625, + "learning_rate": 7.778910449892074e-05, + "loss": 0.6677, + "step": 12612 + }, + { + "epoch": 0.8774566071863369, + "grad_norm": 1.296875, + "learning_rate": 7.770199450489279e-05, + "loss": 0.7072, + "step": 12613 + }, + { + "epoch": 0.8775261748234722, + "grad_norm": 1.4453125, + "learning_rate": 7.761493134050879e-05, + "loss": 0.9677, + "step": 12614 + }, + { + "epoch": 0.8775957424606073, + "grad_norm": 1.078125, + "learning_rate": 7.75279150101893e-05, + "loss": 1.0538, + "step": 12615 + }, + { + "epoch": 0.8776653100977425, + "grad_norm": 1.15625, + "learning_rate": 7.744094551835291e-05, + "loss": 0.9044, + "step": 12616 + }, + { + "epoch": 0.8777348777348777, + "grad_norm": 1.40625, + "learning_rate": 7.735402286941528e-05, + "loss": 1.0882, + "step": 12617 + }, + { + "epoch": 0.877804445372013, + "grad_norm": 1.484375, + "learning_rate": 7.726714706778992e-05, + "loss": 0.9486, + "step": 12618 + }, + { + "epoch": 0.8778740130091481, + "grad_norm": 1.0703125, + "learning_rate": 7.71803181178875e-05, + "loss": 0.8556, + "step": 12619 + }, + { + "epoch": 0.8779435806462833, + "grad_norm": 1.09375, + "learning_rate": 7.709353602411751e-05, + "loss": 0.6466, + "step": 12620 + }, + { + "epoch": 0.8780131482834186, + "grad_norm": 1.1953125, + "learning_rate": 7.700680079088595e-05, + "loss": 0.9507, + "step": 12621 + }, + { + "epoch": 0.8780827159205538, + "grad_norm": 1.046875, + "learning_rate": 7.692011242259677e-05, + "loss": 0.7419, + "step": 12622 + }, + { + "epoch": 0.8781522835576889, + "grad_norm": 1.2578125, + "learning_rate": 7.683347092365166e-05, + "loss": 0.9697, + "step": 12623 + }, + { + "epoch": 0.8782218511948242, + "grad_norm": 0.98046875, + "learning_rate": 7.674687629844967e-05, + "loss": 0.7267, + "step": 12624 + }, + { + "epoch": 0.8782914188319594, + "grad_norm": 1.0078125, + "learning_rate": 7.666032855138793e-05, + "loss": 0.6941, + "step": 12625 + }, + { + "epoch": 0.8783609864690946, + "grad_norm": 0.890625, + "learning_rate": 7.65738276868605e-05, + "loss": 0.6775, + "step": 12626 + }, + { + "epoch": 0.8784305541062298, + "grad_norm": 1.1484375, + "learning_rate": 7.648737370925995e-05, + "loss": 0.8829, + "step": 12627 + }, + { + "epoch": 0.878500121743365, + "grad_norm": 1.140625, + "learning_rate": 7.640096662297547e-05, + "loss": 1.0224, + "step": 12628 + }, + { + "epoch": 0.8785696893805002, + "grad_norm": 1.078125, + "learning_rate": 7.631460643239463e-05, + "loss": 0.9203, + "step": 12629 + }, + { + "epoch": 0.8786392570176353, + "grad_norm": 1.140625, + "learning_rate": 7.62282931419026e-05, + "loss": 0.7338, + "step": 12630 + }, + { + "epoch": 0.8787088246547706, + "grad_norm": 1.1171875, + "learning_rate": 7.614202675588167e-05, + "loss": 0.7872, + "step": 12631 + }, + { + "epoch": 0.8787783922919058, + "grad_norm": 0.91796875, + "learning_rate": 7.605580727871175e-05, + "loss": 0.8034, + "step": 12632 + }, + { + "epoch": 0.878847959929041, + "grad_norm": 0.94921875, + "learning_rate": 7.596963471477103e-05, + "loss": 0.4924, + "step": 12633 + }, + { + "epoch": 0.8789175275661762, + "grad_norm": 1.0546875, + "learning_rate": 7.5883509068435e-05, + "loss": 0.9425, + "step": 12634 + }, + { + "epoch": 0.8789870952033114, + "grad_norm": 1.203125, + "learning_rate": 7.579743034407638e-05, + "loss": 1.0256, + "step": 12635 + }, + { + "epoch": 0.8790566628404466, + "grad_norm": 1.1640625, + "learning_rate": 7.571139854606579e-05, + "loss": 0.8888, + "step": 12636 + }, + { + "epoch": 0.8791262304775819, + "grad_norm": 0.9453125, + "learning_rate": 7.562541367877184e-05, + "loss": 0.8218, + "step": 12637 + }, + { + "epoch": 0.879195798114717, + "grad_norm": 0.9375, + "learning_rate": 7.553947574655995e-05, + "loss": 0.5736, + "step": 12638 + }, + { + "epoch": 0.8792653657518522, + "grad_norm": 0.90625, + "learning_rate": 7.545358475379405e-05, + "loss": 0.7649, + "step": 12639 + }, + { + "epoch": 0.8793349333889875, + "grad_norm": 0.984375, + "learning_rate": 7.536774070483488e-05, + "loss": 0.7031, + "step": 12640 + }, + { + "epoch": 0.8794045010261227, + "grad_norm": 1.0703125, + "learning_rate": 7.52819436040415e-05, + "loss": 0.8432, + "step": 12641 + }, + { + "epoch": 0.8794740686632578, + "grad_norm": 1.3203125, + "learning_rate": 7.519619345577e-05, + "loss": 1.072, + "step": 12642 + }, + { + "epoch": 0.879543636300393, + "grad_norm": 1.234375, + "learning_rate": 7.511049026437434e-05, + "loss": 0.7029, + "step": 12643 + }, + { + "epoch": 0.8796132039375283, + "grad_norm": 0.92578125, + "learning_rate": 7.502483403420646e-05, + "loss": 0.656, + "step": 12644 + }, + { + "epoch": 0.8796827715746635, + "grad_norm": 0.99609375, + "learning_rate": 7.493922476961523e-05, + "loss": 0.7228, + "step": 12645 + }, + { + "epoch": 0.8797523392117986, + "grad_norm": 1.203125, + "learning_rate": 7.48536624749474e-05, + "loss": 0.7988, + "step": 12646 + }, + { + "epoch": 0.8798219068489339, + "grad_norm": 1.15625, + "learning_rate": 7.476814715454738e-05, + "loss": 0.9107, + "step": 12647 + }, + { + "epoch": 0.8798914744860691, + "grad_norm": 0.8984375, + "learning_rate": 7.46826788127577e-05, + "loss": 0.5806, + "step": 12648 + }, + { + "epoch": 0.8799610421232043, + "grad_norm": 1.0234375, + "learning_rate": 7.459725745391743e-05, + "loss": 0.7292, + "step": 12649 + }, + { + "epoch": 0.8800306097603395, + "grad_norm": 0.94921875, + "learning_rate": 7.451188308236401e-05, + "loss": 0.7109, + "step": 12650 + }, + { + "epoch": 0.8801001773974747, + "grad_norm": 1.03125, + "learning_rate": 7.44265557024324e-05, + "loss": 0.7169, + "step": 12651 + }, + { + "epoch": 0.8801697450346099, + "grad_norm": 1.4140625, + "learning_rate": 7.434127531845514e-05, + "loss": 0.8128, + "step": 12652 + }, + { + "epoch": 0.8802393126717452, + "grad_norm": 1.5859375, + "learning_rate": 7.425604193476232e-05, + "loss": 1.0139, + "step": 12653 + }, + { + "epoch": 0.8803088803088803, + "grad_norm": 1.1484375, + "learning_rate": 7.417085555568137e-05, + "loss": 0.8128, + "step": 12654 + }, + { + "epoch": 0.8803784479460155, + "grad_norm": 0.83984375, + "learning_rate": 7.408571618553794e-05, + "loss": 0.8119, + "step": 12655 + }, + { + "epoch": 0.8804480155831507, + "grad_norm": 0.91015625, + "learning_rate": 7.400062382865491e-05, + "loss": 0.6313, + "step": 12656 + }, + { + "epoch": 0.880517583220286, + "grad_norm": 1.1953125, + "learning_rate": 7.39155784893527e-05, + "loss": 0.7273, + "step": 12657 + }, + { + "epoch": 0.8805871508574211, + "grad_norm": 1.3671875, + "learning_rate": 7.383058017194976e-05, + "loss": 0.977, + "step": 12658 + }, + { + "epoch": 0.8806567184945563, + "grad_norm": 0.84765625, + "learning_rate": 7.374562888076175e-05, + "loss": 0.4533, + "step": 12659 + }, + { + "epoch": 0.8807262861316916, + "grad_norm": 1.0546875, + "learning_rate": 7.366072462010187e-05, + "loss": 0.6656, + "step": 12660 + }, + { + "epoch": 0.8807958537688267, + "grad_norm": 0.9921875, + "learning_rate": 7.357586739428135e-05, + "loss": 0.7439, + "step": 12661 + }, + { + "epoch": 0.8808654214059619, + "grad_norm": 1.21875, + "learning_rate": 7.349105720760884e-05, + "loss": 0.7649, + "step": 12662 + }, + { + "epoch": 0.8809349890430972, + "grad_norm": 0.9296875, + "learning_rate": 7.340629406439048e-05, + "loss": 0.7316, + "step": 12663 + }, + { + "epoch": 0.8810045566802324, + "grad_norm": 1.0390625, + "learning_rate": 7.332157796893002e-05, + "loss": 0.6436, + "step": 12664 + }, + { + "epoch": 0.8810741243173675, + "grad_norm": 1.2421875, + "learning_rate": 7.323690892552903e-05, + "loss": 0.7449, + "step": 12665 + }, + { + "epoch": 0.8811436919545028, + "grad_norm": 1.1171875, + "learning_rate": 7.315228693848674e-05, + "loss": 0.7347, + "step": 12666 + }, + { + "epoch": 0.881213259591638, + "grad_norm": 0.9140625, + "learning_rate": 7.306771201209961e-05, + "loss": 0.604, + "step": 12667 + }, + { + "epoch": 0.8812828272287732, + "grad_norm": 1.3046875, + "learning_rate": 7.298318415066186e-05, + "loss": 1.0214, + "step": 12668 + }, + { + "epoch": 0.8813523948659083, + "grad_norm": 1.03125, + "learning_rate": 7.289870335846571e-05, + "loss": 0.683, + "step": 12669 + }, + { + "epoch": 0.8814219625030436, + "grad_norm": 0.87890625, + "learning_rate": 7.28142696398002e-05, + "loss": 0.4949, + "step": 12670 + }, + { + "epoch": 0.8814915301401788, + "grad_norm": 1.125, + "learning_rate": 7.272988299895278e-05, + "loss": 0.9083, + "step": 12671 + }, + { + "epoch": 0.881561097777314, + "grad_norm": 1.5078125, + "learning_rate": 7.264554344020835e-05, + "loss": 0.964, + "step": 12672 + }, + { + "epoch": 0.8816306654144492, + "grad_norm": 1.28125, + "learning_rate": 7.256125096784893e-05, + "loss": 0.7362, + "step": 12673 + }, + { + "epoch": 0.8817002330515844, + "grad_norm": 1.2109375, + "learning_rate": 7.247700558615433e-05, + "loss": 0.9041, + "step": 12674 + }, + { + "epoch": 0.8817698006887196, + "grad_norm": 0.96875, + "learning_rate": 7.239280729940234e-05, + "loss": 0.6304, + "step": 12675 + }, + { + "epoch": 0.8818393683258549, + "grad_norm": 1.328125, + "learning_rate": 7.230865611186833e-05, + "loss": 0.8177, + "step": 12676 + }, + { + "epoch": 0.88190893596299, + "grad_norm": 0.96875, + "learning_rate": 7.222455202782485e-05, + "loss": 0.6874, + "step": 12677 + }, + { + "epoch": 0.8819785036001252, + "grad_norm": 1.125, + "learning_rate": 7.214049505154207e-05, + "loss": 0.7348, + "step": 12678 + }, + { + "epoch": 0.8820480712372605, + "grad_norm": 0.91796875, + "learning_rate": 7.205648518728824e-05, + "loss": 0.6891, + "step": 12679 + }, + { + "epoch": 0.8821176388743956, + "grad_norm": 1.125, + "learning_rate": 7.197252243932906e-05, + "loss": 0.6281, + "step": 12680 + }, + { + "epoch": 0.8821872065115308, + "grad_norm": 1.2265625, + "learning_rate": 7.188860681192766e-05, + "loss": 0.8182, + "step": 12681 + }, + { + "epoch": 0.882256774148666, + "grad_norm": 1.34375, + "learning_rate": 7.180473830934453e-05, + "loss": 0.6742, + "step": 12682 + }, + { + "epoch": 0.8823263417858013, + "grad_norm": 1.1484375, + "learning_rate": 7.172091693583826e-05, + "loss": 0.7446, + "step": 12683 + }, + { + "epoch": 0.8823959094229364, + "grad_norm": 1.21875, + "learning_rate": 7.163714269566524e-05, + "loss": 0.6961, + "step": 12684 + }, + { + "epoch": 0.8824654770600716, + "grad_norm": 1.40625, + "learning_rate": 7.15534155930786e-05, + "loss": 0.7753, + "step": 12685 + }, + { + "epoch": 0.8825350446972069, + "grad_norm": 1.296875, + "learning_rate": 7.146973563233005e-05, + "loss": 0.8588, + "step": 12686 + }, + { + "epoch": 0.8826046123343421, + "grad_norm": 1.4375, + "learning_rate": 7.138610281766811e-05, + "loss": 0.6299, + "step": 12687 + }, + { + "epoch": 0.8826741799714772, + "grad_norm": 1.15625, + "learning_rate": 7.130251715333913e-05, + "loss": 0.6606, + "step": 12688 + }, + { + "epoch": 0.8827437476086125, + "grad_norm": 1.0859375, + "learning_rate": 7.12189786435874e-05, + "loss": 0.7995, + "step": 12689 + }, + { + "epoch": 0.8828133152457477, + "grad_norm": 1.078125, + "learning_rate": 7.113548729265462e-05, + "loss": 0.7172, + "step": 12690 + }, + { + "epoch": 0.8828828828828829, + "grad_norm": 0.9453125, + "learning_rate": 7.105204310478009e-05, + "loss": 0.7823, + "step": 12691 + }, + { + "epoch": 0.8829524505200181, + "grad_norm": 0.921875, + "learning_rate": 7.096864608420029e-05, + "loss": 0.6294, + "step": 12692 + }, + { + "epoch": 0.8830220181571533, + "grad_norm": 1.1015625, + "learning_rate": 7.088529623514995e-05, + "loss": 0.6797, + "step": 12693 + }, + { + "epoch": 0.8830915857942885, + "grad_norm": 1.3359375, + "learning_rate": 7.080199356186146e-05, + "loss": 0.814, + "step": 12694 + }, + { + "epoch": 0.8831611534314237, + "grad_norm": 1.2265625, + "learning_rate": 7.071873806856422e-05, + "loss": 0.8889, + "step": 12695 + }, + { + "epoch": 0.8832307210685589, + "grad_norm": 1.3515625, + "learning_rate": 7.063552975948528e-05, + "loss": 0.9012, + "step": 12696 + }, + { + "epoch": 0.8833002887056941, + "grad_norm": 1.4453125, + "learning_rate": 7.055236863884984e-05, + "loss": 1.0218, + "step": 12697 + }, + { + "epoch": 0.8833698563428293, + "grad_norm": 1.5546875, + "learning_rate": 7.04692547108805e-05, + "loss": 0.6138, + "step": 12698 + }, + { + "epoch": 0.8834394239799646, + "grad_norm": 1.3671875, + "learning_rate": 7.038618797979735e-05, + "loss": 0.8291, + "step": 12699 + }, + { + "epoch": 0.8835089916170997, + "grad_norm": 1.1484375, + "learning_rate": 7.030316844981766e-05, + "loss": 0.87, + "step": 12700 + }, + { + "epoch": 0.8835785592542349, + "grad_norm": 1.3359375, + "learning_rate": 7.022019612515728e-05, + "loss": 0.8387, + "step": 12701 + }, + { + "epoch": 0.8836481268913702, + "grad_norm": 1.046875, + "learning_rate": 7.013727101002876e-05, + "loss": 0.5919, + "step": 12702 + }, + { + "epoch": 0.8837176945285053, + "grad_norm": 1.203125, + "learning_rate": 7.00543931086427e-05, + "loss": 0.8006, + "step": 12703 + }, + { + "epoch": 0.8837872621656405, + "grad_norm": 1.3046875, + "learning_rate": 6.997156242520752e-05, + "loss": 1.0003, + "step": 12704 + }, + { + "epoch": 0.8838568298027758, + "grad_norm": 1.109375, + "learning_rate": 6.988877896392864e-05, + "loss": 0.7739, + "step": 12705 + }, + { + "epoch": 0.883926397439911, + "grad_norm": 1.1171875, + "learning_rate": 6.980604272900937e-05, + "loss": 0.9019, + "step": 12706 + }, + { + "epoch": 0.8839959650770461, + "grad_norm": 0.91015625, + "learning_rate": 6.972335372465067e-05, + "loss": 0.7214, + "step": 12707 + }, + { + "epoch": 0.8840655327141813, + "grad_norm": 1.15625, + "learning_rate": 6.964071195505129e-05, + "loss": 0.8192, + "step": 12708 + }, + { + "epoch": 0.8841351003513166, + "grad_norm": 0.9375, + "learning_rate": 6.955811742440721e-05, + "loss": 0.5364, + "step": 12709 + }, + { + "epoch": 0.8842046679884518, + "grad_norm": 0.91015625, + "learning_rate": 6.947557013691197e-05, + "loss": 0.6884, + "step": 12710 + }, + { + "epoch": 0.8842742356255869, + "grad_norm": 1.3515625, + "learning_rate": 6.939307009675711e-05, + "loss": 1.0167, + "step": 12711 + }, + { + "epoch": 0.8843438032627222, + "grad_norm": 1.0625, + "learning_rate": 6.931061730813171e-05, + "loss": 0.9485, + "step": 12712 + }, + { + "epoch": 0.8844133708998574, + "grad_norm": 1.0703125, + "learning_rate": 6.92282117752221e-05, + "loss": 0.7769, + "step": 12713 + }, + { + "epoch": 0.8844829385369926, + "grad_norm": 1.0, + "learning_rate": 6.914585350221236e-05, + "loss": 0.938, + "step": 12714 + }, + { + "epoch": 0.8845525061741278, + "grad_norm": 1.046875, + "learning_rate": 6.906354249328428e-05, + "loss": 0.7045, + "step": 12715 + }, + { + "epoch": 0.884622073811263, + "grad_norm": 1.171875, + "learning_rate": 6.89812787526175e-05, + "loss": 0.8892, + "step": 12716 + }, + { + "epoch": 0.8846916414483982, + "grad_norm": 1.0859375, + "learning_rate": 6.889906228438847e-05, + "loss": 1.0516, + "step": 12717 + }, + { + "epoch": 0.8847612090855335, + "grad_norm": 0.859375, + "learning_rate": 6.881689309277206e-05, + "loss": 0.5978, + "step": 12718 + }, + { + "epoch": 0.8848307767226686, + "grad_norm": 0.984375, + "learning_rate": 6.873477118194038e-05, + "loss": 0.8943, + "step": 12719 + }, + { + "epoch": 0.8849003443598038, + "grad_norm": 1.1328125, + "learning_rate": 6.865269655606288e-05, + "loss": 0.8368, + "step": 12720 + }, + { + "epoch": 0.884969911996939, + "grad_norm": 1.2265625, + "learning_rate": 6.857066921930721e-05, + "loss": 0.7237, + "step": 12721 + }, + { + "epoch": 0.8850394796340743, + "grad_norm": 1.046875, + "learning_rate": 6.848868917583828e-05, + "loss": 0.8268, + "step": 12722 + }, + { + "epoch": 0.8851090472712094, + "grad_norm": 1.1484375, + "learning_rate": 6.840675642981864e-05, + "loss": 0.9917, + "step": 12723 + }, + { + "epoch": 0.8851786149083446, + "grad_norm": 1.1484375, + "learning_rate": 6.832487098540807e-05, + "loss": 0.7869, + "step": 12724 + }, + { + "epoch": 0.8852481825454799, + "grad_norm": 1.09375, + "learning_rate": 6.824303284676459e-05, + "loss": 0.7963, + "step": 12725 + }, + { + "epoch": 0.885317750182615, + "grad_norm": 1.0625, + "learning_rate": 6.816124201804364e-05, + "loss": 0.5991, + "step": 12726 + }, + { + "epoch": 0.8853873178197502, + "grad_norm": 1.3671875, + "learning_rate": 6.807949850339801e-05, + "loss": 0.8378, + "step": 12727 + }, + { + "epoch": 0.8854568854568855, + "grad_norm": 1.1015625, + "learning_rate": 6.799780230697816e-05, + "loss": 0.6915, + "step": 12728 + }, + { + "epoch": 0.8855264530940207, + "grad_norm": 1.0078125, + "learning_rate": 6.791615343293211e-05, + "loss": 0.8808, + "step": 12729 + }, + { + "epoch": 0.8855960207311558, + "grad_norm": 1.0078125, + "learning_rate": 6.783455188540599e-05, + "loss": 0.7991, + "step": 12730 + }, + { + "epoch": 0.8856655883682911, + "grad_norm": 1.0, + "learning_rate": 6.775299766854271e-05, + "loss": 0.8685, + "step": 12731 + }, + { + "epoch": 0.8857351560054263, + "grad_norm": 1.1796875, + "learning_rate": 6.767149078648348e-05, + "loss": 1.0224, + "step": 12732 + }, + { + "epoch": 0.8858047236425615, + "grad_norm": 1.0859375, + "learning_rate": 6.759003124336671e-05, + "loss": 0.695, + "step": 12733 + }, + { + "epoch": 0.8858742912796966, + "grad_norm": 1.5, + "learning_rate": 6.750861904332817e-05, + "loss": 1.0809, + "step": 12734 + }, + { + "epoch": 0.8859438589168319, + "grad_norm": 1.1640625, + "learning_rate": 6.7427254190502e-05, + "loss": 0.6709, + "step": 12735 + }, + { + "epoch": 0.8860134265539671, + "grad_norm": 1.09375, + "learning_rate": 6.734593668901945e-05, + "loss": 0.7581, + "step": 12736 + }, + { + "epoch": 0.8860829941911023, + "grad_norm": 1.1875, + "learning_rate": 6.726466654300922e-05, + "loss": 0.7828, + "step": 12737 + }, + { + "epoch": 0.8861525618282375, + "grad_norm": 1.1171875, + "learning_rate": 6.718344375659779e-05, + "loss": 0.7232, + "step": 12738 + }, + { + "epoch": 0.8862221294653727, + "grad_norm": 1.0390625, + "learning_rate": 6.710226833390942e-05, + "loss": 0.7902, + "step": 12739 + }, + { + "epoch": 0.8862916971025079, + "grad_norm": 1.0703125, + "learning_rate": 6.702114027906581e-05, + "loss": 0.8745, + "step": 12740 + }, + { + "epoch": 0.8863612647396432, + "grad_norm": 1.1640625, + "learning_rate": 6.694005959618609e-05, + "loss": 0.8363, + "step": 12741 + }, + { + "epoch": 0.8864308323767783, + "grad_norm": 1.4765625, + "learning_rate": 6.685902628938711e-05, + "loss": 0.8588, + "step": 12742 + }, + { + "epoch": 0.8865004000139135, + "grad_norm": 1.2421875, + "learning_rate": 6.677804036278334e-05, + "loss": 1.0298, + "step": 12743 + }, + { + "epoch": 0.8865699676510488, + "grad_norm": 1.2265625, + "learning_rate": 6.669710182048705e-05, + "loss": 0.7713, + "step": 12744 + }, + { + "epoch": 0.886639535288184, + "grad_norm": 0.8046875, + "learning_rate": 6.66162106666075e-05, + "loss": 0.6444, + "step": 12745 + }, + { + "epoch": 0.8867091029253191, + "grad_norm": 1.2265625, + "learning_rate": 6.653536690525241e-05, + "loss": 0.8326, + "step": 12746 + }, + { + "epoch": 0.8867786705624543, + "grad_norm": 0.984375, + "learning_rate": 6.645457054052639e-05, + "loss": 0.7872, + "step": 12747 + }, + { + "epoch": 0.8868482381995896, + "grad_norm": 0.89453125, + "learning_rate": 6.637382157653171e-05, + "loss": 0.7572, + "step": 12748 + }, + { + "epoch": 0.8869178058367247, + "grad_norm": 0.94140625, + "learning_rate": 6.629312001736853e-05, + "loss": 0.5386, + "step": 12749 + }, + { + "epoch": 0.8869873734738599, + "grad_norm": 1.0546875, + "learning_rate": 6.62124658671347e-05, + "loss": 0.9461, + "step": 12750 + }, + { + "epoch": 0.8870569411109952, + "grad_norm": 1.03125, + "learning_rate": 6.613185912992514e-05, + "loss": 0.843, + "step": 12751 + }, + { + "epoch": 0.8871265087481304, + "grad_norm": 1.0390625, + "learning_rate": 6.605129980983249e-05, + "loss": 0.9026, + "step": 12752 + }, + { + "epoch": 0.8871960763852655, + "grad_norm": 1.109375, + "learning_rate": 6.597078791094757e-05, + "loss": 0.8575, + "step": 12753 + }, + { + "epoch": 0.8872656440224008, + "grad_norm": 1.515625, + "learning_rate": 6.589032343735823e-05, + "loss": 0.8761, + "step": 12754 + }, + { + "epoch": 0.887335211659536, + "grad_norm": 1.21875, + "learning_rate": 6.580990639314998e-05, + "loss": 1.0263, + "step": 12755 + }, + { + "epoch": 0.8874047792966712, + "grad_norm": 1.046875, + "learning_rate": 6.57295367824059e-05, + "loss": 0.653, + "step": 12756 + }, + { + "epoch": 0.8874743469338064, + "grad_norm": 1.1328125, + "learning_rate": 6.564921460920692e-05, + "loss": 0.8159, + "step": 12757 + }, + { + "epoch": 0.8875439145709416, + "grad_norm": 1.15625, + "learning_rate": 6.556893987763146e-05, + "loss": 0.8262, + "step": 12758 + }, + { + "epoch": 0.8876134822080768, + "grad_norm": 1.0078125, + "learning_rate": 6.548871259175516e-05, + "loss": 0.7602, + "step": 12759 + }, + { + "epoch": 0.887683049845212, + "grad_norm": 1.21875, + "learning_rate": 6.540853275565195e-05, + "loss": 0.876, + "step": 12760 + }, + { + "epoch": 0.8877526174823472, + "grad_norm": 1.140625, + "learning_rate": 6.532840037339261e-05, + "loss": 0.8405, + "step": 12761 + }, + { + "epoch": 0.8878221851194824, + "grad_norm": 1.1171875, + "learning_rate": 6.524831544904609e-05, + "loss": 0.9209, + "step": 12762 + }, + { + "epoch": 0.8878917527566176, + "grad_norm": 1.1171875, + "learning_rate": 6.516827798667857e-05, + "loss": 0.6923, + "step": 12763 + }, + { + "epoch": 0.8879613203937529, + "grad_norm": 1.4296875, + "learning_rate": 6.508828799035404e-05, + "loss": 0.8913, + "step": 12764 + }, + { + "epoch": 0.888030888030888, + "grad_norm": 1.328125, + "learning_rate": 6.500834546413404e-05, + "loss": 0.9831, + "step": 12765 + }, + { + "epoch": 0.8881004556680232, + "grad_norm": 1.1328125, + "learning_rate": 6.492845041207707e-05, + "loss": 0.8282, + "step": 12766 + }, + { + "epoch": 0.8881700233051585, + "grad_norm": 1.65625, + "learning_rate": 6.484860283824079e-05, + "loss": 0.9055, + "step": 12767 + }, + { + "epoch": 0.8882395909422937, + "grad_norm": 1.3515625, + "learning_rate": 6.476880274667885e-05, + "loss": 0.8114, + "step": 12768 + }, + { + "epoch": 0.8883091585794288, + "grad_norm": 0.796875, + "learning_rate": 6.468905014144322e-05, + "loss": 0.6349, + "step": 12769 + }, + { + "epoch": 0.8883787262165641, + "grad_norm": 1.328125, + "learning_rate": 6.460934502658311e-05, + "loss": 1.0425, + "step": 12770 + }, + { + "epoch": 0.8884482938536993, + "grad_norm": 0.828125, + "learning_rate": 6.452968740614574e-05, + "loss": 0.7217, + "step": 12771 + }, + { + "epoch": 0.8885178614908344, + "grad_norm": 1.0234375, + "learning_rate": 6.445007728417596e-05, + "loss": 0.9206, + "step": 12772 + }, + { + "epoch": 0.8885874291279696, + "grad_norm": 1.15625, + "learning_rate": 6.437051466471567e-05, + "loss": 0.8422, + "step": 12773 + }, + { + "epoch": 0.8886569967651049, + "grad_norm": 1.15625, + "learning_rate": 6.429099955180451e-05, + "loss": 0.9084, + "step": 12774 + }, + { + "epoch": 0.8887265644022401, + "grad_norm": 1.203125, + "learning_rate": 6.421153194948015e-05, + "loss": 0.86, + "step": 12775 + }, + { + "epoch": 0.8887961320393752, + "grad_norm": 0.89453125, + "learning_rate": 6.413211186177759e-05, + "loss": 0.8385, + "step": 12776 + }, + { + "epoch": 0.8888656996765105, + "grad_norm": 1.0078125, + "learning_rate": 6.405273929272914e-05, + "loss": 0.8658, + "step": 12777 + }, + { + "epoch": 0.8889352673136457, + "grad_norm": 1.25, + "learning_rate": 6.397341424636527e-05, + "loss": 0.789, + "step": 12778 + }, + { + "epoch": 0.8890048349507809, + "grad_norm": 1.453125, + "learning_rate": 6.38941367267134e-05, + "loss": 0.9334, + "step": 12779 + }, + { + "epoch": 0.8890744025879161, + "grad_norm": 1.2734375, + "learning_rate": 6.381490673779888e-05, + "loss": 1.0498, + "step": 12780 + }, + { + "epoch": 0.8891439702250513, + "grad_norm": 1.0703125, + "learning_rate": 6.37357242836446e-05, + "loss": 0.787, + "step": 12781 + }, + { + "epoch": 0.8892135378621865, + "grad_norm": 1.2734375, + "learning_rate": 6.365658936827135e-05, + "loss": 0.8056, + "step": 12782 + }, + { + "epoch": 0.8892831054993218, + "grad_norm": 1.0546875, + "learning_rate": 6.35775019956969e-05, + "loss": 0.7835, + "step": 12783 + }, + { + "epoch": 0.8893526731364569, + "grad_norm": 1.2890625, + "learning_rate": 6.349846216993682e-05, + "loss": 0.6149, + "step": 12784 + }, + { + "epoch": 0.8894222407735921, + "grad_norm": 1.09375, + "learning_rate": 6.341946989500458e-05, + "loss": 0.8973, + "step": 12785 + }, + { + "epoch": 0.8894918084107273, + "grad_norm": 1.1328125, + "learning_rate": 6.334052517491107e-05, + "loss": 0.7343, + "step": 12786 + }, + { + "epoch": 0.8895613760478626, + "grad_norm": 1.265625, + "learning_rate": 6.326162801366453e-05, + "loss": 0.9022, + "step": 12787 + }, + { + "epoch": 0.8896309436849977, + "grad_norm": 1.1484375, + "learning_rate": 6.318277841527087e-05, + "loss": 0.9184, + "step": 12788 + }, + { + "epoch": 0.8897005113221329, + "grad_norm": 1.140625, + "learning_rate": 6.310397638373388e-05, + "loss": 0.9703, + "step": 12789 + }, + { + "epoch": 0.8897700789592682, + "grad_norm": 1.1328125, + "learning_rate": 6.302522192305471e-05, + "loss": 0.8512, + "step": 12790 + }, + { + "epoch": 0.8898396465964034, + "grad_norm": 1.2578125, + "learning_rate": 6.294651503723204e-05, + "loss": 0.7479, + "step": 12791 + }, + { + "epoch": 0.8899092142335385, + "grad_norm": 1.0390625, + "learning_rate": 6.286785573026232e-05, + "loss": 0.6878, + "step": 12792 + }, + { + "epoch": 0.8899787818706738, + "grad_norm": 0.9140625, + "learning_rate": 6.278924400613928e-05, + "loss": 0.4392, + "step": 12793 + }, + { + "epoch": 0.890048349507809, + "grad_norm": 1.2890625, + "learning_rate": 6.271067986885459e-05, + "loss": 0.8874, + "step": 12794 + }, + { + "epoch": 0.8901179171449441, + "grad_norm": 1.328125, + "learning_rate": 6.263216332239718e-05, + "loss": 0.9909, + "step": 12795 + }, + { + "epoch": 0.8901874847820794, + "grad_norm": 1.046875, + "learning_rate": 6.255369437075409e-05, + "loss": 0.7922, + "step": 12796 + }, + { + "epoch": 0.8902570524192146, + "grad_norm": 1.0234375, + "learning_rate": 6.247527301790922e-05, + "loss": 0.8277, + "step": 12797 + }, + { + "epoch": 0.8903266200563498, + "grad_norm": 1.171875, + "learning_rate": 6.23968992678443e-05, + "loss": 0.8435, + "step": 12798 + }, + { + "epoch": 0.8903961876934849, + "grad_norm": 1.078125, + "learning_rate": 6.231857312453903e-05, + "loss": 0.6398, + "step": 12799 + }, + { + "epoch": 0.8904657553306202, + "grad_norm": 0.88671875, + "learning_rate": 6.224029459197056e-05, + "loss": 0.6042, + "step": 12800 + }, + { + "epoch": 0.8905353229677554, + "grad_norm": 1.046875, + "learning_rate": 6.216206367411326e-05, + "loss": 0.7371, + "step": 12801 + }, + { + "epoch": 0.8906048906048906, + "grad_norm": 1.0859375, + "learning_rate": 6.208388037493906e-05, + "loss": 0.7232, + "step": 12802 + }, + { + "epoch": 0.8906744582420258, + "grad_norm": 1.0078125, + "learning_rate": 6.200574469841813e-05, + "loss": 0.8836, + "step": 12803 + }, + { + "epoch": 0.890744025879161, + "grad_norm": 1.21875, + "learning_rate": 6.192765664851763e-05, + "loss": 1.2466, + "step": 12804 + }, + { + "epoch": 0.8908135935162962, + "grad_norm": 0.91015625, + "learning_rate": 6.184961622920237e-05, + "loss": 0.766, + "step": 12805 + }, + { + "epoch": 0.8908831611534315, + "grad_norm": 1.390625, + "learning_rate": 6.177162344443521e-05, + "loss": 0.7032, + "step": 12806 + }, + { + "epoch": 0.8909527287905666, + "grad_norm": 1.3125, + "learning_rate": 6.169367829817573e-05, + "loss": 0.6213, + "step": 12807 + }, + { + "epoch": 0.8910222964277018, + "grad_norm": 1.1640625, + "learning_rate": 6.161578079438212e-05, + "loss": 0.815, + "step": 12808 + }, + { + "epoch": 0.8910918640648371, + "grad_norm": 0.87109375, + "learning_rate": 6.15379309370091e-05, + "loss": 0.6714, + "step": 12809 + }, + { + "epoch": 0.8911614317019723, + "grad_norm": 1.09375, + "learning_rate": 6.146012873000994e-05, + "loss": 0.9554, + "step": 12810 + }, + { + "epoch": 0.8912309993391074, + "grad_norm": 1.421875, + "learning_rate": 6.138237417733494e-05, + "loss": 0.9702, + "step": 12811 + }, + { + "epoch": 0.8913005669762426, + "grad_norm": 1.3046875, + "learning_rate": 6.130466728293161e-05, + "loss": 1.0276, + "step": 12812 + }, + { + "epoch": 0.8913701346133779, + "grad_norm": 1.2265625, + "learning_rate": 6.122700805074622e-05, + "loss": 0.9332, + "step": 12813 + }, + { + "epoch": 0.891439702250513, + "grad_norm": 1.21875, + "learning_rate": 6.114939648472151e-05, + "loss": 0.8333, + "step": 12814 + }, + { + "epoch": 0.8915092698876482, + "grad_norm": 1.1640625, + "learning_rate": 6.107183258879833e-05, + "loss": 0.9553, + "step": 12815 + }, + { + "epoch": 0.8915788375247835, + "grad_norm": 1.4296875, + "learning_rate": 6.099431636691488e-05, + "loss": 0.9614, + "step": 12816 + }, + { + "epoch": 0.8916484051619187, + "grad_norm": 1.109375, + "learning_rate": 6.0916847823006994e-05, + "loss": 0.6986, + "step": 12817 + }, + { + "epoch": 0.8917179727990538, + "grad_norm": 1.25, + "learning_rate": 6.083942696100842e-05, + "loss": 0.8352, + "step": 12818 + }, + { + "epoch": 0.8917875404361891, + "grad_norm": 1.46875, + "learning_rate": 6.076205378484989e-05, + "loss": 1.0811, + "step": 12819 + }, + { + "epoch": 0.8918571080733243, + "grad_norm": 1.21875, + "learning_rate": 6.068472829846039e-05, + "loss": 0.7868, + "step": 12820 + }, + { + "epoch": 0.8919266757104595, + "grad_norm": 0.9609375, + "learning_rate": 6.060745050576566e-05, + "loss": 0.7291, + "step": 12821 + }, + { + "epoch": 0.8919962433475948, + "grad_norm": 1.1015625, + "learning_rate": 6.0530220410689786e-05, + "loss": 0.9385, + "step": 12822 + }, + { + "epoch": 0.8920658109847299, + "grad_norm": 0.89453125, + "learning_rate": 6.045303801715396e-05, + "loss": 0.6088, + "step": 12823 + }, + { + "epoch": 0.8921353786218651, + "grad_norm": 0.9609375, + "learning_rate": 6.037590332907739e-05, + "loss": 0.679, + "step": 12824 + }, + { + "epoch": 0.8922049462590003, + "grad_norm": 0.9296875, + "learning_rate": 6.029881635037615e-05, + "loss": 0.628, + "step": 12825 + }, + { + "epoch": 0.8922745138961355, + "grad_norm": 0.96484375, + "learning_rate": 6.022177708496468e-05, + "loss": 0.7714, + "step": 12826 + }, + { + "epoch": 0.8923440815332707, + "grad_norm": 1.125, + "learning_rate": 6.014478553675462e-05, + "loss": 0.8259, + "step": 12827 + }, + { + "epoch": 0.8924136491704059, + "grad_norm": 0.84765625, + "learning_rate": 6.006784170965518e-05, + "loss": 0.7613, + "step": 12828 + }, + { + "epoch": 0.8924832168075412, + "grad_norm": 1.0078125, + "learning_rate": 5.999094560757301e-05, + "loss": 0.5784, + "step": 12829 + }, + { + "epoch": 0.8925527844446763, + "grad_norm": 1.3203125, + "learning_rate": 5.991409723441255e-05, + "loss": 0.6955, + "step": 12830 + }, + { + "epoch": 0.8926223520818115, + "grad_norm": 0.98828125, + "learning_rate": 5.983729659407589e-05, + "loss": 0.9643, + "step": 12831 + }, + { + "epoch": 0.8926919197189468, + "grad_norm": 1.1640625, + "learning_rate": 5.976054369046269e-05, + "loss": 0.8414, + "step": 12832 + }, + { + "epoch": 0.892761487356082, + "grad_norm": 1.25, + "learning_rate": 5.968383852746973e-05, + "loss": 0.825, + "step": 12833 + }, + { + "epoch": 0.8928310549932171, + "grad_norm": 1.015625, + "learning_rate": 5.9607181108991994e-05, + "loss": 0.9173, + "step": 12834 + }, + { + "epoch": 0.8929006226303524, + "grad_norm": 1.0859375, + "learning_rate": 5.95305714389216e-05, + "loss": 0.6716, + "step": 12835 + }, + { + "epoch": 0.8929701902674876, + "grad_norm": 1.015625, + "learning_rate": 5.945400952114866e-05, + "loss": 0.7928, + "step": 12836 + }, + { + "epoch": 0.8930397579046228, + "grad_norm": 1.265625, + "learning_rate": 5.9377495359560165e-05, + "loss": 0.9662, + "step": 12837 + }, + { + "epoch": 0.8931093255417579, + "grad_norm": 1.375, + "learning_rate": 5.930102895804157e-05, + "loss": 0.9403, + "step": 12838 + }, + { + "epoch": 0.8931788931788932, + "grad_norm": 1.4375, + "learning_rate": 5.92246103204751e-05, + "loss": 0.7703, + "step": 12839 + }, + { + "epoch": 0.8932484608160284, + "grad_norm": 1.25, + "learning_rate": 5.914823945074099e-05, + "loss": 0.6388, + "step": 12840 + }, + { + "epoch": 0.8933180284531635, + "grad_norm": 1.015625, + "learning_rate": 5.907191635271725e-05, + "loss": 0.8618, + "step": 12841 + }, + { + "epoch": 0.8933875960902988, + "grad_norm": 0.91796875, + "learning_rate": 5.899564103027899e-05, + "loss": 0.6604, + "step": 12842 + }, + { + "epoch": 0.893457163727434, + "grad_norm": 1.046875, + "learning_rate": 5.891941348729901e-05, + "loss": 0.7512, + "step": 12843 + }, + { + "epoch": 0.8935267313645692, + "grad_norm": 1.09375, + "learning_rate": 5.884323372764755e-05, + "loss": 0.853, + "step": 12844 + }, + { + "epoch": 0.8935962990017045, + "grad_norm": 1.078125, + "learning_rate": 5.8767101755193174e-05, + "loss": 0.6933, + "step": 12845 + }, + { + "epoch": 0.8936658666388396, + "grad_norm": 0.86328125, + "learning_rate": 5.8691017573801244e-05, + "loss": 0.6461, + "step": 12846 + }, + { + "epoch": 0.8937354342759748, + "grad_norm": 1.078125, + "learning_rate": 5.8614981187334884e-05, + "loss": 0.8107, + "step": 12847 + }, + { + "epoch": 0.8938050019131101, + "grad_norm": 1.125, + "learning_rate": 5.853899259965467e-05, + "loss": 0.7591, + "step": 12848 + }, + { + "epoch": 0.8938745695502452, + "grad_norm": 0.87109375, + "learning_rate": 5.846305181461908e-05, + "loss": 0.64, + "step": 12849 + }, + { + "epoch": 0.8939441371873804, + "grad_norm": 1.0, + "learning_rate": 5.8387158836084254e-05, + "loss": 0.5869, + "step": 12850 + }, + { + "epoch": 0.8940137048245156, + "grad_norm": 0.94921875, + "learning_rate": 5.8311313667903206e-05, + "loss": 0.6576, + "step": 12851 + }, + { + "epoch": 0.8940832724616509, + "grad_norm": 1.015625, + "learning_rate": 5.8235516313927316e-05, + "loss": 0.7791, + "step": 12852 + }, + { + "epoch": 0.894152840098786, + "grad_norm": 1.2734375, + "learning_rate": 5.815976677800505e-05, + "loss": 0.9427, + "step": 12853 + }, + { + "epoch": 0.8942224077359212, + "grad_norm": 1.375, + "learning_rate": 5.808406506398256e-05, + "loss": 0.7885, + "step": 12854 + }, + { + "epoch": 0.8942919753730565, + "grad_norm": 1.1640625, + "learning_rate": 5.800841117570366e-05, + "loss": 0.7102, + "step": 12855 + }, + { + "epoch": 0.8943615430101917, + "grad_norm": 1.1328125, + "learning_rate": 5.793280511700971e-05, + "loss": 0.7416, + "step": 12856 + }, + { + "epoch": 0.8944311106473268, + "grad_norm": 1.1171875, + "learning_rate": 5.7857246891739324e-05, + "loss": 0.8422, + "step": 12857 + }, + { + "epoch": 0.8945006782844621, + "grad_norm": 1.21875, + "learning_rate": 5.778173650372931e-05, + "loss": 0.5853, + "step": 12858 + }, + { + "epoch": 0.8945702459215973, + "grad_norm": 1.9140625, + "learning_rate": 5.7706273956813716e-05, + "loss": 1.1291, + "step": 12859 + }, + { + "epoch": 0.8946398135587325, + "grad_norm": 1.03125, + "learning_rate": 5.763085925482403e-05, + "loss": 0.9255, + "step": 12860 + }, + { + "epoch": 0.8947093811958677, + "grad_norm": 0.890625, + "learning_rate": 5.7555492401589304e-05, + "loss": 0.7343, + "step": 12861 + }, + { + "epoch": 0.8947789488330029, + "grad_norm": 0.9765625, + "learning_rate": 5.748017340093636e-05, + "loss": 0.8176, + "step": 12862 + }, + { + "epoch": 0.8948485164701381, + "grad_norm": 1.1171875, + "learning_rate": 5.7404902256689596e-05, + "loss": 0.834, + "step": 12863 + }, + { + "epoch": 0.8949180841072732, + "grad_norm": 1.2265625, + "learning_rate": 5.732967897267094e-05, + "loss": 0.6891, + "step": 12864 + }, + { + "epoch": 0.8949876517444085, + "grad_norm": 1.09375, + "learning_rate": 5.725450355269957e-05, + "loss": 0.6077, + "step": 12865 + }, + { + "epoch": 0.8950572193815437, + "grad_norm": 1.2734375, + "learning_rate": 5.7179376000592975e-05, + "loss": 0.9541, + "step": 12866 + }, + { + "epoch": 0.8951267870186789, + "grad_norm": 1.0546875, + "learning_rate": 5.710429632016534e-05, + "loss": 0.7923, + "step": 12867 + }, + { + "epoch": 0.8951963546558142, + "grad_norm": 0.984375, + "learning_rate": 5.702926451522905e-05, + "loss": 0.4878, + "step": 12868 + }, + { + "epoch": 0.8952659222929493, + "grad_norm": 1.1796875, + "learning_rate": 5.695428058959373e-05, + "loss": 1.0191, + "step": 12869 + }, + { + "epoch": 0.8953354899300845, + "grad_norm": 1.0390625, + "learning_rate": 5.687934454706689e-05, + "loss": 0.7994, + "step": 12870 + }, + { + "epoch": 0.8954050575672198, + "grad_norm": 1.3984375, + "learning_rate": 5.680445639145304e-05, + "loss": 0.8081, + "step": 12871 + }, + { + "epoch": 0.895474625204355, + "grad_norm": 1.5859375, + "learning_rate": 5.67296161265548e-05, + "loss": 0.9523, + "step": 12872 + }, + { + "epoch": 0.8955441928414901, + "grad_norm": 1.3203125, + "learning_rate": 5.665482375617248e-05, + "loss": 0.9154, + "step": 12873 + }, + { + "epoch": 0.8956137604786254, + "grad_norm": 1.1796875, + "learning_rate": 5.658007928410336e-05, + "loss": 0.7976, + "step": 12874 + }, + { + "epoch": 0.8956833281157606, + "grad_norm": 1.390625, + "learning_rate": 5.6505382714142626e-05, + "loss": 0.9391, + "step": 12875 + }, + { + "epoch": 0.8957528957528957, + "grad_norm": 1.0234375, + "learning_rate": 5.64307340500827e-05, + "loss": 0.9901, + "step": 12876 + }, + { + "epoch": 0.8958224633900309, + "grad_norm": 1.0078125, + "learning_rate": 5.6356133295714426e-05, + "loss": 0.759, + "step": 12877 + }, + { + "epoch": 0.8958920310271662, + "grad_norm": 1.765625, + "learning_rate": 5.6281580454825344e-05, + "loss": 0.9905, + "step": 12878 + }, + { + "epoch": 0.8959615986643014, + "grad_norm": 1.0703125, + "learning_rate": 5.620707553120086e-05, + "loss": 0.7871, + "step": 12879 + }, + { + "epoch": 0.8960311663014365, + "grad_norm": 0.94921875, + "learning_rate": 5.6132618528624055e-05, + "loss": 0.6924, + "step": 12880 + }, + { + "epoch": 0.8961007339385718, + "grad_norm": 1.046875, + "learning_rate": 5.605820945087536e-05, + "loss": 0.8007, + "step": 12881 + }, + { + "epoch": 0.896170301575707, + "grad_norm": 0.95703125, + "learning_rate": 5.598384830173309e-05, + "loss": 0.7307, + "step": 12882 + }, + { + "epoch": 0.8962398692128422, + "grad_norm": 0.91796875, + "learning_rate": 5.590953508497276e-05, + "loss": 0.8733, + "step": 12883 + }, + { + "epoch": 0.8963094368499774, + "grad_norm": 1.09375, + "learning_rate": 5.583526980436771e-05, + "loss": 0.9888, + "step": 12884 + }, + { + "epoch": 0.8963790044871126, + "grad_norm": 1.015625, + "learning_rate": 5.576105246368857e-05, + "loss": 0.7823, + "step": 12885 + }, + { + "epoch": 0.8964485721242478, + "grad_norm": 1.2578125, + "learning_rate": 5.568688306670389e-05, + "loss": 0.8014, + "step": 12886 + }, + { + "epoch": 0.8965181397613831, + "grad_norm": 1.2265625, + "learning_rate": 5.5612761617179766e-05, + "loss": 0.7285, + "step": 12887 + }, + { + "epoch": 0.8965877073985182, + "grad_norm": 1.6171875, + "learning_rate": 5.553868811887952e-05, + "loss": 0.9137, + "step": 12888 + }, + { + "epoch": 0.8966572750356534, + "grad_norm": 0.93359375, + "learning_rate": 5.546466257556415e-05, + "loss": 0.7395, + "step": 12889 + }, + { + "epoch": 0.8967268426727886, + "grad_norm": 1.0546875, + "learning_rate": 5.539068499099231e-05, + "loss": 0.9236, + "step": 12890 + }, + { + "epoch": 0.8967964103099239, + "grad_norm": 1.0703125, + "learning_rate": 5.5316755368920554e-05, + "loss": 0.7547, + "step": 12891 + }, + { + "epoch": 0.896865977947059, + "grad_norm": 1.0078125, + "learning_rate": 5.5242873713102326e-05, + "loss": 0.8267, + "step": 12892 + }, + { + "epoch": 0.8969355455841942, + "grad_norm": 1.515625, + "learning_rate": 5.516904002728895e-05, + "loss": 0.8644, + "step": 12893 + }, + { + "epoch": 0.8970051132213295, + "grad_norm": 1.125, + "learning_rate": 5.509525431522955e-05, + "loss": 0.7402, + "step": 12894 + }, + { + "epoch": 0.8970746808584646, + "grad_norm": 1.0390625, + "learning_rate": 5.502151658067034e-05, + "loss": 0.8537, + "step": 12895 + }, + { + "epoch": 0.8971442484955998, + "grad_norm": 1.203125, + "learning_rate": 5.494782682735555e-05, + "loss": 0.6948, + "step": 12896 + }, + { + "epoch": 0.8972138161327351, + "grad_norm": 1.0703125, + "learning_rate": 5.487418505902664e-05, + "loss": 0.8975, + "step": 12897 + }, + { + "epoch": 0.8972833837698703, + "grad_norm": 1.2578125, + "learning_rate": 5.480059127942283e-05, + "loss": 0.8004, + "step": 12898 + }, + { + "epoch": 0.8973529514070054, + "grad_norm": 1.0390625, + "learning_rate": 5.47270454922808e-05, + "loss": 0.7677, + "step": 12899 + }, + { + "epoch": 0.8974225190441407, + "grad_norm": 1.1015625, + "learning_rate": 5.465354770133491e-05, + "loss": 1.0044, + "step": 12900 + }, + { + "epoch": 0.8974920866812759, + "grad_norm": 0.984375, + "learning_rate": 5.4580097910317036e-05, + "loss": 0.6903, + "step": 12901 + }, + { + "epoch": 0.8975616543184111, + "grad_norm": 1.0390625, + "learning_rate": 5.4506696122956556e-05, + "loss": 0.9146, + "step": 12902 + }, + { + "epoch": 0.8976312219555462, + "grad_norm": 1.609375, + "learning_rate": 5.443334234298025e-05, + "loss": 0.8403, + "step": 12903 + }, + { + "epoch": 0.8977007895926815, + "grad_norm": 1.28125, + "learning_rate": 5.436003657411281e-05, + "loss": 0.7782, + "step": 12904 + }, + { + "epoch": 0.8977703572298167, + "grad_norm": 1.09375, + "learning_rate": 5.4286778820076486e-05, + "loss": 0.9536, + "step": 12905 + }, + { + "epoch": 0.8978399248669519, + "grad_norm": 1.421875, + "learning_rate": 5.421356908459074e-05, + "loss": 0.9864, + "step": 12906 + }, + { + "epoch": 0.8979094925040871, + "grad_norm": 1.0546875, + "learning_rate": 5.414040737137271e-05, + "loss": 0.636, + "step": 12907 + }, + { + "epoch": 0.8979790601412223, + "grad_norm": 1.421875, + "learning_rate": 5.406729368413743e-05, + "loss": 0.9173, + "step": 12908 + }, + { + "epoch": 0.8980486277783575, + "grad_norm": 1.3203125, + "learning_rate": 5.399422802659715e-05, + "loss": 0.8698, + "step": 12909 + }, + { + "epoch": 0.8981181954154928, + "grad_norm": 0.9296875, + "learning_rate": 5.3921210402461785e-05, + "loss": 0.6691, + "step": 12910 + }, + { + "epoch": 0.8981877630526279, + "grad_norm": 1.015625, + "learning_rate": 5.38482408154386e-05, + "loss": 0.7015, + "step": 12911 + }, + { + "epoch": 0.8982573306897631, + "grad_norm": 1.6484375, + "learning_rate": 5.377531926923285e-05, + "loss": 0.8865, + "step": 12912 + }, + { + "epoch": 0.8983268983268984, + "grad_norm": 1.1171875, + "learning_rate": 5.3702445767547015e-05, + "loss": 0.9282, + "step": 12913 + }, + { + "epoch": 0.8983964659640336, + "grad_norm": 1.25, + "learning_rate": 5.362962031408136e-05, + "loss": 0.925, + "step": 12914 + }, + { + "epoch": 0.8984660336011687, + "grad_norm": 1.296875, + "learning_rate": 5.35568429125336e-05, + "loss": 0.8217, + "step": 12915 + }, + { + "epoch": 0.8985356012383039, + "grad_norm": 1.1171875, + "learning_rate": 5.348411356659888e-05, + "loss": 0.897, + "step": 12916 + }, + { + "epoch": 0.8986051688754392, + "grad_norm": 1.4453125, + "learning_rate": 5.341143227996992e-05, + "loss": 0.8641, + "step": 12917 + }, + { + "epoch": 0.8986747365125743, + "grad_norm": 1.0859375, + "learning_rate": 5.3338799056337316e-05, + "loss": 0.7648, + "step": 12918 + }, + { + "epoch": 0.8987443041497095, + "grad_norm": 1.03125, + "learning_rate": 5.326621389938913e-05, + "loss": 0.7663, + "step": 12919 + }, + { + "epoch": 0.8988138717868448, + "grad_norm": 1.0234375, + "learning_rate": 5.319367681281073e-05, + "loss": 1.0038, + "step": 12920 + }, + { + "epoch": 0.89888343942398, + "grad_norm": 1.046875, + "learning_rate": 5.312118780028496e-05, + "loss": 0.8561, + "step": 12921 + }, + { + "epoch": 0.8989530070611151, + "grad_norm": 1.078125, + "learning_rate": 5.304874686549277e-05, + "loss": 0.8276, + "step": 12922 + }, + { + "epoch": 0.8990225746982504, + "grad_norm": 0.9609375, + "learning_rate": 5.29763540121122e-05, + "loss": 0.7428, + "step": 12923 + }, + { + "epoch": 0.8990921423353856, + "grad_norm": 1.2421875, + "learning_rate": 5.290400924381911e-05, + "loss": 0.7342, + "step": 12924 + }, + { + "epoch": 0.8991617099725208, + "grad_norm": 1.1640625, + "learning_rate": 5.2831712564286536e-05, + "loss": 1.0198, + "step": 12925 + }, + { + "epoch": 0.899231277609656, + "grad_norm": 0.96484375, + "learning_rate": 5.275946397718578e-05, + "loss": 0.9774, + "step": 12926 + }, + { + "epoch": 0.8993008452467912, + "grad_norm": 1.390625, + "learning_rate": 5.2687263486184686e-05, + "loss": 0.9127, + "step": 12927 + }, + { + "epoch": 0.8993704128839264, + "grad_norm": 0.8046875, + "learning_rate": 5.2615111094949765e-05, + "loss": 0.6506, + "step": 12928 + }, + { + "epoch": 0.8994399805210616, + "grad_norm": 1.0859375, + "learning_rate": 5.254300680714419e-05, + "loss": 0.9133, + "step": 12929 + }, + { + "epoch": 0.8995095481581968, + "grad_norm": 0.9921875, + "learning_rate": 5.247095062642937e-05, + "loss": 0.8668, + "step": 12930 + }, + { + "epoch": 0.899579115795332, + "grad_norm": 1.171875, + "learning_rate": 5.23989425564636e-05, + "loss": 0.7779, + "step": 12931 + }, + { + "epoch": 0.8996486834324672, + "grad_norm": 1.1484375, + "learning_rate": 5.2326982600903184e-05, + "loss": 0.8159, + "step": 12932 + }, + { + "epoch": 0.8997182510696025, + "grad_norm": 0.86328125, + "learning_rate": 5.225507076340219e-05, + "loss": 0.6071, + "step": 12933 + }, + { + "epoch": 0.8997878187067376, + "grad_norm": 1.0390625, + "learning_rate": 5.21832070476117e-05, + "loss": 0.6333, + "step": 12934 + }, + { + "epoch": 0.8998573863438728, + "grad_norm": 1.4453125, + "learning_rate": 5.211139145718047e-05, + "loss": 0.9271, + "step": 12935 + }, + { + "epoch": 0.8999269539810081, + "grad_norm": 0.81640625, + "learning_rate": 5.2039623995755126e-05, + "loss": 0.6386, + "step": 12936 + }, + { + "epoch": 0.8999965216181433, + "grad_norm": 1.0390625, + "learning_rate": 5.196790466697965e-05, + "loss": 0.7896, + "step": 12937 + }, + { + "epoch": 0.9000660892552784, + "grad_norm": 1.484375, + "learning_rate": 5.189623347449557e-05, + "loss": 0.9823, + "step": 12938 + }, + { + "epoch": 0.9001356568924136, + "grad_norm": 1.015625, + "learning_rate": 5.182461042194175e-05, + "loss": 0.727, + "step": 12939 + }, + { + "epoch": 0.9002052245295489, + "grad_norm": 1.1640625, + "learning_rate": 5.1753035512955184e-05, + "loss": 0.9766, + "step": 12940 + }, + { + "epoch": 0.900274792166684, + "grad_norm": 1.3125, + "learning_rate": 5.168150875117006e-05, + "loss": 0.7615, + "step": 12941 + }, + { + "epoch": 0.9003443598038192, + "grad_norm": 0.8984375, + "learning_rate": 5.161003014021792e-05, + "loss": 0.7439, + "step": 12942 + }, + { + "epoch": 0.9004139274409545, + "grad_norm": 1.3828125, + "learning_rate": 5.1538599683728206e-05, + "loss": 0.8239, + "step": 12943 + }, + { + "epoch": 0.9004834950780897, + "grad_norm": 1.140625, + "learning_rate": 5.146721738532789e-05, + "loss": 0.6905, + "step": 12944 + }, + { + "epoch": 0.9005530627152248, + "grad_norm": 1.2734375, + "learning_rate": 5.1395883248641196e-05, + "loss": 0.7003, + "step": 12945 + }, + { + "epoch": 0.9006226303523601, + "grad_norm": 1.0625, + "learning_rate": 5.132459727729022e-05, + "loss": 1.021, + "step": 12946 + }, + { + "epoch": 0.9006921979894953, + "grad_norm": 1.15625, + "learning_rate": 5.125335947489462e-05, + "loss": 0.7083, + "step": 12947 + }, + { + "epoch": 0.9007617656266305, + "grad_norm": 1.015625, + "learning_rate": 5.118216984507151e-05, + "loss": 1.0288, + "step": 12948 + }, + { + "epoch": 0.9008313332637657, + "grad_norm": 1.4375, + "learning_rate": 5.111102839143511e-05, + "loss": 0.9, + "step": 12949 + }, + { + "epoch": 0.9009009009009009, + "grad_norm": 0.90625, + "learning_rate": 5.103993511759808e-05, + "loss": 0.6472, + "step": 12950 + }, + { + "epoch": 0.9009704685380361, + "grad_norm": 1.359375, + "learning_rate": 5.096889002717009e-05, + "loss": 0.9621, + "step": 12951 + }, + { + "epoch": 0.9010400361751713, + "grad_norm": 1.0546875, + "learning_rate": 5.0897893123758365e-05, + "loss": 0.7863, + "step": 12952 + }, + { + "epoch": 0.9011096038123065, + "grad_norm": 1.28125, + "learning_rate": 5.08269444109678e-05, + "loss": 0.9322, + "step": 12953 + }, + { + "epoch": 0.9011791714494417, + "grad_norm": 1.2890625, + "learning_rate": 5.0756043892400626e-05, + "loss": 0.8211, + "step": 12954 + }, + { + "epoch": 0.9012487390865769, + "grad_norm": 1.0625, + "learning_rate": 5.0685191571657294e-05, + "loss": 0.9078, + "step": 12955 + }, + { + "epoch": 0.9013183067237122, + "grad_norm": 0.98828125, + "learning_rate": 5.061438745233493e-05, + "loss": 0.6872, + "step": 12956 + }, + { + "epoch": 0.9013878743608473, + "grad_norm": 1.125, + "learning_rate": 5.054363153802865e-05, + "loss": 0.8901, + "step": 12957 + }, + { + "epoch": 0.9014574419979825, + "grad_norm": 1.046875, + "learning_rate": 5.0472923832331266e-05, + "loss": 0.8544, + "step": 12958 + }, + { + "epoch": 0.9015270096351178, + "grad_norm": 1.125, + "learning_rate": 5.040226433883266e-05, + "loss": 0.9961, + "step": 12959 + }, + { + "epoch": 0.901596577272253, + "grad_norm": 1.0234375, + "learning_rate": 5.0331653061120755e-05, + "loss": 0.7943, + "step": 12960 + }, + { + "epoch": 0.9016661449093881, + "grad_norm": 1.15625, + "learning_rate": 5.0261090002781004e-05, + "loss": 0.716, + "step": 12961 + }, + { + "epoch": 0.9017357125465234, + "grad_norm": 0.77734375, + "learning_rate": 5.0190575167396e-05, + "loss": 0.7278, + "step": 12962 + }, + { + "epoch": 0.9018052801836586, + "grad_norm": 1.1015625, + "learning_rate": 5.01201085585461e-05, + "loss": 0.8286, + "step": 12963 + }, + { + "epoch": 0.9018748478207937, + "grad_norm": 1.015625, + "learning_rate": 5.0049690179809315e-05, + "loss": 0.6693, + "step": 12964 + }, + { + "epoch": 0.9019444154579289, + "grad_norm": 0.984375, + "learning_rate": 4.997932003476124e-05, + "loss": 0.7188, + "step": 12965 + }, + { + "epoch": 0.9020139830950642, + "grad_norm": 1.0859375, + "learning_rate": 4.9908998126974915e-05, + "loss": 0.7914, + "step": 12966 + }, + { + "epoch": 0.9020835507321994, + "grad_norm": 0.87109375, + "learning_rate": 4.9838724460020693e-05, + "loss": 0.7081, + "step": 12967 + }, + { + "epoch": 0.9021531183693345, + "grad_norm": 0.96875, + "learning_rate": 4.9768499037466944e-05, + "loss": 0.7018, + "step": 12968 + }, + { + "epoch": 0.9022226860064698, + "grad_norm": 1.0546875, + "learning_rate": 4.969832186287937e-05, + "loss": 0.7203, + "step": 12969 + }, + { + "epoch": 0.902292253643605, + "grad_norm": 0.9765625, + "learning_rate": 4.962819293982113e-05, + "loss": 0.8163, + "step": 12970 + }, + { + "epoch": 0.9023618212807402, + "grad_norm": 1.015625, + "learning_rate": 4.9558112271852916e-05, + "loss": 0.7653, + "step": 12971 + }, + { + "epoch": 0.9024313889178754, + "grad_norm": 1.09375, + "learning_rate": 4.948807986253323e-05, + "loss": 0.8682, + "step": 12972 + }, + { + "epoch": 0.9025009565550106, + "grad_norm": 1.3515625, + "learning_rate": 4.9418095715417885e-05, + "loss": 0.6972, + "step": 12973 + }, + { + "epoch": 0.9025705241921458, + "grad_norm": 1.5390625, + "learning_rate": 4.934815983406027e-05, + "loss": 0.7371, + "step": 12974 + }, + { + "epoch": 0.9026400918292811, + "grad_norm": 1.53125, + "learning_rate": 4.927827222201165e-05, + "loss": 0.9693, + "step": 12975 + }, + { + "epoch": 0.9027096594664162, + "grad_norm": 1.296875, + "learning_rate": 4.9208432882820396e-05, + "loss": 0.691, + "step": 12976 + }, + { + "epoch": 0.9027792271035514, + "grad_norm": 1.03125, + "learning_rate": 4.913864182003236e-05, + "loss": 0.7755, + "step": 12977 + }, + { + "epoch": 0.9028487947406866, + "grad_norm": 1.3515625, + "learning_rate": 4.9068899037191364e-05, + "loss": 0.7264, + "step": 12978 + }, + { + "epoch": 0.9029183623778219, + "grad_norm": 1.1171875, + "learning_rate": 4.8999204537838906e-05, + "loss": 0.7654, + "step": 12979 + }, + { + "epoch": 0.902987930014957, + "grad_norm": 1.0078125, + "learning_rate": 4.892955832551338e-05, + "loss": 0.7029, + "step": 12980 + }, + { + "epoch": 0.9030574976520922, + "grad_norm": 1.140625, + "learning_rate": 4.885996040375096e-05, + "loss": 1.0747, + "step": 12981 + }, + { + "epoch": 0.9031270652892275, + "grad_norm": 1.2421875, + "learning_rate": 4.8790410776085705e-05, + "loss": 0.8239, + "step": 12982 + }, + { + "epoch": 0.9031966329263627, + "grad_norm": 1.0234375, + "learning_rate": 4.872090944604901e-05, + "loss": 0.8294, + "step": 12983 + }, + { + "epoch": 0.9032662005634978, + "grad_norm": 1.390625, + "learning_rate": 4.865145641716972e-05, + "loss": 1.057, + "step": 12984 + }, + { + "epoch": 0.9033357682006331, + "grad_norm": 1.109375, + "learning_rate": 4.858205169297425e-05, + "loss": 0.7924, + "step": 12985 + }, + { + "epoch": 0.9034053358377683, + "grad_norm": 1.2734375, + "learning_rate": 4.851269527698665e-05, + "loss": 0.6779, + "step": 12986 + }, + { + "epoch": 0.9034749034749034, + "grad_norm": 1.3515625, + "learning_rate": 4.8443387172728784e-05, + "loss": 0.7835, + "step": 12987 + }, + { + "epoch": 0.9035444711120387, + "grad_norm": 1.3671875, + "learning_rate": 4.837412738371927e-05, + "loss": 1.1405, + "step": 12988 + }, + { + "epoch": 0.9036140387491739, + "grad_norm": 1.03125, + "learning_rate": 4.830491591347519e-05, + "loss": 0.7007, + "step": 12989 + }, + { + "epoch": 0.9036836063863091, + "grad_norm": 1.1328125, + "learning_rate": 4.823575276551051e-05, + "loss": 0.8549, + "step": 12990 + }, + { + "epoch": 0.9037531740234442, + "grad_norm": 1.0078125, + "learning_rate": 4.816663794333698e-05, + "loss": 0.673, + "step": 12991 + }, + { + "epoch": 0.9038227416605795, + "grad_norm": 1.015625, + "learning_rate": 4.8097571450464006e-05, + "loss": 0.9113, + "step": 12992 + }, + { + "epoch": 0.9038923092977147, + "grad_norm": 1.21875, + "learning_rate": 4.802855329039846e-05, + "loss": 0.6998, + "step": 12993 + }, + { + "epoch": 0.9039618769348499, + "grad_norm": 1.0859375, + "learning_rate": 4.795958346664475e-05, + "loss": 0.7755, + "step": 12994 + }, + { + "epoch": 0.9040314445719851, + "grad_norm": 1.6484375, + "learning_rate": 4.789066198270464e-05, + "loss": 0.6506, + "step": 12995 + }, + { + "epoch": 0.9041010122091203, + "grad_norm": 1.1328125, + "learning_rate": 4.782178884207766e-05, + "loss": 0.8783, + "step": 12996 + }, + { + "epoch": 0.9041705798462555, + "grad_norm": 1.203125, + "learning_rate": 4.775296404826113e-05, + "loss": 0.8482, + "step": 12997 + }, + { + "epoch": 0.9042401474833908, + "grad_norm": 1.046875, + "learning_rate": 4.768418760474935e-05, + "loss": 0.721, + "step": 12998 + }, + { + "epoch": 0.9043097151205259, + "grad_norm": 0.96875, + "learning_rate": 4.761545951503432e-05, + "loss": 0.9024, + "step": 12999 + }, + { + "epoch": 0.9043792827576611, + "grad_norm": 0.90234375, + "learning_rate": 4.7546779782605906e-05, + "loss": 0.7128, + "step": 13000 + }, + { + "epoch": 0.9044488503947964, + "grad_norm": 0.8671875, + "learning_rate": 4.7478148410951546e-05, + "loss": 0.7462, + "step": 13001 + }, + { + "epoch": 0.9045184180319316, + "grad_norm": 1.265625, + "learning_rate": 4.7409565403555456e-05, + "loss": 0.6865, + "step": 13002 + }, + { + "epoch": 0.9045879856690667, + "grad_norm": 1.1484375, + "learning_rate": 4.734103076390039e-05, + "loss": 0.8292, + "step": 13003 + }, + { + "epoch": 0.9046575533062019, + "grad_norm": 1.6171875, + "learning_rate": 4.727254449546614e-05, + "loss": 0.6711, + "step": 13004 + }, + { + "epoch": 0.9047271209433372, + "grad_norm": 0.8984375, + "learning_rate": 4.720410660172969e-05, + "loss": 0.878, + "step": 13005 + }, + { + "epoch": 0.9047966885804724, + "grad_norm": 1.1796875, + "learning_rate": 4.7135717086166375e-05, + "loss": 0.882, + "step": 13006 + }, + { + "epoch": 0.9048662562176075, + "grad_norm": 1.21875, + "learning_rate": 4.7067375952248637e-05, + "loss": 0.848, + "step": 13007 + }, + { + "epoch": 0.9049358238547428, + "grad_norm": 1.4453125, + "learning_rate": 4.6999083203446366e-05, + "loss": 0.9239, + "step": 13008 + }, + { + "epoch": 0.905005391491878, + "grad_norm": 0.96875, + "learning_rate": 4.693083884322713e-05, + "loss": 0.6508, + "step": 13009 + }, + { + "epoch": 0.9050749591290131, + "grad_norm": 1.0859375, + "learning_rate": 4.686264287505604e-05, + "loss": 0.7265, + "step": 13010 + }, + { + "epoch": 0.9051445267661484, + "grad_norm": 1.234375, + "learning_rate": 4.679449530239588e-05, + "loss": 0.8146, + "step": 13011 + }, + { + "epoch": 0.9052140944032836, + "grad_norm": 1.171875, + "learning_rate": 4.6726396128706774e-05, + "loss": 0.8923, + "step": 13012 + }, + { + "epoch": 0.9052836620404188, + "grad_norm": 1.734375, + "learning_rate": 4.665834535744617e-05, + "loss": 0.8934, + "step": 13013 + }, + { + "epoch": 0.905353229677554, + "grad_norm": 0.8671875, + "learning_rate": 4.659034299206977e-05, + "loss": 0.6589, + "step": 13014 + }, + { + "epoch": 0.9054227973146892, + "grad_norm": 1.1796875, + "learning_rate": 4.652238903603023e-05, + "loss": 0.9328, + "step": 13015 + }, + { + "epoch": 0.9054923649518244, + "grad_norm": 1.171875, + "learning_rate": 4.6454483492777925e-05, + "loss": 0.9544, + "step": 13016 + }, + { + "epoch": 0.9055619325889596, + "grad_norm": 1.0703125, + "learning_rate": 4.638662636576052e-05, + "loss": 0.9136, + "step": 13017 + }, + { + "epoch": 0.9056315002260948, + "grad_norm": 1.125, + "learning_rate": 4.6318817658423715e-05, + "loss": 0.7097, + "step": 13018 + }, + { + "epoch": 0.90570106786323, + "grad_norm": 1.09375, + "learning_rate": 4.625105737421065e-05, + "loss": 0.8913, + "step": 13019 + }, + { + "epoch": 0.9057706355003652, + "grad_norm": 0.89453125, + "learning_rate": 4.618334551656145e-05, + "loss": 0.7505, + "step": 13020 + }, + { + "epoch": 0.9058402031375005, + "grad_norm": 0.859375, + "learning_rate": 4.611568208891448e-05, + "loss": 0.7989, + "step": 13021 + }, + { + "epoch": 0.9059097707746356, + "grad_norm": 0.9609375, + "learning_rate": 4.6048067094705216e-05, + "loss": 0.68, + "step": 13022 + }, + { + "epoch": 0.9059793384117708, + "grad_norm": 1.296875, + "learning_rate": 4.59805005373668e-05, + "loss": 0.8832, + "step": 13023 + }, + { + "epoch": 0.9060489060489061, + "grad_norm": 1.3671875, + "learning_rate": 4.591298242032982e-05, + "loss": 0.9539, + "step": 13024 + }, + { + "epoch": 0.9061184736860413, + "grad_norm": 0.9609375, + "learning_rate": 4.5845512747022865e-05, + "loss": 0.6243, + "step": 13025 + }, + { + "epoch": 0.9061880413231764, + "grad_norm": 0.94921875, + "learning_rate": 4.577809152087142e-05, + "loss": 0.7258, + "step": 13026 + }, + { + "epoch": 0.9062576089603117, + "grad_norm": 1.1328125, + "learning_rate": 4.571071874529886e-05, + "loss": 0.6775, + "step": 13027 + }, + { + "epoch": 0.9063271765974469, + "grad_norm": 1.1796875, + "learning_rate": 4.5643394423725895e-05, + "loss": 0.8348, + "step": 13028 + }, + { + "epoch": 0.906396744234582, + "grad_norm": 0.9296875, + "learning_rate": 4.5576118559571224e-05, + "loss": 0.7729, + "step": 13029 + }, + { + "epoch": 0.9064663118717172, + "grad_norm": 1.0859375, + "learning_rate": 4.5508891156250565e-05, + "loss": 0.7014, + "step": 13030 + }, + { + "epoch": 0.9065358795088525, + "grad_norm": 1.140625, + "learning_rate": 4.54417122171773e-05, + "loss": 0.7075, + "step": 13031 + }, + { + "epoch": 0.9066054471459877, + "grad_norm": 1.1015625, + "learning_rate": 4.537458174576259e-05, + "loss": 0.7201, + "step": 13032 + }, + { + "epoch": 0.9066750147831228, + "grad_norm": 1.25, + "learning_rate": 4.530749974541504e-05, + "loss": 0.7543, + "step": 13033 + }, + { + "epoch": 0.9067445824202581, + "grad_norm": 1.1796875, + "learning_rate": 4.524046621954048e-05, + "loss": 0.7822, + "step": 13034 + }, + { + "epoch": 0.9068141500573933, + "grad_norm": 1.3125, + "learning_rate": 4.517348117154296e-05, + "loss": 0.8342, + "step": 13035 + }, + { + "epoch": 0.9068837176945285, + "grad_norm": 0.9921875, + "learning_rate": 4.510654460482322e-05, + "loss": 0.8462, + "step": 13036 + }, + { + "epoch": 0.9069532853316637, + "grad_norm": 1.1796875, + "learning_rate": 4.503965652278008e-05, + "loss": 0.6069, + "step": 13037 + }, + { + "epoch": 0.9070228529687989, + "grad_norm": 1.4453125, + "learning_rate": 4.497281692880983e-05, + "loss": 0.9126, + "step": 13038 + }, + { + "epoch": 0.9070924206059341, + "grad_norm": 1.078125, + "learning_rate": 4.490602582630643e-05, + "loss": 0.8152, + "step": 13039 + }, + { + "epoch": 0.9071619882430694, + "grad_norm": 1.2109375, + "learning_rate": 4.483928321866093e-05, + "loss": 1.0337, + "step": 13040 + }, + { + "epoch": 0.9072315558802045, + "grad_norm": 1.265625, + "learning_rate": 4.4772589109262184e-05, + "loss": 0.6577, + "step": 13041 + }, + { + "epoch": 0.9073011235173397, + "grad_norm": 0.9296875, + "learning_rate": 4.4705943501496596e-05, + "loss": 1.0599, + "step": 13042 + }, + { + "epoch": 0.9073706911544749, + "grad_norm": 0.99609375, + "learning_rate": 4.463934639874834e-05, + "loss": 0.7178, + "step": 13043 + }, + { + "epoch": 0.9074402587916102, + "grad_norm": 1.1640625, + "learning_rate": 4.45727978043986e-05, + "loss": 0.7231, + "step": 13044 + }, + { + "epoch": 0.9075098264287453, + "grad_norm": 1.046875, + "learning_rate": 4.450629772182646e-05, + "loss": 0.8416, + "step": 13045 + }, + { + "epoch": 0.9075793940658805, + "grad_norm": 1.9296875, + "learning_rate": 4.4439846154408435e-05, + "loss": 1.1456, + "step": 13046 + }, + { + "epoch": 0.9076489617030158, + "grad_norm": 0.99609375, + "learning_rate": 4.4373443105518827e-05, + "loss": 0.7196, + "step": 13047 + }, + { + "epoch": 0.907718529340151, + "grad_norm": 1.046875, + "learning_rate": 4.430708857852883e-05, + "loss": 1.0032, + "step": 13048 + }, + { + "epoch": 0.9077880969772861, + "grad_norm": 1.25, + "learning_rate": 4.424078257680808e-05, + "loss": 0.9284, + "step": 13049 + }, + { + "epoch": 0.9078576646144214, + "grad_norm": 1.7578125, + "learning_rate": 4.417452510372277e-05, + "loss": 0.8973, + "step": 13050 + }, + { + "epoch": 0.9079272322515566, + "grad_norm": 1.0859375, + "learning_rate": 4.410831616263755e-05, + "loss": 0.9603, + "step": 13051 + }, + { + "epoch": 0.9079967998886918, + "grad_norm": 1.2890625, + "learning_rate": 4.404215575691384e-05, + "loss": 0.8146, + "step": 13052 + }, + { + "epoch": 0.908066367525827, + "grad_norm": 1.3515625, + "learning_rate": 4.397604388991116e-05, + "loss": 0.7503, + "step": 13053 + }, + { + "epoch": 0.9081359351629622, + "grad_norm": 0.953125, + "learning_rate": 4.3909980564986294e-05, + "loss": 0.6531, + "step": 13054 + }, + { + "epoch": 0.9082055028000974, + "grad_norm": 1.0703125, + "learning_rate": 4.3843965785493435e-05, + "loss": 0.6746, + "step": 13055 + }, + { + "epoch": 0.9082750704372325, + "grad_norm": 1.1328125, + "learning_rate": 4.377799955478456e-05, + "loss": 0.6423, + "step": 13056 + }, + { + "epoch": 0.9083446380743678, + "grad_norm": 0.98046875, + "learning_rate": 4.371208187620934e-05, + "loss": 0.6185, + "step": 13057 + }, + { + "epoch": 0.908414205711503, + "grad_norm": 1.25, + "learning_rate": 4.364621275311453e-05, + "loss": 0.846, + "step": 13058 + }, + { + "epoch": 0.9084837733486382, + "grad_norm": 1.0859375, + "learning_rate": 4.358039218884458e-05, + "loss": 0.8656, + "step": 13059 + }, + { + "epoch": 0.9085533409857734, + "grad_norm": 1.0, + "learning_rate": 4.351462018674157e-05, + "loss": 0.5444, + "step": 13060 + }, + { + "epoch": 0.9086229086229086, + "grad_norm": 1.0859375, + "learning_rate": 4.3448896750145184e-05, + "loss": 0.8221, + "step": 13061 + }, + { + "epoch": 0.9086924762600438, + "grad_norm": 0.95703125, + "learning_rate": 4.338322188239241e-05, + "loss": 0.6084, + "step": 13062 + }, + { + "epoch": 0.9087620438971791, + "grad_norm": 0.86328125, + "learning_rate": 4.331759558681803e-05, + "loss": 0.5762, + "step": 13063 + }, + { + "epoch": 0.9088316115343142, + "grad_norm": 1.59375, + "learning_rate": 4.3252017866753926e-05, + "loss": 1.0052, + "step": 13064 + }, + { + "epoch": 0.9089011791714494, + "grad_norm": 1.3125, + "learning_rate": 4.318648872553011e-05, + "loss": 0.9105, + "step": 13065 + }, + { + "epoch": 0.9089707468085847, + "grad_norm": 1.140625, + "learning_rate": 4.3121008166473576e-05, + "loss": 0.843, + "step": 13066 + }, + { + "epoch": 0.9090403144457199, + "grad_norm": 1.3671875, + "learning_rate": 4.305557619290934e-05, + "loss": 0.968, + "step": 13067 + }, + { + "epoch": 0.909109882082855, + "grad_norm": 1.4765625, + "learning_rate": 4.2990192808159636e-05, + "loss": 1.0298, + "step": 13068 + }, + { + "epoch": 0.9091794497199902, + "grad_norm": 0.88671875, + "learning_rate": 4.292485801554402e-05, + "loss": 0.5187, + "step": 13069 + }, + { + "epoch": 0.9092490173571255, + "grad_norm": 1.3984375, + "learning_rate": 4.2859571818380295e-05, + "loss": 1.0645, + "step": 13070 + }, + { + "epoch": 0.9093185849942607, + "grad_norm": 1.1953125, + "learning_rate": 4.279433421998324e-05, + "loss": 1.0641, + "step": 13071 + }, + { + "epoch": 0.9093881526313958, + "grad_norm": 0.890625, + "learning_rate": 4.272914522366511e-05, + "loss": 0.715, + "step": 13072 + }, + { + "epoch": 0.9094577202685311, + "grad_norm": 1.203125, + "learning_rate": 4.266400483273591e-05, + "loss": 1.082, + "step": 13073 + }, + { + "epoch": 0.9095272879056663, + "grad_norm": 1.0078125, + "learning_rate": 4.259891305050323e-05, + "loss": 0.4616, + "step": 13074 + }, + { + "epoch": 0.9095968555428015, + "grad_norm": 1.046875, + "learning_rate": 4.253386988027219e-05, + "loss": 0.7929, + "step": 13075 + }, + { + "epoch": 0.9096664231799367, + "grad_norm": 1.2265625, + "learning_rate": 4.246887532534516e-05, + "loss": 0.7197, + "step": 13076 + }, + { + "epoch": 0.9097359908170719, + "grad_norm": 1.1796875, + "learning_rate": 4.240392938902238e-05, + "loss": 1.2147, + "step": 13077 + }, + { + "epoch": 0.9098055584542071, + "grad_norm": 1.25, + "learning_rate": 4.2339032074601326e-05, + "loss": 1.0718, + "step": 13078 + }, + { + "epoch": 0.9098751260913424, + "grad_norm": 1.046875, + "learning_rate": 4.2274183385377476e-05, + "loss": 0.5959, + "step": 13079 + }, + { + "epoch": 0.9099446937284775, + "grad_norm": 1.0546875, + "learning_rate": 4.220938332464308e-05, + "loss": 0.9636, + "step": 13080 + }, + { + "epoch": 0.9100142613656127, + "grad_norm": 1.1953125, + "learning_rate": 4.214463189568874e-05, + "loss": 0.9016, + "step": 13081 + }, + { + "epoch": 0.9100838290027479, + "grad_norm": 1.203125, + "learning_rate": 4.20799291018018e-05, + "loss": 0.8073, + "step": 13082 + }, + { + "epoch": 0.9101533966398831, + "grad_norm": 1.1171875, + "learning_rate": 4.2015274946268115e-05, + "loss": 0.9264, + "step": 13083 + }, + { + "epoch": 0.9102229642770183, + "grad_norm": 1.3671875, + "learning_rate": 4.195066943236991e-05, + "loss": 0.7468, + "step": 13084 + }, + { + "epoch": 0.9102925319141535, + "grad_norm": 0.96875, + "learning_rate": 4.1886112563387924e-05, + "loss": 0.8731, + "step": 13085 + }, + { + "epoch": 0.9103620995512888, + "grad_norm": 1.25, + "learning_rate": 4.1821604342599854e-05, + "loss": 0.9215, + "step": 13086 + }, + { + "epoch": 0.9104316671884239, + "grad_norm": 1.03125, + "learning_rate": 4.175714477328108e-05, + "loss": 0.8136, + "step": 13087 + }, + { + "epoch": 0.9105012348255591, + "grad_norm": 0.9453125, + "learning_rate": 4.169273385870454e-05, + "loss": 0.7271, + "step": 13088 + }, + { + "epoch": 0.9105708024626944, + "grad_norm": 1.2265625, + "learning_rate": 4.162837160214095e-05, + "loss": 0.6549, + "step": 13089 + }, + { + "epoch": 0.9106403700998296, + "grad_norm": 1.2734375, + "learning_rate": 4.156405800685803e-05, + "loss": 1.0101, + "step": 13090 + }, + { + "epoch": 0.9107099377369647, + "grad_norm": 1.5625, + "learning_rate": 4.1499793076121285e-05, + "loss": 0.7849, + "step": 13091 + }, + { + "epoch": 0.9107795053741, + "grad_norm": 0.921875, + "learning_rate": 4.1435576813193765e-05, + "loss": 0.8024, + "step": 13092 + }, + { + "epoch": 0.9108490730112352, + "grad_norm": 1.8671875, + "learning_rate": 4.137140922133642e-05, + "loss": 0.7898, + "step": 13093 + }, + { + "epoch": 0.9109186406483704, + "grad_norm": 1.1875, + "learning_rate": 4.130729030380675e-05, + "loss": 0.732, + "step": 13094 + }, + { + "epoch": 0.9109882082855055, + "grad_norm": 0.890625, + "learning_rate": 4.1243220063860944e-05, + "loss": 0.779, + "step": 13095 + }, + { + "epoch": 0.9110577759226408, + "grad_norm": 1.0859375, + "learning_rate": 4.117919850475183e-05, + "loss": 0.6693, + "step": 13096 + }, + { + "epoch": 0.911127343559776, + "grad_norm": 1.21875, + "learning_rate": 4.111522562973025e-05, + "loss": 1.0803, + "step": 13097 + }, + { + "epoch": 0.9111969111969112, + "grad_norm": 1.046875, + "learning_rate": 4.1051301442044276e-05, + "loss": 0.9407, + "step": 13098 + }, + { + "epoch": 0.9112664788340464, + "grad_norm": 1.4375, + "learning_rate": 4.098742594493998e-05, + "loss": 0.7143, + "step": 13099 + }, + { + "epoch": 0.9113360464711816, + "grad_norm": 0.97265625, + "learning_rate": 4.092359914166033e-05, + "loss": 0.6816, + "step": 13100 + }, + { + "epoch": 0.9114056141083168, + "grad_norm": 1.109375, + "learning_rate": 4.0859821035445946e-05, + "loss": 0.7882, + "step": 13101 + }, + { + "epoch": 0.9114751817454521, + "grad_norm": 1.5703125, + "learning_rate": 4.079609162953568e-05, + "loss": 1.231, + "step": 13102 + }, + { + "epoch": 0.9115447493825872, + "grad_norm": 1.3046875, + "learning_rate": 4.0732410927165067e-05, + "loss": 0.9799, + "step": 13103 + }, + { + "epoch": 0.9116143170197224, + "grad_norm": 1.03125, + "learning_rate": 4.066877893156762e-05, + "loss": 0.7711, + "step": 13104 + }, + { + "epoch": 0.9116838846568577, + "grad_norm": 1.0, + "learning_rate": 4.0605195645974094e-05, + "loss": 0.527, + "step": 13105 + }, + { + "epoch": 0.9117534522939928, + "grad_norm": 0.87890625, + "learning_rate": 4.054166107361301e-05, + "loss": 0.6169, + "step": 13106 + }, + { + "epoch": 0.911823019931128, + "grad_norm": 1.1484375, + "learning_rate": 4.0478175217710466e-05, + "loss": 0.8007, + "step": 13107 + }, + { + "epoch": 0.9118925875682632, + "grad_norm": 1.125, + "learning_rate": 4.041473808148977e-05, + "loss": 0.6891, + "step": 13108 + }, + { + "epoch": 0.9119621552053985, + "grad_norm": 1.25, + "learning_rate": 4.035134966817211e-05, + "loss": 0.8601, + "step": 13109 + }, + { + "epoch": 0.9120317228425336, + "grad_norm": 1.359375, + "learning_rate": 4.0288009980975706e-05, + "loss": 1.1731, + "step": 13110 + }, + { + "epoch": 0.9121012904796688, + "grad_norm": 1.09375, + "learning_rate": 4.022471902311709e-05, + "loss": 0.7185, + "step": 13111 + }, + { + "epoch": 0.9121708581168041, + "grad_norm": 1.25, + "learning_rate": 4.0161476797809456e-05, + "loss": 0.9068, + "step": 13112 + }, + { + "epoch": 0.9122404257539393, + "grad_norm": 1.046875, + "learning_rate": 4.009828330826415e-05, + "loss": 0.6831, + "step": 13113 + }, + { + "epoch": 0.9123099933910744, + "grad_norm": 1.078125, + "learning_rate": 4.00351385576897e-05, + "loss": 0.6043, + "step": 13114 + }, + { + "epoch": 0.9123795610282097, + "grad_norm": 0.8984375, + "learning_rate": 3.997204254929232e-05, + "loss": 0.8852, + "step": 13115 + }, + { + "epoch": 0.9124491286653449, + "grad_norm": 1.046875, + "learning_rate": 3.9908995286275784e-05, + "loss": 0.8096, + "step": 13116 + }, + { + "epoch": 0.9125186963024801, + "grad_norm": 1.1796875, + "learning_rate": 3.984599677184131e-05, + "loss": 0.6992, + "step": 13117 + }, + { + "epoch": 0.9125882639396153, + "grad_norm": 0.859375, + "learning_rate": 3.978304700918755e-05, + "loss": 0.6913, + "step": 13118 + }, + { + "epoch": 0.9126578315767505, + "grad_norm": 1.1171875, + "learning_rate": 3.9720146001510746e-05, + "loss": 0.8249, + "step": 13119 + }, + { + "epoch": 0.9127273992138857, + "grad_norm": 1.0546875, + "learning_rate": 3.965729375200477e-05, + "loss": 0.7471, + "step": 13120 + }, + { + "epoch": 0.9127969668510209, + "grad_norm": 1.234375, + "learning_rate": 3.959449026386097e-05, + "loss": 1.1734, + "step": 13121 + }, + { + "epoch": 0.9128665344881561, + "grad_norm": 1.0546875, + "learning_rate": 3.953173554026801e-05, + "loss": 0.72, + "step": 13122 + }, + { + "epoch": 0.9129361021252913, + "grad_norm": 1.03125, + "learning_rate": 3.9469029584412676e-05, + "loss": 0.7768, + "step": 13123 + }, + { + "epoch": 0.9130056697624265, + "grad_norm": 1.1875, + "learning_rate": 3.940637239947831e-05, + "loss": 0.7127, + "step": 13124 + }, + { + "epoch": 0.9130752373995618, + "grad_norm": 1.0703125, + "learning_rate": 3.9343763988646807e-05, + "loss": 0.8563, + "step": 13125 + }, + { + "epoch": 0.9131448050366969, + "grad_norm": 1.4140625, + "learning_rate": 3.928120435509675e-05, + "loss": 1.2617, + "step": 13126 + }, + { + "epoch": 0.9132143726738321, + "grad_norm": 1.09375, + "learning_rate": 3.921869350200491e-05, + "loss": 0.9248, + "step": 13127 + }, + { + "epoch": 0.9132839403109674, + "grad_norm": 1.03125, + "learning_rate": 3.915623143254488e-05, + "loss": 0.7759, + "step": 13128 + }, + { + "epoch": 0.9133535079481025, + "grad_norm": 1.03125, + "learning_rate": 3.909381814988855e-05, + "loss": 0.7245, + "step": 13129 + }, + { + "epoch": 0.9134230755852377, + "grad_norm": 1.09375, + "learning_rate": 3.903145365720484e-05, + "loss": 0.8968, + "step": 13130 + }, + { + "epoch": 0.913492643222373, + "grad_norm": 1.21875, + "learning_rate": 3.896913795766033e-05, + "loss": 0.9318, + "step": 13131 + }, + { + "epoch": 0.9135622108595082, + "grad_norm": 0.94140625, + "learning_rate": 3.8906871054419034e-05, + "loss": 0.6763, + "step": 13132 + }, + { + "epoch": 0.9136317784966433, + "grad_norm": 1.0234375, + "learning_rate": 3.884465295064232e-05, + "loss": 0.7571, + "step": 13133 + }, + { + "epoch": 0.9137013461337785, + "grad_norm": 1.0390625, + "learning_rate": 3.878248364948978e-05, + "loss": 0.7489, + "step": 13134 + }, + { + "epoch": 0.9137709137709138, + "grad_norm": 1.3203125, + "learning_rate": 3.8720363154117755e-05, + "loss": 0.8554, + "step": 13135 + }, + { + "epoch": 0.913840481408049, + "grad_norm": 1.140625, + "learning_rate": 3.865829146768041e-05, + "loss": 0.7288, + "step": 13136 + }, + { + "epoch": 0.9139100490451841, + "grad_norm": 1.1953125, + "learning_rate": 3.859626859332965e-05, + "loss": 0.8952, + "step": 13137 + }, + { + "epoch": 0.9139796166823194, + "grad_norm": 1.1875, + "learning_rate": 3.853429453421442e-05, + "loss": 0.799, + "step": 13138 + }, + { + "epoch": 0.9140491843194546, + "grad_norm": 1.2890625, + "learning_rate": 3.847236929348163e-05, + "loss": 0.5849, + "step": 13139 + }, + { + "epoch": 0.9141187519565898, + "grad_norm": 1.4296875, + "learning_rate": 3.8410492874275335e-05, + "loss": 0.6744, + "step": 13140 + }, + { + "epoch": 0.914188319593725, + "grad_norm": 1.140625, + "learning_rate": 3.8348665279737684e-05, + "loss": 0.833, + "step": 13141 + }, + { + "epoch": 0.9142578872308602, + "grad_norm": 1.1015625, + "learning_rate": 3.828688651300749e-05, + "loss": 0.7386, + "step": 13142 + }, + { + "epoch": 0.9143274548679954, + "grad_norm": 1.1875, + "learning_rate": 3.822515657722181e-05, + "loss": 0.8128, + "step": 13143 + }, + { + "epoch": 0.9143970225051307, + "grad_norm": 1.109375, + "learning_rate": 3.816347547551524e-05, + "loss": 0.8064, + "step": 13144 + }, + { + "epoch": 0.9144665901422658, + "grad_norm": 1.3359375, + "learning_rate": 3.810184321101917e-05, + "loss": 0.9626, + "step": 13145 + }, + { + "epoch": 0.914536157779401, + "grad_norm": 1.125, + "learning_rate": 3.8040259786863315e-05, + "loss": 0.8708, + "step": 13146 + }, + { + "epoch": 0.9146057254165362, + "grad_norm": 1.1796875, + "learning_rate": 3.797872520617418e-05, + "loss": 0.6174, + "step": 13147 + }, + { + "epoch": 0.9146752930536715, + "grad_norm": 1.1875, + "learning_rate": 3.791723947207659e-05, + "loss": 0.8264, + "step": 13148 + }, + { + "epoch": 0.9147448606908066, + "grad_norm": 1.1171875, + "learning_rate": 3.785580258769239e-05, + "loss": 0.5879, + "step": 13149 + }, + { + "epoch": 0.9148144283279418, + "grad_norm": 0.91796875, + "learning_rate": 3.779441455614086e-05, + "loss": 0.5422, + "step": 13150 + }, + { + "epoch": 0.9148839959650771, + "grad_norm": 1.0703125, + "learning_rate": 3.773307538053916e-05, + "loss": 0.5574, + "step": 13151 + }, + { + "epoch": 0.9149535636022122, + "grad_norm": 1.1015625, + "learning_rate": 3.76717850640016e-05, + "loss": 0.6906, + "step": 13152 + }, + { + "epoch": 0.9150231312393474, + "grad_norm": 1.125, + "learning_rate": 3.7610543609640444e-05, + "loss": 0.8463, + "step": 13153 + }, + { + "epoch": 0.9150926988764827, + "grad_norm": 1.0859375, + "learning_rate": 3.754935102056489e-05, + "loss": 0.9383, + "step": 13154 + }, + { + "epoch": 0.9151622665136179, + "grad_norm": 1.0546875, + "learning_rate": 3.7488207299882336e-05, + "loss": 0.7741, + "step": 13155 + }, + { + "epoch": 0.915231834150753, + "grad_norm": 1.21875, + "learning_rate": 3.7427112450697075e-05, + "loss": 0.9069, + "step": 13156 + }, + { + "epoch": 0.9153014017878883, + "grad_norm": 1.15625, + "learning_rate": 3.736606647611141e-05, + "loss": 0.6133, + "step": 13157 + }, + { + "epoch": 0.9153709694250235, + "grad_norm": 1.1015625, + "learning_rate": 3.730506937922484e-05, + "loss": 0.7051, + "step": 13158 + }, + { + "epoch": 0.9154405370621587, + "grad_norm": 1.015625, + "learning_rate": 3.7244121163134584e-05, + "loss": 0.6315, + "step": 13159 + }, + { + "epoch": 0.9155101046992938, + "grad_norm": 0.953125, + "learning_rate": 3.718322183093503e-05, + "loss": 0.7364, + "step": 13160 + }, + { + "epoch": 0.9155796723364291, + "grad_norm": 1.109375, + "learning_rate": 3.7122371385718614e-05, + "loss": 0.6312, + "step": 13161 + }, + { + "epoch": 0.9156492399735643, + "grad_norm": 1.2265625, + "learning_rate": 3.706156983057496e-05, + "loss": 0.8725, + "step": 13162 + }, + { + "epoch": 0.9157188076106995, + "grad_norm": 1.359375, + "learning_rate": 3.700081716859116e-05, + "loss": 1.1978, + "step": 13163 + }, + { + "epoch": 0.9157883752478347, + "grad_norm": 1.203125, + "learning_rate": 3.694011340285208e-05, + "loss": 1.0957, + "step": 13164 + }, + { + "epoch": 0.9158579428849699, + "grad_norm": 1.5546875, + "learning_rate": 3.687945853643959e-05, + "loss": 0.6028, + "step": 13165 + }, + { + "epoch": 0.9159275105221051, + "grad_norm": 1.0078125, + "learning_rate": 3.6818852572434e-05, + "loss": 0.7117, + "step": 13166 + }, + { + "epoch": 0.9159970781592404, + "grad_norm": 0.9375, + "learning_rate": 3.6758295513912185e-05, + "loss": 0.6572, + "step": 13167 + }, + { + "epoch": 0.9160666457963755, + "grad_norm": 1.578125, + "learning_rate": 3.669778736394902e-05, + "loss": 0.7317, + "step": 13168 + }, + { + "epoch": 0.9161362134335107, + "grad_norm": 1.046875, + "learning_rate": 3.663732812561682e-05, + "loss": 0.729, + "step": 13169 + }, + { + "epoch": 0.916205781070646, + "grad_norm": 1.2109375, + "learning_rate": 3.6576917801985355e-05, + "loss": 0.8536, + "step": 13170 + }, + { + "epoch": 0.9162753487077812, + "grad_norm": 1.3125, + "learning_rate": 3.651655639612206e-05, + "loss": 0.9006, + "step": 13171 + }, + { + "epoch": 0.9163449163449163, + "grad_norm": 1.09375, + "learning_rate": 3.64562439110917e-05, + "loss": 0.6241, + "step": 13172 + }, + { + "epoch": 0.9164144839820515, + "grad_norm": 1.59375, + "learning_rate": 3.6395980349956616e-05, + "loss": 0.7321, + "step": 13173 + }, + { + "epoch": 0.9164840516191868, + "grad_norm": 1.2890625, + "learning_rate": 3.6335765715776684e-05, + "loss": 0.6549, + "step": 13174 + }, + { + "epoch": 0.916553619256322, + "grad_norm": 1.140625, + "learning_rate": 3.627560001160935e-05, + "loss": 0.8101, + "step": 13175 + }, + { + "epoch": 0.9166231868934571, + "grad_norm": 1.109375, + "learning_rate": 3.6215483240509604e-05, + "loss": 0.7974, + "step": 13176 + }, + { + "epoch": 0.9166927545305924, + "grad_norm": 1.15625, + "learning_rate": 3.61554154055298e-05, + "loss": 0.7702, + "step": 13177 + }, + { + "epoch": 0.9167623221677276, + "grad_norm": 1.59375, + "learning_rate": 3.6095396509719934e-05, + "loss": 0.8351, + "step": 13178 + }, + { + "epoch": 0.9168318898048627, + "grad_norm": 0.7734375, + "learning_rate": 3.603542655612702e-05, + "loss": 0.5195, + "step": 13179 + }, + { + "epoch": 0.916901457441998, + "grad_norm": 1.2578125, + "learning_rate": 3.5975505547796714e-05, + "loss": 0.8202, + "step": 13180 + }, + { + "epoch": 0.9169710250791332, + "grad_norm": 1.1484375, + "learning_rate": 3.591563348777127e-05, + "loss": 0.7548, + "step": 13181 + }, + { + "epoch": 0.9170405927162684, + "grad_norm": 1.0078125, + "learning_rate": 3.585581037909036e-05, + "loss": 0.615, + "step": 13182 + }, + { + "epoch": 0.9171101603534036, + "grad_norm": 1.3671875, + "learning_rate": 3.5796036224791884e-05, + "loss": 1.1456, + "step": 13183 + }, + { + "epoch": 0.9171797279905388, + "grad_norm": 0.9765625, + "learning_rate": 3.573631102791075e-05, + "loss": 0.747, + "step": 13184 + }, + { + "epoch": 0.917249295627674, + "grad_norm": 1.171875, + "learning_rate": 3.5676634791479535e-05, + "loss": 0.8888, + "step": 13185 + }, + { + "epoch": 0.9173188632648092, + "grad_norm": 1.1328125, + "learning_rate": 3.561700751852803e-05, + "loss": 0.8245, + "step": 13186 + }, + { + "epoch": 0.9173884309019444, + "grad_norm": 1.0546875, + "learning_rate": 3.555742921208427e-05, + "loss": 0.8364, + "step": 13187 + }, + { + "epoch": 0.9174579985390796, + "grad_norm": 0.9609375, + "learning_rate": 3.5497899875172935e-05, + "loss": 0.8607, + "step": 13188 + }, + { + "epoch": 0.9175275661762148, + "grad_norm": 1.109375, + "learning_rate": 3.5438419510816834e-05, + "loss": 0.9017, + "step": 13189 + }, + { + "epoch": 0.9175971338133501, + "grad_norm": 0.9765625, + "learning_rate": 3.537898812203621e-05, + "loss": 0.6674, + "step": 13190 + }, + { + "epoch": 0.9176667014504852, + "grad_norm": 0.84765625, + "learning_rate": 3.531960571184845e-05, + "loss": 0.7925, + "step": 13191 + }, + { + "epoch": 0.9177362690876204, + "grad_norm": 1.1171875, + "learning_rate": 3.526027228326867e-05, + "loss": 0.9025, + "step": 13192 + }, + { + "epoch": 0.9178058367247557, + "grad_norm": 0.94921875, + "learning_rate": 3.520098783930958e-05, + "loss": 0.6693, + "step": 13193 + }, + { + "epoch": 0.9178754043618909, + "grad_norm": 1.34375, + "learning_rate": 3.514175238298145e-05, + "loss": 0.8207, + "step": 13194 + }, + { + "epoch": 0.917944971999026, + "grad_norm": 1.328125, + "learning_rate": 3.508256591729198e-05, + "loss": 1.0026, + "step": 13195 + }, + { + "epoch": 0.9180145396361613, + "grad_norm": 1.1328125, + "learning_rate": 3.5023428445246085e-05, + "loss": 0.9401, + "step": 13196 + }, + { + "epoch": 0.9180841072732965, + "grad_norm": 1.265625, + "learning_rate": 3.496433996984682e-05, + "loss": 0.8634, + "step": 13197 + }, + { + "epoch": 0.9181536749104316, + "grad_norm": 1.265625, + "learning_rate": 3.4905300494094125e-05, + "loss": 0.6497, + "step": 13198 + }, + { + "epoch": 0.9182232425475668, + "grad_norm": 1.3984375, + "learning_rate": 3.4846310020985925e-05, + "loss": 0.7236, + "step": 13199 + }, + { + "epoch": 0.9182928101847021, + "grad_norm": 1.046875, + "learning_rate": 3.478736855351727e-05, + "loss": 0.8447, + "step": 13200 + }, + { + "epoch": 0.9183623778218373, + "grad_norm": 1.1015625, + "learning_rate": 3.4728476094681105e-05, + "loss": 0.8562, + "step": 13201 + }, + { + "epoch": 0.9184319454589724, + "grad_norm": 1.328125, + "learning_rate": 3.466963264746748e-05, + "loss": 0.723, + "step": 13202 + }, + { + "epoch": 0.9185015130961077, + "grad_norm": 1.2890625, + "learning_rate": 3.461083821486421e-05, + "loss": 0.9176, + "step": 13203 + }, + { + "epoch": 0.9185710807332429, + "grad_norm": 1.21875, + "learning_rate": 3.4552092799856826e-05, + "loss": 0.6648, + "step": 13204 + }, + { + "epoch": 0.9186406483703781, + "grad_norm": 1.1484375, + "learning_rate": 3.449339640542804e-05, + "loss": 0.5927, + "step": 13205 + }, + { + "epoch": 0.9187102160075133, + "grad_norm": 1.15625, + "learning_rate": 3.44347490345579e-05, + "loss": 1.0229, + "step": 13206 + }, + { + "epoch": 0.9187797836446485, + "grad_norm": 1.1796875, + "learning_rate": 3.4376150690224375e-05, + "loss": 0.9107, + "step": 13207 + }, + { + "epoch": 0.9188493512817837, + "grad_norm": 1.328125, + "learning_rate": 3.431760137540285e-05, + "loss": 0.7092, + "step": 13208 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 1.4453125, + "learning_rate": 3.425910109306618e-05, + "loss": 0.8211, + "step": 13209 + }, + { + "epoch": 0.9189884865560541, + "grad_norm": 1.1796875, + "learning_rate": 3.4200649846184654e-05, + "loss": 0.9294, + "step": 13210 + }, + { + "epoch": 0.9190580541931893, + "grad_norm": 1.296875, + "learning_rate": 3.4142247637726e-05, + "loss": 1.0419, + "step": 13211 + }, + { + "epoch": 0.9191276218303245, + "grad_norm": 1.0546875, + "learning_rate": 3.408389447065596e-05, + "loss": 0.9011, + "step": 13212 + }, + { + "epoch": 0.9191971894674598, + "grad_norm": 1.15625, + "learning_rate": 3.4025590347937066e-05, + "loss": 0.6634, + "step": 13213 + }, + { + "epoch": 0.9192667571045949, + "grad_norm": 1.0859375, + "learning_rate": 3.396733527252982e-05, + "loss": 0.9252, + "step": 13214 + }, + { + "epoch": 0.9193363247417301, + "grad_norm": 1.0703125, + "learning_rate": 3.390912924739209e-05, + "loss": 0.6012, + "step": 13215 + }, + { + "epoch": 0.9194058923788654, + "grad_norm": 1.28125, + "learning_rate": 3.385097227547929e-05, + "loss": 0.7775, + "step": 13216 + }, + { + "epoch": 0.9194754600160006, + "grad_norm": 1.1875, + "learning_rate": 3.379286435974438e-05, + "loss": 0.8743, + "step": 13217 + }, + { + "epoch": 0.9195450276531357, + "grad_norm": 1.1640625, + "learning_rate": 3.37348055031379e-05, + "loss": 0.9789, + "step": 13218 + }, + { + "epoch": 0.919614595290271, + "grad_norm": 1.0546875, + "learning_rate": 3.36767957086076e-05, + "loss": 0.7683, + "step": 13219 + }, + { + "epoch": 0.9196841629274062, + "grad_norm": 0.86328125, + "learning_rate": 3.361883497909901e-05, + "loss": 0.7352, + "step": 13220 + }, + { + "epoch": 0.9197537305645413, + "grad_norm": 1.375, + "learning_rate": 3.356092331755489e-05, + "loss": 0.8684, + "step": 13221 + }, + { + "epoch": 0.9198232982016766, + "grad_norm": 0.96484375, + "learning_rate": 3.350306072691611e-05, + "loss": 0.798, + "step": 13222 + }, + { + "epoch": 0.9198928658388118, + "grad_norm": 1.3984375, + "learning_rate": 3.3445247210120324e-05, + "loss": 0.9874, + "step": 13223 + }, + { + "epoch": 0.919962433475947, + "grad_norm": 1.1875, + "learning_rate": 3.338748277010295e-05, + "loss": 0.6423, + "step": 13224 + }, + { + "epoch": 0.9200320011130821, + "grad_norm": 1.1953125, + "learning_rate": 3.33297674097971e-05, + "loss": 0.7429, + "step": 13225 + }, + { + "epoch": 0.9201015687502174, + "grad_norm": 1.171875, + "learning_rate": 3.327210113213353e-05, + "loss": 0.9039, + "step": 13226 + }, + { + "epoch": 0.9201711363873526, + "grad_norm": 1.15625, + "learning_rate": 3.3214483940039894e-05, + "loss": 0.7173, + "step": 13227 + }, + { + "epoch": 0.9202407040244878, + "grad_norm": 1.1953125, + "learning_rate": 3.315691583644165e-05, + "loss": 0.6867, + "step": 13228 + }, + { + "epoch": 0.920310271661623, + "grad_norm": 1.078125, + "learning_rate": 3.309939682426222e-05, + "loss": 0.8921, + "step": 13229 + }, + { + "epoch": 0.9203798392987582, + "grad_norm": 0.94140625, + "learning_rate": 3.30419269064216e-05, + "loss": 0.7802, + "step": 13230 + }, + { + "epoch": 0.9204494069358934, + "grad_norm": 1.21875, + "learning_rate": 3.298450608583825e-05, + "loss": 0.9367, + "step": 13231 + }, + { + "epoch": 0.9205189745730287, + "grad_norm": 1.109375, + "learning_rate": 3.29271343654276e-05, + "loss": 0.7717, + "step": 13232 + }, + { + "epoch": 0.9205885422101638, + "grad_norm": 1.2421875, + "learning_rate": 3.286981174810266e-05, + "loss": 0.8901, + "step": 13233 + }, + { + "epoch": 0.920658109847299, + "grad_norm": 1.1640625, + "learning_rate": 3.281253823677388e-05, + "loss": 0.7818, + "step": 13234 + }, + { + "epoch": 0.9207276774844343, + "grad_norm": 1.15625, + "learning_rate": 3.275531383434938e-05, + "loss": 0.8075, + "step": 13235 + }, + { + "epoch": 0.9207972451215695, + "grad_norm": 1.0703125, + "learning_rate": 3.269813854373493e-05, + "loss": 0.793, + "step": 13236 + }, + { + "epoch": 0.9208668127587046, + "grad_norm": 1.1875, + "learning_rate": 3.264101236783346e-05, + "loss": 0.8248, + "step": 13237 + }, + { + "epoch": 0.9209363803958398, + "grad_norm": 1.1015625, + "learning_rate": 3.25839353095454e-05, + "loss": 0.7525, + "step": 13238 + }, + { + "epoch": 0.9210059480329751, + "grad_norm": 1.0859375, + "learning_rate": 3.2526907371768996e-05, + "loss": 0.5513, + "step": 13239 + }, + { + "epoch": 0.9210755156701103, + "grad_norm": 0.890625, + "learning_rate": 3.246992855739983e-05, + "loss": 0.7061, + "step": 13240 + }, + { + "epoch": 0.9211450833072454, + "grad_norm": 1.1171875, + "learning_rate": 3.2412998869331134e-05, + "loss": 0.7867, + "step": 13241 + }, + { + "epoch": 0.9212146509443807, + "grad_norm": 1.0, + "learning_rate": 3.235611831045304e-05, + "loss": 0.6275, + "step": 13242 + }, + { + "epoch": 0.9212842185815159, + "grad_norm": 0.9296875, + "learning_rate": 3.229928688365413e-05, + "loss": 0.5783, + "step": 13243 + }, + { + "epoch": 0.921353786218651, + "grad_norm": 1.4609375, + "learning_rate": 3.224250459181988e-05, + "loss": 0.6496, + "step": 13244 + }, + { + "epoch": 0.9214233538557863, + "grad_norm": 1.0078125, + "learning_rate": 3.218577143783341e-05, + "loss": 0.8002, + "step": 13245 + }, + { + "epoch": 0.9214929214929215, + "grad_norm": 1.015625, + "learning_rate": 3.212908742457532e-05, + "loss": 0.7586, + "step": 13246 + }, + { + "epoch": 0.9215624891300567, + "grad_norm": 1.5859375, + "learning_rate": 3.2072452554923746e-05, + "loss": 0.9243, + "step": 13247 + }, + { + "epoch": 0.921632056767192, + "grad_norm": 0.90625, + "learning_rate": 3.201586683175417e-05, + "loss": 0.7317, + "step": 13248 + }, + { + "epoch": 0.9217016244043271, + "grad_norm": 1.2734375, + "learning_rate": 3.1959330257939957e-05, + "loss": 0.9879, + "step": 13249 + }, + { + "epoch": 0.9217711920414623, + "grad_norm": 1.703125, + "learning_rate": 3.1902842836351696e-05, + "loss": 0.7883, + "step": 13250 + }, + { + "epoch": 0.9218407596785975, + "grad_norm": 1.171875, + "learning_rate": 3.184640456985755e-05, + "loss": 0.8505, + "step": 13251 + }, + { + "epoch": 0.9219103273157327, + "grad_norm": 1.03125, + "learning_rate": 3.179001546132298e-05, + "loss": 0.6454, + "step": 13252 + }, + { + "epoch": 0.9219798949528679, + "grad_norm": 1.0859375, + "learning_rate": 3.173367551361139e-05, + "loss": 0.7965, + "step": 13253 + }, + { + "epoch": 0.9220494625900031, + "grad_norm": 0.9375, + "learning_rate": 3.167738472958337e-05, + "loss": 0.6902, + "step": 13254 + }, + { + "epoch": 0.9221190302271384, + "grad_norm": 1.109375, + "learning_rate": 3.162114311209707e-05, + "loss": 0.9991, + "step": 13255 + }, + { + "epoch": 0.9221885978642735, + "grad_norm": 0.84765625, + "learning_rate": 3.1564950664007996e-05, + "loss": 0.5433, + "step": 13256 + }, + { + "epoch": 0.9222581655014087, + "grad_norm": 1.0546875, + "learning_rate": 3.1508807388169414e-05, + "loss": 0.781, + "step": 13257 + }, + { + "epoch": 0.922327733138544, + "grad_norm": 0.91015625, + "learning_rate": 3.1452713287432154e-05, + "loss": 0.7424, + "step": 13258 + }, + { + "epoch": 0.9223973007756792, + "grad_norm": 1.171875, + "learning_rate": 3.139666836464439e-05, + "loss": 0.8479, + "step": 13259 + }, + { + "epoch": 0.9224668684128143, + "grad_norm": 0.76953125, + "learning_rate": 3.13406726226515e-05, + "loss": 0.6565, + "step": 13260 + }, + { + "epoch": 0.9225364360499496, + "grad_norm": 1.515625, + "learning_rate": 3.128472606429689e-05, + "loss": 0.8064, + "step": 13261 + }, + { + "epoch": 0.9226060036870848, + "grad_norm": 1.0390625, + "learning_rate": 3.122882869242116e-05, + "loss": 0.7445, + "step": 13262 + }, + { + "epoch": 0.92267557132422, + "grad_norm": 1.0, + "learning_rate": 3.1172980509862504e-05, + "loss": 0.5144, + "step": 13263 + }, + { + "epoch": 0.9227451389613551, + "grad_norm": 1.125, + "learning_rate": 3.111718151945686e-05, + "loss": 0.8032, + "step": 13264 + }, + { + "epoch": 0.9228147065984904, + "grad_norm": 0.94921875, + "learning_rate": 3.106143172403708e-05, + "loss": 0.7622, + "step": 13265 + }, + { + "epoch": 0.9228842742356256, + "grad_norm": 1.0546875, + "learning_rate": 3.10057311264339e-05, + "loss": 0.5968, + "step": 13266 + }, + { + "epoch": 0.9229538418727607, + "grad_norm": 1.1484375, + "learning_rate": 3.095007972947572e-05, + "loss": 0.8642, + "step": 13267 + }, + { + "epoch": 0.923023409509896, + "grad_norm": 0.96484375, + "learning_rate": 3.089447753598806e-05, + "loss": 0.7421, + "step": 13268 + }, + { + "epoch": 0.9230929771470312, + "grad_norm": 1.09375, + "learning_rate": 3.083892454879433e-05, + "loss": 0.6603, + "step": 13269 + }, + { + "epoch": 0.9231625447841664, + "grad_norm": 0.95703125, + "learning_rate": 3.0783420770714834e-05, + "loss": 0.9264, + "step": 13270 + }, + { + "epoch": 0.9232321124213017, + "grad_norm": 1.2421875, + "learning_rate": 3.072796620456808e-05, + "loss": 0.7977, + "step": 13271 + }, + { + "epoch": 0.9233016800584368, + "grad_norm": 1.078125, + "learning_rate": 3.067256085316983e-05, + "loss": 0.7678, + "step": 13272 + }, + { + "epoch": 0.923371247695572, + "grad_norm": 0.90625, + "learning_rate": 3.0617204719333155e-05, + "loss": 0.7053, + "step": 13273 + }, + { + "epoch": 0.9234408153327073, + "grad_norm": 1.0859375, + "learning_rate": 3.056189780586871e-05, + "loss": 0.6626, + "step": 13274 + }, + { + "epoch": 0.9235103829698424, + "grad_norm": 2.109375, + "learning_rate": 3.0506640115584682e-05, + "loss": 1.0136, + "step": 13275 + }, + { + "epoch": 0.9235799506069776, + "grad_norm": 0.98046875, + "learning_rate": 3.0451431651286943e-05, + "loss": 0.6453, + "step": 13276 + }, + { + "epoch": 0.9236495182441128, + "grad_norm": 1.53125, + "learning_rate": 3.039627241577858e-05, + "loss": 0.9795, + "step": 13277 + }, + { + "epoch": 0.9237190858812481, + "grad_norm": 0.97265625, + "learning_rate": 3.0341162411860466e-05, + "loss": 0.7024, + "step": 13278 + }, + { + "epoch": 0.9237886535183832, + "grad_norm": 1.1328125, + "learning_rate": 3.02861016423307e-05, + "loss": 0.8726, + "step": 13279 + }, + { + "epoch": 0.9238582211555184, + "grad_norm": 0.98828125, + "learning_rate": 3.0231090109984814e-05, + "loss": 0.5164, + "step": 13280 + }, + { + "epoch": 0.9239277887926537, + "grad_norm": 0.8359375, + "learning_rate": 3.0176127817616138e-05, + "loss": 0.6693, + "step": 13281 + }, + { + "epoch": 0.9239973564297889, + "grad_norm": 1.390625, + "learning_rate": 3.0121214768015548e-05, + "loss": 0.7757, + "step": 13282 + }, + { + "epoch": 0.924066924066924, + "grad_norm": 1.0, + "learning_rate": 3.0066350963971145e-05, + "loss": 0.6186, + "step": 13283 + }, + { + "epoch": 0.9241364917040593, + "grad_norm": 1.0625, + "learning_rate": 3.0011536408268482e-05, + "loss": 0.8729, + "step": 13284 + }, + { + "epoch": 0.9242060593411945, + "grad_norm": 1.328125, + "learning_rate": 2.995677110369088e-05, + "loss": 1.0522, + "step": 13285 + }, + { + "epoch": 0.9242756269783297, + "grad_norm": 1.3828125, + "learning_rate": 2.9902055053019238e-05, + "loss": 0.9778, + "step": 13286 + }, + { + "epoch": 0.9243451946154649, + "grad_norm": 1.0, + "learning_rate": 2.984738825903155e-05, + "loss": 0.628, + "step": 13287 + }, + { + "epoch": 0.9244147622526001, + "grad_norm": 1.421875, + "learning_rate": 2.979277072450348e-05, + "loss": 0.7561, + "step": 13288 + }, + { + "epoch": 0.9244843298897353, + "grad_norm": 1.203125, + "learning_rate": 2.9738202452208263e-05, + "loss": 0.8658, + "step": 13289 + }, + { + "epoch": 0.9245538975268704, + "grad_norm": 1.15625, + "learning_rate": 2.9683683444916787e-05, + "loss": 0.6984, + "step": 13290 + }, + { + "epoch": 0.9246234651640057, + "grad_norm": 1.421875, + "learning_rate": 2.9629213705396953e-05, + "loss": 0.6508, + "step": 13291 + }, + { + "epoch": 0.9246930328011409, + "grad_norm": 0.96875, + "learning_rate": 2.9574793236414764e-05, + "loss": 0.6579, + "step": 13292 + }, + { + "epoch": 0.9247626004382761, + "grad_norm": 1.4453125, + "learning_rate": 2.952042204073324e-05, + "loss": 0.9784, + "step": 13293 + }, + { + "epoch": 0.9248321680754114, + "grad_norm": 1.2734375, + "learning_rate": 2.9466100121112947e-05, + "loss": 0.8942, + "step": 13294 + }, + { + "epoch": 0.9249017357125465, + "grad_norm": 1.3671875, + "learning_rate": 2.941182748031235e-05, + "loss": 0.9168, + "step": 13295 + }, + { + "epoch": 0.9249713033496817, + "grad_norm": 1.15625, + "learning_rate": 2.935760412108701e-05, + "loss": 1.0138, + "step": 13296 + }, + { + "epoch": 0.925040870986817, + "grad_norm": 1.515625, + "learning_rate": 2.9303430046190184e-05, + "loss": 0.614, + "step": 13297 + }, + { + "epoch": 0.9251104386239521, + "grad_norm": 1.0703125, + "learning_rate": 2.9249305258372437e-05, + "loss": 0.535, + "step": 13298 + }, + { + "epoch": 0.9251800062610873, + "grad_norm": 1.3671875, + "learning_rate": 2.9195229760382026e-05, + "loss": 0.9334, + "step": 13299 + }, + { + "epoch": 0.9252495738982226, + "grad_norm": 0.953125, + "learning_rate": 2.9141203554964745e-05, + "loss": 0.8297, + "step": 13300 + }, + { + "epoch": 0.9253191415353578, + "grad_norm": 1.21875, + "learning_rate": 2.9087226644863628e-05, + "loss": 0.8797, + "step": 13301 + }, + { + "epoch": 0.9253887091724929, + "grad_norm": 1.3359375, + "learning_rate": 2.903329903281926e-05, + "loss": 0.893, + "step": 13302 + }, + { + "epoch": 0.9254582768096281, + "grad_norm": 0.86328125, + "learning_rate": 2.8979420721569892e-05, + "loss": 0.7535, + "step": 13303 + }, + { + "epoch": 0.9255278444467634, + "grad_norm": 0.9375, + "learning_rate": 2.892559171385145e-05, + "loss": 0.6196, + "step": 13304 + }, + { + "epoch": 0.9255974120838986, + "grad_norm": 0.9140625, + "learning_rate": 2.8871812012396635e-05, + "loss": 0.6602, + "step": 13305 + }, + { + "epoch": 0.9256669797210337, + "grad_norm": 1.0546875, + "learning_rate": 2.88180816199366e-05, + "loss": 0.8892, + "step": 13306 + }, + { + "epoch": 0.925736547358169, + "grad_norm": 0.96484375, + "learning_rate": 2.876440053919904e-05, + "loss": 0.7827, + "step": 13307 + }, + { + "epoch": 0.9258061149953042, + "grad_norm": 0.8828125, + "learning_rate": 2.871076877291001e-05, + "loss": 0.6778, + "step": 13308 + }, + { + "epoch": 0.9258756826324394, + "grad_norm": 0.91796875, + "learning_rate": 2.8657186323792438e-05, + "loss": 0.6027, + "step": 13309 + }, + { + "epoch": 0.9259452502695746, + "grad_norm": 1.03125, + "learning_rate": 2.8603653194567036e-05, + "loss": 0.9155, + "step": 13310 + }, + { + "epoch": 0.9260148179067098, + "grad_norm": 1.0234375, + "learning_rate": 2.8550169387951852e-05, + "loss": 0.6383, + "step": 13311 + }, + { + "epoch": 0.926084385543845, + "grad_norm": 1.1171875, + "learning_rate": 2.8496734906662604e-05, + "loss": 0.7247, + "step": 13312 + }, + { + "epoch": 0.9261539531809803, + "grad_norm": 1.203125, + "learning_rate": 2.844334975341234e-05, + "loss": 0.9816, + "step": 13313 + }, + { + "epoch": 0.9262235208181154, + "grad_norm": 1.1875, + "learning_rate": 2.8390013930912008e-05, + "loss": 0.8588, + "step": 13314 + }, + { + "epoch": 0.9262930884552506, + "grad_norm": 1.265625, + "learning_rate": 2.8336727441869326e-05, + "loss": 0.7097, + "step": 13315 + }, + { + "epoch": 0.9263626560923858, + "grad_norm": 1.2890625, + "learning_rate": 2.828349028899002e-05, + "loss": 0.7123, + "step": 13316 + }, + { + "epoch": 0.926432223729521, + "grad_norm": 1.109375, + "learning_rate": 2.8230302474977376e-05, + "loss": 0.7025, + "step": 13317 + }, + { + "epoch": 0.9265017913666562, + "grad_norm": 1.1015625, + "learning_rate": 2.8177164002531897e-05, + "loss": 0.9708, + "step": 13318 + }, + { + "epoch": 0.9265713590037914, + "grad_norm": 1.6015625, + "learning_rate": 2.8124074874351646e-05, + "loss": 0.7133, + "step": 13319 + }, + { + "epoch": 0.9266409266409267, + "grad_norm": 1.1953125, + "learning_rate": 2.8071035093132247e-05, + "loss": 0.7549, + "step": 13320 + }, + { + "epoch": 0.9267104942780618, + "grad_norm": 0.89453125, + "learning_rate": 2.8018044661566768e-05, + "loss": 0.7371, + "step": 13321 + }, + { + "epoch": 0.926780061915197, + "grad_norm": 1.0, + "learning_rate": 2.796510358234583e-05, + "loss": 0.8252, + "step": 13322 + }, + { + "epoch": 0.9268496295523323, + "grad_norm": 1.0390625, + "learning_rate": 2.791221185815751e-05, + "loss": 0.6271, + "step": 13323 + }, + { + "epoch": 0.9269191971894675, + "grad_norm": 1.1640625, + "learning_rate": 2.7859369491687547e-05, + "loss": 0.7798, + "step": 13324 + }, + { + "epoch": 0.9269887648266026, + "grad_norm": 0.984375, + "learning_rate": 2.7806576485618683e-05, + "loss": 0.7248, + "step": 13325 + }, + { + "epoch": 0.9270583324637379, + "grad_norm": 1.109375, + "learning_rate": 2.7753832842631665e-05, + "loss": 0.6875, + "step": 13326 + }, + { + "epoch": 0.9271279001008731, + "grad_norm": 1.09375, + "learning_rate": 2.770113856540457e-05, + "loss": 0.8432, + "step": 13327 + }, + { + "epoch": 0.9271974677380083, + "grad_norm": 1.203125, + "learning_rate": 2.7648493656612926e-05, + "loss": 0.8636, + "step": 13328 + }, + { + "epoch": 0.9272670353751434, + "grad_norm": 1.4453125, + "learning_rate": 2.7595898118929706e-05, + "loss": 1.0768, + "step": 13329 + }, + { + "epoch": 0.9273366030122787, + "grad_norm": 1.109375, + "learning_rate": 2.7543351955025552e-05, + "loss": 0.8567, + "step": 13330 + }, + { + "epoch": 0.9274061706494139, + "grad_norm": 1.2578125, + "learning_rate": 2.749085516756833e-05, + "loss": 0.801, + "step": 13331 + }, + { + "epoch": 0.9274757382865491, + "grad_norm": 1.0625, + "learning_rate": 2.7438407759223793e-05, + "loss": 0.8172, + "step": 13332 + }, + { + "epoch": 0.9275453059236843, + "grad_norm": 0.859375, + "learning_rate": 2.7386009732654815e-05, + "loss": 0.6904, + "step": 13333 + }, + { + "epoch": 0.9276148735608195, + "grad_norm": 1.09375, + "learning_rate": 2.7333661090521932e-05, + "loss": 0.8203, + "step": 13334 + }, + { + "epoch": 0.9276844411979547, + "grad_norm": 0.97265625, + "learning_rate": 2.7281361835483022e-05, + "loss": 0.7293, + "step": 13335 + }, + { + "epoch": 0.92775400883509, + "grad_norm": 1.0703125, + "learning_rate": 2.7229111970193842e-05, + "loss": 0.8736, + "step": 13336 + }, + { + "epoch": 0.9278235764722251, + "grad_norm": 1.34375, + "learning_rate": 2.7176911497307166e-05, + "loss": 0.9797, + "step": 13337 + }, + { + "epoch": 0.9278931441093603, + "grad_norm": 1.4375, + "learning_rate": 2.7124760419473537e-05, + "loss": 0.8142, + "step": 13338 + }, + { + "epoch": 0.9279627117464956, + "grad_norm": 0.96484375, + "learning_rate": 2.7072658739340837e-05, + "loss": 0.6258, + "step": 13339 + }, + { + "epoch": 0.9280322793836308, + "grad_norm": 0.92578125, + "learning_rate": 2.702060645955473e-05, + "loss": 0.8679, + "step": 13340 + }, + { + "epoch": 0.9281018470207659, + "grad_norm": 1.234375, + "learning_rate": 2.696860358275799e-05, + "loss": 0.946, + "step": 13341 + }, + { + "epoch": 0.9281714146579011, + "grad_norm": 0.87109375, + "learning_rate": 2.691665011159117e-05, + "loss": 0.6146, + "step": 13342 + }, + { + "epoch": 0.9282409822950364, + "grad_norm": 1.5234375, + "learning_rate": 2.6864746048692156e-05, + "loss": 1.1254, + "step": 13343 + }, + { + "epoch": 0.9283105499321715, + "grad_norm": 1.2265625, + "learning_rate": 2.6812891396696294e-05, + "loss": 0.9415, + "step": 13344 + }, + { + "epoch": 0.9283801175693067, + "grad_norm": 1.015625, + "learning_rate": 2.676108615823658e-05, + "loss": 0.6591, + "step": 13345 + }, + { + "epoch": 0.928449685206442, + "grad_norm": 1.6015625, + "learning_rate": 2.670933033594358e-05, + "loss": 0.712, + "step": 13346 + }, + { + "epoch": 0.9285192528435772, + "grad_norm": 1.21875, + "learning_rate": 2.6657623932444975e-05, + "loss": 0.6956, + "step": 13347 + }, + { + "epoch": 0.9285888204807123, + "grad_norm": 1.203125, + "learning_rate": 2.66059669503661e-05, + "loss": 0.8038, + "step": 13348 + }, + { + "epoch": 0.9286583881178476, + "grad_norm": 1.109375, + "learning_rate": 2.6554359392329973e-05, + "loss": 0.7031, + "step": 13349 + }, + { + "epoch": 0.9287279557549828, + "grad_norm": 1.3046875, + "learning_rate": 2.6502801260957054e-05, + "loss": 0.8144, + "step": 13350 + }, + { + "epoch": 0.928797523392118, + "grad_norm": 1.125, + "learning_rate": 2.6451292558864915e-05, + "loss": 0.8768, + "step": 13351 + }, + { + "epoch": 0.9288670910292532, + "grad_norm": 1.046875, + "learning_rate": 2.639983328866935e-05, + "loss": 0.7542, + "step": 13352 + }, + { + "epoch": 0.9289366586663884, + "grad_norm": 0.98828125, + "learning_rate": 2.6348423452982717e-05, + "loss": 0.9279, + "step": 13353 + }, + { + "epoch": 0.9290062263035236, + "grad_norm": 1.2734375, + "learning_rate": 2.6297063054415705e-05, + "loss": 0.8012, + "step": 13354 + }, + { + "epoch": 0.9290757939406588, + "grad_norm": 1.2734375, + "learning_rate": 2.624575209557589e-05, + "loss": 0.9576, + "step": 13355 + }, + { + "epoch": 0.929145361577794, + "grad_norm": 1.34375, + "learning_rate": 2.6194490579068864e-05, + "loss": 0.7498, + "step": 13356 + }, + { + "epoch": 0.9292149292149292, + "grad_norm": 1.1875, + "learning_rate": 2.6143278507497203e-05, + "loss": 0.7359, + "step": 13357 + }, + { + "epoch": 0.9292844968520644, + "grad_norm": 1.3046875, + "learning_rate": 2.6092115883461054e-05, + "loss": 0.9154, + "step": 13358 + }, + { + "epoch": 0.9293540644891997, + "grad_norm": 1.1484375, + "learning_rate": 2.604100270955867e-05, + "loss": 1.0482, + "step": 13359 + }, + { + "epoch": 0.9294236321263348, + "grad_norm": 1.1484375, + "learning_rate": 2.5989938988384976e-05, + "loss": 0.7849, + "step": 13360 + }, + { + "epoch": 0.92949319976347, + "grad_norm": 1.3046875, + "learning_rate": 2.5938924722532788e-05, + "loss": 0.9157, + "step": 13361 + }, + { + "epoch": 0.9295627674006053, + "grad_norm": 1.09375, + "learning_rate": 2.5887959914592364e-05, + "loss": 0.8902, + "step": 13362 + }, + { + "epoch": 0.9296323350377405, + "grad_norm": 1.3671875, + "learning_rate": 2.5837044567151412e-05, + "loss": 0.8757, + "step": 13363 + }, + { + "epoch": 0.9297019026748756, + "grad_norm": 0.9140625, + "learning_rate": 2.5786178682795204e-05, + "loss": 0.6357, + "step": 13364 + }, + { + "epoch": 0.9297714703120109, + "grad_norm": 1.1875, + "learning_rate": 2.5735362264106442e-05, + "loss": 0.7472, + "step": 13365 + }, + { + "epoch": 0.9298410379491461, + "grad_norm": 1.015625, + "learning_rate": 2.5684595313665405e-05, + "loss": 0.7725, + "step": 13366 + }, + { + "epoch": 0.9299106055862812, + "grad_norm": 1.21875, + "learning_rate": 2.5633877834049578e-05, + "loss": 0.9519, + "step": 13367 + }, + { + "epoch": 0.9299801732234164, + "grad_norm": 1.34375, + "learning_rate": 2.5583209827834353e-05, + "loss": 0.8959, + "step": 13368 + }, + { + "epoch": 0.9300497408605517, + "grad_norm": 0.95703125, + "learning_rate": 2.5532591297592333e-05, + "loss": 0.807, + "step": 13369 + }, + { + "epoch": 0.9301193084976869, + "grad_norm": 1.2421875, + "learning_rate": 2.5482022245893578e-05, + "loss": 0.9299, + "step": 13370 + }, + { + "epoch": 0.930188876134822, + "grad_norm": 0.80859375, + "learning_rate": 2.543150267530592e-05, + "loss": 0.661, + "step": 13371 + }, + { + "epoch": 0.9302584437719573, + "grad_norm": 1.0234375, + "learning_rate": 2.538103258839408e-05, + "loss": 0.862, + "step": 13372 + }, + { + "epoch": 0.9303280114090925, + "grad_norm": 0.96875, + "learning_rate": 2.533061198772124e-05, + "loss": 0.7233, + "step": 13373 + }, + { + "epoch": 0.9303975790462277, + "grad_norm": 1.4921875, + "learning_rate": 2.5280240875847126e-05, + "loss": 0.6801, + "step": 13374 + }, + { + "epoch": 0.930467146683363, + "grad_norm": 1.0390625, + "learning_rate": 2.522991925532958e-05, + "loss": 0.9739, + "step": 13375 + }, + { + "epoch": 0.9305367143204981, + "grad_norm": 1.4453125, + "learning_rate": 2.5179647128723337e-05, + "loss": 0.9727, + "step": 13376 + }, + { + "epoch": 0.9306062819576333, + "grad_norm": 1.2109375, + "learning_rate": 2.5129424498581132e-05, + "loss": 0.6165, + "step": 13377 + }, + { + "epoch": 0.9306758495947686, + "grad_norm": 1.2109375, + "learning_rate": 2.507925136745315e-05, + "loss": 0.6481, + "step": 13378 + }, + { + "epoch": 0.9307454172319037, + "grad_norm": 1.0234375, + "learning_rate": 2.5029127737886793e-05, + "loss": 0.8426, + "step": 13379 + }, + { + "epoch": 0.9308149848690389, + "grad_norm": 1.25, + "learning_rate": 2.497905361242714e-05, + "loss": 0.7517, + "step": 13380 + }, + { + "epoch": 0.9308845525061741, + "grad_norm": 1.2421875, + "learning_rate": 2.4929028993616598e-05, + "loss": 0.7481, + "step": 13381 + }, + { + "epoch": 0.9309541201433094, + "grad_norm": 1.015625, + "learning_rate": 2.487905388399525e-05, + "loss": 0.612, + "step": 13382 + }, + { + "epoch": 0.9310236877804445, + "grad_norm": 1.484375, + "learning_rate": 2.482912828610062e-05, + "loss": 0.7068, + "step": 13383 + }, + { + "epoch": 0.9310932554175797, + "grad_norm": 1.390625, + "learning_rate": 2.4779252202467685e-05, + "loss": 0.823, + "step": 13384 + }, + { + "epoch": 0.931162823054715, + "grad_norm": 1.046875, + "learning_rate": 2.4729425635628634e-05, + "loss": 0.6847, + "step": 13385 + }, + { + "epoch": 0.9312323906918502, + "grad_norm": 1.234375, + "learning_rate": 2.4679648588113777e-05, + "loss": 0.8414, + "step": 13386 + }, + { + "epoch": 0.9313019583289853, + "grad_norm": 1.0078125, + "learning_rate": 2.462992106245043e-05, + "loss": 0.9564, + "step": 13387 + }, + { + "epoch": 0.9313715259661206, + "grad_norm": 1.375, + "learning_rate": 2.4580243061163466e-05, + "loss": 0.709, + "step": 13388 + }, + { + "epoch": 0.9314410936032558, + "grad_norm": 0.92578125, + "learning_rate": 2.453061458677519e-05, + "loss": 0.7071, + "step": 13389 + }, + { + "epoch": 0.931510661240391, + "grad_norm": 0.8984375, + "learning_rate": 2.448103564180548e-05, + "loss": 0.6645, + "step": 13390 + }, + { + "epoch": 0.9315802288775262, + "grad_norm": 1.828125, + "learning_rate": 2.4431506228771993e-05, + "loss": 0.8114, + "step": 13391 + }, + { + "epoch": 0.9316497965146614, + "grad_norm": 1.0078125, + "learning_rate": 2.438202635018938e-05, + "loss": 0.735, + "step": 13392 + }, + { + "epoch": 0.9317193641517966, + "grad_norm": 1.046875, + "learning_rate": 2.4332596008569853e-05, + "loss": 0.9012, + "step": 13393 + }, + { + "epoch": 0.9317889317889317, + "grad_norm": 1.25, + "learning_rate": 2.4283215206423514e-05, + "loss": 0.8712, + "step": 13394 + }, + { + "epoch": 0.931858499426067, + "grad_norm": 1.1171875, + "learning_rate": 2.4233883946257364e-05, + "loss": 0.8293, + "step": 13395 + }, + { + "epoch": 0.9319280670632022, + "grad_norm": 1.15625, + "learning_rate": 2.4184602230576613e-05, + "loss": 0.698, + "step": 13396 + }, + { + "epoch": 0.9319976347003374, + "grad_norm": 1.125, + "learning_rate": 2.4135370061883045e-05, + "loss": 0.7936, + "step": 13397 + }, + { + "epoch": 0.9320672023374726, + "grad_norm": 1.21875, + "learning_rate": 2.4086187442676766e-05, + "loss": 0.9763, + "step": 13398 + }, + { + "epoch": 0.9321367699746078, + "grad_norm": 0.91015625, + "learning_rate": 2.403705437545489e-05, + "loss": 0.7281, + "step": 13399 + }, + { + "epoch": 0.932206337611743, + "grad_norm": 1.3046875, + "learning_rate": 2.3987970862712204e-05, + "loss": 0.7393, + "step": 13400 + }, + { + "epoch": 0.9322759052488783, + "grad_norm": 1.390625, + "learning_rate": 2.3938936906940824e-05, + "loss": 0.9169, + "step": 13401 + }, + { + "epoch": 0.9323454728860134, + "grad_norm": 1.3203125, + "learning_rate": 2.3889952510630643e-05, + "loss": 0.7412, + "step": 13402 + }, + { + "epoch": 0.9324150405231486, + "grad_norm": 1.0234375, + "learning_rate": 2.3841017676268673e-05, + "loss": 0.6227, + "step": 13403 + }, + { + "epoch": 0.9324846081602839, + "grad_norm": 1.1328125, + "learning_rate": 2.3792132406339485e-05, + "loss": 1.0202, + "step": 13404 + }, + { + "epoch": 0.9325541757974191, + "grad_norm": 1.2265625, + "learning_rate": 2.3743296703325533e-05, + "loss": 0.7722, + "step": 13405 + }, + { + "epoch": 0.9326237434345542, + "grad_norm": 1.390625, + "learning_rate": 2.3694510569706285e-05, + "loss": 1.0787, + "step": 13406 + }, + { + "epoch": 0.9326933110716894, + "grad_norm": 0.8671875, + "learning_rate": 2.3645774007958754e-05, + "loss": 0.6709, + "step": 13407 + }, + { + "epoch": 0.9327628787088247, + "grad_norm": 0.9453125, + "learning_rate": 2.3597087020557628e-05, + "loss": 0.6799, + "step": 13408 + }, + { + "epoch": 0.9328324463459599, + "grad_norm": 1.1015625, + "learning_rate": 2.354844960997493e-05, + "loss": 0.726, + "step": 13409 + }, + { + "epoch": 0.932902013983095, + "grad_norm": 0.9375, + "learning_rate": 2.3499861778680463e-05, + "loss": 1.0161, + "step": 13410 + }, + { + "epoch": 0.9329715816202303, + "grad_norm": 1.265625, + "learning_rate": 2.3451323529140923e-05, + "loss": 0.8431, + "step": 13411 + }, + { + "epoch": 0.9330411492573655, + "grad_norm": 1.0625, + "learning_rate": 2.340283486382111e-05, + "loss": 0.8835, + "step": 13412 + }, + { + "epoch": 0.9331107168945006, + "grad_norm": 1.046875, + "learning_rate": 2.3354395785182836e-05, + "loss": 0.6826, + "step": 13413 + }, + { + "epoch": 0.9331802845316359, + "grad_norm": 1.1015625, + "learning_rate": 2.330600629568569e-05, + "loss": 0.8793, + "step": 13414 + }, + { + "epoch": 0.9332498521687711, + "grad_norm": 1.0546875, + "learning_rate": 2.3257666397786702e-05, + "loss": 0.9135, + "step": 13415 + }, + { + "epoch": 0.9333194198059063, + "grad_norm": 0.9765625, + "learning_rate": 2.320937609394025e-05, + "loss": 0.6617, + "step": 13416 + }, + { + "epoch": 0.9333889874430416, + "grad_norm": 1.3515625, + "learning_rate": 2.3161135386598255e-05, + "loss": 0.8328, + "step": 13417 + }, + { + "epoch": 0.9334585550801767, + "grad_norm": 1.5859375, + "learning_rate": 2.31129442782102e-05, + "loss": 0.822, + "step": 13418 + }, + { + "epoch": 0.9335281227173119, + "grad_norm": 1.1796875, + "learning_rate": 2.3064802771223026e-05, + "loss": 0.5763, + "step": 13419 + }, + { + "epoch": 0.9335976903544471, + "grad_norm": 1.0234375, + "learning_rate": 2.301671086808099e-05, + "loss": 0.7431, + "step": 13420 + }, + { + "epoch": 0.9336672579915823, + "grad_norm": 0.94140625, + "learning_rate": 2.2968668571226038e-05, + "loss": 0.7305, + "step": 13421 + }, + { + "epoch": 0.9337368256287175, + "grad_norm": 1.125, + "learning_rate": 2.292067588309732e-05, + "loss": 0.6679, + "step": 13422 + }, + { + "epoch": 0.9338063932658527, + "grad_norm": 1.359375, + "learning_rate": 2.287273280613211e-05, + "loss": 0.9048, + "step": 13423 + }, + { + "epoch": 0.933875960902988, + "grad_norm": 1.0390625, + "learning_rate": 2.282483934276436e-05, + "loss": 0.873, + "step": 13424 + }, + { + "epoch": 0.9339455285401231, + "grad_norm": 1.1640625, + "learning_rate": 2.2776995495425778e-05, + "loss": 0.6686, + "step": 13425 + }, + { + "epoch": 0.9340150961772583, + "grad_norm": 1.1328125, + "learning_rate": 2.2729201266545983e-05, + "loss": 0.7767, + "step": 13426 + }, + { + "epoch": 0.9340846638143936, + "grad_norm": 1.265625, + "learning_rate": 2.268145665855148e-05, + "loss": 0.8113, + "step": 13427 + }, + { + "epoch": 0.9341542314515288, + "grad_norm": 1.4296875, + "learning_rate": 2.2633761673866548e-05, + "loss": 0.8365, + "step": 13428 + }, + { + "epoch": 0.9342237990886639, + "grad_norm": 0.859375, + "learning_rate": 2.2586116314912807e-05, + "loss": 0.7165, + "step": 13429 + }, + { + "epoch": 0.9342933667257992, + "grad_norm": 1.3515625, + "learning_rate": 2.2538520584109766e-05, + "loss": 0.7445, + "step": 13430 + }, + { + "epoch": 0.9343629343629344, + "grad_norm": 1.0078125, + "learning_rate": 2.2490974483873715e-05, + "loss": 0.788, + "step": 13431 + }, + { + "epoch": 0.9344325020000696, + "grad_norm": 1.1640625, + "learning_rate": 2.2443478016618945e-05, + "loss": 0.8022, + "step": 13432 + }, + { + "epoch": 0.9345020696372047, + "grad_norm": 1.1171875, + "learning_rate": 2.2396031184757193e-05, + "loss": 0.8723, + "step": 13433 + }, + { + "epoch": 0.93457163727434, + "grad_norm": 1.015625, + "learning_rate": 2.234863399069753e-05, + "loss": 0.7293, + "step": 13434 + }, + { + "epoch": 0.9346412049114752, + "grad_norm": 1.1953125, + "learning_rate": 2.230128643684648e-05, + "loss": 0.659, + "step": 13435 + }, + { + "epoch": 0.9347107725486103, + "grad_norm": 1.1171875, + "learning_rate": 2.2253988525608004e-05, + "loss": 0.6691, + "step": 13436 + }, + { + "epoch": 0.9347803401857456, + "grad_norm": 1.390625, + "learning_rate": 2.2206740259383963e-05, + "loss": 1.1001, + "step": 13437 + }, + { + "epoch": 0.9348499078228808, + "grad_norm": 1.03125, + "learning_rate": 2.2159541640573212e-05, + "loss": 0.7743, + "step": 13438 + }, + { + "epoch": 0.934919475460016, + "grad_norm": 1.1640625, + "learning_rate": 2.2112392671572058e-05, + "loss": 0.9172, + "step": 13439 + }, + { + "epoch": 0.9349890430971513, + "grad_norm": 1.1640625, + "learning_rate": 2.2065293354774916e-05, + "loss": 0.8179, + "step": 13440 + }, + { + "epoch": 0.9350586107342864, + "grad_norm": 0.8515625, + "learning_rate": 2.201824369257288e-05, + "loss": 0.6874, + "step": 13441 + }, + { + "epoch": 0.9351281783714216, + "grad_norm": 0.9765625, + "learning_rate": 2.1971243687355034e-05, + "loss": 0.7957, + "step": 13442 + }, + { + "epoch": 0.9351977460085569, + "grad_norm": 1.2109375, + "learning_rate": 2.1924293341507804e-05, + "loss": 0.9496, + "step": 13443 + }, + { + "epoch": 0.935267313645692, + "grad_norm": 1.25, + "learning_rate": 2.1877392657415172e-05, + "loss": 0.8483, + "step": 13444 + }, + { + "epoch": 0.9353368812828272, + "grad_norm": 0.875, + "learning_rate": 2.1830541637458347e-05, + "loss": 0.7571, + "step": 13445 + }, + { + "epoch": 0.9354064489199624, + "grad_norm": 1.1328125, + "learning_rate": 2.1783740284016306e-05, + "loss": 0.9877, + "step": 13446 + }, + { + "epoch": 0.9354760165570977, + "grad_norm": 1.375, + "learning_rate": 2.173698859946538e-05, + "loss": 1.0809, + "step": 13447 + }, + { + "epoch": 0.9355455841942328, + "grad_norm": 1.875, + "learning_rate": 2.169028658617944e-05, + "loss": 0.8189, + "step": 13448 + }, + { + "epoch": 0.935615151831368, + "grad_norm": 1.1171875, + "learning_rate": 2.1643634246529597e-05, + "loss": 0.7207, + "step": 13449 + }, + { + "epoch": 0.9356847194685033, + "grad_norm": 1.125, + "learning_rate": 2.159703158288462e-05, + "loss": 0.608, + "step": 13450 + }, + { + "epoch": 0.9357542871056385, + "grad_norm": 1.1328125, + "learning_rate": 2.1550478597611055e-05, + "loss": 0.6458, + "step": 13451 + }, + { + "epoch": 0.9358238547427736, + "grad_norm": 1.171875, + "learning_rate": 2.1503975293072466e-05, + "loss": 0.7931, + "step": 13452 + }, + { + "epoch": 0.9358934223799089, + "grad_norm": 1.1171875, + "learning_rate": 2.1457521671629842e-05, + "loss": 0.8415, + "step": 13453 + }, + { + "epoch": 0.9359629900170441, + "grad_norm": 1.0390625, + "learning_rate": 2.1411117735642194e-05, + "loss": 0.7379, + "step": 13454 + }, + { + "epoch": 0.9360325576541793, + "grad_norm": 1.2734375, + "learning_rate": 2.136476348746541e-05, + "loss": 0.7582, + "step": 13455 + }, + { + "epoch": 0.9361021252913145, + "grad_norm": 1.1875, + "learning_rate": 2.1318458929453388e-05, + "loss": 0.8989, + "step": 13456 + }, + { + "epoch": 0.9361716929284497, + "grad_norm": 1.2578125, + "learning_rate": 2.1272204063957022e-05, + "loss": 0.571, + "step": 13457 + }, + { + "epoch": 0.9362412605655849, + "grad_norm": 1.109375, + "learning_rate": 2.1225998893324993e-05, + "loss": 0.6005, + "step": 13458 + }, + { + "epoch": 0.93631082820272, + "grad_norm": 1.46875, + "learning_rate": 2.117984341990331e-05, + "loss": 0.7056, + "step": 13459 + }, + { + "epoch": 0.9363803958398553, + "grad_norm": 1.1953125, + "learning_rate": 2.1133737646035544e-05, + "loss": 0.7055, + "step": 13460 + }, + { + "epoch": 0.9364499634769905, + "grad_norm": 0.984375, + "learning_rate": 2.1087681574062824e-05, + "loss": 0.6748, + "step": 13461 + }, + { + "epoch": 0.9365195311141257, + "grad_norm": 0.984375, + "learning_rate": 2.1041675206323498e-05, + "loss": 0.7552, + "step": 13462 + }, + { + "epoch": 0.936589098751261, + "grad_norm": 1.3828125, + "learning_rate": 2.0995718545153585e-05, + "loss": 0.9645, + "step": 13463 + }, + { + "epoch": 0.9366586663883961, + "grad_norm": 1.4765625, + "learning_rate": 2.094981159288656e-05, + "loss": 0.7061, + "step": 13464 + }, + { + "epoch": 0.9367282340255313, + "grad_norm": 1.546875, + "learning_rate": 2.0903954351853328e-05, + "loss": 0.7951, + "step": 13465 + }, + { + "epoch": 0.9367978016626666, + "grad_norm": 1.0546875, + "learning_rate": 2.085814682438225e-05, + "loss": 0.8461, + "step": 13466 + }, + { + "epoch": 0.9368673692998017, + "grad_norm": 1.015625, + "learning_rate": 2.0812389012799248e-05, + "loss": 0.7609, + "step": 13467 + }, + { + "epoch": 0.9369369369369369, + "grad_norm": 1.078125, + "learning_rate": 2.0766680919427682e-05, + "loss": 0.7872, + "step": 13468 + }, + { + "epoch": 0.9370065045740722, + "grad_norm": 1.6796875, + "learning_rate": 2.0721022546588362e-05, + "loss": 0.8846, + "step": 13469 + }, + { + "epoch": 0.9370760722112074, + "grad_norm": 1.3125, + "learning_rate": 2.0675413896599548e-05, + "loss": 0.7828, + "step": 13470 + }, + { + "epoch": 0.9371456398483425, + "grad_norm": 1.078125, + "learning_rate": 2.0629854971777053e-05, + "loss": 0.5559, + "step": 13471 + }, + { + "epoch": 0.9372152074854777, + "grad_norm": 1.2734375, + "learning_rate": 2.0584345774434243e-05, + "loss": 0.8942, + "step": 13472 + }, + { + "epoch": 0.937284775122613, + "grad_norm": 0.9296875, + "learning_rate": 2.053888630688161e-05, + "loss": 0.7171, + "step": 13473 + }, + { + "epoch": 0.9373543427597482, + "grad_norm": 1.34375, + "learning_rate": 2.0493476571427526e-05, + "loss": 0.8903, + "step": 13474 + }, + { + "epoch": 0.9374239103968833, + "grad_norm": 1.328125, + "learning_rate": 2.0448116570377596e-05, + "loss": 0.8405, + "step": 13475 + }, + { + "epoch": 0.9374934780340186, + "grad_norm": 1.078125, + "learning_rate": 2.0402806306034973e-05, + "loss": 0.5641, + "step": 13476 + }, + { + "epoch": 0.9375630456711538, + "grad_norm": 1.046875, + "learning_rate": 2.035754578070037e-05, + "loss": 0.7364, + "step": 13477 + }, + { + "epoch": 0.937632613308289, + "grad_norm": 1.3046875, + "learning_rate": 2.0312334996671734e-05, + "loss": 0.9378, + "step": 13478 + }, + { + "epoch": 0.9377021809454242, + "grad_norm": 1.03125, + "learning_rate": 2.0267173956244887e-05, + "loss": 0.7104, + "step": 13479 + }, + { + "epoch": 0.9377717485825594, + "grad_norm": 1.484375, + "learning_rate": 2.022206266171267e-05, + "loss": 0.8865, + "step": 13480 + }, + { + "epoch": 0.9378413162196946, + "grad_norm": 1.2890625, + "learning_rate": 2.017700111536558e-05, + "loss": 0.7749, + "step": 13481 + }, + { + "epoch": 0.9379108838568299, + "grad_norm": 1.0859375, + "learning_rate": 2.0131989319491784e-05, + "loss": 0.9203, + "step": 13482 + }, + { + "epoch": 0.937980451493965, + "grad_norm": 1.40625, + "learning_rate": 2.008702727637668e-05, + "loss": 1.0085, + "step": 13483 + }, + { + "epoch": 0.9380500191311002, + "grad_norm": 1.0703125, + "learning_rate": 2.0042114988303217e-05, + "loss": 0.7835, + "step": 13484 + }, + { + "epoch": 0.9381195867682354, + "grad_norm": 1.09375, + "learning_rate": 1.9997252457551685e-05, + "loss": 0.7453, + "step": 13485 + }, + { + "epoch": 0.9381891544053707, + "grad_norm": 1.0546875, + "learning_rate": 1.9952439686400148e-05, + "loss": 0.5907, + "step": 13486 + }, + { + "epoch": 0.9382587220425058, + "grad_norm": 0.9765625, + "learning_rate": 1.9907676677123898e-05, + "loss": 0.6764, + "step": 13487 + }, + { + "epoch": 0.938328289679641, + "grad_norm": 1.3984375, + "learning_rate": 1.9862963431995895e-05, + "loss": 0.912, + "step": 13488 + }, + { + "epoch": 0.9383978573167763, + "grad_norm": 1.0703125, + "learning_rate": 1.981829995328621e-05, + "loss": 0.8614, + "step": 13489 + }, + { + "epoch": 0.9384674249539114, + "grad_norm": 1.328125, + "learning_rate": 1.9773686243262924e-05, + "loss": 1.0099, + "step": 13490 + }, + { + "epoch": 0.9385369925910466, + "grad_norm": 1.0859375, + "learning_rate": 1.9729122304191104e-05, + "loss": 0.6065, + "step": 13491 + }, + { + "epoch": 0.9386065602281819, + "grad_norm": 1.546875, + "learning_rate": 1.9684608138333392e-05, + "loss": 0.9515, + "step": 13492 + }, + { + "epoch": 0.9386761278653171, + "grad_norm": 1.0390625, + "learning_rate": 1.9640143747950312e-05, + "loss": 0.6751, + "step": 13493 + }, + { + "epoch": 0.9387456955024522, + "grad_norm": 1.0, + "learning_rate": 1.959572913529928e-05, + "loss": 0.6923, + "step": 13494 + }, + { + "epoch": 0.9388152631395875, + "grad_norm": 1.2421875, + "learning_rate": 1.9551364302635377e-05, + "loss": 1.0487, + "step": 13495 + }, + { + "epoch": 0.9388848307767227, + "grad_norm": 1.3359375, + "learning_rate": 1.9507049252211472e-05, + "loss": 0.9824, + "step": 13496 + }, + { + "epoch": 0.9389543984138579, + "grad_norm": 1.15625, + "learning_rate": 1.9462783986277655e-05, + "loss": 0.9057, + "step": 13497 + }, + { + "epoch": 0.939023966050993, + "grad_norm": 0.85546875, + "learning_rate": 1.9418568507081346e-05, + "loss": 0.6693, + "step": 13498 + }, + { + "epoch": 0.9390935336881283, + "grad_norm": 1.0, + "learning_rate": 1.937440281686753e-05, + "loss": 0.5237, + "step": 13499 + }, + { + "epoch": 0.9391631013252635, + "grad_norm": 0.87890625, + "learning_rate": 1.933028691787886e-05, + "loss": 0.6226, + "step": 13500 + }, + { + "epoch": 0.9392326689623987, + "grad_norm": 1.2734375, + "learning_rate": 1.9286220812355317e-05, + "loss": 0.77, + "step": 13501 + }, + { + "epoch": 0.9393022365995339, + "grad_norm": 0.9453125, + "learning_rate": 1.9242204502534332e-05, + "loss": 0.6295, + "step": 13502 + }, + { + "epoch": 0.9393718042366691, + "grad_norm": 1.2578125, + "learning_rate": 1.9198237990650792e-05, + "loss": 0.6855, + "step": 13503 + }, + { + "epoch": 0.9394413718738043, + "grad_norm": 1.0234375, + "learning_rate": 1.9154321278937126e-05, + "loss": 0.7297, + "step": 13504 + }, + { + "epoch": 0.9395109395109396, + "grad_norm": 1.0390625, + "learning_rate": 1.911045436962322e-05, + "loss": 0.8111, + "step": 13505 + }, + { + "epoch": 0.9395805071480747, + "grad_norm": 1.0703125, + "learning_rate": 1.9066637264936293e-05, + "loss": 0.7445, + "step": 13506 + }, + { + "epoch": 0.9396500747852099, + "grad_norm": 1.1796875, + "learning_rate": 1.902286996710134e-05, + "loss": 0.8456, + "step": 13507 + }, + { + "epoch": 0.9397196424223452, + "grad_norm": 1.203125, + "learning_rate": 1.8979152478340588e-05, + "loss": 0.8239, + "step": 13508 + }, + { + "epoch": 0.9397892100594804, + "grad_norm": 1.1171875, + "learning_rate": 1.8935484800873702e-05, + "loss": 0.6998, + "step": 13509 + }, + { + "epoch": 0.9398587776966155, + "grad_norm": 1.0546875, + "learning_rate": 1.8891866936917913e-05, + "loss": 0.8455, + "step": 13510 + }, + { + "epoch": 0.9399283453337507, + "grad_norm": 1.140625, + "learning_rate": 1.8848298888688108e-05, + "loss": 0.7153, + "step": 13511 + }, + { + "epoch": 0.939997912970886, + "grad_norm": 1.0546875, + "learning_rate": 1.8804780658396303e-05, + "loss": 0.9088, + "step": 13512 + }, + { + "epoch": 0.9400674806080211, + "grad_norm": 1.0234375, + "learning_rate": 1.876131224825195e-05, + "loss": 0.6779, + "step": 13513 + }, + { + "epoch": 0.9401370482451563, + "grad_norm": 1.1796875, + "learning_rate": 1.8717893660462502e-05, + "loss": 0.6937, + "step": 13514 + }, + { + "epoch": 0.9402066158822916, + "grad_norm": 0.98046875, + "learning_rate": 1.8674524897232427e-05, + "loss": 0.6491, + "step": 13515 + }, + { + "epoch": 0.9402761835194268, + "grad_norm": 0.9609375, + "learning_rate": 1.863120596076373e-05, + "loss": 0.7033, + "step": 13516 + }, + { + "epoch": 0.9403457511565619, + "grad_norm": 1.15625, + "learning_rate": 1.858793685325577e-05, + "loss": 0.6249, + "step": 13517 + }, + { + "epoch": 0.9404153187936972, + "grad_norm": 1.2265625, + "learning_rate": 1.85447175769059e-05, + "loss": 0.7647, + "step": 13518 + }, + { + "epoch": 0.9404848864308324, + "grad_norm": 1.28125, + "learning_rate": 1.850154813390814e-05, + "loss": 0.8839, + "step": 13519 + }, + { + "epoch": 0.9405544540679676, + "grad_norm": 1.8203125, + "learning_rate": 1.845842852645474e-05, + "loss": 1.343, + "step": 13520 + }, + { + "epoch": 0.9406240217051027, + "grad_norm": 1.125, + "learning_rate": 1.8415358756735168e-05, + "loss": 0.6996, + "step": 13521 + }, + { + "epoch": 0.940693589342238, + "grad_norm": 1.3515625, + "learning_rate": 1.8372338826936007e-05, + "loss": 0.9822, + "step": 13522 + }, + { + "epoch": 0.9407631569793732, + "grad_norm": 1.0, + "learning_rate": 1.8329368739241625e-05, + "loss": 0.5543, + "step": 13523 + }, + { + "epoch": 0.9408327246165084, + "grad_norm": 1.1015625, + "learning_rate": 1.828644849583394e-05, + "loss": 0.9282, + "step": 13524 + }, + { + "epoch": 0.9409022922536436, + "grad_norm": 1.453125, + "learning_rate": 1.8243578098892322e-05, + "loss": 0.9974, + "step": 13525 + }, + { + "epoch": 0.9409718598907788, + "grad_norm": 1.2265625, + "learning_rate": 1.820075755059336e-05, + "loss": 0.7867, + "step": 13526 + }, + { + "epoch": 0.941041427527914, + "grad_norm": 1.1484375, + "learning_rate": 1.8157986853111208e-05, + "loss": 1.1717, + "step": 13527 + }, + { + "epoch": 0.9411109951650493, + "grad_norm": 1.1015625, + "learning_rate": 1.811526600861757e-05, + "loss": 0.8979, + "step": 13528 + }, + { + "epoch": 0.9411805628021844, + "grad_norm": 0.97265625, + "learning_rate": 1.8072595019281824e-05, + "loss": 0.5598, + "step": 13529 + }, + { + "epoch": 0.9412501304393196, + "grad_norm": 1.5078125, + "learning_rate": 1.8029973887270344e-05, + "loss": 1.0097, + "step": 13530 + }, + { + "epoch": 0.9413196980764549, + "grad_norm": 1.046875, + "learning_rate": 1.7987402614747296e-05, + "loss": 0.7811, + "step": 13531 + }, + { + "epoch": 0.94138926571359, + "grad_norm": 1.09375, + "learning_rate": 1.7944881203874162e-05, + "loss": 0.7663, + "step": 13532 + }, + { + "epoch": 0.9414588333507252, + "grad_norm": 1.421875, + "learning_rate": 1.7902409656810226e-05, + "loss": 0.9591, + "step": 13533 + }, + { + "epoch": 0.9415284009878604, + "grad_norm": 1.171875, + "learning_rate": 1.7859987975711644e-05, + "loss": 0.8446, + "step": 13534 + }, + { + "epoch": 0.9415979686249957, + "grad_norm": 1.2265625, + "learning_rate": 1.7817616162732587e-05, + "loss": 0.7937, + "step": 13535 + }, + { + "epoch": 0.9416675362621308, + "grad_norm": 1.234375, + "learning_rate": 1.777529422002444e-05, + "loss": 1.1306, + "step": 13536 + }, + { + "epoch": 0.941737103899266, + "grad_norm": 0.9921875, + "learning_rate": 1.7733022149735934e-05, + "loss": 0.5752, + "step": 13537 + }, + { + "epoch": 0.9418066715364013, + "grad_norm": 1.0390625, + "learning_rate": 1.769079995401357e-05, + "loss": 0.7649, + "step": 13538 + }, + { + "epoch": 0.9418762391735365, + "grad_norm": 1.515625, + "learning_rate": 1.76486276350013e-05, + "loss": 0.9554, + "step": 13539 + }, + { + "epoch": 0.9419458068106716, + "grad_norm": 1.140625, + "learning_rate": 1.7606505194840304e-05, + "loss": 0.9893, + "step": 13540 + }, + { + "epoch": 0.9420153744478069, + "grad_norm": 1.0859375, + "learning_rate": 1.7564432635669314e-05, + "loss": 0.7798, + "step": 13541 + }, + { + "epoch": 0.9420849420849421, + "grad_norm": 1.21875, + "learning_rate": 1.752240995962451e-05, + "loss": 0.8572, + "step": 13542 + }, + { + "epoch": 0.9421545097220773, + "grad_norm": 1.078125, + "learning_rate": 1.7480437168839847e-05, + "loss": 0.9295, + "step": 13543 + }, + { + "epoch": 0.9422240773592125, + "grad_norm": 1.1953125, + "learning_rate": 1.743851426544618e-05, + "loss": 0.7993, + "step": 13544 + }, + { + "epoch": 0.9422936449963477, + "grad_norm": 1.203125, + "learning_rate": 1.7396641251572364e-05, + "loss": 0.8911, + "step": 13545 + }, + { + "epoch": 0.9423632126334829, + "grad_norm": 1.3046875, + "learning_rate": 1.7354818129344253e-05, + "loss": 0.9248, + "step": 13546 + }, + { + "epoch": 0.942432780270618, + "grad_norm": 1.1484375, + "learning_rate": 1.731304490088581e-05, + "loss": 0.7356, + "step": 13547 + }, + { + "epoch": 0.9425023479077533, + "grad_norm": 1.0078125, + "learning_rate": 1.7271321568317677e-05, + "loss": 0.5402, + "step": 13548 + }, + { + "epoch": 0.9425719155448885, + "grad_norm": 1.0, + "learning_rate": 1.72296481337586e-05, + "loss": 0.5653, + "step": 13549 + }, + { + "epoch": 0.9426414831820237, + "grad_norm": 1.0625, + "learning_rate": 1.7188024599324448e-05, + "loss": 0.7083, + "step": 13550 + }, + { + "epoch": 0.942711050819159, + "grad_norm": 1.1328125, + "learning_rate": 1.7146450967128635e-05, + "loss": 0.8384, + "step": 13551 + }, + { + "epoch": 0.9427806184562941, + "grad_norm": 1.15625, + "learning_rate": 1.710492723928203e-05, + "loss": 0.6302, + "step": 13552 + }, + { + "epoch": 0.9428501860934293, + "grad_norm": 1.2890625, + "learning_rate": 1.7063453417893173e-05, + "loss": 0.7427, + "step": 13553 + }, + { + "epoch": 0.9429197537305646, + "grad_norm": 1.078125, + "learning_rate": 1.7022029505067816e-05, + "loss": 0.7552, + "step": 13554 + }, + { + "epoch": 0.9429893213676998, + "grad_norm": 1.0390625, + "learning_rate": 1.698065550290906e-05, + "loss": 0.6747, + "step": 13555 + }, + { + "epoch": 0.9430588890048349, + "grad_norm": 1.1015625, + "learning_rate": 1.693933141351789e-05, + "loss": 0.7567, + "step": 13556 + }, + { + "epoch": 0.9431284566419702, + "grad_norm": 1.0234375, + "learning_rate": 1.6898057238992625e-05, + "loss": 0.7553, + "step": 13557 + }, + { + "epoch": 0.9431980242791054, + "grad_norm": 1.109375, + "learning_rate": 1.6856832981428706e-05, + "loss": 0.7016, + "step": 13558 + }, + { + "epoch": 0.9432675919162405, + "grad_norm": 0.94140625, + "learning_rate": 1.681565864291934e-05, + "loss": 0.7851, + "step": 13559 + }, + { + "epoch": 0.9433371595533757, + "grad_norm": 1.0859375, + "learning_rate": 1.6774534225555194e-05, + "loss": 0.9058, + "step": 13560 + }, + { + "epoch": 0.943406727190511, + "grad_norm": 1.390625, + "learning_rate": 1.6733459731424594e-05, + "loss": 0.7591, + "step": 13561 + }, + { + "epoch": 0.9434762948276462, + "grad_norm": 1.09375, + "learning_rate": 1.6692435162612764e-05, + "loss": 0.6447, + "step": 13562 + }, + { + "epoch": 0.9435458624647813, + "grad_norm": 1.1796875, + "learning_rate": 1.66514605212027e-05, + "loss": 1.0391, + "step": 13563 + }, + { + "epoch": 0.9436154301019166, + "grad_norm": 1.4296875, + "learning_rate": 1.6610535809275185e-05, + "loss": 0.7364, + "step": 13564 + }, + { + "epoch": 0.9436849977390518, + "grad_norm": 1.0625, + "learning_rate": 1.6569661028908e-05, + "loss": 0.6518, + "step": 13565 + }, + { + "epoch": 0.943754565376187, + "grad_norm": 1.2109375, + "learning_rate": 1.6528836182176487e-05, + "loss": 0.8753, + "step": 13566 + }, + { + "epoch": 0.9438241330133222, + "grad_norm": 1.15625, + "learning_rate": 1.6488061271153653e-05, + "loss": 0.9546, + "step": 13567 + }, + { + "epoch": 0.9438937006504574, + "grad_norm": 0.859375, + "learning_rate": 1.6447336297909842e-05, + "loss": 0.7661, + "step": 13568 + }, + { + "epoch": 0.9439632682875926, + "grad_norm": 1.1484375, + "learning_rate": 1.6406661264512733e-05, + "loss": 0.8817, + "step": 13569 + }, + { + "epoch": 0.9440328359247279, + "grad_norm": 0.9375, + "learning_rate": 1.6366036173027676e-05, + "loss": 0.7684, + "step": 13570 + }, + { + "epoch": 0.944102403561863, + "grad_norm": 1.3671875, + "learning_rate": 1.6325461025517574e-05, + "loss": 0.9332, + "step": 13571 + }, + { + "epoch": 0.9441719711989982, + "grad_norm": 1.3671875, + "learning_rate": 1.6284935824042447e-05, + "loss": 0.7933, + "step": 13572 + }, + { + "epoch": 0.9442415388361334, + "grad_norm": 1.3671875, + "learning_rate": 1.624446057065987e-05, + "loss": 0.8663, + "step": 13573 + }, + { + "epoch": 0.9443111064732687, + "grad_norm": 1.3203125, + "learning_rate": 1.6204035267425088e-05, + "loss": 0.9225, + "step": 13574 + }, + { + "epoch": 0.9443806741104038, + "grad_norm": 1.4296875, + "learning_rate": 1.6163659916390794e-05, + "loss": 1.2551, + "step": 13575 + }, + { + "epoch": 0.944450241747539, + "grad_norm": 0.9921875, + "learning_rate": 1.61233345196069e-05, + "loss": 0.6812, + "step": 13576 + }, + { + "epoch": 0.9445198093846743, + "grad_norm": 1.0625, + "learning_rate": 1.6083059079121e-05, + "loss": 0.5759, + "step": 13577 + }, + { + "epoch": 0.9445893770218095, + "grad_norm": 1.2890625, + "learning_rate": 1.6042833596978e-05, + "loss": 0.8327, + "step": 13578 + }, + { + "epoch": 0.9446589446589446, + "grad_norm": 0.8984375, + "learning_rate": 1.600265807522039e-05, + "loss": 0.9751, + "step": 13579 + }, + { + "epoch": 0.9447285122960799, + "grad_norm": 1.1640625, + "learning_rate": 1.5962532515888086e-05, + "loss": 0.964, + "step": 13580 + }, + { + "epoch": 0.9447980799332151, + "grad_norm": 1.140625, + "learning_rate": 1.592245692101857e-05, + "loss": 0.7652, + "step": 13581 + }, + { + "epoch": 0.9448676475703502, + "grad_norm": 1.265625, + "learning_rate": 1.588243129264655e-05, + "loss": 1.1237, + "step": 13582 + }, + { + "epoch": 0.9449372152074855, + "grad_norm": 1.0390625, + "learning_rate": 1.5842455632804288e-05, + "loss": 0.6775, + "step": 13583 + }, + { + "epoch": 0.9450067828446207, + "grad_norm": 1.0390625, + "learning_rate": 1.5802529943521604e-05, + "loss": 0.6229, + "step": 13584 + }, + { + "epoch": 0.9450763504817559, + "grad_norm": 1.2265625, + "learning_rate": 1.576265422682577e-05, + "loss": 0.6356, + "step": 13585 + }, + { + "epoch": 0.945145918118891, + "grad_norm": 1.2890625, + "learning_rate": 1.5722828484741382e-05, + "loss": 0.9438, + "step": 13586 + }, + { + "epoch": 0.9452154857560263, + "grad_norm": 1.1796875, + "learning_rate": 1.5683052719290714e-05, + "loss": 0.9678, + "step": 13587 + }, + { + "epoch": 0.9452850533931615, + "grad_norm": 1.109375, + "learning_rate": 1.564332693249315e-05, + "loss": 0.9181, + "step": 13588 + }, + { + "epoch": 0.9453546210302967, + "grad_norm": 1.0390625, + "learning_rate": 1.560365112636608e-05, + "loss": 0.927, + "step": 13589 + }, + { + "epoch": 0.9454241886674319, + "grad_norm": 0.91796875, + "learning_rate": 1.556402530292389e-05, + "loss": 0.9889, + "step": 13590 + }, + { + "epoch": 0.9454937563045671, + "grad_norm": 1.2109375, + "learning_rate": 1.5524449464178413e-05, + "loss": 0.8351, + "step": 13591 + }, + { + "epoch": 0.9455633239417023, + "grad_norm": 0.94140625, + "learning_rate": 1.548492361213938e-05, + "loss": 0.7599, + "step": 13592 + }, + { + "epoch": 0.9456328915788376, + "grad_norm": 1.0859375, + "learning_rate": 1.5445447748813624e-05, + "loss": 0.7491, + "step": 13593 + }, + { + "epoch": 0.9457024592159727, + "grad_norm": 1.25, + "learning_rate": 1.5406021876205435e-05, + "loss": 0.8565, + "step": 13594 + }, + { + "epoch": 0.9457720268531079, + "grad_norm": 1.046875, + "learning_rate": 1.5366645996316764e-05, + "loss": 0.8743, + "step": 13595 + }, + { + "epoch": 0.9458415944902432, + "grad_norm": 1.15625, + "learning_rate": 1.5327320111146904e-05, + "loss": 0.8134, + "step": 13596 + }, + { + "epoch": 0.9459111621273784, + "grad_norm": 1.28125, + "learning_rate": 1.528804422269259e-05, + "loss": 0.8351, + "step": 13597 + }, + { + "epoch": 0.9459807297645135, + "grad_norm": 1.359375, + "learning_rate": 1.5248818332948e-05, + "loss": 1.1409, + "step": 13598 + }, + { + "epoch": 0.9460502974016487, + "grad_norm": 1.6171875, + "learning_rate": 1.5209642443905103e-05, + "loss": 0.7357, + "step": 13599 + }, + { + "epoch": 0.946119865038784, + "grad_norm": 1.1796875, + "learning_rate": 1.517051655755275e-05, + "loss": 0.8461, + "step": 13600 + }, + { + "epoch": 0.9461894326759192, + "grad_norm": 1.0390625, + "learning_rate": 1.5131440675877572e-05, + "loss": 0.7245, + "step": 13601 + }, + { + "epoch": 0.9462590003130543, + "grad_norm": 1.21875, + "learning_rate": 1.5092414800863763e-05, + "loss": 0.8758, + "step": 13602 + }, + { + "epoch": 0.9463285679501896, + "grad_norm": 1.03125, + "learning_rate": 1.5053438934492958e-05, + "loss": 0.8778, + "step": 13603 + }, + { + "epoch": 0.9463981355873248, + "grad_norm": 1.2265625, + "learning_rate": 1.5014513078743907e-05, + "loss": 0.8159, + "step": 13604 + }, + { + "epoch": 0.9464677032244599, + "grad_norm": 1.109375, + "learning_rate": 1.4975637235593253e-05, + "loss": 0.7748, + "step": 13605 + }, + { + "epoch": 0.9465372708615952, + "grad_norm": 1.140625, + "learning_rate": 1.493681140701475e-05, + "loss": 0.7375, + "step": 13606 + }, + { + "epoch": 0.9466068384987304, + "grad_norm": 1.0859375, + "learning_rate": 1.4898035594979931e-05, + "loss": 0.7438, + "step": 13607 + }, + { + "epoch": 0.9466764061358656, + "grad_norm": 1.0625, + "learning_rate": 1.4859309801457555e-05, + "loss": 0.7884, + "step": 13608 + }, + { + "epoch": 0.9467459737730008, + "grad_norm": 1.6796875, + "learning_rate": 1.4820634028414049e-05, + "loss": 0.7238, + "step": 13609 + }, + { + "epoch": 0.946815541410136, + "grad_norm": 1.015625, + "learning_rate": 1.4782008277812953e-05, + "loss": 0.937, + "step": 13610 + }, + { + "epoch": 0.9468851090472712, + "grad_norm": 0.96484375, + "learning_rate": 1.4743432551615698e-05, + "loss": 0.8037, + "step": 13611 + }, + { + "epoch": 0.9469546766844064, + "grad_norm": 0.99609375, + "learning_rate": 1.470490685178083e-05, + "loss": 0.7272, + "step": 13612 + }, + { + "epoch": 0.9470242443215416, + "grad_norm": 0.92578125, + "learning_rate": 1.4666431180264561e-05, + "loss": 0.777, + "step": 13613 + }, + { + "epoch": 0.9470938119586768, + "grad_norm": 1.125, + "learning_rate": 1.4628005539020551e-05, + "loss": 0.8623, + "step": 13614 + }, + { + "epoch": 0.947163379595812, + "grad_norm": 1.0859375, + "learning_rate": 1.458962992999957e-05, + "loss": 0.6876, + "step": 13615 + }, + { + "epoch": 0.9472329472329473, + "grad_norm": 1.0703125, + "learning_rate": 1.4551304355150396e-05, + "loss": 1.0869, + "step": 13616 + }, + { + "epoch": 0.9473025148700824, + "grad_norm": 1.28125, + "learning_rate": 1.4513028816419138e-05, + "loss": 0.9281, + "step": 13617 + }, + { + "epoch": 0.9473720825072176, + "grad_norm": 1.1015625, + "learning_rate": 1.4474803315748908e-05, + "loss": 0.7179, + "step": 13618 + }, + { + "epoch": 0.9474416501443529, + "grad_norm": 0.9765625, + "learning_rate": 1.443662785508082e-05, + "loss": 0.8745, + "step": 13619 + }, + { + "epoch": 0.9475112177814881, + "grad_norm": 1.0859375, + "learning_rate": 1.43985024363531e-05, + "loss": 0.6506, + "step": 13620 + }, + { + "epoch": 0.9475807854186232, + "grad_norm": 0.92578125, + "learning_rate": 1.4360427061501646e-05, + "loss": 0.6436, + "step": 13621 + }, + { + "epoch": 0.9476503530557585, + "grad_norm": 1.1953125, + "learning_rate": 1.432240173245969e-05, + "loss": 0.8159, + "step": 13622 + }, + { + "epoch": 0.9477199206928937, + "grad_norm": 1.0625, + "learning_rate": 1.4284426451158018e-05, + "loss": 0.9967, + "step": 13623 + }, + { + "epoch": 0.9477894883300289, + "grad_norm": 1.21875, + "learning_rate": 1.4246501219524754e-05, + "loss": 0.7271, + "step": 13624 + }, + { + "epoch": 0.947859055967164, + "grad_norm": 1.2109375, + "learning_rate": 1.4208626039485695e-05, + "loss": 0.832, + "step": 13625 + }, + { + "epoch": 0.9479286236042993, + "grad_norm": 1.0703125, + "learning_rate": 1.4170800912963744e-05, + "loss": 0.7337, + "step": 13626 + }, + { + "epoch": 0.9479981912414345, + "grad_norm": 1.15625, + "learning_rate": 1.4133025841879699e-05, + "loss": 0.7956, + "step": 13627 + }, + { + "epoch": 0.9480677588785696, + "grad_norm": 1.546875, + "learning_rate": 1.4095300828151358e-05, + "loss": 0.7781, + "step": 13628 + }, + { + "epoch": 0.9481373265157049, + "grad_norm": 1.4765625, + "learning_rate": 1.4057625873694191e-05, + "loss": 0.8593, + "step": 13629 + }, + { + "epoch": 0.9482068941528401, + "grad_norm": 1.0703125, + "learning_rate": 1.4020000980421554e-05, + "loss": 0.7329, + "step": 13630 + }, + { + "epoch": 0.9482764617899753, + "grad_norm": 1.21875, + "learning_rate": 1.3982426150243366e-05, + "loss": 0.8627, + "step": 13631 + }, + { + "epoch": 0.9483460294271105, + "grad_norm": 1.2421875, + "learning_rate": 1.3944901385067765e-05, + "loss": 0.7527, + "step": 13632 + }, + { + "epoch": 0.9484155970642457, + "grad_norm": 1.03125, + "learning_rate": 1.3907426686800007e-05, + "loss": 0.9456, + "step": 13633 + }, + { + "epoch": 0.9484851647013809, + "grad_norm": 1.0546875, + "learning_rate": 1.3870002057342679e-05, + "loss": 0.6164, + "step": 13634 + }, + { + "epoch": 0.9485547323385162, + "grad_norm": 1.109375, + "learning_rate": 1.3832627498596372e-05, + "loss": 0.8265, + "step": 13635 + }, + { + "epoch": 0.9486242999756513, + "grad_norm": 0.9453125, + "learning_rate": 1.379530301245857e-05, + "loss": 0.7207, + "step": 13636 + }, + { + "epoch": 0.9486938676127865, + "grad_norm": 1.4765625, + "learning_rate": 1.3758028600824313e-05, + "loss": 0.9727, + "step": 13637 + }, + { + "epoch": 0.9487634352499217, + "grad_norm": 1.3984375, + "learning_rate": 1.3720804265586417e-05, + "loss": 0.8802, + "step": 13638 + }, + { + "epoch": 0.948833002887057, + "grad_norm": 1.7109375, + "learning_rate": 1.3683630008634817e-05, + "loss": 1.1856, + "step": 13639 + }, + { + "epoch": 0.9489025705241921, + "grad_norm": 1.1953125, + "learning_rate": 1.3646505831857115e-05, + "loss": 0.7854, + "step": 13640 + }, + { + "epoch": 0.9489721381613273, + "grad_norm": 0.921875, + "learning_rate": 1.3609431737138356e-05, + "loss": 0.7218, + "step": 13641 + }, + { + "epoch": 0.9490417057984626, + "grad_norm": 1.2734375, + "learning_rate": 1.3572407726360703e-05, + "loss": 0.9039, + "step": 13642 + }, + { + "epoch": 0.9491112734355978, + "grad_norm": 1.0703125, + "learning_rate": 1.3535433801404317e-05, + "loss": 0.5143, + "step": 13643 + }, + { + "epoch": 0.9491808410727329, + "grad_norm": 1.203125, + "learning_rate": 1.3498509964146366e-05, + "loss": 0.6741, + "step": 13644 + }, + { + "epoch": 0.9492504087098682, + "grad_norm": 1.1328125, + "learning_rate": 1.3461636216461904e-05, + "loss": 0.7088, + "step": 13645 + }, + { + "epoch": 0.9493199763470034, + "grad_norm": 1.265625, + "learning_rate": 1.3424812560222987e-05, + "loss": 0.877, + "step": 13646 + }, + { + "epoch": 0.9493895439841386, + "grad_norm": 1.1796875, + "learning_rate": 1.3388038997299235e-05, + "loss": 0.8244, + "step": 13647 + }, + { + "epoch": 0.9494591116212738, + "grad_norm": 1.1875, + "learning_rate": 1.335131552955815e-05, + "loss": 0.7707, + "step": 13648 + }, + { + "epoch": 0.949528679258409, + "grad_norm": 0.9609375, + "learning_rate": 1.3314642158864132e-05, + "loss": 0.7041, + "step": 13649 + }, + { + "epoch": 0.9495982468955442, + "grad_norm": 1.0859375, + "learning_rate": 1.3278018887079247e-05, + "loss": 0.751, + "step": 13650 + }, + { + "epoch": 0.9496678145326793, + "grad_norm": 1.328125, + "learning_rate": 1.3241445716063227e-05, + "loss": 0.8735, + "step": 13651 + }, + { + "epoch": 0.9497373821698146, + "grad_norm": 0.8828125, + "learning_rate": 1.3204922647672813e-05, + "loss": 0.6172, + "step": 13652 + }, + { + "epoch": 0.9498069498069498, + "grad_norm": 1.1953125, + "learning_rate": 1.316844968376274e-05, + "loss": 0.9435, + "step": 13653 + }, + { + "epoch": 0.949876517444085, + "grad_norm": 0.9296875, + "learning_rate": 1.3132026826184751e-05, + "loss": 0.6585, + "step": 13654 + }, + { + "epoch": 0.9499460850812202, + "grad_norm": 1.375, + "learning_rate": 1.3095654076788254e-05, + "loss": 0.9507, + "step": 13655 + }, + { + "epoch": 0.9500156527183554, + "grad_norm": 1.109375, + "learning_rate": 1.3059331437420108e-05, + "loss": 0.8002, + "step": 13656 + }, + { + "epoch": 0.9500852203554906, + "grad_norm": 1.390625, + "learning_rate": 1.30230589099245e-05, + "loss": 0.8445, + "step": 13657 + }, + { + "epoch": 0.9501547879926259, + "grad_norm": 1.171875, + "learning_rate": 1.2986836496143295e-05, + "loss": 0.8644, + "step": 13658 + }, + { + "epoch": 0.950224355629761, + "grad_norm": 1.6015625, + "learning_rate": 1.2950664197915573e-05, + "loss": 0.6882, + "step": 13659 + }, + { + "epoch": 0.9502939232668962, + "grad_norm": 0.96875, + "learning_rate": 1.291454201707809e-05, + "loss": 0.7682, + "step": 13660 + }, + { + "epoch": 0.9503634909040315, + "grad_norm": 1.21875, + "learning_rate": 1.2878469955464712e-05, + "loss": 0.8046, + "step": 13661 + }, + { + "epoch": 0.9504330585411667, + "grad_norm": 1.28125, + "learning_rate": 1.2842448014907304e-05, + "loss": 0.7754, + "step": 13662 + }, + { + "epoch": 0.9505026261783018, + "grad_norm": 1.3359375, + "learning_rate": 1.280647619723474e-05, + "loss": 0.7069, + "step": 13663 + }, + { + "epoch": 0.950572193815437, + "grad_norm": 1.1640625, + "learning_rate": 1.2770554504273557e-05, + "loss": 0.9939, + "step": 13664 + }, + { + "epoch": 0.9506417614525723, + "grad_norm": 0.8671875, + "learning_rate": 1.273468293784752e-05, + "loss": 0.7638, + "step": 13665 + }, + { + "epoch": 0.9507113290897075, + "grad_norm": 1.1484375, + "learning_rate": 1.2698861499778058e-05, + "loss": 0.8706, + "step": 13666 + }, + { + "epoch": 0.9507808967268426, + "grad_norm": 1.2109375, + "learning_rate": 1.2663090191884164e-05, + "loss": 0.8316, + "step": 13667 + }, + { + "epoch": 0.9508504643639779, + "grad_norm": 1.203125, + "learning_rate": 1.2627369015981827e-05, + "loss": 0.9694, + "step": 13668 + }, + { + "epoch": 0.9509200320011131, + "grad_norm": 1.4140625, + "learning_rate": 1.2591697973885152e-05, + "loss": 0.9504, + "step": 13669 + }, + { + "epoch": 0.9509895996382483, + "grad_norm": 1.265625, + "learning_rate": 1.2556077067405026e-05, + "loss": 0.8707, + "step": 13670 + }, + { + "epoch": 0.9510591672753835, + "grad_norm": 1.375, + "learning_rate": 1.2520506298350332e-05, + "loss": 0.8861, + "step": 13671 + }, + { + "epoch": 0.9511287349125187, + "grad_norm": 0.9296875, + "learning_rate": 1.248498566852696e-05, + "loss": 0.5927, + "step": 13672 + }, + { + "epoch": 0.9511983025496539, + "grad_norm": 0.8515625, + "learning_rate": 1.244951517973858e-05, + "loss": 0.6532, + "step": 13673 + }, + { + "epoch": 0.9512678701867892, + "grad_norm": 1.15625, + "learning_rate": 1.2414094833786194e-05, + "loss": 0.7398, + "step": 13674 + }, + { + "epoch": 0.9513374378239243, + "grad_norm": 1.171875, + "learning_rate": 1.2378724632468253e-05, + "loss": 0.9259, + "step": 13675 + }, + { + "epoch": 0.9514070054610595, + "grad_norm": 1.0859375, + "learning_rate": 1.2343404577580764e-05, + "loss": 0.9632, + "step": 13676 + }, + { + "epoch": 0.9514765730981947, + "grad_norm": 1.03125, + "learning_rate": 1.230813467091707e-05, + "loss": 0.6046, + "step": 13677 + }, + { + "epoch": 0.95154614073533, + "grad_norm": 1.046875, + "learning_rate": 1.2272914914267963e-05, + "loss": 0.6549, + "step": 13678 + }, + { + "epoch": 0.9516157083724651, + "grad_norm": 1.1328125, + "learning_rate": 1.2237745309421567e-05, + "loss": 0.6816, + "step": 13679 + }, + { + "epoch": 0.9516852760096003, + "grad_norm": 1.8046875, + "learning_rate": 1.2202625858163896e-05, + "loss": 0.8623, + "step": 13680 + }, + { + "epoch": 0.9517548436467356, + "grad_norm": 1.09375, + "learning_rate": 1.216755656227797e-05, + "loss": 0.7623, + "step": 13681 + }, + { + "epoch": 0.9518244112838707, + "grad_norm": 1.546875, + "learning_rate": 1.2132537423544476e-05, + "loss": 1.0027, + "step": 13682 + }, + { + "epoch": 0.9518939789210059, + "grad_norm": 1.203125, + "learning_rate": 1.2097568443741547e-05, + "loss": 0.8778, + "step": 13683 + }, + { + "epoch": 0.9519635465581412, + "grad_norm": 0.93359375, + "learning_rate": 1.206264962464465e-05, + "loss": 0.8399, + "step": 13684 + }, + { + "epoch": 0.9520331141952764, + "grad_norm": 1.0546875, + "learning_rate": 1.2027780968026925e-05, + "loss": 0.8131, + "step": 13685 + }, + { + "epoch": 0.9521026818324115, + "grad_norm": 1.1953125, + "learning_rate": 1.199296247565862e-05, + "loss": 0.719, + "step": 13686 + }, + { + "epoch": 0.9521722494695468, + "grad_norm": 0.890625, + "learning_rate": 1.1958194149307767e-05, + "loss": 0.7673, + "step": 13687 + }, + { + "epoch": 0.952241817106682, + "grad_norm": 0.9609375, + "learning_rate": 1.1923475990739729e-05, + "loss": 0.5885, + "step": 13688 + }, + { + "epoch": 0.9523113847438172, + "grad_norm": 1.3203125, + "learning_rate": 1.1888808001717321e-05, + "loss": 0.7645, + "step": 13689 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.96484375, + "learning_rate": 1.1854190184000801e-05, + "loss": 0.6647, + "step": 13690 + }, + { + "epoch": 0.9524505200180876, + "grad_norm": 1.625, + "learning_rate": 1.1819622539347985e-05, + "loss": 0.9428, + "step": 13691 + }, + { + "epoch": 0.9525200876552228, + "grad_norm": 1.0, + "learning_rate": 1.1785105069513802e-05, + "loss": 0.7621, + "step": 13692 + }, + { + "epoch": 0.952589655292358, + "grad_norm": 0.81640625, + "learning_rate": 1.1750637776250961e-05, + "loss": 0.439, + "step": 13693 + }, + { + "epoch": 0.9526592229294932, + "grad_norm": 1.2578125, + "learning_rate": 1.171622066130973e-05, + "loss": 0.8382, + "step": 13694 + }, + { + "epoch": 0.9527287905666284, + "grad_norm": 1.1484375, + "learning_rate": 1.1681853726437376e-05, + "loss": 0.6706, + "step": 13695 + }, + { + "epoch": 0.9527983582037636, + "grad_norm": 1.1875, + "learning_rate": 1.1647536973379058e-05, + "loss": 0.7817, + "step": 13696 + }, + { + "epoch": 0.9528679258408989, + "grad_norm": 1.46875, + "learning_rate": 1.1613270403877163e-05, + "loss": 1.0401, + "step": 13697 + }, + { + "epoch": 0.952937493478034, + "grad_norm": 1.109375, + "learning_rate": 1.157905401967152e-05, + "loss": 0.9601, + "step": 13698 + }, + { + "epoch": 0.9530070611151692, + "grad_norm": 1.046875, + "learning_rate": 1.1544887822499517e-05, + "loss": 0.8719, + "step": 13699 + }, + { + "epoch": 0.9530766287523045, + "grad_norm": 1.1171875, + "learning_rate": 1.1510771814095989e-05, + "loss": 0.6628, + "step": 13700 + }, + { + "epoch": 0.9531461963894396, + "grad_norm": 0.95703125, + "learning_rate": 1.1476705996192993e-05, + "loss": 0.7796, + "step": 13701 + }, + { + "epoch": 0.9532157640265748, + "grad_norm": 1.2578125, + "learning_rate": 1.144269037052037e-05, + "loss": 0.9382, + "step": 13702 + }, + { + "epoch": 0.95328533166371, + "grad_norm": 1.046875, + "learning_rate": 1.1408724938805293e-05, + "loss": 0.927, + "step": 13703 + }, + { + "epoch": 0.9533548993008453, + "grad_norm": 0.8671875, + "learning_rate": 1.137480970277227e-05, + "loss": 0.7409, + "step": 13704 + }, + { + "epoch": 0.9534244669379804, + "grad_norm": 0.8515625, + "learning_rate": 1.1340944664143371e-05, + "loss": 0.5701, + "step": 13705 + }, + { + "epoch": 0.9534940345751156, + "grad_norm": 1.234375, + "learning_rate": 1.1307129824638108e-05, + "loss": 0.8807, + "step": 13706 + }, + { + "epoch": 0.9535636022122509, + "grad_norm": 0.8671875, + "learning_rate": 1.1273365185973328e-05, + "loss": 0.7222, + "step": 13707 + }, + { + "epoch": 0.9536331698493861, + "grad_norm": 1.0078125, + "learning_rate": 1.1239650749863662e-05, + "loss": 0.7902, + "step": 13708 + }, + { + "epoch": 0.9537027374865212, + "grad_norm": 1.1328125, + "learning_rate": 1.1205986518020738e-05, + "loss": 0.8234, + "step": 13709 + }, + { + "epoch": 0.9537723051236565, + "grad_norm": 1.2109375, + "learning_rate": 1.1172372492153859e-05, + "loss": 0.7902, + "step": 13710 + }, + { + "epoch": 0.9538418727607917, + "grad_norm": 0.8671875, + "learning_rate": 1.113880867396988e-05, + "loss": 0.6262, + "step": 13711 + }, + { + "epoch": 0.9539114403979269, + "grad_norm": 1.1171875, + "learning_rate": 1.1105295065172993e-05, + "loss": 0.7055, + "step": 13712 + }, + { + "epoch": 0.9539810080350621, + "grad_norm": 1.0859375, + "learning_rate": 1.1071831667464838e-05, + "loss": 0.7361, + "step": 13713 + }, + { + "epoch": 0.9540505756721973, + "grad_norm": 1.421875, + "learning_rate": 1.1038418482544387e-05, + "loss": 0.9626, + "step": 13714 + }, + { + "epoch": 0.9541201433093325, + "grad_norm": 1.2890625, + "learning_rate": 1.1005055512108508e-05, + "loss": 1.0645, + "step": 13715 + }, + { + "epoch": 0.9541897109464677, + "grad_norm": 1.390625, + "learning_rate": 1.0971742757850844e-05, + "loss": 0.8561, + "step": 13716 + }, + { + "epoch": 0.9542592785836029, + "grad_norm": 1.0859375, + "learning_rate": 1.0938480221463155e-05, + "loss": 0.8213, + "step": 13717 + }, + { + "epoch": 0.9543288462207381, + "grad_norm": 1.078125, + "learning_rate": 1.0905267904633975e-05, + "loss": 0.7922, + "step": 13718 + }, + { + "epoch": 0.9543984138578733, + "grad_norm": 1.03125, + "learning_rate": 1.087210580905007e-05, + "loss": 0.7726, + "step": 13719 + }, + { + "epoch": 0.9544679814950086, + "grad_norm": 1.078125, + "learning_rate": 1.083899393639498e-05, + "loss": 0.6426, + "step": 13720 + }, + { + "epoch": 0.9545375491321437, + "grad_norm": 1.59375, + "learning_rate": 1.0805932288350029e-05, + "loss": 0.9605, + "step": 13721 + }, + { + "epoch": 0.9546071167692789, + "grad_norm": 1.578125, + "learning_rate": 1.0772920866593983e-05, + "loss": 0.7406, + "step": 13722 + }, + { + "epoch": 0.9546766844064142, + "grad_norm": 1.4296875, + "learning_rate": 1.0739959672803057e-05, + "loss": 1.0407, + "step": 13723 + }, + { + "epoch": 0.9547462520435493, + "grad_norm": 1.015625, + "learning_rate": 1.070704870865058e-05, + "loss": 0.8359, + "step": 13724 + }, + { + "epoch": 0.9548158196806845, + "grad_norm": 1.1328125, + "learning_rate": 1.0674187975807659e-05, + "loss": 0.8006, + "step": 13725 + }, + { + "epoch": 0.9548853873178198, + "grad_norm": 1.4140625, + "learning_rate": 1.064137747594307e-05, + "loss": 1.2597, + "step": 13726 + }, + { + "epoch": 0.954954954954955, + "grad_norm": 1.015625, + "learning_rate": 1.0608617210722594e-05, + "loss": 0.8139, + "step": 13727 + }, + { + "epoch": 0.9550245225920901, + "grad_norm": 1.0078125, + "learning_rate": 1.0575907181809563e-05, + "loss": 0.667, + "step": 13728 + }, + { + "epoch": 0.9550940902292253, + "grad_norm": 1.09375, + "learning_rate": 1.0543247390864984e-05, + "loss": 0.7991, + "step": 13729 + }, + { + "epoch": 0.9551636578663606, + "grad_norm": 1.09375, + "learning_rate": 1.0510637839546977e-05, + "loss": 0.9141, + "step": 13730 + }, + { + "epoch": 0.9552332255034958, + "grad_norm": 1.265625, + "learning_rate": 1.0478078529511436e-05, + "loss": 0.9213, + "step": 13731 + }, + { + "epoch": 0.9553027931406309, + "grad_norm": 1.1171875, + "learning_rate": 1.0445569462411487e-05, + "loss": 0.9326, + "step": 13732 + }, + { + "epoch": 0.9553723607777662, + "grad_norm": 1.109375, + "learning_rate": 1.0413110639897916e-05, + "loss": 0.8153, + "step": 13733 + }, + { + "epoch": 0.9554419284149014, + "grad_norm": 1.1875, + "learning_rate": 1.038070206361852e-05, + "loss": 0.7108, + "step": 13734 + }, + { + "epoch": 0.9555114960520366, + "grad_norm": 0.8125, + "learning_rate": 1.034834373521909e-05, + "loss": 0.5579, + "step": 13735 + }, + { + "epoch": 0.9555810636891718, + "grad_norm": 0.9453125, + "learning_rate": 1.0316035656342537e-05, + "loss": 0.8924, + "step": 13736 + }, + { + "epoch": 0.955650631326307, + "grad_norm": 1.3125, + "learning_rate": 1.0283777828629437e-05, + "loss": 0.8648, + "step": 13737 + }, + { + "epoch": 0.9557201989634422, + "grad_norm": 0.8515625, + "learning_rate": 1.0251570253717369e-05, + "loss": 0.6623, + "step": 13738 + }, + { + "epoch": 0.9557897666005775, + "grad_norm": 0.90234375, + "learning_rate": 1.0219412933241911e-05, + "loss": 0.5868, + "step": 13739 + }, + { + "epoch": 0.9558593342377126, + "grad_norm": 1.0546875, + "learning_rate": 1.0187305868835872e-05, + "loss": 0.7931, + "step": 13740 + }, + { + "epoch": 0.9559289018748478, + "grad_norm": 1.046875, + "learning_rate": 1.015524906212939e-05, + "loss": 0.8052, + "step": 13741 + }, + { + "epoch": 0.955998469511983, + "grad_norm": 1.0078125, + "learning_rate": 1.0123242514750163e-05, + "loss": 0.7964, + "step": 13742 + }, + { + "epoch": 0.9560680371491183, + "grad_norm": 1.1484375, + "learning_rate": 1.0091286228323338e-05, + "loss": 0.9343, + "step": 13743 + }, + { + "epoch": 0.9561376047862534, + "grad_norm": 1.09375, + "learning_rate": 1.0059380204471503e-05, + "loss": 0.816, + "step": 13744 + }, + { + "epoch": 0.9562071724233886, + "grad_norm": 1.1953125, + "learning_rate": 1.0027524444814694e-05, + "loss": 0.917, + "step": 13745 + }, + { + "epoch": 0.9562767400605239, + "grad_norm": 1.015625, + "learning_rate": 9.995718950970289e-06, + "loss": 0.5941, + "step": 13746 + }, + { + "epoch": 0.956346307697659, + "grad_norm": 1.265625, + "learning_rate": 9.963963724553327e-06, + "loss": 0.8269, + "step": 13747 + }, + { + "epoch": 0.9564158753347942, + "grad_norm": 2.015625, + "learning_rate": 9.932258767176072e-06, + "loss": 0.8375, + "step": 13748 + }, + { + "epoch": 0.9564854429719295, + "grad_norm": 0.99609375, + "learning_rate": 9.900604080448461e-06, + "loss": 0.6939, + "step": 13749 + }, + { + "epoch": 0.9565550106090647, + "grad_norm": 1.328125, + "learning_rate": 9.868999665977763e-06, + "loss": 0.8623, + "step": 13750 + }, + { + "epoch": 0.9566245782461998, + "grad_norm": 1.0703125, + "learning_rate": 9.837445525368582e-06, + "loss": 1.0773, + "step": 13751 + }, + { + "epoch": 0.9566941458833351, + "grad_norm": 1.09375, + "learning_rate": 9.805941660223083e-06, + "loss": 0.8229, + "step": 13752 + }, + { + "epoch": 0.9567637135204703, + "grad_norm": 1.078125, + "learning_rate": 9.774488072140874e-06, + "loss": 0.7507, + "step": 13753 + }, + { + "epoch": 0.9568332811576055, + "grad_norm": 1.4765625, + "learning_rate": 9.743084762719235e-06, + "loss": 1.0015, + "step": 13754 + }, + { + "epoch": 0.9569028487947406, + "grad_norm": 1.3515625, + "learning_rate": 9.711731733552442e-06, + "loss": 0.9465, + "step": 13755 + }, + { + "epoch": 0.9569724164318759, + "grad_norm": 1.1875, + "learning_rate": 9.680428986232337e-06, + "loss": 0.9657, + "step": 13756 + }, + { + "epoch": 0.9570419840690111, + "grad_norm": 1.0625, + "learning_rate": 9.649176522348535e-06, + "loss": 0.9344, + "step": 13757 + }, + { + "epoch": 0.9571115517061463, + "grad_norm": 1.1953125, + "learning_rate": 9.617974343487878e-06, + "loss": 0.7727, + "step": 13758 + }, + { + "epoch": 0.9571811193432815, + "grad_norm": 1.2265625, + "learning_rate": 9.586822451234546e-06, + "loss": 0.7814, + "step": 13759 + }, + { + "epoch": 0.9572506869804167, + "grad_norm": 1.0625, + "learning_rate": 9.555720847170379e-06, + "loss": 0.5553, + "step": 13760 + }, + { + "epoch": 0.9573202546175519, + "grad_norm": 1.46875, + "learning_rate": 9.524669532874452e-06, + "loss": 0.8443, + "step": 13761 + }, + { + "epoch": 0.9573898222546872, + "grad_norm": 1.109375, + "learning_rate": 9.493668509923392e-06, + "loss": 1.0941, + "step": 13762 + }, + { + "epoch": 0.9574593898918223, + "grad_norm": 0.890625, + "learning_rate": 9.462717779891273e-06, + "loss": 0.6499, + "step": 13763 + }, + { + "epoch": 0.9575289575289575, + "grad_norm": 1.1015625, + "learning_rate": 9.431817344349835e-06, + "loss": 0.7202, + "step": 13764 + }, + { + "epoch": 0.9575985251660928, + "grad_norm": 1.015625, + "learning_rate": 9.400967204867827e-06, + "loss": 0.7296, + "step": 13765 + }, + { + "epoch": 0.957668092803228, + "grad_norm": 0.8984375, + "learning_rate": 9.370167363011662e-06, + "loss": 0.7224, + "step": 13766 + }, + { + "epoch": 0.9577376604403631, + "grad_norm": 1.4921875, + "learning_rate": 9.339417820345198e-06, + "loss": 0.8337, + "step": 13767 + }, + { + "epoch": 0.9578072280774983, + "grad_norm": 1.1875, + "learning_rate": 9.308718578429964e-06, + "loss": 1.0526, + "step": 13768 + }, + { + "epoch": 0.9578767957146336, + "grad_norm": 1.078125, + "learning_rate": 9.278069638824494e-06, + "loss": 0.7711, + "step": 13769 + }, + { + "epoch": 0.9579463633517687, + "grad_norm": 1.015625, + "learning_rate": 9.247471003084984e-06, + "loss": 0.5816, + "step": 13770 + }, + { + "epoch": 0.9580159309889039, + "grad_norm": 1.078125, + "learning_rate": 9.216922672765082e-06, + "loss": 0.752, + "step": 13771 + }, + { + "epoch": 0.9580854986260392, + "grad_norm": 1.0859375, + "learning_rate": 9.186424649416103e-06, + "loss": 0.8951, + "step": 13772 + }, + { + "epoch": 0.9581550662631744, + "grad_norm": 1.109375, + "learning_rate": 9.155976934586251e-06, + "loss": 0.7709, + "step": 13773 + }, + { + "epoch": 0.9582246339003095, + "grad_norm": 1.0390625, + "learning_rate": 9.125579529821736e-06, + "loss": 0.7626, + "step": 13774 + }, + { + "epoch": 0.9582942015374448, + "grad_norm": 1.390625, + "learning_rate": 9.09523243666599e-06, + "loss": 0.7321, + "step": 13775 + }, + { + "epoch": 0.95836376917458, + "grad_norm": 1.0625, + "learning_rate": 9.064935656659668e-06, + "loss": 0.763, + "step": 13776 + }, + { + "epoch": 0.9584333368117152, + "grad_norm": 1.0703125, + "learning_rate": 9.034689191341206e-06, + "loss": 0.59, + "step": 13777 + }, + { + "epoch": 0.9585029044488504, + "grad_norm": 1.03125, + "learning_rate": 9.004493042246487e-06, + "loss": 0.7677, + "step": 13778 + }, + { + "epoch": 0.9585724720859856, + "grad_norm": 1.09375, + "learning_rate": 8.974347210908729e-06, + "loss": 0.9117, + "step": 13779 + }, + { + "epoch": 0.9586420397231208, + "grad_norm": 1.2109375, + "learning_rate": 8.944251698858263e-06, + "loss": 0.752, + "step": 13780 + }, + { + "epoch": 0.958711607360256, + "grad_norm": 1.5703125, + "learning_rate": 8.914206507623535e-06, + "loss": 0.6105, + "step": 13781 + }, + { + "epoch": 0.9587811749973912, + "grad_norm": 1.203125, + "learning_rate": 8.884211638729877e-06, + "loss": 0.7613, + "step": 13782 + }, + { + "epoch": 0.9588507426345264, + "grad_norm": 1.1484375, + "learning_rate": 8.854267093700518e-06, + "loss": 0.9493, + "step": 13783 + }, + { + "epoch": 0.9589203102716616, + "grad_norm": 1.1875, + "learning_rate": 8.824372874055575e-06, + "loss": 0.7822, + "step": 13784 + }, + { + "epoch": 0.9589898779087969, + "grad_norm": 1.34375, + "learning_rate": 8.794528981313055e-06, + "loss": 1.0353, + "step": 13785 + }, + { + "epoch": 0.959059445545932, + "grad_norm": 1.3515625, + "learning_rate": 8.764735416988413e-06, + "loss": 1.0167, + "step": 13786 + }, + { + "epoch": 0.9591290131830672, + "grad_norm": 1.109375, + "learning_rate": 8.734992182594325e-06, + "loss": 0.9073, + "step": 13787 + }, + { + "epoch": 0.9591985808202025, + "grad_norm": 1.328125, + "learning_rate": 8.705299279640921e-06, + "loss": 0.8753, + "step": 13788 + }, + { + "epoch": 0.9592681484573377, + "grad_norm": 1.4453125, + "learning_rate": 8.675656709635882e-06, + "loss": 0.8206, + "step": 13789 + }, + { + "epoch": 0.9593377160944728, + "grad_norm": 1.3671875, + "learning_rate": 8.646064474084447e-06, + "loss": 1.1265, + "step": 13790 + }, + { + "epoch": 0.9594072837316081, + "grad_norm": 1.0390625, + "learning_rate": 8.616522574489083e-06, + "loss": 0.7658, + "step": 13791 + }, + { + "epoch": 0.9594768513687433, + "grad_norm": 1.1875, + "learning_rate": 8.5870310123497e-06, + "loss": 0.8042, + "step": 13792 + }, + { + "epoch": 0.9595464190058784, + "grad_norm": 0.9296875, + "learning_rate": 8.557589789163767e-06, + "loss": 0.6322, + "step": 13793 + }, + { + "epoch": 0.9596159866430136, + "grad_norm": 0.93359375, + "learning_rate": 8.528198906426198e-06, + "loss": 0.9129, + "step": 13794 + }, + { + "epoch": 0.9596855542801489, + "grad_norm": 0.87109375, + "learning_rate": 8.498858365629359e-06, + "loss": 0.5803, + "step": 13795 + }, + { + "epoch": 0.9597551219172841, + "grad_norm": 0.98828125, + "learning_rate": 8.469568168262943e-06, + "loss": 0.7071, + "step": 13796 + }, + { + "epoch": 0.9598246895544192, + "grad_norm": 1.0390625, + "learning_rate": 8.440328315814094e-06, + "loss": 0.928, + "step": 13797 + }, + { + "epoch": 0.9598942571915545, + "grad_norm": 0.94921875, + "learning_rate": 8.411138809767626e-06, + "loss": 0.55, + "step": 13798 + }, + { + "epoch": 0.9599638248286897, + "grad_norm": 1.0078125, + "learning_rate": 8.381999651605466e-06, + "loss": 0.8848, + "step": 13799 + }, + { + "epoch": 0.9600333924658249, + "grad_norm": 1.171875, + "learning_rate": 8.352910842807315e-06, + "loss": 0.9132, + "step": 13800 + }, + { + "epoch": 0.9601029601029601, + "grad_norm": 1.2109375, + "learning_rate": 8.323872384850106e-06, + "loss": 0.7374, + "step": 13801 + }, + { + "epoch": 0.9601725277400953, + "grad_norm": 1.5234375, + "learning_rate": 8.294884279208104e-06, + "loss": 1.0615, + "step": 13802 + }, + { + "epoch": 0.9602420953772305, + "grad_norm": 1.15625, + "learning_rate": 8.265946527353462e-06, + "loss": 0.7314, + "step": 13803 + }, + { + "epoch": 0.9603116630143658, + "grad_norm": 1.203125, + "learning_rate": 8.237059130755232e-06, + "loss": 0.5477, + "step": 13804 + }, + { + "epoch": 0.9603812306515009, + "grad_norm": 1.1796875, + "learning_rate": 8.208222090880346e-06, + "loss": 0.8335, + "step": 13805 + }, + { + "epoch": 0.9604507982886361, + "grad_norm": 1.4609375, + "learning_rate": 8.17943540919297e-06, + "loss": 0.7856, + "step": 13806 + }, + { + "epoch": 0.9605203659257713, + "grad_norm": 0.84765625, + "learning_rate": 8.150699087154712e-06, + "loss": 0.7394, + "step": 13807 + }, + { + "epoch": 0.9605899335629066, + "grad_norm": 1.1640625, + "learning_rate": 8.122013126224514e-06, + "loss": 0.9102, + "step": 13808 + }, + { + "epoch": 0.9606595012000417, + "grad_norm": 1.28125, + "learning_rate": 8.093377527859213e-06, + "loss": 0.7345, + "step": 13809 + }, + { + "epoch": 0.9607290688371769, + "grad_norm": 1.21875, + "learning_rate": 8.064792293512535e-06, + "loss": 0.8206, + "step": 13810 + }, + { + "epoch": 0.9607986364743122, + "grad_norm": 1.1796875, + "learning_rate": 8.036257424636096e-06, + "loss": 0.9497, + "step": 13811 + }, + { + "epoch": 0.9608682041114474, + "grad_norm": 1.0625, + "learning_rate": 8.007772922678514e-06, + "loss": 0.6294, + "step": 13812 + }, + { + "epoch": 0.9609377717485825, + "grad_norm": 1.0546875, + "learning_rate": 7.979338789086299e-06, + "loss": 0.5506, + "step": 13813 + }, + { + "epoch": 0.9610073393857178, + "grad_norm": 0.828125, + "learning_rate": 7.950955025303076e-06, + "loss": 0.7495, + "step": 13814 + }, + { + "epoch": 0.961076907022853, + "grad_norm": 1.125, + "learning_rate": 7.922621632770022e-06, + "loss": 0.6539, + "step": 13815 + }, + { + "epoch": 0.9611464746599881, + "grad_norm": 1.1796875, + "learning_rate": 7.894338612925877e-06, + "loss": 0.9119, + "step": 13816 + }, + { + "epoch": 0.9612160422971234, + "grad_norm": 1.0546875, + "learning_rate": 7.866105967206493e-06, + "loss": 0.6867, + "step": 13817 + }, + { + "epoch": 0.9612856099342586, + "grad_norm": 1.0390625, + "learning_rate": 7.837923697045613e-06, + "loss": 0.9962, + "step": 13818 + }, + { + "epoch": 0.9613551775713938, + "grad_norm": 1.1640625, + "learning_rate": 7.8097918038742e-06, + "loss": 0.8745, + "step": 13819 + }, + { + "epoch": 0.9614247452085289, + "grad_norm": 0.90625, + "learning_rate": 7.781710289120447e-06, + "loss": 0.9529, + "step": 13820 + }, + { + "epoch": 0.9614943128456642, + "grad_norm": 1.21875, + "learning_rate": 7.753679154210214e-06, + "loss": 0.8506, + "step": 13821 + }, + { + "epoch": 0.9615638804827994, + "grad_norm": 0.9765625, + "learning_rate": 7.725698400567026e-06, + "loss": 0.7563, + "step": 13822 + }, + { + "epoch": 0.9616334481199346, + "grad_norm": 1.5234375, + "learning_rate": 7.697768029611308e-06, + "loss": 1.1066, + "step": 13823 + }, + { + "epoch": 0.9617030157570698, + "grad_norm": 0.87890625, + "learning_rate": 7.669888042761475e-06, + "loss": 0.6832, + "step": 13824 + }, + { + "epoch": 0.961772583394205, + "grad_norm": 1.390625, + "learning_rate": 7.642058441432953e-06, + "loss": 0.8851, + "step": 13825 + }, + { + "epoch": 0.9618421510313402, + "grad_norm": 0.8984375, + "learning_rate": 7.614279227038834e-06, + "loss": 0.7575, + "step": 13826 + }, + { + "epoch": 0.9619117186684755, + "grad_norm": 0.921875, + "learning_rate": 7.5865504009895445e-06, + "loss": 0.6526, + "step": 13827 + }, + { + "epoch": 0.9619812863056106, + "grad_norm": 1.1640625, + "learning_rate": 7.558871964693181e-06, + "loss": 0.741, + "step": 13828 + }, + { + "epoch": 0.9620508539427458, + "grad_norm": 1.1484375, + "learning_rate": 7.531243919555064e-06, + "loss": 0.8455, + "step": 13829 + }, + { + "epoch": 0.9621204215798811, + "grad_norm": 1.5390625, + "learning_rate": 7.50366626697796e-06, + "loss": 0.8973, + "step": 13830 + }, + { + "epoch": 0.9621899892170163, + "grad_norm": 0.8984375, + "learning_rate": 7.4761390083619706e-06, + "loss": 0.6072, + "step": 13831 + }, + { + "epoch": 0.9622595568541514, + "grad_norm": 1.1796875, + "learning_rate": 7.4486621451052e-06, + "loss": 0.6326, + "step": 13832 + }, + { + "epoch": 0.9623291244912866, + "grad_norm": 1.0625, + "learning_rate": 7.421235678602423e-06, + "loss": 0.722, + "step": 13833 + }, + { + "epoch": 0.9623986921284219, + "grad_norm": 1.2890625, + "learning_rate": 7.3938596102463005e-06, + "loss": 0.7974, + "step": 13834 + }, + { + "epoch": 0.962468259765557, + "grad_norm": 1.2890625, + "learning_rate": 7.366533941426834e-06, + "loss": 1.1735, + "step": 13835 + }, + { + "epoch": 0.9625378274026922, + "grad_norm": 1.265625, + "learning_rate": 7.339258673531579e-06, + "loss": 0.6646, + "step": 13836 + }, + { + "epoch": 0.9626073950398275, + "grad_norm": 0.97265625, + "learning_rate": 7.3120338079454285e-06, + "loss": 0.8732, + "step": 13837 + }, + { + "epoch": 0.9626769626769627, + "grad_norm": 1.3515625, + "learning_rate": 7.28485934605072e-06, + "loss": 0.9918, + "step": 13838 + }, + { + "epoch": 0.9627465303140978, + "grad_norm": 1.0234375, + "learning_rate": 7.25773528922713e-06, + "loss": 0.54, + "step": 13839 + }, + { + "epoch": 0.9628160979512331, + "grad_norm": 1.140625, + "learning_rate": 7.230661638851887e-06, + "loss": 0.891, + "step": 13840 + }, + { + "epoch": 0.9628856655883683, + "grad_norm": 1.515625, + "learning_rate": 7.2036383962997835e-06, + "loss": 0.645, + "step": 13841 + }, + { + "epoch": 0.9629552332255035, + "grad_norm": 1.296875, + "learning_rate": 7.176665562942941e-06, + "loss": 0.7792, + "step": 13842 + }, + { + "epoch": 0.9630248008626388, + "grad_norm": 0.94921875, + "learning_rate": 7.149743140150711e-06, + "loss": 0.7466, + "step": 13843 + }, + { + "epoch": 0.9630943684997739, + "grad_norm": 1.2421875, + "learning_rate": 7.12287112929022e-06, + "loss": 0.9052, + "step": 13844 + }, + { + "epoch": 0.9631639361369091, + "grad_norm": 1.3671875, + "learning_rate": 7.096049531725823e-06, + "loss": 0.9388, + "step": 13845 + }, + { + "epoch": 0.9632335037740443, + "grad_norm": 1.3046875, + "learning_rate": 7.069278348819541e-06, + "loss": 0.8991, + "step": 13846 + }, + { + "epoch": 0.9633030714111795, + "grad_norm": 1.1875, + "learning_rate": 7.042557581930508e-06, + "loss": 0.9729, + "step": 13847 + }, + { + "epoch": 0.9633726390483147, + "grad_norm": 0.8828125, + "learning_rate": 7.015887232415419e-06, + "loss": 0.775, + "step": 13848 + }, + { + "epoch": 0.9634422066854499, + "grad_norm": 0.8828125, + "learning_rate": 6.989267301628632e-06, + "loss": 0.5252, + "step": 13849 + }, + { + "epoch": 0.9635117743225852, + "grad_norm": 0.953125, + "learning_rate": 6.9626977909217346e-06, + "loss": 0.7728, + "step": 13850 + }, + { + "epoch": 0.9635813419597203, + "grad_norm": 1.15625, + "learning_rate": 6.936178701643758e-06, + "loss": 0.7419, + "step": 13851 + }, + { + "epoch": 0.9636509095968555, + "grad_norm": 1.1796875, + "learning_rate": 6.909710035141292e-06, + "loss": 1.1057, + "step": 13852 + }, + { + "epoch": 0.9637204772339908, + "grad_norm": 1.296875, + "learning_rate": 6.883291792758151e-06, + "loss": 0.8733, + "step": 13853 + }, + { + "epoch": 0.963790044871126, + "grad_norm": 0.9765625, + "learning_rate": 6.856923975835705e-06, + "loss": 0.9535, + "step": 13854 + }, + { + "epoch": 0.9638596125082611, + "grad_norm": 1.3203125, + "learning_rate": 6.830606585712884e-06, + "loss": 0.9676, + "step": 13855 + }, + { + "epoch": 0.9639291801453964, + "grad_norm": 1.0859375, + "learning_rate": 6.804339623725842e-06, + "loss": 0.7737, + "step": 13856 + }, + { + "epoch": 0.9639987477825316, + "grad_norm": 1.0546875, + "learning_rate": 6.77812309120851e-06, + "loss": 1.0122, + "step": 13857 + }, + { + "epoch": 0.9640683154196668, + "grad_norm": 1.109375, + "learning_rate": 6.751956989491825e-06, + "loss": 0.8278, + "step": 13858 + }, + { + "epoch": 0.9641378830568019, + "grad_norm": 1.078125, + "learning_rate": 6.72584131990428e-06, + "loss": 0.5063, + "step": 13859 + }, + { + "epoch": 0.9642074506939372, + "grad_norm": 1.1953125, + "learning_rate": 6.699776083772257e-06, + "loss": 0.7343, + "step": 13860 + }, + { + "epoch": 0.9642770183310724, + "grad_norm": 1.5234375, + "learning_rate": 6.673761282418922e-06, + "loss": 0.8882, + "step": 13861 + }, + { + "epoch": 0.9643465859682075, + "grad_norm": 1.09375, + "learning_rate": 6.647796917165216e-06, + "loss": 0.6325, + "step": 13862 + }, + { + "epoch": 0.9644161536053428, + "grad_norm": 1.0546875, + "learning_rate": 6.621882989329531e-06, + "loss": 0.6688, + "step": 13863 + }, + { + "epoch": 0.964485721242478, + "grad_norm": 1.15625, + "learning_rate": 6.59601950022759e-06, + "loss": 0.8961, + "step": 13864 + }, + { + "epoch": 0.9645552888796132, + "grad_norm": 1.109375, + "learning_rate": 6.570206451172789e-06, + "loss": 1.0174, + "step": 13865 + }, + { + "epoch": 0.9646248565167485, + "grad_norm": 1.28125, + "learning_rate": 6.544443843475523e-06, + "loss": 0.8975, + "step": 13866 + }, + { + "epoch": 0.9646944241538836, + "grad_norm": 0.98046875, + "learning_rate": 6.518731678443968e-06, + "loss": 0.7626, + "step": 13867 + }, + { + "epoch": 0.9647639917910188, + "grad_norm": 1.234375, + "learning_rate": 6.493069957383857e-06, + "loss": 0.808, + "step": 13868 + }, + { + "epoch": 0.9648335594281541, + "grad_norm": 0.8125, + "learning_rate": 6.467458681597926e-06, + "loss": 0.6525, + "step": 13869 + }, + { + "epoch": 0.9649031270652892, + "grad_norm": 1.15625, + "learning_rate": 6.441897852386691e-06, + "loss": 1.021, + "step": 13870 + }, + { + "epoch": 0.9649726947024244, + "grad_norm": 1.0703125, + "learning_rate": 6.416387471047891e-06, + "loss": 0.7757, + "step": 13871 + }, + { + "epoch": 0.9650422623395596, + "grad_norm": 1.0859375, + "learning_rate": 6.390927538877045e-06, + "loss": 0.7479, + "step": 13872 + }, + { + "epoch": 0.9651118299766949, + "grad_norm": 1.03125, + "learning_rate": 6.365518057166564e-06, + "loss": 0.7607, + "step": 13873 + }, + { + "epoch": 0.96518139761383, + "grad_norm": 1.0078125, + "learning_rate": 6.340159027206971e-06, + "loss": 0.9347, + "step": 13874 + }, + { + "epoch": 0.9652509652509652, + "grad_norm": 1.46875, + "learning_rate": 6.3148504502855695e-06, + "loss": 1.1005, + "step": 13875 + }, + { + "epoch": 0.9653205328881005, + "grad_norm": 1.3515625, + "learning_rate": 6.289592327687554e-06, + "loss": 1.03, + "step": 13876 + }, + { + "epoch": 0.9653901005252357, + "grad_norm": 1.125, + "learning_rate": 6.264384660695343e-06, + "loss": 0.6541, + "step": 13877 + }, + { + "epoch": 0.9654596681623708, + "grad_norm": 1.1171875, + "learning_rate": 6.239227450588914e-06, + "loss": 1.0166, + "step": 13878 + }, + { + "epoch": 0.9655292357995061, + "grad_norm": 0.9140625, + "learning_rate": 6.214120698645575e-06, + "loss": 0.7993, + "step": 13879 + }, + { + "epoch": 0.9655988034366413, + "grad_norm": 1.2421875, + "learning_rate": 6.189064406140199e-06, + "loss": 0.723, + "step": 13880 + }, + { + "epoch": 0.9656683710737765, + "grad_norm": 1.2421875, + "learning_rate": 6.164058574344766e-06, + "loss": 0.5604, + "step": 13881 + }, + { + "epoch": 0.9657379387109117, + "grad_norm": 1.25, + "learning_rate": 6.139103204529372e-06, + "loss": 0.8618, + "step": 13882 + }, + { + "epoch": 0.9658075063480469, + "grad_norm": 1.2109375, + "learning_rate": 6.114198297960672e-06, + "loss": 0.741, + "step": 13883 + }, + { + "epoch": 0.9658770739851821, + "grad_norm": 1.078125, + "learning_rate": 6.089343855903651e-06, + "loss": 0.8718, + "step": 13884 + }, + { + "epoch": 0.9659466416223172, + "grad_norm": 1.09375, + "learning_rate": 6.064539879619968e-06, + "loss": 0.5796, + "step": 13885 + }, + { + "epoch": 0.9660162092594525, + "grad_norm": 1.0546875, + "learning_rate": 6.03978637036906e-06, + "loss": 0.8759, + "step": 13886 + }, + { + "epoch": 0.9660857768965877, + "grad_norm": 0.9296875, + "learning_rate": 6.015083329407922e-06, + "loss": 0.5381, + "step": 13887 + }, + { + "epoch": 0.9661553445337229, + "grad_norm": 1.1640625, + "learning_rate": 5.990430757990773e-06, + "loss": 1.0053, + "step": 13888 + }, + { + "epoch": 0.9662249121708582, + "grad_norm": 1.359375, + "learning_rate": 5.9658286573694984e-06, + "loss": 1.0386, + "step": 13889 + }, + { + "epoch": 0.9662944798079933, + "grad_norm": 1.4296875, + "learning_rate": 5.941277028792991e-06, + "loss": 1.0889, + "step": 13890 + }, + { + "epoch": 0.9663640474451285, + "grad_norm": 1.328125, + "learning_rate": 5.916775873508029e-06, + "loss": 0.8672, + "step": 13891 + }, + { + "epoch": 0.9664336150822638, + "grad_norm": 1.1640625, + "learning_rate": 5.89232519275873e-06, + "loss": 0.7844, + "step": 13892 + }, + { + "epoch": 0.966503182719399, + "grad_norm": 1.53125, + "learning_rate": 5.867924987786432e-06, + "loss": 1.0494, + "step": 13893 + }, + { + "epoch": 0.9665727503565341, + "grad_norm": 1.09375, + "learning_rate": 5.843575259830036e-06, + "loss": 0.5348, + "step": 13894 + }, + { + "epoch": 0.9666423179936694, + "grad_norm": 0.93359375, + "learning_rate": 5.819276010126107e-06, + "loss": 0.7969, + "step": 13895 + }, + { + "epoch": 0.9667118856308046, + "grad_norm": 1.3359375, + "learning_rate": 5.795027239908213e-06, + "loss": 0.6913, + "step": 13896 + }, + { + "epoch": 0.9667814532679397, + "grad_norm": 1.0703125, + "learning_rate": 5.7708289504077024e-06, + "loss": 0.6975, + "step": 13897 + }, + { + "epoch": 0.9668510209050749, + "grad_norm": 1.015625, + "learning_rate": 5.746681142853149e-06, + "loss": 0.6556, + "step": 13898 + }, + { + "epoch": 0.9669205885422102, + "grad_norm": 0.94921875, + "learning_rate": 5.722583818470795e-06, + "loss": 0.7655, + "step": 13899 + }, + { + "epoch": 0.9669901561793454, + "grad_norm": 0.94921875, + "learning_rate": 5.698536978484104e-06, + "loss": 0.6782, + "step": 13900 + }, + { + "epoch": 0.9670597238164805, + "grad_norm": 1.265625, + "learning_rate": 5.674540624113988e-06, + "loss": 0.7805, + "step": 13901 + }, + { + "epoch": 0.9671292914536158, + "grad_norm": 1.046875, + "learning_rate": 5.650594756579031e-06, + "loss": 0.7414, + "step": 13902 + }, + { + "epoch": 0.967198859090751, + "grad_norm": 0.99609375, + "learning_rate": 5.626699377094924e-06, + "loss": 0.8086, + "step": 13903 + }, + { + "epoch": 0.9672684267278862, + "grad_norm": 0.98828125, + "learning_rate": 5.6028544868749194e-06, + "loss": 0.7736, + "step": 13904 + }, + { + "epoch": 0.9673379943650214, + "grad_norm": 1.1484375, + "learning_rate": 5.579060087129939e-06, + "loss": 0.6967, + "step": 13905 + }, + { + "epoch": 0.9674075620021566, + "grad_norm": 1.109375, + "learning_rate": 5.555316179068015e-06, + "loss": 0.9138, + "step": 13906 + }, + { + "epoch": 0.9674771296392918, + "grad_norm": 1.1796875, + "learning_rate": 5.531622763894739e-06, + "loss": 0.837, + "step": 13907 + }, + { + "epoch": 0.9675466972764271, + "grad_norm": 0.90234375, + "learning_rate": 5.507979842813149e-06, + "loss": 0.7755, + "step": 13908 + }, + { + "epoch": 0.9676162649135622, + "grad_norm": 1.1953125, + "learning_rate": 5.48438741702384e-06, + "loss": 0.7594, + "step": 13909 + }, + { + "epoch": 0.9676858325506974, + "grad_norm": 1.0859375, + "learning_rate": 5.46084548772452e-06, + "loss": 0.816, + "step": 13910 + }, + { + "epoch": 0.9677554001878326, + "grad_norm": 1.0390625, + "learning_rate": 5.437354056110566e-06, + "loss": 0.5594, + "step": 13911 + }, + { + "epoch": 0.9678249678249679, + "grad_norm": 1.1015625, + "learning_rate": 5.413913123374914e-06, + "loss": 0.8616, + "step": 13912 + }, + { + "epoch": 0.967894535462103, + "grad_norm": 1.1640625, + "learning_rate": 5.390522690707611e-06, + "loss": 0.7617, + "step": 13913 + }, + { + "epoch": 0.9679641030992382, + "grad_norm": 1.3359375, + "learning_rate": 5.367182759296374e-06, + "loss": 0.8175, + "step": 13914 + }, + { + "epoch": 0.9680336707363735, + "grad_norm": 1.1640625, + "learning_rate": 5.343893330326255e-06, + "loss": 0.7259, + "step": 13915 + }, + { + "epoch": 0.9681032383735086, + "grad_norm": 0.9296875, + "learning_rate": 5.320654404979863e-06, + "loss": 0.6797, + "step": 13916 + }, + { + "epoch": 0.9681728060106438, + "grad_norm": 1.4375, + "learning_rate": 5.297465984437033e-06, + "loss": 0.8994, + "step": 13917 + }, + { + "epoch": 0.9682423736477791, + "grad_norm": 1.25, + "learning_rate": 5.274328069875156e-06, + "loss": 0.8834, + "step": 13918 + }, + { + "epoch": 0.9683119412849143, + "grad_norm": 1.1875, + "learning_rate": 5.251240662469181e-06, + "loss": 0.7321, + "step": 13919 + }, + { + "epoch": 0.9683815089220494, + "grad_norm": 1.0625, + "learning_rate": 5.228203763391392e-06, + "loss": 0.6997, + "step": 13920 + }, + { + "epoch": 0.9684510765591847, + "grad_norm": 1.2734375, + "learning_rate": 5.2052173738113e-06, + "loss": 0.7915, + "step": 13921 + }, + { + "epoch": 0.9685206441963199, + "grad_norm": 1.1484375, + "learning_rate": 5.18228149489608e-06, + "loss": 0.8359, + "step": 13922 + }, + { + "epoch": 0.9685902118334551, + "grad_norm": 1.1796875, + "learning_rate": 5.1593961278103566e-06, + "loss": 0.82, + "step": 13923 + }, + { + "epoch": 0.9686597794705902, + "grad_norm": 1.0859375, + "learning_rate": 5.136561273716201e-06, + "loss": 0.8778, + "step": 13924 + }, + { + "epoch": 0.9687293471077255, + "grad_norm": 1.1640625, + "learning_rate": 5.113776933772907e-06, + "loss": 0.8501, + "step": 13925 + }, + { + "epoch": 0.9687989147448607, + "grad_norm": 0.91015625, + "learning_rate": 5.09104310913755e-06, + "loss": 0.5983, + "step": 13926 + }, + { + "epoch": 0.9688684823819959, + "grad_norm": 1.2890625, + "learning_rate": 5.068359800964206e-06, + "loss": 0.8232, + "step": 13927 + }, + { + "epoch": 0.9689380500191311, + "grad_norm": 0.91015625, + "learning_rate": 5.045727010404733e-06, + "loss": 0.6038, + "step": 13928 + }, + { + "epoch": 0.9690076176562663, + "grad_norm": 1.484375, + "learning_rate": 5.023144738608321e-06, + "loss": 0.6608, + "step": 13929 + }, + { + "epoch": 0.9690771852934015, + "grad_norm": 1.0078125, + "learning_rate": 5.000612986721498e-06, + "loss": 0.6612, + "step": 13930 + }, + { + "epoch": 0.9691467529305368, + "grad_norm": 1.0703125, + "learning_rate": 4.9781317558884596e-06, + "loss": 0.6919, + "step": 13931 + }, + { + "epoch": 0.9692163205676719, + "grad_norm": 1.1484375, + "learning_rate": 4.955701047250516e-06, + "loss": 0.9562, + "step": 13932 + }, + { + "epoch": 0.9692858882048071, + "grad_norm": 1.421875, + "learning_rate": 4.933320861946866e-06, + "loss": 1.0431, + "step": 13933 + }, + { + "epoch": 0.9693554558419424, + "grad_norm": 0.9609375, + "learning_rate": 4.91099120111349e-06, + "loss": 0.7166, + "step": 13934 + }, + { + "epoch": 0.9694250234790776, + "grad_norm": 1.2265625, + "learning_rate": 4.888712065884482e-06, + "loss": 1.1322, + "step": 13935 + }, + { + "epoch": 0.9694945911162127, + "grad_norm": 0.97265625, + "learning_rate": 4.866483457390825e-06, + "loss": 0.738, + "step": 13936 + }, + { + "epoch": 0.9695641587533479, + "grad_norm": 1.046875, + "learning_rate": 4.844305376761393e-06, + "loss": 0.931, + "step": 13937 + }, + { + "epoch": 0.9696337263904832, + "grad_norm": 1.0390625, + "learning_rate": 4.822177825122176e-06, + "loss": 0.8767, + "step": 13938 + }, + { + "epoch": 0.9697032940276183, + "grad_norm": 1.125, + "learning_rate": 4.800100803596607e-06, + "loss": 0.6092, + "step": 13939 + }, + { + "epoch": 0.9697728616647535, + "grad_norm": 1.1484375, + "learning_rate": 4.778074313305791e-06, + "loss": 0.7926, + "step": 13940 + }, + { + "epoch": 0.9698424293018888, + "grad_norm": 1.21875, + "learning_rate": 4.756098355368055e-06, + "loss": 0.9556, + "step": 13941 + }, + { + "epoch": 0.969911996939024, + "grad_norm": 1.265625, + "learning_rate": 4.734172930899283e-06, + "loss": 0.7866, + "step": 13942 + }, + { + "epoch": 0.9699815645761591, + "grad_norm": 1.46875, + "learning_rate": 4.712298041012697e-06, + "loss": 0.9331, + "step": 13943 + }, + { + "epoch": 0.9700511322132944, + "grad_norm": 1.1796875, + "learning_rate": 4.690473686819075e-06, + "loss": 0.9383, + "step": 13944 + }, + { + "epoch": 0.9701206998504296, + "grad_norm": 1.0234375, + "learning_rate": 4.668699869426308e-06, + "loss": 0.6549, + "step": 13945 + }, + { + "epoch": 0.9701902674875648, + "grad_norm": 1.078125, + "learning_rate": 4.646976589940177e-06, + "loss": 0.7424, + "step": 13946 + }, + { + "epoch": 0.9702598351247, + "grad_norm": 1.1328125, + "learning_rate": 4.625303849463581e-06, + "loss": 0.8849, + "step": 13947 + }, + { + "epoch": 0.9703294027618352, + "grad_norm": 1.2109375, + "learning_rate": 4.6036816490970805e-06, + "loss": 0.8294, + "step": 13948 + }, + { + "epoch": 0.9703989703989704, + "grad_norm": 1.1484375, + "learning_rate": 4.582109989938465e-06, + "loss": 1.1237, + "step": 13949 + }, + { + "epoch": 0.9704685380361056, + "grad_norm": 1.0546875, + "learning_rate": 4.560588873082972e-06, + "loss": 0.8116, + "step": 13950 + }, + { + "epoch": 0.9705381056732408, + "grad_norm": 1.1015625, + "learning_rate": 4.539118299623391e-06, + "loss": 0.793, + "step": 13951 + }, + { + "epoch": 0.970607673310376, + "grad_norm": 1.0390625, + "learning_rate": 4.517698270649961e-06, + "loss": 0.8075, + "step": 13952 + }, + { + "epoch": 0.9706772409475112, + "grad_norm": 1.15625, + "learning_rate": 4.496328787250148e-06, + "loss": 0.7519, + "step": 13953 + }, + { + "epoch": 0.9707468085846465, + "grad_norm": 1.078125, + "learning_rate": 4.4750098505089705e-06, + "loss": 0.7596, + "step": 13954 + }, + { + "epoch": 0.9708163762217816, + "grad_norm": 1.390625, + "learning_rate": 4.45374146150912e-06, + "loss": 0.8662, + "step": 13955 + }, + { + "epoch": 0.9708859438589168, + "grad_norm": 0.98828125, + "learning_rate": 4.4325236213302865e-06, + "loss": 0.7271, + "step": 13956 + }, + { + "epoch": 0.9709555114960521, + "grad_norm": 1.1328125, + "learning_rate": 4.411356331049832e-06, + "loss": 1.0329, + "step": 13957 + }, + { + "epoch": 0.9710250791331873, + "grad_norm": 1.4375, + "learning_rate": 4.390239591742562e-06, + "loss": 0.7974, + "step": 13958 + }, + { + "epoch": 0.9710946467703224, + "grad_norm": 0.98828125, + "learning_rate": 4.369173404480731e-06, + "loss": 0.7028, + "step": 13959 + }, + { + "epoch": 0.9711642144074577, + "grad_norm": 1.25, + "learning_rate": 4.348157770333927e-06, + "loss": 0.7796, + "step": 13960 + }, + { + "epoch": 0.9712337820445929, + "grad_norm": 1.234375, + "learning_rate": 4.327192690369186e-06, + "loss": 0.7537, + "step": 13961 + }, + { + "epoch": 0.971303349681728, + "grad_norm": 1.2109375, + "learning_rate": 4.306278165651101e-06, + "loss": 0.9896, + "step": 13962 + }, + { + "epoch": 0.9713729173188632, + "grad_norm": 0.8828125, + "learning_rate": 4.2854141972414885e-06, + "loss": 0.9013, + "step": 13963 + }, + { + "epoch": 0.9714424849559985, + "grad_norm": 1.1640625, + "learning_rate": 4.2646007861997235e-06, + "loss": 0.8468, + "step": 13964 + }, + { + "epoch": 0.9715120525931337, + "grad_norm": 0.97265625, + "learning_rate": 4.243837933582739e-06, + "loss": 0.7459, + "step": 13965 + }, + { + "epoch": 0.9715816202302688, + "grad_norm": 1.078125, + "learning_rate": 4.2231256404446916e-06, + "loss": 0.856, + "step": 13966 + }, + { + "epoch": 0.9716511878674041, + "grad_norm": 1.046875, + "learning_rate": 4.202463907837184e-06, + "loss": 0.8179, + "step": 13967 + }, + { + "epoch": 0.9717207555045393, + "grad_norm": 0.89453125, + "learning_rate": 4.1818527368093775e-06, + "loss": 0.6342, + "step": 13968 + }, + { + "epoch": 0.9717903231416745, + "grad_norm": 1.171875, + "learning_rate": 4.161292128407767e-06, + "loss": 1.057, + "step": 13969 + }, + { + "epoch": 0.9718598907788097, + "grad_norm": 1.3046875, + "learning_rate": 4.140782083676409e-06, + "loss": 0.8391, + "step": 13970 + }, + { + "epoch": 0.9719294584159449, + "grad_norm": 1.0078125, + "learning_rate": 4.1203226036565785e-06, + "loss": 0.7712, + "step": 13971 + }, + { + "epoch": 0.9719990260530801, + "grad_norm": 0.796875, + "learning_rate": 4.099913689387114e-06, + "loss": 0.5111, + "step": 13972 + }, + { + "epoch": 0.9720685936902154, + "grad_norm": 1.4921875, + "learning_rate": 4.079555341904406e-06, + "loss": 0.9152, + "step": 13973 + }, + { + "epoch": 0.9721381613273505, + "grad_norm": 1.2578125, + "learning_rate": 4.059247562242074e-06, + "loss": 0.758, + "step": 13974 + }, + { + "epoch": 0.9722077289644857, + "grad_norm": 1.2109375, + "learning_rate": 4.038990351431182e-06, + "loss": 0.7742, + "step": 13975 + }, + { + "epoch": 0.9722772966016209, + "grad_norm": 1.1328125, + "learning_rate": 4.018783710500462e-06, + "loss": 0.8875, + "step": 13976 + }, + { + "epoch": 0.9723468642387562, + "grad_norm": 0.9765625, + "learning_rate": 3.998627640475649e-06, + "loss": 0.7069, + "step": 13977 + }, + { + "epoch": 0.9724164318758913, + "grad_norm": 0.99609375, + "learning_rate": 3.978522142380259e-06, + "loss": 0.6547, + "step": 13978 + }, + { + "epoch": 0.9724859995130265, + "grad_norm": 0.9609375, + "learning_rate": 3.958467217235362e-06, + "loss": 0.9462, + "step": 13979 + }, + { + "epoch": 0.9725555671501618, + "grad_norm": 1.3359375, + "learning_rate": 3.938462866059034e-06, + "loss": 0.5854, + "step": 13980 + }, + { + "epoch": 0.972625134787297, + "grad_norm": 1.0703125, + "learning_rate": 3.918509089867017e-06, + "loss": 1.0063, + "step": 13981 + }, + { + "epoch": 0.9726947024244321, + "grad_norm": 1.1171875, + "learning_rate": 3.898605889672391e-06, + "loss": 0.7507, + "step": 13982 + }, + { + "epoch": 0.9727642700615674, + "grad_norm": 0.97265625, + "learning_rate": 3.878753266486013e-06, + "loss": 0.7795, + "step": 13983 + }, + { + "epoch": 0.9728338376987026, + "grad_norm": 1.0625, + "learning_rate": 3.858951221315632e-06, + "loss": 0.865, + "step": 13984 + }, + { + "epoch": 0.9729034053358377, + "grad_norm": 0.96484375, + "learning_rate": 3.839199755166778e-06, + "loss": 0.7666, + "step": 13985 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.85546875, + "learning_rate": 3.819498869042315e-06, + "loss": 0.5805, + "step": 13986 + }, + { + "epoch": 0.9730425406101082, + "grad_norm": 1.09375, + "learning_rate": 3.7998485639426648e-06, + "loss": 0.776, + "step": 13987 + }, + { + "epoch": 0.9731121082472434, + "grad_norm": 1.1484375, + "learning_rate": 3.7802488408653635e-06, + "loss": 0.8188, + "step": 13988 + }, + { + "epoch": 0.9731816758843785, + "grad_norm": 1.15625, + "learning_rate": 3.7606997008058363e-06, + "loss": 0.9584, + "step": 13989 + }, + { + "epoch": 0.9732512435215138, + "grad_norm": 1.2890625, + "learning_rate": 3.741201144756512e-06, + "loss": 0.8802, + "step": 13990 + }, + { + "epoch": 0.973320811158649, + "grad_norm": 1.125, + "learning_rate": 3.7217531737073762e-06, + "loss": 0.7809, + "step": 13991 + }, + { + "epoch": 0.9733903787957842, + "grad_norm": 0.98046875, + "learning_rate": 3.7023557886460833e-06, + "loss": 0.7774, + "step": 13992 + }, + { + "epoch": 0.9734599464329194, + "grad_norm": 0.90234375, + "learning_rate": 3.6830089905575128e-06, + "loss": 0.9329, + "step": 13993 + }, + { + "epoch": 0.9735295140700546, + "grad_norm": 1.015625, + "learning_rate": 3.663712780423878e-06, + "loss": 0.582, + "step": 13994 + }, + { + "epoch": 0.9735990817071898, + "grad_norm": 1.09375, + "learning_rate": 3.644467159224951e-06, + "loss": 0.8051, + "step": 13995 + }, + { + "epoch": 0.9736686493443251, + "grad_norm": 0.87890625, + "learning_rate": 3.625272127938062e-06, + "loss": 0.745, + "step": 13996 + }, + { + "epoch": 0.9737382169814602, + "grad_norm": 1.2109375, + "learning_rate": 3.6061276875376527e-06, + "loss": 0.7743, + "step": 13997 + }, + { + "epoch": 0.9738077846185954, + "grad_norm": 1.4609375, + "learning_rate": 3.5870338389959454e-06, + "loss": 0.9294, + "step": 13998 + }, + { + "epoch": 0.9738773522557307, + "grad_norm": 1.3984375, + "learning_rate": 3.567990583282388e-06, + "loss": 0.7511, + "step": 13999 + }, + { + "epoch": 0.9739469198928659, + "grad_norm": 1.0234375, + "learning_rate": 3.5489979213638724e-06, + "loss": 0.8622, + "step": 14000 + }, + { + "epoch": 0.974016487530001, + "grad_norm": 1.03125, + "learning_rate": 3.530055854204739e-06, + "loss": 0.5685, + "step": 14001 + }, + { + "epoch": 0.9740860551671362, + "grad_norm": 1.078125, + "learning_rate": 3.5111643827667737e-06, + "loss": 0.6536, + "step": 14002 + }, + { + "epoch": 0.9741556228042715, + "grad_norm": 1.2578125, + "learning_rate": 3.4923235080092095e-06, + "loss": 1.0551, + "step": 14003 + }, + { + "epoch": 0.9742251904414067, + "grad_norm": 1.3828125, + "learning_rate": 3.473533230888726e-06, + "loss": 0.8098, + "step": 14004 + }, + { + "epoch": 0.9742947580785418, + "grad_norm": 1.0625, + "learning_rate": 3.4547935523593366e-06, + "loss": 0.6146, + "step": 14005 + }, + { + "epoch": 0.9743643257156771, + "grad_norm": 1.0390625, + "learning_rate": 3.436104473372503e-06, + "loss": 0.7175, + "step": 14006 + }, + { + "epoch": 0.9744338933528123, + "grad_norm": 1.3984375, + "learning_rate": 3.4174659948773554e-06, + "loss": 0.8898, + "step": 14007 + }, + { + "epoch": 0.9745034609899474, + "grad_norm": 1.203125, + "learning_rate": 3.3988781178201366e-06, + "loss": 0.7473, + "step": 14008 + }, + { + "epoch": 0.9745730286270827, + "grad_norm": 1.2578125, + "learning_rate": 3.380340843144536e-06, + "loss": 0.9942, + "step": 14009 + }, + { + "epoch": 0.9746425962642179, + "grad_norm": 1.015625, + "learning_rate": 3.3618541717919117e-06, + "loss": 0.7472, + "step": 14010 + }, + { + "epoch": 0.9747121639013531, + "grad_norm": 1.1484375, + "learning_rate": 3.343418104700957e-06, + "loss": 0.7854, + "step": 14011 + }, + { + "epoch": 0.9747817315384883, + "grad_norm": 1.015625, + "learning_rate": 3.3250326428077014e-06, + "loss": 0.7731, + "step": 14012 + }, + { + "epoch": 0.9748512991756235, + "grad_norm": 1.0546875, + "learning_rate": 3.3066977870456205e-06, + "loss": 0.7759, + "step": 14013 + }, + { + "epoch": 0.9749208668127587, + "grad_norm": 0.921875, + "learning_rate": 3.288413538345747e-06, + "loss": 0.6518, + "step": 14014 + }, + { + "epoch": 0.9749904344498939, + "grad_norm": 1.1796875, + "learning_rate": 3.2701798976364494e-06, + "loss": 0.8985, + "step": 14015 + }, + { + "epoch": 0.9750600020870291, + "grad_norm": 1.1015625, + "learning_rate": 3.2519968658435427e-06, + "loss": 0.7888, + "step": 14016 + }, + { + "epoch": 0.9751295697241643, + "grad_norm": 0.9375, + "learning_rate": 3.2338644438902887e-06, + "loss": 0.6723, + "step": 14017 + }, + { + "epoch": 0.9751991373612995, + "grad_norm": 1.0234375, + "learning_rate": 3.2157826326972837e-06, + "loss": 0.8776, + "step": 14018 + }, + { + "epoch": 0.9752687049984348, + "grad_norm": 1.5625, + "learning_rate": 3.197751433182572e-06, + "loss": 0.848, + "step": 14019 + }, + { + "epoch": 0.9753382726355699, + "grad_norm": 1.359375, + "learning_rate": 3.179770846261865e-06, + "loss": 1.0315, + "step": 14020 + }, + { + "epoch": 0.9754078402727051, + "grad_norm": 1.234375, + "learning_rate": 3.1618408728480985e-06, + "loss": 0.8892, + "step": 14021 + }, + { + "epoch": 0.9754774079098404, + "grad_norm": 1.171875, + "learning_rate": 3.1439615138515454e-06, + "loss": 1.0339, + "step": 14022 + }, + { + "epoch": 0.9755469755469756, + "grad_norm": 1.3671875, + "learning_rate": 3.126132770180146e-06, + "loss": 0.9649, + "step": 14023 + }, + { + "epoch": 0.9756165431841107, + "grad_norm": 0.984375, + "learning_rate": 3.108354642739064e-06, + "loss": 0.7231, + "step": 14024 + }, + { + "epoch": 0.975686110821246, + "grad_norm": 0.9453125, + "learning_rate": 3.090627132431023e-06, + "loss": 0.8066, + "step": 14025 + }, + { + "epoch": 0.9757556784583812, + "grad_norm": 0.96875, + "learning_rate": 3.0729502401561916e-06, + "loss": 0.5879, + "step": 14026 + }, + { + "epoch": 0.9758252460955164, + "grad_norm": 0.7421875, + "learning_rate": 3.0553239668120735e-06, + "loss": 0.5491, + "step": 14027 + }, + { + "epoch": 0.9758948137326515, + "grad_norm": 1.03125, + "learning_rate": 3.037748313293509e-06, + "loss": 0.7964, + "step": 14028 + }, + { + "epoch": 0.9759643813697868, + "grad_norm": 1.2421875, + "learning_rate": 3.020223280493228e-06, + "loss": 0.7861, + "step": 14029 + }, + { + "epoch": 0.976033949006922, + "grad_norm": 1.09375, + "learning_rate": 3.0027488693007422e-06, + "loss": 0.8609, + "step": 14030 + }, + { + "epoch": 0.9761035166440571, + "grad_norm": 1.21875, + "learning_rate": 2.9853250806033407e-06, + "loss": 0.7567, + "step": 14031 + }, + { + "epoch": 0.9761730842811924, + "grad_norm": 1.1796875, + "learning_rate": 2.9679519152859824e-06, + "loss": 0.8647, + "step": 14032 + }, + { + "epoch": 0.9762426519183276, + "grad_norm": 1.125, + "learning_rate": 2.950629374230518e-06, + "loss": 0.8666, + "step": 14033 + }, + { + "epoch": 0.9763122195554628, + "grad_norm": 1.2578125, + "learning_rate": 2.9333574583165767e-06, + "loss": 0.9188, + "step": 14034 + }, + { + "epoch": 0.976381787192598, + "grad_norm": 1.5234375, + "learning_rate": 2.916136168421124e-06, + "loss": 0.9102, + "step": 14035 + }, + { + "epoch": 0.9764513548297332, + "grad_norm": 1.1875, + "learning_rate": 2.8989655054186827e-06, + "loss": 0.6939, + "step": 14036 + }, + { + "epoch": 0.9765209224668684, + "grad_norm": 0.92578125, + "learning_rate": 2.881845470180999e-06, + "loss": 0.6198, + "step": 14037 + }, + { + "epoch": 0.9765904901040037, + "grad_norm": 1.0625, + "learning_rate": 2.864776063577268e-06, + "loss": 0.7109, + "step": 14038 + }, + { + "epoch": 0.9766600577411388, + "grad_norm": 0.9375, + "learning_rate": 2.8477572864744616e-06, + "loss": 0.7561, + "step": 14039 + }, + { + "epoch": 0.976729625378274, + "grad_norm": 1.0703125, + "learning_rate": 2.830789139736334e-06, + "loss": 0.6205, + "step": 14040 + }, + { + "epoch": 0.9767991930154092, + "grad_norm": 1.1171875, + "learning_rate": 2.8138716242247507e-06, + "loss": 0.882, + "step": 14041 + }, + { + "epoch": 0.9768687606525445, + "grad_norm": 0.99609375, + "learning_rate": 2.79700474079847e-06, + "loss": 0.9954, + "step": 14042 + }, + { + "epoch": 0.9769383282896796, + "grad_norm": 0.99609375, + "learning_rate": 2.7801884903141396e-06, + "loss": 0.8026, + "step": 14043 + }, + { + "epoch": 0.9770078959268148, + "grad_norm": 1.34375, + "learning_rate": 2.7634228736254097e-06, + "loss": 0.8356, + "step": 14044 + }, + { + "epoch": 0.9770774635639501, + "grad_norm": 0.91015625, + "learning_rate": 2.7467078915835996e-06, + "loss": 0.5747, + "step": 14045 + }, + { + "epoch": 0.9771470312010853, + "grad_norm": 0.9375, + "learning_rate": 2.730043545037364e-06, + "loss": 0.8995, + "step": 14046 + }, + { + "epoch": 0.9772165988382204, + "grad_norm": 1.578125, + "learning_rate": 2.7134298348330257e-06, + "loss": 0.9942, + "step": 14047 + }, + { + "epoch": 0.9772861664753557, + "grad_norm": 1.21875, + "learning_rate": 2.6968667618140207e-06, + "loss": 0.9477, + "step": 14048 + }, + { + "epoch": 0.9773557341124909, + "grad_norm": 1.1171875, + "learning_rate": 2.6803543268213436e-06, + "loss": 0.725, + "step": 14049 + }, + { + "epoch": 0.977425301749626, + "grad_norm": 1.0390625, + "learning_rate": 2.663892530693324e-06, + "loss": 0.843, + "step": 14050 + }, + { + "epoch": 0.9774948693867613, + "grad_norm": 1.1484375, + "learning_rate": 2.6474813742659587e-06, + "loss": 0.7772, + "step": 14051 + }, + { + "epoch": 0.9775644370238965, + "grad_norm": 1.1484375, + "learning_rate": 2.631120858372471e-06, + "loss": 0.8957, + "step": 14052 + }, + { + "epoch": 0.9776340046610317, + "grad_norm": 1.328125, + "learning_rate": 2.6148109838435297e-06, + "loss": 0.8659, + "step": 14053 + }, + { + "epoch": 0.9777035722981668, + "grad_norm": 1.0546875, + "learning_rate": 2.59855175150725e-06, + "loss": 0.8784, + "step": 14054 + }, + { + "epoch": 0.9777731399353021, + "grad_norm": 1.2109375, + "learning_rate": 2.5823431621893046e-06, + "loss": 0.7079, + "step": 14055 + }, + { + "epoch": 0.9778427075724373, + "grad_norm": 1.3828125, + "learning_rate": 2.566185216712591e-06, + "loss": 0.8376, + "step": 14056 + }, + { + "epoch": 0.9779122752095725, + "grad_norm": 1.4140625, + "learning_rate": 2.550077915897564e-06, + "loss": 0.9035, + "step": 14057 + }, + { + "epoch": 0.9779818428467077, + "grad_norm": 0.78125, + "learning_rate": 2.534021260562014e-06, + "loss": 0.597, + "step": 14058 + }, + { + "epoch": 0.9780514104838429, + "grad_norm": 0.984375, + "learning_rate": 2.5180152515212885e-06, + "loss": 0.8717, + "step": 14059 + }, + { + "epoch": 0.9781209781209781, + "grad_norm": 1.140625, + "learning_rate": 2.5020598895880706e-06, + "loss": 0.985, + "step": 14060 + }, + { + "epoch": 0.9781905457581134, + "grad_norm": 1.3671875, + "learning_rate": 2.486155175572491e-06, + "loss": 0.7367, + "step": 14061 + }, + { + "epoch": 0.9782601133952485, + "grad_norm": 1.296875, + "learning_rate": 2.470301110282236e-06, + "loss": 0.9985, + "step": 14062 + }, + { + "epoch": 0.9783296810323837, + "grad_norm": 1.140625, + "learning_rate": 2.4544976945219953e-06, + "loss": 0.7257, + "step": 14063 + }, + { + "epoch": 0.978399248669519, + "grad_norm": 0.90625, + "learning_rate": 2.438744929094461e-06, + "loss": 0.7439, + "step": 14064 + }, + { + "epoch": 0.9784688163066542, + "grad_norm": 0.90234375, + "learning_rate": 2.4230428147992146e-06, + "loss": 0.66, + "step": 14065 + }, + { + "epoch": 0.9785383839437893, + "grad_norm": 1.21875, + "learning_rate": 2.407391352433841e-06, + "loss": 0.7612, + "step": 14066 + }, + { + "epoch": 0.9786079515809245, + "grad_norm": 1.0078125, + "learning_rate": 2.3917905427929265e-06, + "loss": 0.649, + "step": 14067 + }, + { + "epoch": 0.9786775192180598, + "grad_norm": 0.9140625, + "learning_rate": 2.3762403866685046e-06, + "loss": 0.8349, + "step": 14068 + }, + { + "epoch": 0.978747086855195, + "grad_norm": 1.390625, + "learning_rate": 2.3607408848501655e-06, + "loss": 0.7158, + "step": 14069 + }, + { + "epoch": 0.9788166544923301, + "grad_norm": 1.1796875, + "learning_rate": 2.3452920381249466e-06, + "loss": 1.0078, + "step": 14070 + }, + { + "epoch": 0.9788862221294654, + "grad_norm": 0.95703125, + "learning_rate": 2.329893847277331e-06, + "loss": 0.9462, + "step": 14071 + }, + { + "epoch": 0.9789557897666006, + "grad_norm": 1.03125, + "learning_rate": 2.3145463130890276e-06, + "loss": 0.7862, + "step": 14072 + }, + { + "epoch": 0.9790253574037358, + "grad_norm": 1.0546875, + "learning_rate": 2.299249436339301e-06, + "loss": 0.9804, + "step": 14073 + }, + { + "epoch": 0.979094925040871, + "grad_norm": 1.3515625, + "learning_rate": 2.284003217804864e-06, + "loss": 0.9441, + "step": 14074 + }, + { + "epoch": 0.9791644926780062, + "grad_norm": 1.2109375, + "learning_rate": 2.268807658259986e-06, + "loss": 0.9997, + "step": 14075 + }, + { + "epoch": 0.9792340603151414, + "grad_norm": 1.296875, + "learning_rate": 2.2536627584761603e-06, + "loss": 1.0353, + "step": 14076 + }, + { + "epoch": 0.9793036279522767, + "grad_norm": 1.296875, + "learning_rate": 2.2385685192222173e-06, + "loss": 0.9707, + "step": 14077 + }, + { + "epoch": 0.9793731955894118, + "grad_norm": 1.2109375, + "learning_rate": 2.2235249412647653e-06, + "loss": 0.7933, + "step": 14078 + }, + { + "epoch": 0.979442763226547, + "grad_norm": 1.453125, + "learning_rate": 2.2085320253674155e-06, + "loss": 0.9587, + "step": 14079 + }, + { + "epoch": 0.9795123308636822, + "grad_norm": 0.92578125, + "learning_rate": 2.193589772291671e-06, + "loss": 0.6871, + "step": 14080 + }, + { + "epoch": 0.9795818985008174, + "grad_norm": 1.5703125, + "learning_rate": 2.178698182796146e-06, + "loss": 1.0512, + "step": 14081 + }, + { + "epoch": 0.9796514661379526, + "grad_norm": 1.0390625, + "learning_rate": 2.163857257636903e-06, + "loss": 0.6405, + "step": 14082 + }, + { + "epoch": 0.9797210337750878, + "grad_norm": 0.7734375, + "learning_rate": 2.1490669975674506e-06, + "loss": 0.6477, + "step": 14083 + }, + { + "epoch": 0.9797906014122231, + "grad_norm": 1.28125, + "learning_rate": 2.134327403338854e-06, + "loss": 0.9768, + "step": 14084 + }, + { + "epoch": 0.9798601690493582, + "grad_norm": 1.234375, + "learning_rate": 2.1196384756995145e-06, + "loss": 0.6931, + "step": 14085 + }, + { + "epoch": 0.9799297366864934, + "grad_norm": 1.2109375, + "learning_rate": 2.105000215395281e-06, + "loss": 0.8718, + "step": 14086 + }, + { + "epoch": 0.9799993043236287, + "grad_norm": 1.09375, + "learning_rate": 2.0904126231693355e-06, + "loss": 0.7593, + "step": 14087 + }, + { + "epoch": 0.9800688719607639, + "grad_norm": 1.125, + "learning_rate": 2.0758756997624194e-06, + "loss": 0.6853, + "step": 14088 + }, + { + "epoch": 0.980138439597899, + "grad_norm": 0.90625, + "learning_rate": 2.0613894459127204e-06, + "loss": 0.7454, + "step": 14089 + }, + { + "epoch": 0.9802080072350342, + "grad_norm": 0.96484375, + "learning_rate": 2.0469538623555385e-06, + "loss": 0.7557, + "step": 14090 + }, + { + "epoch": 0.9802775748721695, + "grad_norm": 1.09375, + "learning_rate": 2.032568949824065e-06, + "loss": 0.5998, + "step": 14091 + }, + { + "epoch": 0.9803471425093047, + "grad_norm": 1.0, + "learning_rate": 2.0182347090484944e-06, + "loss": 0.7196, + "step": 14092 + }, + { + "epoch": 0.9804167101464398, + "grad_norm": 1.2734375, + "learning_rate": 2.00395114075691e-06, + "loss": 0.9062, + "step": 14093 + }, + { + "epoch": 0.9804862777835751, + "grad_norm": 1.109375, + "learning_rate": 1.989718245674288e-06, + "loss": 0.8441, + "step": 14094 + }, + { + "epoch": 0.9805558454207103, + "grad_norm": 1.203125, + "learning_rate": 1.9755360245236055e-06, + "loss": 0.7855, + "step": 14095 + }, + { + "epoch": 0.9806254130578455, + "grad_norm": 1.015625, + "learning_rate": 1.9614044780246198e-06, + "loss": 0.9628, + "step": 14096 + }, + { + "epoch": 0.9806949806949807, + "grad_norm": 1.0234375, + "learning_rate": 1.9473236068950905e-06, + "loss": 0.8135, + "step": 14097 + }, + { + "epoch": 0.9807645483321159, + "grad_norm": 0.94140625, + "learning_rate": 1.93329341184989e-06, + "loss": 0.6462, + "step": 14098 + }, + { + "epoch": 0.9808341159692511, + "grad_norm": 1.0625, + "learning_rate": 1.9193138936014488e-06, + "loss": 0.5737, + "step": 14099 + }, + { + "epoch": 0.9809036836063864, + "grad_norm": 0.8828125, + "learning_rate": 1.9053850528595318e-06, + "loss": 0.6714, + "step": 14100 + }, + { + "epoch": 0.9809732512435215, + "grad_norm": 1.1328125, + "learning_rate": 1.8915068903313515e-06, + "loss": 0.8413, + "step": 14101 + }, + { + "epoch": 0.9810428188806567, + "grad_norm": 1.125, + "learning_rate": 1.8776794067216774e-06, + "loss": 0.6286, + "step": 14102 + }, + { + "epoch": 0.9811123865177919, + "grad_norm": 1.1484375, + "learning_rate": 1.8639026027325035e-06, + "loss": 0.8265, + "step": 14103 + }, + { + "epoch": 0.9811819541549271, + "grad_norm": 1.453125, + "learning_rate": 1.8501764790633814e-06, + "loss": 0.7367, + "step": 14104 + }, + { + "epoch": 0.9812515217920623, + "grad_norm": 0.953125, + "learning_rate": 1.8365010364113089e-06, + "loss": 0.8278, + "step": 14105 + }, + { + "epoch": 0.9813210894291975, + "grad_norm": 1.6875, + "learning_rate": 1.8228762754705086e-06, + "loss": 0.925, + "step": 14106 + }, + { + "epoch": 0.9813906570663328, + "grad_norm": 1.2265625, + "learning_rate": 1.8093021969328716e-06, + "loss": 0.7409, + "step": 14107 + }, + { + "epoch": 0.9814602247034679, + "grad_norm": 0.8671875, + "learning_rate": 1.7957788014877352e-06, + "loss": 0.7799, + "step": 14108 + }, + { + "epoch": 0.9815297923406031, + "grad_norm": 0.9375, + "learning_rate": 1.7823060898214395e-06, + "loss": 0.8179, + "step": 14109 + }, + { + "epoch": 0.9815993599777384, + "grad_norm": 1.21875, + "learning_rate": 1.7688840626184367e-06, + "loss": 0.7299, + "step": 14110 + }, + { + "epoch": 0.9816689276148736, + "grad_norm": 1.3203125, + "learning_rate": 1.7555127205598487e-06, + "loss": 0.8655, + "step": 14111 + }, + { + "epoch": 0.9817384952520087, + "grad_norm": 1.0546875, + "learning_rate": 1.74219206432491e-06, + "loss": 0.7925, + "step": 14112 + }, + { + "epoch": 0.981808062889144, + "grad_norm": 0.99609375, + "learning_rate": 1.7289220945898576e-06, + "loss": 0.6584, + "step": 14113 + }, + { + "epoch": 0.9818776305262792, + "grad_norm": 1.21875, + "learning_rate": 1.7157028120284857e-06, + "loss": 0.8806, + "step": 14114 + }, + { + "epoch": 0.9819471981634144, + "grad_norm": 0.9609375, + "learning_rate": 1.702534217312035e-06, + "loss": 0.8664, + "step": 14115 + }, + { + "epoch": 0.9820167658005495, + "grad_norm": 1.3125, + "learning_rate": 1.689416311109082e-06, + "loss": 0.7739, + "step": 14116 + }, + { + "epoch": 0.9820863334376848, + "grad_norm": 1.1953125, + "learning_rate": 1.6763490940856496e-06, + "loss": 0.6885, + "step": 14117 + }, + { + "epoch": 0.98215590107482, + "grad_norm": 0.890625, + "learning_rate": 1.6633325669054289e-06, + "loss": 0.6807, + "step": 14118 + }, + { + "epoch": 0.9822254687119552, + "grad_norm": 1.125, + "learning_rate": 1.6503667302290027e-06, + "loss": 0.8408, + "step": 14119 + }, + { + "epoch": 0.9822950363490904, + "grad_norm": 1.4140625, + "learning_rate": 1.6374515847149552e-06, + "loss": 0.8461, + "step": 14120 + }, + { + "epoch": 0.9823646039862256, + "grad_norm": 1.1328125, + "learning_rate": 1.6245871310190952e-06, + "loss": 0.9569, + "step": 14121 + }, + { + "epoch": 0.9824341716233608, + "grad_norm": 1.046875, + "learning_rate": 1.611773369794456e-06, + "loss": 0.8808, + "step": 14122 + }, + { + "epoch": 0.9825037392604961, + "grad_norm": 1.046875, + "learning_rate": 1.599010301691739e-06, + "loss": 0.6954, + "step": 14123 + }, + { + "epoch": 0.9825733068976312, + "grad_norm": 1.1015625, + "learning_rate": 1.5862979273588707e-06, + "loss": 1.0216, + "step": 14124 + }, + { + "epoch": 0.9826428745347664, + "grad_norm": 0.82421875, + "learning_rate": 1.5736362474415567e-06, + "loss": 0.7222, + "step": 14125 + }, + { + "epoch": 0.9827124421719017, + "grad_norm": 1.28125, + "learning_rate": 1.561025262582394e-06, + "loss": 0.8758, + "step": 14126 + }, + { + "epoch": 0.9827820098090368, + "grad_norm": 1.4296875, + "learning_rate": 1.5484649734219814e-06, + "loss": 0.7369, + "step": 14127 + }, + { + "epoch": 0.982851577446172, + "grad_norm": 1.0234375, + "learning_rate": 1.5359553805979198e-06, + "loss": 0.9374, + "step": 14128 + }, + { + "epoch": 0.9829211450833072, + "grad_norm": 1.0546875, + "learning_rate": 1.523496484745368e-06, + "loss": 0.8218, + "step": 14129 + }, + { + "epoch": 0.9829907127204425, + "grad_norm": 1.3359375, + "learning_rate": 1.5110882864970422e-06, + "loss": 0.7643, + "step": 14130 + }, + { + "epoch": 0.9830602803575776, + "grad_norm": 1.1171875, + "learning_rate": 1.4987307864828825e-06, + "loss": 0.8093, + "step": 14131 + }, + { + "epoch": 0.9831298479947128, + "grad_norm": 1.09375, + "learning_rate": 1.4864239853303873e-06, + "loss": 0.5871, + "step": 14132 + }, + { + "epoch": 0.9831994156318481, + "grad_norm": 1.0703125, + "learning_rate": 1.474167883664279e-06, + "loss": 0.7188, + "step": 14133 + }, + { + "epoch": 0.9832689832689833, + "grad_norm": 1.078125, + "learning_rate": 1.4619624821070599e-06, + "loss": 0.8958, + "step": 14134 + }, + { + "epoch": 0.9833385509061184, + "grad_norm": 1.2578125, + "learning_rate": 1.449807781278345e-06, + "loss": 0.8396, + "step": 14135 + }, + { + "epoch": 0.9834081185432537, + "grad_norm": 0.98046875, + "learning_rate": 1.4377037817954186e-06, + "loss": 0.7646, + "step": 14136 + }, + { + "epoch": 0.9834776861803889, + "grad_norm": 1.0234375, + "learning_rate": 1.425650484272678e-06, + "loss": 0.7525, + "step": 14137 + }, + { + "epoch": 0.9835472538175241, + "grad_norm": 1.265625, + "learning_rate": 1.4136478893221894e-06, + "loss": 0.7781, + "step": 14138 + }, + { + "epoch": 0.9836168214546593, + "grad_norm": 1.09375, + "learning_rate": 1.401695997553465e-06, + "loss": 0.831, + "step": 14139 + }, + { + "epoch": 0.9836863890917945, + "grad_norm": 1.03125, + "learning_rate": 1.3897948095733525e-06, + "loss": 0.9721, + "step": 14140 + }, + { + "epoch": 0.9837559567289297, + "grad_norm": 1.2265625, + "learning_rate": 1.3779443259860359e-06, + "loss": 0.7669, + "step": 14141 + }, + { + "epoch": 0.9838255243660649, + "grad_norm": 1.34375, + "learning_rate": 1.3661445473933664e-06, + "loss": 0.8089, + "step": 14142 + }, + { + "epoch": 0.9838950920032001, + "grad_norm": 0.9921875, + "learning_rate": 1.35439547439431e-06, + "loss": 0.674, + "step": 14143 + }, + { + "epoch": 0.9839646596403353, + "grad_norm": 0.7890625, + "learning_rate": 1.3426971075855e-06, + "loss": 0.5941, + "step": 14144 + }, + { + "epoch": 0.9840342272774705, + "grad_norm": 0.9609375, + "learning_rate": 1.3310494475609058e-06, + "loss": 0.6877, + "step": 14145 + }, + { + "epoch": 0.9841037949146058, + "grad_norm": 1.2578125, + "learning_rate": 1.3194524949119435e-06, + "loss": 0.6499, + "step": 14146 + }, + { + "epoch": 0.9841733625517409, + "grad_norm": 1.2421875, + "learning_rate": 1.3079062502275863e-06, + "loss": 0.8235, + "step": 14147 + }, + { + "epoch": 0.9842429301888761, + "grad_norm": 1.375, + "learning_rate": 1.2964107140938096e-06, + "loss": 0.8694, + "step": 14148 + }, + { + "epoch": 0.9843124978260114, + "grad_norm": 1.1015625, + "learning_rate": 1.2849658870945914e-06, + "loss": 0.9599, + "step": 14149 + }, + { + "epoch": 0.9843820654631465, + "grad_norm": 0.84765625, + "learning_rate": 1.2735717698107996e-06, + "loss": 0.5055, + "step": 14150 + }, + { + "epoch": 0.9844516331002817, + "grad_norm": 1.171875, + "learning_rate": 1.262228362821194e-06, + "loss": 1.1511, + "step": 14151 + }, + { + "epoch": 0.984521200737417, + "grad_norm": 1.0859375, + "learning_rate": 1.250935666701536e-06, + "loss": 0.7013, + "step": 14152 + }, + { + "epoch": 0.9845907683745522, + "grad_norm": 1.1328125, + "learning_rate": 1.2396936820252557e-06, + "loss": 0.8599, + "step": 14153 + }, + { + "epoch": 0.9846603360116873, + "grad_norm": 1.140625, + "learning_rate": 1.2285024093632303e-06, + "loss": 0.8528, + "step": 14154 + }, + { + "epoch": 0.9847299036488225, + "grad_norm": 0.9609375, + "learning_rate": 1.2173618492837823e-06, + "loss": 0.8547, + "step": 14155 + }, + { + "epoch": 0.9847994712859578, + "grad_norm": 1.109375, + "learning_rate": 1.2062720023523488e-06, + "loss": 1.0045, + "step": 14156 + }, + { + "epoch": 0.984869038923093, + "grad_norm": 0.9765625, + "learning_rate": 1.1952328691321457e-06, + "loss": 0.5509, + "step": 14157 + }, + { + "epoch": 0.9849386065602281, + "grad_norm": 1.40625, + "learning_rate": 1.1842444501837245e-06, + "loss": 0.9216, + "step": 14158 + }, + { + "epoch": 0.9850081741973634, + "grad_norm": 1.09375, + "learning_rate": 1.1733067460649727e-06, + "loss": 0.8681, + "step": 14159 + }, + { + "epoch": 0.9850777418344986, + "grad_norm": 0.91796875, + "learning_rate": 1.1624197573312234e-06, + "loss": 0.8702, + "step": 14160 + }, + { + "epoch": 0.9851473094716338, + "grad_norm": 1.0390625, + "learning_rate": 1.1515834845352568e-06, + "loss": 0.859, + "step": 14161 + }, + { + "epoch": 0.985216877108769, + "grad_norm": 1.2109375, + "learning_rate": 1.1407979282272996e-06, + "loss": 0.6349, + "step": 14162 + }, + { + "epoch": 0.9852864447459042, + "grad_norm": 1.2109375, + "learning_rate": 1.1300630889550245e-06, + "loss": 0.6855, + "step": 14163 + }, + { + "epoch": 0.9853560123830394, + "grad_norm": 1.0859375, + "learning_rate": 1.1193789672634402e-06, + "loss": 0.8024, + "step": 14164 + }, + { + "epoch": 0.9854255800201747, + "grad_norm": 1.078125, + "learning_rate": 1.1087455636951128e-06, + "loss": 0.6758, + "step": 14165 + }, + { + "epoch": 0.9854951476573098, + "grad_norm": 1.0234375, + "learning_rate": 1.0981628787898323e-06, + "loss": 0.612, + "step": 14166 + }, + { + "epoch": 0.985564715294445, + "grad_norm": 1.015625, + "learning_rate": 1.0876309130850582e-06, + "loss": 0.7018, + "step": 14167 + }, + { + "epoch": 0.9856342829315802, + "grad_norm": 1.1953125, + "learning_rate": 1.0771496671154736e-06, + "loss": 0.8467, + "step": 14168 + }, + { + "epoch": 0.9857038505687155, + "grad_norm": 1.2265625, + "learning_rate": 1.0667191414133192e-06, + "loss": 0.8525, + "step": 14169 + }, + { + "epoch": 0.9857734182058506, + "grad_norm": 1.4453125, + "learning_rate": 1.0563393365080609e-06, + "loss": 1.0241, + "step": 14170 + }, + { + "epoch": 0.9858429858429858, + "grad_norm": 0.97265625, + "learning_rate": 1.0460102529269432e-06, + "loss": 0.7085, + "step": 14171 + }, + { + "epoch": 0.9859125534801211, + "grad_norm": 1.546875, + "learning_rate": 1.0357318911943247e-06, + "loss": 0.9521, + "step": 14172 + }, + { + "epoch": 0.9859821211172562, + "grad_norm": 0.98046875, + "learning_rate": 1.0255042518320102e-06, + "loss": 0.8574, + "step": 14173 + }, + { + "epoch": 0.9860516887543914, + "grad_norm": 0.96875, + "learning_rate": 1.0153273353594727e-06, + "loss": 0.6841, + "step": 14174 + }, + { + "epoch": 0.9861212563915267, + "grad_norm": 1.015625, + "learning_rate": 1.0052011422932994e-06, + "loss": 0.5727, + "step": 14175 + }, + { + "epoch": 0.9861908240286619, + "grad_norm": 1.0703125, + "learning_rate": 9.951256731477453e-07, + "loss": 0.6811, + "step": 14176 + }, + { + "epoch": 0.986260391665797, + "grad_norm": 1.1953125, + "learning_rate": 9.851009284344016e-07, + "loss": 0.8459, + "step": 14177 + }, + { + "epoch": 0.9863299593029323, + "grad_norm": 1.0546875, + "learning_rate": 9.751269086620829e-07, + "loss": 0.642, + "step": 14178 + }, + { + "epoch": 0.9863995269400675, + "grad_norm": 1.5546875, + "learning_rate": 9.652036143374953e-07, + "loss": 0.965, + "step": 14179 + }, + { + "epoch": 0.9864690945772027, + "grad_norm": 1.2734375, + "learning_rate": 9.55331045964236e-07, + "loss": 0.9367, + "step": 14180 + }, + { + "epoch": 0.9865386622143378, + "grad_norm": 1.1328125, + "learning_rate": 9.455092040437929e-07, + "loss": 0.6349, + "step": 14181 + }, + { + "epoch": 0.9866082298514731, + "grad_norm": 0.98046875, + "learning_rate": 9.357380890747668e-07, + "loss": 0.5858, + "step": 14182 + }, + { + "epoch": 0.9866777974886083, + "grad_norm": 0.9375, + "learning_rate": 9.260177015533167e-07, + "loss": 0.6042, + "step": 14183 + }, + { + "epoch": 0.9867473651257435, + "grad_norm": 1.1015625, + "learning_rate": 9.163480419729365e-07, + "loss": 0.6278, + "step": 14184 + }, + { + "epoch": 0.9868169327628787, + "grad_norm": 1.0859375, + "learning_rate": 9.067291108246778e-07, + "loss": 0.9643, + "step": 14185 + }, + { + "epoch": 0.9868865004000139, + "grad_norm": 0.8671875, + "learning_rate": 8.971609085969279e-07, + "loss": 0.5668, + "step": 14186 + }, + { + "epoch": 0.9869560680371491, + "grad_norm": 1.171875, + "learning_rate": 8.876434357755203e-07, + "loss": 0.8407, + "step": 14187 + }, + { + "epoch": 0.9870256356742844, + "grad_norm": 1.1484375, + "learning_rate": 8.781766928436241e-07, + "loss": 0.8556, + "step": 14188 + }, + { + "epoch": 0.9870952033114195, + "grad_norm": 1.0390625, + "learning_rate": 8.687606802819659e-07, + "loss": 0.6536, + "step": 14189 + }, + { + "epoch": 0.9871647709485547, + "grad_norm": 1.453125, + "learning_rate": 8.593953985687186e-07, + "loss": 0.8054, + "step": 14190 + }, + { + "epoch": 0.98723433858569, + "grad_norm": 1.078125, + "learning_rate": 8.500808481792799e-07, + "loss": 0.6906, + "step": 14191 + }, + { + "epoch": 0.9873039062228252, + "grad_norm": 0.90625, + "learning_rate": 8.408170295866046e-07, + "loss": 0.6133, + "step": 14192 + }, + { + "epoch": 0.9873734738599603, + "grad_norm": 0.8828125, + "learning_rate": 8.316039432612055e-07, + "loss": 0.5521, + "step": 14193 + }, + { + "epoch": 0.9874430414970955, + "grad_norm": 1.359375, + "learning_rate": 8.224415896705972e-07, + "loss": 0.7895, + "step": 14194 + }, + { + "epoch": 0.9875126091342308, + "grad_norm": 1.046875, + "learning_rate": 8.133299692804075e-07, + "loss": 0.9231, + "step": 14195 + }, + { + "epoch": 0.987582176771366, + "grad_norm": 1.0234375, + "learning_rate": 8.042690825529331e-07, + "loss": 0.7716, + "step": 14196 + }, + { + "epoch": 0.9876517444085011, + "grad_norm": 1.0625, + "learning_rate": 7.952589299483615e-07, + "loss": 0.75, + "step": 14197 + }, + { + "epoch": 0.9877213120456364, + "grad_norm": 1.5, + "learning_rate": 7.862995119241045e-07, + "loss": 0.9462, + "step": 14198 + }, + { + "epoch": 0.9877908796827716, + "grad_norm": 0.96484375, + "learning_rate": 7.773908289352427e-07, + "loss": 0.8243, + "step": 14199 + }, + { + "epoch": 0.9878604473199067, + "grad_norm": 1.4921875, + "learning_rate": 7.685328814339698e-07, + "loss": 1.1487, + "step": 14200 + }, + { + "epoch": 0.987930014957042, + "grad_norm": 1.1875, + "learning_rate": 7.597256698701482e-07, + "loss": 0.9426, + "step": 14201 + }, + { + "epoch": 0.9879995825941772, + "grad_norm": 1.171875, + "learning_rate": 7.509691946908648e-07, + "loss": 0.785, + "step": 14202 + }, + { + "epoch": 0.9880691502313124, + "grad_norm": 1.125, + "learning_rate": 7.422634563407638e-07, + "loss": 0.989, + "step": 14203 + }, + { + "epoch": 0.9881387178684476, + "grad_norm": 1.0703125, + "learning_rate": 7.33608455261936e-07, + "loss": 0.9506, + "step": 14204 + }, + { + "epoch": 0.9882082855055828, + "grad_norm": 0.875, + "learning_rate": 7.250041918938077e-07, + "loss": 0.5925, + "step": 14205 + }, + { + "epoch": 0.988277853142718, + "grad_norm": 1.1640625, + "learning_rate": 7.164506666732518e-07, + "loss": 0.8583, + "step": 14206 + }, + { + "epoch": 0.9883474207798532, + "grad_norm": 1.296875, + "learning_rate": 7.079478800344763e-07, + "loss": 0.835, + "step": 14207 + }, + { + "epoch": 0.9884169884169884, + "grad_norm": 1.265625, + "learning_rate": 6.994958324093581e-07, + "loss": 0.7567, + "step": 14208 + }, + { + "epoch": 0.9884865560541236, + "grad_norm": 1.1328125, + "learning_rate": 6.910945242269983e-07, + "loss": 0.8193, + "step": 14209 + }, + { + "epoch": 0.9885561236912588, + "grad_norm": 1.25, + "learning_rate": 6.827439559140558e-07, + "loss": 0.8375, + "step": 14210 + }, + { + "epoch": 0.9886256913283941, + "grad_norm": 0.9140625, + "learning_rate": 6.744441278943025e-07, + "loss": 0.7245, + "step": 14211 + }, + { + "epoch": 0.9886952589655292, + "grad_norm": 1.34375, + "learning_rate": 6.661950405894013e-07, + "loss": 0.8492, + "step": 14212 + }, + { + "epoch": 0.9887648266026644, + "grad_norm": 1.4765625, + "learning_rate": 6.579966944180172e-07, + "loss": 0.8967, + "step": 14213 + }, + { + "epoch": 0.9888343942397997, + "grad_norm": 1.0703125, + "learning_rate": 6.498490897965948e-07, + "loss": 0.6543, + "step": 14214 + }, + { + "epoch": 0.9889039618769349, + "grad_norm": 1.125, + "learning_rate": 6.417522271386922e-07, + "loss": 0.7294, + "step": 14215 + }, + { + "epoch": 0.98897352951407, + "grad_norm": 1.25, + "learning_rate": 6.33706106855425e-07, + "loss": 0.7326, + "step": 14216 + }, + { + "epoch": 0.9890430971512053, + "grad_norm": 1.203125, + "learning_rate": 6.257107293554664e-07, + "loss": 0.9212, + "step": 14217 + }, + { + "epoch": 0.9891126647883405, + "grad_norm": 1.1875, + "learning_rate": 6.177660950446029e-07, + "loss": 0.9369, + "step": 14218 + }, + { + "epoch": 0.9891822324254756, + "grad_norm": 1.2421875, + "learning_rate": 6.098722043264005e-07, + "loss": 0.7213, + "step": 14219 + }, + { + "epoch": 0.9892518000626108, + "grad_norm": 1.125, + "learning_rate": 6.020290576015386e-07, + "loss": 0.8029, + "step": 14220 + }, + { + "epoch": 0.9893213676997461, + "grad_norm": 2.359375, + "learning_rate": 5.942366552683654e-07, + "loss": 0.724, + "step": 14221 + }, + { + "epoch": 0.9893909353368813, + "grad_norm": 1.203125, + "learning_rate": 5.864949977224532e-07, + "loss": 0.5431, + "step": 14222 + }, + { + "epoch": 0.9894605029740164, + "grad_norm": 1.2578125, + "learning_rate": 5.788040853568211e-07, + "loss": 0.772, + "step": 14223 + }, + { + "epoch": 0.9895300706111517, + "grad_norm": 1.0859375, + "learning_rate": 5.711639185621564e-07, + "loss": 0.7427, + "step": 14224 + }, + { + "epoch": 0.9895996382482869, + "grad_norm": 1.0390625, + "learning_rate": 5.635744977262603e-07, + "loss": 0.7589, + "step": 14225 + }, + { + "epoch": 0.9896692058854221, + "grad_norm": 0.98046875, + "learning_rate": 5.560358232344909e-07, + "loss": 0.716, + "step": 14226 + }, + { + "epoch": 0.9897387735225573, + "grad_norm": 1.25, + "learning_rate": 5.485478954697643e-07, + "loss": 1.0223, + "step": 14227 + }, + { + "epoch": 0.9898083411596925, + "grad_norm": 1.171875, + "learning_rate": 5.411107148119987e-07, + "loss": 0.9527, + "step": 14228 + }, + { + "epoch": 0.9898779087968277, + "grad_norm": 1.09375, + "learning_rate": 5.337242816391142e-07, + "loss": 0.7207, + "step": 14229 + }, + { + "epoch": 0.989947476433963, + "grad_norm": 1.21875, + "learning_rate": 5.263885963260329e-07, + "loss": 0.8742, + "step": 14230 + }, + { + "epoch": 0.9900170440710981, + "grad_norm": 1.2109375, + "learning_rate": 5.191036592451237e-07, + "loss": 0.6883, + "step": 14231 + }, + { + "epoch": 0.9900866117082333, + "grad_norm": 1.1484375, + "learning_rate": 5.11869470766424e-07, + "loss": 0.8554, + "step": 14232 + }, + { + "epoch": 0.9901561793453685, + "grad_norm": 1.234375, + "learning_rate": 5.046860312571955e-07, + "loss": 0.8186, + "step": 14233 + }, + { + "epoch": 0.9902257469825038, + "grad_norm": 1.2265625, + "learning_rate": 4.975533410821465e-07, + "loss": 1.1046, + "step": 14234 + }, + { + "epoch": 0.9902953146196389, + "grad_norm": 1.21875, + "learning_rate": 4.904714006035427e-07, + "loss": 0.7421, + "step": 14235 + }, + { + "epoch": 0.9903648822567741, + "grad_norm": 1.1171875, + "learning_rate": 4.834402101808743e-07, + "loss": 0.7434, + "step": 14236 + }, + { + "epoch": 0.9904344498939094, + "grad_norm": 1.4609375, + "learning_rate": 4.7645977017118926e-07, + "loss": 1.1464, + "step": 14237 + }, + { + "epoch": 0.9905040175310446, + "grad_norm": 1.0, + "learning_rate": 4.695300809288705e-07, + "loss": 0.9736, + "step": 14238 + }, + { + "epoch": 0.9905735851681797, + "grad_norm": 1.0703125, + "learning_rate": 4.626511428058588e-07, + "loss": 0.8671, + "step": 14239 + }, + { + "epoch": 0.990643152805315, + "grad_norm": 1.171875, + "learning_rate": 4.558229561513194e-07, + "loss": 0.6504, + "step": 14240 + }, + { + "epoch": 0.9907127204424502, + "grad_norm": 1.2890625, + "learning_rate": 4.4904552131197485e-07, + "loss": 0.8912, + "step": 14241 + }, + { + "epoch": 0.9907822880795853, + "grad_norm": 1.1484375, + "learning_rate": 4.423188386321053e-07, + "loss": 0.6163, + "step": 14242 + }, + { + "epoch": 0.9908518557167206, + "grad_norm": 1.1484375, + "learning_rate": 4.356429084531044e-07, + "loss": 0.8492, + "step": 14243 + }, + { + "epoch": 0.9909214233538558, + "grad_norm": 0.9921875, + "learning_rate": 4.2901773111392317e-07, + "loss": 0.8083, + "step": 14244 + }, + { + "epoch": 0.990990990990991, + "grad_norm": 1.1484375, + "learning_rate": 4.2244330695107024e-07, + "loss": 0.7564, + "step": 14245 + }, + { + "epoch": 0.9910605586281261, + "grad_norm": 1.1953125, + "learning_rate": 4.1591963629827867e-07, + "loss": 0.8627, + "step": 14246 + }, + { + "epoch": 0.9911301262652614, + "grad_norm": 1.15625, + "learning_rate": 4.09446719486839e-07, + "loss": 0.8946, + "step": 14247 + }, + { + "epoch": 0.9911996939023966, + "grad_norm": 1.0, + "learning_rate": 4.030245568453772e-07, + "loss": 0.7983, + "step": 14248 + }, + { + "epoch": 0.9912692615395318, + "grad_norm": 1.203125, + "learning_rate": 3.966531486998548e-07, + "loss": 0.8036, + "step": 14249 + }, + { + "epoch": 0.991338829176667, + "grad_norm": 1.0859375, + "learning_rate": 3.9033249537412384e-07, + "loss": 0.8449, + "step": 14250 + }, + { + "epoch": 0.9914083968138022, + "grad_norm": 1.0703125, + "learning_rate": 3.8406259718881673e-07, + "loss": 0.8014, + "step": 14251 + }, + { + "epoch": 0.9914779644509374, + "grad_norm": 0.96484375, + "learning_rate": 3.7784345446234545e-07, + "loss": 0.7806, + "step": 14252 + }, + { + "epoch": 0.9915475320880727, + "grad_norm": 1.015625, + "learning_rate": 3.716750675104574e-07, + "loss": 0.6997, + "step": 14253 + }, + { + "epoch": 0.9916170997252078, + "grad_norm": 1.234375, + "learning_rate": 3.6555743664645757e-07, + "loss": 0.707, + "step": 14254 + }, + { + "epoch": 0.991686667362343, + "grad_norm": 1.4296875, + "learning_rate": 3.594905621809863e-07, + "loss": 0.9648, + "step": 14255 + }, + { + "epoch": 0.9917562349994783, + "grad_norm": 1.1484375, + "learning_rate": 3.534744444220195e-07, + "loss": 0.6445, + "step": 14256 + }, + { + "epoch": 0.9918258026366135, + "grad_norm": 1.046875, + "learning_rate": 3.4750908367497946e-07, + "loss": 0.9463, + "step": 14257 + }, + { + "epoch": 0.9918953702737486, + "grad_norm": 1.2265625, + "learning_rate": 3.415944802428461e-07, + "loss": 0.9668, + "step": 14258 + }, + { + "epoch": 0.9919649379108838, + "grad_norm": 1.25, + "learning_rate": 3.3573063442582376e-07, + "loss": 0.9442, + "step": 14259 + }, + { + "epoch": 0.9920345055480191, + "grad_norm": 1.265625, + "learning_rate": 3.299175465217852e-07, + "loss": 0.9371, + "step": 14260 + }, + { + "epoch": 0.9921040731851543, + "grad_norm": 1.0625, + "learning_rate": 3.241552168257167e-07, + "loss": 0.8297, + "step": 14261 + }, + { + "epoch": 0.9921736408222894, + "grad_norm": 1.3515625, + "learning_rate": 3.1844364563038407e-07, + "loss": 0.9003, + "step": 14262 + }, + { + "epoch": 0.9922432084594247, + "grad_norm": 0.96875, + "learning_rate": 3.127828332257776e-07, + "loss": 0.8385, + "step": 14263 + }, + { + "epoch": 0.9923127760965599, + "grad_norm": 1.0859375, + "learning_rate": 3.07172779899223e-07, + "loss": 0.8694, + "step": 14264 + }, + { + "epoch": 0.992382343733695, + "grad_norm": 1.09375, + "learning_rate": 3.016134859354924e-07, + "loss": 0.7328, + "step": 14265 + }, + { + "epoch": 0.9924519113708303, + "grad_norm": 1.3203125, + "learning_rate": 2.961049516171377e-07, + "loss": 0.911, + "step": 14266 + }, + { + "epoch": 0.9925214790079655, + "grad_norm": 1.6171875, + "learning_rate": 2.906471772236019e-07, + "loss": 0.773, + "step": 14267 + }, + { + "epoch": 0.9925910466451007, + "grad_norm": 1.0859375, + "learning_rate": 2.852401630321078e-07, + "loss": 0.7979, + "step": 14268 + }, + { + "epoch": 0.992660614282236, + "grad_norm": 1.1015625, + "learning_rate": 2.798839093172134e-07, + "loss": 0.7917, + "step": 14269 + }, + { + "epoch": 0.9927301819193711, + "grad_norm": 1.3359375, + "learning_rate": 2.745784163508125e-07, + "loss": 0.6746, + "step": 14270 + }, + { + "epoch": 0.9927997495565063, + "grad_norm": 1.3125, + "learning_rate": 2.693236844023561e-07, + "loss": 0.9603, + "step": 14271 + }, + { + "epoch": 0.9928693171936415, + "grad_norm": 1.296875, + "learning_rate": 2.6411971373863086e-07, + "loss": 0.9019, + "step": 14272 + }, + { + "epoch": 0.9929388848307767, + "grad_norm": 1.4375, + "learning_rate": 2.5896650462386985e-07, + "loss": 0.8084, + "step": 14273 + }, + { + "epoch": 0.9930084524679119, + "grad_norm": 1.1328125, + "learning_rate": 2.5386405731964157e-07, + "loss": 0.8645, + "step": 14274 + }, + { + "epoch": 0.9930780201050471, + "grad_norm": 0.9453125, + "learning_rate": 2.4881237208518313e-07, + "loss": 0.7938, + "step": 14275 + }, + { + "epoch": 0.9931475877421824, + "grad_norm": 1.125, + "learning_rate": 2.4381144917695606e-07, + "loss": 0.8516, + "step": 14276 + }, + { + "epoch": 0.9932171553793175, + "grad_norm": 0.99609375, + "learning_rate": 2.3886128884875737e-07, + "loss": 0.743, + "step": 14277 + }, + { + "epoch": 0.9932867230164527, + "grad_norm": 0.80078125, + "learning_rate": 2.3396189135205248e-07, + "loss": 0.7701, + "step": 14278 + }, + { + "epoch": 0.993356290653588, + "grad_norm": 1.046875, + "learning_rate": 2.2911325693553142e-07, + "loss": 0.8914, + "step": 14279 + }, + { + "epoch": 0.9934258582907232, + "grad_norm": 1.296875, + "learning_rate": 2.2431538584544164e-07, + "loss": 0.5938, + "step": 14280 + }, + { + "epoch": 0.9934954259278583, + "grad_norm": 1.2265625, + "learning_rate": 2.1956827832536608e-07, + "loss": 0.8855, + "step": 14281 + }, + { + "epoch": 0.9935649935649936, + "grad_norm": 1.109375, + "learning_rate": 2.1487193461633415e-07, + "loss": 0.8047, + "step": 14282 + }, + { + "epoch": 0.9936345612021288, + "grad_norm": 1.1875, + "learning_rate": 2.1022635495682174e-07, + "loss": 0.9119, + "step": 14283 + }, + { + "epoch": 0.993704128839264, + "grad_norm": 0.82421875, + "learning_rate": 2.0563153958275128e-07, + "loss": 0.8005, + "step": 14284 + }, + { + "epoch": 0.9937736964763991, + "grad_norm": 1.015625, + "learning_rate": 2.0108748872726956e-07, + "loss": 0.9094, + "step": 14285 + }, + { + "epoch": 0.9938432641135344, + "grad_norm": 0.91015625, + "learning_rate": 1.9659420262130302e-07, + "loss": 0.8076, + "step": 14286 + }, + { + "epoch": 0.9939128317506696, + "grad_norm": 1.296875, + "learning_rate": 1.9215168149289143e-07, + "loss": 0.953, + "step": 14287 + }, + { + "epoch": 0.9939823993878047, + "grad_norm": 1.1328125, + "learning_rate": 1.8775992556752108e-07, + "loss": 0.8002, + "step": 14288 + }, + { + "epoch": 0.99405196702494, + "grad_norm": 1.1015625, + "learning_rate": 1.8341893506834684e-07, + "loss": 0.8113, + "step": 14289 + }, + { + "epoch": 0.9941215346620752, + "grad_norm": 1.15625, + "learning_rate": 1.7912871021574794e-07, + "loss": 0.7923, + "step": 14290 + }, + { + "epoch": 0.9941911022992104, + "grad_norm": 0.90234375, + "learning_rate": 1.7488925122743916e-07, + "loss": 0.8136, + "step": 14291 + }, + { + "epoch": 0.9942606699363457, + "grad_norm": 1.203125, + "learning_rate": 1.7070055831880372e-07, + "loss": 0.7789, + "step": 14292 + }, + { + "epoch": 0.9943302375734808, + "grad_norm": 1.015625, + "learning_rate": 1.6656263170244934e-07, + "loss": 0.9456, + "step": 14293 + }, + { + "epoch": 0.994399805210616, + "grad_norm": 1.140625, + "learning_rate": 1.6247547158854125e-07, + "loss": 0.9678, + "step": 14294 + }, + { + "epoch": 0.9944693728477513, + "grad_norm": 1.109375, + "learning_rate": 1.5843907818458015e-07, + "loss": 0.8439, + "step": 14295 + }, + { + "epoch": 0.9945389404848864, + "grad_norm": 1.1171875, + "learning_rate": 1.5445345169551316e-07, + "loss": 0.7758, + "step": 14296 + }, + { + "epoch": 0.9946085081220216, + "grad_norm": 1.046875, + "learning_rate": 1.5051859232373398e-07, + "loss": 0.7314, + "step": 14297 + }, + { + "epoch": 0.9946780757591568, + "grad_norm": 1.2890625, + "learning_rate": 1.4663450026897174e-07, + "loss": 1.0071, + "step": 14298 + }, + { + "epoch": 0.9947476433962921, + "grad_norm": 0.96484375, + "learning_rate": 1.4280117572840202e-07, + "loss": 0.6489, + "step": 14299 + }, + { + "epoch": 0.9948172110334272, + "grad_norm": 1.0390625, + "learning_rate": 1.3901861889686895e-07, + "loss": 0.7407, + "step": 14300 + }, + { + "epoch": 0.9948867786705624, + "grad_norm": 0.83203125, + "learning_rate": 1.352868299662191e-07, + "loss": 0.6603, + "step": 14301 + }, + { + "epoch": 0.9949563463076977, + "grad_norm": 0.9296875, + "learning_rate": 1.3160580912596753e-07, + "loss": 0.8934, + "step": 14302 + }, + { + "epoch": 0.9950259139448329, + "grad_norm": 1.640625, + "learning_rate": 1.2797555656318682e-07, + "loss": 0.9611, + "step": 14303 + }, + { + "epoch": 0.995095481581968, + "grad_norm": 1.21875, + "learning_rate": 1.2439607246195194e-07, + "loss": 0.6547, + "step": 14304 + }, + { + "epoch": 0.9951650492191033, + "grad_norm": 1.0390625, + "learning_rate": 1.2086735700422846e-07, + "loss": 0.9057, + "step": 14305 + }, + { + "epoch": 0.9952346168562385, + "grad_norm": 1.109375, + "learning_rate": 1.1738941036909535e-07, + "loss": 0.9191, + "step": 14306 + }, + { + "epoch": 0.9953041844933737, + "grad_norm": 1.515625, + "learning_rate": 1.1396223273307804e-07, + "loss": 0.7479, + "step": 14307 + }, + { + "epoch": 0.9953737521305089, + "grad_norm": 0.9609375, + "learning_rate": 1.1058582427025954e-07, + "loss": 0.7768, + "step": 14308 + }, + { + "epoch": 0.9954433197676441, + "grad_norm": 1.296875, + "learning_rate": 1.0726018515216929e-07, + "loss": 0.6844, + "step": 14309 + }, + { + "epoch": 0.9955128874047793, + "grad_norm": 1.2265625, + "learning_rate": 1.0398531554745017e-07, + "loss": 0.8366, + "step": 14310 + }, + { + "epoch": 0.9955824550419144, + "grad_norm": 1.6640625, + "learning_rate": 1.0076121562263563e-07, + "loss": 0.8562, + "step": 14311 + }, + { + "epoch": 0.9956520226790497, + "grad_norm": 1.359375, + "learning_rate": 9.758788554126152e-08, + "loss": 0.812, + "step": 14312 + }, + { + "epoch": 0.9957215903161849, + "grad_norm": 1.1484375, + "learning_rate": 9.446532546442121e-08, + "loss": 0.9063, + "step": 14313 + }, + { + "epoch": 0.9957911579533201, + "grad_norm": 1.015625, + "learning_rate": 9.139353555076557e-08, + "loss": 0.688, + "step": 14314 + }, + { + "epoch": 0.9958607255904554, + "grad_norm": 1.3828125, + "learning_rate": 8.837251595628093e-08, + "loss": 0.8599, + "step": 14315 + }, + { + "epoch": 0.9959302932275905, + "grad_norm": 1.1796875, + "learning_rate": 8.540226683428908e-08, + "loss": 0.8422, + "step": 14316 + }, + { + "epoch": 0.9959998608647257, + "grad_norm": 1.1015625, + "learning_rate": 8.248278833566936e-08, + "loss": 0.7857, + "step": 14317 + }, + { + "epoch": 0.996069428501861, + "grad_norm": 0.90625, + "learning_rate": 7.961408060852549e-08, + "loss": 0.6352, + "step": 14318 + }, + { + "epoch": 0.9961389961389961, + "grad_norm": 1.171875, + "learning_rate": 7.679614379862976e-08, + "loss": 0.8828, + "step": 14319 + }, + { + "epoch": 0.9962085637761313, + "grad_norm": 1.40625, + "learning_rate": 7.402897804908992e-08, + "loss": 0.9948, + "step": 14320 + }, + { + "epoch": 0.9962781314132666, + "grad_norm": 1.1640625, + "learning_rate": 7.13125835003492e-08, + "loss": 0.9744, + "step": 14321 + }, + { + "epoch": 0.9963476990504018, + "grad_norm": 1.078125, + "learning_rate": 6.864696029029727e-08, + "loss": 0.6612, + "step": 14322 + }, + { + "epoch": 0.9964172666875369, + "grad_norm": 1.140625, + "learning_rate": 6.603210855438136e-08, + "loss": 0.705, + "step": 14323 + }, + { + "epoch": 0.9964868343246721, + "grad_norm": 1.4140625, + "learning_rate": 6.34680284252731e-08, + "loss": 0.9975, + "step": 14324 + }, + { + "epoch": 0.9965564019618074, + "grad_norm": 1.109375, + "learning_rate": 6.095472003320169e-08, + "loss": 0.5625, + "step": 14325 + }, + { + "epoch": 0.9966259695989426, + "grad_norm": 1.140625, + "learning_rate": 5.849218350573171e-08, + "loss": 0.9337, + "step": 14326 + }, + { + "epoch": 0.9966955372360777, + "grad_norm": 1.03125, + "learning_rate": 5.6080418968096346e-08, + "loss": 0.665, + "step": 14327 + }, + { + "epoch": 0.996765104873213, + "grad_norm": 0.9609375, + "learning_rate": 5.371942654242012e-08, + "loss": 0.8357, + "step": 14328 + }, + { + "epoch": 0.9968346725103482, + "grad_norm": 1.1328125, + "learning_rate": 5.14092063489402e-08, + "loss": 0.7825, + "step": 14329 + }, + { + "epoch": 0.9969042401474834, + "grad_norm": 1.046875, + "learning_rate": 4.914975850467407e-08, + "loss": 0.9078, + "step": 14330 + }, + { + "epoch": 0.9969738077846186, + "grad_norm": 1.2421875, + "learning_rate": 4.6941083124529824e-08, + "loss": 1.0158, + "step": 14331 + }, + { + "epoch": 0.9970433754217538, + "grad_norm": 1.0078125, + "learning_rate": 4.478318032052897e-08, + "loss": 0.8284, + "step": 14332 + }, + { + "epoch": 0.997112943058889, + "grad_norm": 1.3671875, + "learning_rate": 4.267605020236154e-08, + "loss": 1.2325, + "step": 14333 + }, + { + "epoch": 0.9971825106960243, + "grad_norm": 1.28125, + "learning_rate": 4.061969287683098e-08, + "loss": 0.8964, + "step": 14334 + }, + { + "epoch": 0.9972520783331594, + "grad_norm": 0.84375, + "learning_rate": 3.8614108448520316e-08, + "loss": 0.7729, + "step": 14335 + }, + { + "epoch": 0.9973216459702946, + "grad_norm": 1.046875, + "learning_rate": 3.665929701923698e-08, + "loss": 0.966, + "step": 14336 + }, + { + "epoch": 0.9973912136074298, + "grad_norm": 0.8984375, + "learning_rate": 3.475525868823493e-08, + "loss": 0.5503, + "step": 14337 + }, + { + "epoch": 0.997460781244565, + "grad_norm": 0.95703125, + "learning_rate": 3.2901993552103546e-08, + "loss": 0.7011, + "step": 14338 + }, + { + "epoch": 0.9975303488817002, + "grad_norm": 1.078125, + "learning_rate": 3.109950170498976e-08, + "loss": 0.9463, + "step": 14339 + }, + { + "epoch": 0.9975999165188354, + "grad_norm": 1.09375, + "learning_rate": 2.934778323848697e-08, + "loss": 0.7466, + "step": 14340 + }, + { + "epoch": 0.9976694841559707, + "grad_norm": 1.140625, + "learning_rate": 2.764683824141301e-08, + "loss": 0.9995, + "step": 14341 + }, + { + "epoch": 0.9977390517931058, + "grad_norm": 0.91796875, + "learning_rate": 2.5996666800254253e-08, + "loss": 0.6633, + "step": 14342 + }, + { + "epoch": 0.997808619430241, + "grad_norm": 2.296875, + "learning_rate": 2.4397268998721523e-08, + "loss": 1.3606, + "step": 14343 + }, + { + "epoch": 0.9978781870673763, + "grad_norm": 1.0703125, + "learning_rate": 2.2848644917972116e-08, + "loss": 0.6953, + "step": 14344 + }, + { + "epoch": 0.9979477547045115, + "grad_norm": 1.21875, + "learning_rate": 2.1350794636831872e-08, + "loss": 0.9079, + "step": 14345 + }, + { + "epoch": 0.9980173223416466, + "grad_norm": 1.2421875, + "learning_rate": 1.990371823112902e-08, + "loss": 0.9467, + "step": 14346 + }, + { + "epoch": 0.9980868899787819, + "grad_norm": 1.109375, + "learning_rate": 1.850741577447135e-08, + "loss": 0.6985, + "step": 14347 + }, + { + "epoch": 0.9981564576159171, + "grad_norm": 1.0, + "learning_rate": 1.7161887337802108e-08, + "loss": 0.6278, + "step": 14348 + }, + { + "epoch": 0.9982260252530523, + "grad_norm": 1.5703125, + "learning_rate": 1.586713298928899e-08, + "loss": 0.6759, + "step": 14349 + }, + { + "epoch": 0.9982955928901874, + "grad_norm": 1.15625, + "learning_rate": 1.4623152794768224e-08, + "loss": 0.6557, + "step": 14350 + }, + { + "epoch": 0.9983651605273227, + "grad_norm": 1.390625, + "learning_rate": 1.3429946817300476e-08, + "loss": 0.7722, + "step": 14351 + }, + { + "epoch": 0.9984347281644579, + "grad_norm": 1.0390625, + "learning_rate": 1.2287515117725968e-08, + "loss": 0.8084, + "step": 14352 + }, + { + "epoch": 0.9985042958015931, + "grad_norm": 0.98046875, + "learning_rate": 1.1195857753776295e-08, + "loss": 0.7004, + "step": 14353 + }, + { + "epoch": 0.9985738634387283, + "grad_norm": 0.9609375, + "learning_rate": 1.0154974780962611e-08, + "loss": 0.6947, + "step": 14354 + }, + { + "epoch": 0.9986434310758635, + "grad_norm": 1.0234375, + "learning_rate": 9.164866252242554e-09, + "loss": 0.674, + "step": 14355 + }, + { + "epoch": 0.9987129987129987, + "grad_norm": 1.09375, + "learning_rate": 8.225532217687181e-09, + "loss": 0.8365, + "step": 14356 + }, + { + "epoch": 0.998782566350134, + "grad_norm": 1.1015625, + "learning_rate": 7.336972725147106e-09, + "loss": 0.6817, + "step": 14357 + }, + { + "epoch": 0.9988521339872691, + "grad_norm": 1.015625, + "learning_rate": 6.499187819808405e-09, + "loss": 0.6803, + "step": 14358 + }, + { + "epoch": 0.9989217016244043, + "grad_norm": 1.0390625, + "learning_rate": 5.712177543970576e-09, + "loss": 0.8999, + "step": 14359 + }, + { + "epoch": 0.9989912692615396, + "grad_norm": 1.015625, + "learning_rate": 4.975941937823691e-09, + "loss": 0.7464, + "step": 14360 + }, + { + "epoch": 0.9990608368986748, + "grad_norm": 1.8359375, + "learning_rate": 4.290481038560223e-09, + "loss": 0.9455, + "step": 14361 + }, + { + "epoch": 0.9991304045358099, + "grad_norm": 1.0, + "learning_rate": 3.655794881152197e-09, + "loss": 0.7925, + "step": 14362 + }, + { + "epoch": 0.9991999721729451, + "grad_norm": 1.0546875, + "learning_rate": 3.0718834976850575e-09, + "loss": 0.656, + "step": 14363 + }, + { + "epoch": 0.9992695398100804, + "grad_norm": 1.2109375, + "learning_rate": 2.538746917912782e-09, + "loss": 0.6469, + "step": 14364 + }, + { + "epoch": 0.9993391074472155, + "grad_norm": 1.265625, + "learning_rate": 2.0563851688137903e-09, + "loss": 0.8579, + "step": 14365 + }, + { + "epoch": 0.9994086750843507, + "grad_norm": 0.9609375, + "learning_rate": 1.624798274924011e-09, + "loss": 0.7825, + "step": 14366 + }, + { + "epoch": 0.999478242721486, + "grad_norm": 1.03125, + "learning_rate": 1.24398625822586e-09, + "loss": 0.834, + "step": 14367 + }, + { + "epoch": 0.9995478103586212, + "grad_norm": 1.34375, + "learning_rate": 9.139491379261955e-10, + "loss": 1.0088, + "step": 14368 + }, + { + "epoch": 0.9996173779957563, + "grad_norm": 1.359375, + "learning_rate": 6.346869309004078e-10, + "loss": 0.7419, + "step": 14369 + }, + { + "epoch": 0.9996869456328916, + "grad_norm": 1.015625, + "learning_rate": 4.06199651248329e-10, + "loss": 0.6799, + "step": 14370 + }, + { + "epoch": 0.9997565132700268, + "grad_norm": 1.046875, + "learning_rate": 2.284873106273011e-10, + "loss": 0.9677, + "step": 14371 + }, + { + "epoch": 0.999826080907162, + "grad_norm": 0.98046875, + "learning_rate": 1.0154991791910816e-10, + "loss": 0.6741, + "step": 14372 + }, + { + "epoch": 0.9998956485442972, + "grad_norm": 1.4453125, + "learning_rate": 2.5387479785088375e-11, + "loss": 0.7453, + "step": 14373 + }, + { + "epoch": 0.9999652161814324, + "grad_norm": 1.171875, + "learning_rate": 0.0, + "loss": 1.1312, + "step": 14374 + } + ], + "logging_steps": 1, + "max_steps": 14374, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.7122538767292826e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}